1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * AES-XTS for modern x86_64 CPUs 4 * 5 * Copyright 2024 Google LLC 6 * 7 * Author: Eric Biggers <[email protected]> 8 */ 9 10/* 11 * This file implements AES-XTS for modern x86_64 CPUs. To handle the 12 * complexities of coding for x86 SIMD, e.g. where every vector length needs 13 * different code, it uses a macro to generate several implementations that 14 * share similar source code but are targeted at different CPUs, listed below: 15 * 16 * AES-NI + AVX 17 * - 128-bit vectors (1 AES block per vector) 18 * - VEX-coded instructions 19 * - xmm0-xmm15 20 * - This is for older CPUs that lack VAES but do have AVX. 21 * 22 * VAES + VPCLMULQDQ + AVX2 23 * - 256-bit vectors (2 AES blocks per vector) 24 * - VEX-coded instructions 25 * - ymm0-ymm15 26 * - This is for CPUs that have VAES but lack AVX512 or AVX10, 27 * e.g. Intel's Alder Lake and AMD's Zen 3. 28 * 29 * VAES + VPCLMULQDQ + AVX10/256 + BMI2 30 * - 256-bit vectors (2 AES blocks per vector) 31 * - EVEX-coded instructions 32 * - ymm0-ymm31 33 * - This is for CPUs that have AVX512 but where using zmm registers causes 34 * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. 35 * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. 36 * To avoid confusion with 512-bit, we just write AVX10/256. 37 * 38 * VAES + VPCLMULQDQ + AVX10/512 + BMI2 39 * - Same as the previous one, but upgrades to 512-bit vectors 40 * (4 AES blocks per vector) in zmm0-zmm31. 41 * - This is for CPUs that have good AVX512 or AVX10/512 support. 42 * 43 * This file doesn't have an implementation for AES-NI alone (without AVX), as 44 * the lack of VEX would make all the assembly code different. 45 * 46 * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of 47 * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be 48 * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might 49 * need to start also providing an implementation using VAES alone. 50 * 51 * The AES-XTS implementations in this file support everything required by the 52 * crypto API, including support for arbitrary input lengths and multi-part 53 * processing. However, they are most heavily optimized for the common case of 54 * power-of-2 length inputs that are processed in a single part (disk sectors). 55 */ 56 57#include <linux/linkage.h> 58#include <linux/cfi_types.h> 59 60.section .rodata 61.p2align 4 62.Lgf_poly: 63 // The low 64 bits of this value represent the polynomial x^7 + x^2 + x 64 // + 1. It is the value that must be XOR'd into the low 64 bits of the 65 // tweak each time a 1 is carried out of the high 64 bits. 66 // 67 // The high 64 bits of this value is just the internal carry bit that 68 // exists when there's a carry out of the low 64 bits of the tweak. 69 .quad 0x87, 1 70 71 // This table contains constants for vpshufb and vpblendvb, used to 72 // handle variable byte shifts and blending during ciphertext stealing 73 // on CPUs that don't support AVX10-style masking. 74.Lcts_permute_table: 75 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 76 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 77 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 78 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 79 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 81.text 82 83.macro _define_Vi i 84.if VL == 16 85 .set V\i, %xmm\i 86.elseif VL == 32 87 .set V\i, %ymm\i 88.elseif VL == 64 89 .set V\i, %zmm\i 90.else 91 .error "Unsupported Vector Length (VL)" 92.endif 93.endm 94 95.macro _define_aliases 96 // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers 97 // are available, that map to the xmm, ymm, or zmm registers according 98 // to the selected Vector Length (VL). 99.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 100 _define_Vi \i 101.endr 102.if USE_AVX10 103.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 104 _define_Vi \i 105.endr 106.endif 107 108 // Function parameters 109 .set KEY, %rdi // Initially points to crypto_aes_ctx, then is 110 // advanced to point to 7th-from-last round key 111 .set SRC, %rsi // Pointer to next source data 112 .set DST, %rdx // Pointer to next destination data 113 .set LEN, %ecx // Remaining length in bytes 114 .set LEN8, %cl 115 .set LEN64, %rcx 116 .set TWEAK, %r8 // Pointer to next tweak 117 118 // %rax holds the AES key length in bytes. 119 .set KEYLEN, %eax 120 .set KEYLEN64, %rax 121 122 // %r9-r11 are available as temporaries. 123 124 // V0-V3 hold the data blocks during the main loop, or temporary values 125 // otherwise. V4-V5 hold temporary values. 126 127 // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. 128 .set TWEAK0_XMM, %xmm6 129 .set TWEAK0, V6 130 .set TWEAK1_XMM, %xmm7 131 .set TWEAK1, V7 132 .set TWEAK2, V8 133 .set TWEAK3, V9 134 135 // V10-V13 are used for computing the next values of TWEAK[0-3]. 136 .set NEXT_TWEAK0, V10 137 .set NEXT_TWEAK1, V11 138 .set NEXT_TWEAK2, V12 139 .set NEXT_TWEAK3, V13 140 141 // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes. 142 .set GF_POLY_XMM, %xmm14 143 .set GF_POLY, V14 144 145 // V15 holds the key for AES "round 0", copied to all 128-bit lanes. 146 .set KEY0_XMM, %xmm15 147 .set KEY0, V15 148 149 // If 32 SIMD registers are available, then V16-V29 hold the remaining 150 // AES round keys, copied to all 128-bit lanes. 151 // 152 // AES-128, AES-192, and AES-256 use different numbers of round keys. 153 // To allow handling all three variants efficiently, we align the round 154 // keys to the *end* of this register range. I.e., AES-128 uses 155 // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. 156 // (All also use KEY0 for the XOR-only "round" at the beginning.) 157.if USE_AVX10 158 .set KEY1_XMM, %xmm16 159 .set KEY1, V16 160 .set KEY2_XMM, %xmm17 161 .set KEY2, V17 162 .set KEY3_XMM, %xmm18 163 .set KEY3, V18 164 .set KEY4_XMM, %xmm19 165 .set KEY4, V19 166 .set KEY5_XMM, %xmm20 167 .set KEY5, V20 168 .set KEY6_XMM, %xmm21 169 .set KEY6, V21 170 .set KEY7_XMM, %xmm22 171 .set KEY7, V22 172 .set KEY8_XMM, %xmm23 173 .set KEY8, V23 174 .set KEY9_XMM, %xmm24 175 .set KEY9, V24 176 .set KEY10_XMM, %xmm25 177 .set KEY10, V25 178 .set KEY11_XMM, %xmm26 179 .set KEY11, V26 180 .set KEY12_XMM, %xmm27 181 .set KEY12, V27 182 .set KEY13_XMM, %xmm28 183 .set KEY13, V28 184 .set KEY14_XMM, %xmm29 185 .set KEY14, V29 186.endif 187 // V30-V31 are currently unused. 188.endm 189 190// Move a vector between memory and a register. 191// The register operand must be in the first 16 vector registers. 192.macro _vmovdqu src, dst 193.if VL < 64 194 vmovdqu \src, \dst 195.else 196 vmovdqu8 \src, \dst 197.endif 198.endm 199 200// Broadcast a 128-bit value into a vector. 201.macro _vbroadcast128 src, dst 202.if VL == 16 && !USE_AVX10 203 vmovdqu \src, \dst 204.elseif VL == 32 && !USE_AVX10 205 vbroadcasti128 \src, \dst 206.else 207 vbroadcasti32x4 \src, \dst 208.endif 209.endm 210 211// XOR two vectors together. 212// Any register operands must be in the first 16 vector registers. 213.macro _vpxor src1, src2, dst 214.if VL < 64 215 vpxor \src1, \src2, \dst 216.else 217 vpxord \src1, \src2, \dst 218.endif 219.endm 220 221// XOR three vectors together. 222.macro _xor3 src1, src2, src3_and_dst 223.if USE_AVX10 224 // vpternlogd with immediate 0x96 is a three-argument XOR. 225 vpternlogd $0x96, \src1, \src2, \src3_and_dst 226.else 227 vpxor \src1, \src3_and_dst, \src3_and_dst 228 vpxor \src2, \src3_and_dst, \src3_and_dst 229.endif 230.endm 231 232// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak 233// (by multiplying by the polynomial 'x') and write it to \dst. 234.macro _next_tweak src, tmp, dst 235 vpshufd $0x13, \src, \tmp 236 vpaddq \src, \src, \dst 237 vpsrad $31, \tmp, \tmp 238.if USE_AVX10 239 vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst 240.else 241 vpand GF_POLY_XMM, \tmp, \tmp 242 vpxor \tmp, \dst, \dst 243.endif 244.endm 245 246// Given the XTS tweak(s) in the vector \src, compute the next vector of 247// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. 248// 249// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute 250// all tweaks in the vector in parallel. If VL=16, we just do the regular 251// computation without vpclmulqdq, as it's the faster method for a single tweak. 252.macro _next_tweakvec src, tmp1, tmp2, dst 253.if VL == 16 254 _next_tweak \src, \tmp1, \dst 255.else 256 vpsrlq $64 - VL/16, \src, \tmp1 257 vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2 258 vpslldq $8, \tmp1, \tmp1 259 vpsllq $VL/16, \src, \dst 260 _xor3 \tmp1, \tmp2, \dst 261.endif 262.endm 263 264// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and 265// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. 266.macro _compute_first_set_of_tweaks 267 vmovdqu (TWEAK), TWEAK0_XMM 268 _vbroadcast128 .Lgf_poly(%rip), GF_POLY 269.if VL == 16 270 // With VL=16, multiplying by x serially is fastest. 271 _next_tweak TWEAK0, %xmm0, TWEAK1 272 _next_tweak TWEAK1, %xmm0, TWEAK2 273 _next_tweak TWEAK2, %xmm0, TWEAK3 274.else 275.if VL == 32 276 // Compute the second block of TWEAK0. 277 _next_tweak TWEAK0_XMM, %xmm0, %xmm1 278 vinserti128 $1, %xmm1, TWEAK0, TWEAK0 279.elseif VL == 64 280 // Compute the remaining blocks of TWEAK0. 281 _next_tweak TWEAK0_XMM, %xmm0, %xmm1 282 _next_tweak %xmm1, %xmm0, %xmm2 283 _next_tweak %xmm2, %xmm0, %xmm3 284 vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 285 vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 286 vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 287.endif 288 // Compute TWEAK[1-3] from TWEAK0. 289 vpsrlq $64 - 1*VL/16, TWEAK0, V0 290 vpsrlq $64 - 2*VL/16, TWEAK0, V2 291 vpsrlq $64 - 3*VL/16, TWEAK0, V4 292 vpclmulqdq $0x01, GF_POLY, V0, V1 293 vpclmulqdq $0x01, GF_POLY, V2, V3 294 vpclmulqdq $0x01, GF_POLY, V4, V5 295 vpslldq $8, V0, V0 296 vpslldq $8, V2, V2 297 vpslldq $8, V4, V4 298 vpsllq $1*VL/16, TWEAK0, TWEAK1 299 vpsllq $2*VL/16, TWEAK0, TWEAK2 300 vpsllq $3*VL/16, TWEAK0, TWEAK3 301.if USE_AVX10 302 vpternlogd $0x96, V0, V1, TWEAK1 303 vpternlogd $0x96, V2, V3, TWEAK2 304 vpternlogd $0x96, V4, V5, TWEAK3 305.else 306 vpxor V0, TWEAK1, TWEAK1 307 vpxor V2, TWEAK2, TWEAK2 308 vpxor V4, TWEAK3, TWEAK3 309 vpxor V1, TWEAK1, TWEAK1 310 vpxor V3, TWEAK2, TWEAK2 311 vpxor V5, TWEAK3, TWEAK3 312.endif 313.endif 314.endm 315 316// Do one step in computing the next set of tweaks using the method of just 317// multiplying by x repeatedly (the same method _next_tweak uses). 318.macro _tweak_step_mulx i 319.if \i == 0 320 .set PREV_TWEAK, TWEAK3 321 .set NEXT_TWEAK, NEXT_TWEAK0 322.elseif \i == 5 323 .set PREV_TWEAK, NEXT_TWEAK0 324 .set NEXT_TWEAK, NEXT_TWEAK1 325.elseif \i == 10 326 .set PREV_TWEAK, NEXT_TWEAK1 327 .set NEXT_TWEAK, NEXT_TWEAK2 328.elseif \i == 15 329 .set PREV_TWEAK, NEXT_TWEAK2 330 .set NEXT_TWEAK, NEXT_TWEAK3 331.endif 332.if \i >= 0 && \i < 20 && \i % 5 == 0 333 vpshufd $0x13, PREV_TWEAK, V5 334.elseif \i >= 0 && \i < 20 && \i % 5 == 1 335 vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK 336.elseif \i >= 0 && \i < 20 && \i % 5 == 2 337 vpsrad $31, V5, V5 338.elseif \i >= 0 && \i < 20 && \i % 5 == 3 339 vpand GF_POLY, V5, V5 340.elseif \i >= 0 && \i < 20 && \i % 5 == 4 341 vpxor V5, NEXT_TWEAK, NEXT_TWEAK 342.elseif \i == 1000 343 vmovdqa NEXT_TWEAK0, TWEAK0 344 vmovdqa NEXT_TWEAK1, TWEAK1 345 vmovdqa NEXT_TWEAK2, TWEAK2 346 vmovdqa NEXT_TWEAK3, TWEAK3 347.endif 348.endm 349 350// Do one step in computing the next set of tweaks using the VPCLMULQDQ method 351// (the same method _next_tweakvec uses for VL > 16). This means multiplying 352// each tweak by x^(4*VL/16) independently. 353// 354// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed 355// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq 356// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves 357// instructions directly. The 128-bit right shift (vpsrldq) performs better 358// than a 64-bit right shift on Intel CPUs in the context where it is used here, 359// because it runs on a different execution port from the AES instructions. 360.macro _tweak_step_pclmul i 361.if \i == 0 362 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 363.elseif \i == 2 364 vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 365.elseif \i == 4 366 vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 367.elseif \i == 6 368 vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 369.elseif \i == 8 370 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 371.elseif \i == 10 372 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 373.elseif \i == 12 374 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 375.elseif \i == 14 376 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 377.elseif \i == 1000 378 vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 379 vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1 380 vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2 381 vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3 382 _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0 383 _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1 384 _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2 385 _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3 386.endif 387.endm 388 389// _tweak_step does one step of the computation of the next set of tweaks from 390// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of 391// \i that include at least 0 through 19, then 1000 which signals the last step. 392// 393// This is used to interleave the computation of the next set of tweaks with the 394// AES en/decryptions, which increases performance in some cases. Clobbers V5. 395.macro _tweak_step i 396.if VL == 16 397 _tweak_step_mulx \i 398.else 399 _tweak_step_pclmul \i 400.endif 401.endm 402 403.macro _setup_round_keys enc 404 405 // Select either the encryption round keys or the decryption round keys. 406.if \enc 407 .set OFFS, 0 408.else 409 .set OFFS, 240 410.endif 411 412 // Load the round key for "round 0". 413 _vbroadcast128 OFFS(KEY), KEY0 414 415 // Increment KEY to make it so that 7*16(KEY) is the last round key. 416 // For AES-128, increment by 3*16, resulting in the 10 round keys (not 417 // counting the zero-th round key which was just loaded into KEY0) being 418 // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use 419 // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment 420 // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY). 421 // 422 // This rebasing provides two benefits. First, it makes the offset to 423 // any round key be in the range [-96, 112], fitting in a signed byte. 424 // This shortens VEX-encoded instructions that access the later round 425 // keys which otherwise would need 4-byte offsets. Second, it makes it 426 // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the 427 // beginning. Skipping rounds at the end doesn't work as well because 428 // the last round needs different instructions. 429 // 430 // An alternative approach would be to roll up all the round loops. We 431 // don't do that because (a) it isn't compatible with caching the round 432 // keys in registers which we do when possible (see below), (b) we 433 // interleave the AES rounds with the XTS tweak computation, and (c) it 434 // seems unwise to rely *too* heavily on the CPU's branch predictor. 435 lea OFFS-16(KEY, KEYLEN64, 4), KEY 436 437 // If all 32 SIMD registers are available, cache all the round keys. 438.if USE_AVX10 439 cmp $24, KEYLEN 440 jl .Laes128\@ 441 je .Laes192\@ 442 _vbroadcast128 -6*16(KEY), KEY1 443 _vbroadcast128 -5*16(KEY), KEY2 444.Laes192\@: 445 _vbroadcast128 -4*16(KEY), KEY3 446 _vbroadcast128 -3*16(KEY), KEY4 447.Laes128\@: 448 _vbroadcast128 -2*16(KEY), KEY5 449 _vbroadcast128 -1*16(KEY), KEY6 450 _vbroadcast128 0*16(KEY), KEY7 451 _vbroadcast128 1*16(KEY), KEY8 452 _vbroadcast128 2*16(KEY), KEY9 453 _vbroadcast128 3*16(KEY), KEY10 454 _vbroadcast128 4*16(KEY), KEY11 455 _vbroadcast128 5*16(KEY), KEY12 456 _vbroadcast128 6*16(KEY), KEY13 457 _vbroadcast128 7*16(KEY), KEY14 458.endif 459.endm 460 461// Do a single non-last round of AES encryption (if \enc==1) or decryption (if 462// \enc==0) on the block(s) in \data using the round key(s) in \key. The 463// register length determines the number of AES blocks en/decrypted. 464.macro _vaes enc, key, data 465.if \enc 466 vaesenc \key, \data, \data 467.else 468 vaesdec \key, \data, \data 469.endif 470.endm 471 472// Same as _vaes, but does the last round. 473.macro _vaeslast enc, key, data 474.if \enc 475 vaesenclast \key, \data, \data 476.else 477 vaesdeclast \key, \data, \data 478.endif 479.endm 480 481// Do a single non-last round of AES en/decryption on the block(s) in \data, 482// using the same key for all block(s). The round key is loaded from the 483// appropriate register or memory location for round \i. May clobber \tmp. 484.macro _vaes_1x enc, i, xmm_suffix, data, tmp 485.if USE_AVX10 486 _vaes \enc, KEY\i\xmm_suffix, \data 487.else 488.ifnb \xmm_suffix 489 _vaes \enc, (\i-7)*16(KEY), \data 490.else 491 _vbroadcast128 (\i-7)*16(KEY), \tmp 492 _vaes \enc, \tmp, \data 493.endif 494.endif 495.endm 496 497// Do a single non-last round of AES en/decryption on the blocks in registers 498// V0-V3, using the same key for all blocks. The round key is loaded from the 499// appropriate register or memory location for round \i. In addition, does two 500// steps of the computation of the next set of tweaks. May clobber V4 and V5. 501.macro _vaes_4x enc, i 502.if USE_AVX10 503 _tweak_step (2*(\i-5)) 504 _vaes \enc, KEY\i, V0 505 _vaes \enc, KEY\i, V1 506 _tweak_step (2*(\i-5) + 1) 507 _vaes \enc, KEY\i, V2 508 _vaes \enc, KEY\i, V3 509.else 510 _vbroadcast128 (\i-7)*16(KEY), V4 511 _tweak_step (2*(\i-5)) 512 _vaes \enc, V4, V0 513 _vaes \enc, V4, V1 514 _tweak_step (2*(\i-5) + 1) 515 _vaes \enc, V4, V2 516 _vaes \enc, V4, V3 517.endif 518.endm 519 520// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, 521// then XOR with \tweak again) of the block(s) in \data. To process a single 522// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of 523// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. 524.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp 525 _xor3 KEY0\xmm_suffix, \tweak, \data 526 cmp $24, KEYLEN 527 jl .Laes128\@ 528 je .Laes192\@ 529 _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp 530 _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp 531.Laes192\@: 532 _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp 533 _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp 534.Laes128\@: 535.irp i, 5,6,7,8,9,10,11,12,13 536 _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp 537.endr 538.if USE_AVX10 539 vpxord KEY14\xmm_suffix, \tweak, \tmp 540.else 541.ifnb \xmm_suffix 542 vpxor 7*16(KEY), \tweak, \tmp 543.else 544 _vbroadcast128 7*16(KEY), \tmp 545 vpxor \tweak, \tmp, \tmp 546.endif 547.endif 548 _vaeslast \enc, \tmp, \data 549.endm 550 551.macro _aes_xts_crypt enc 552 _define_aliases 553 554.if !\enc 555 // When decrypting a message whose length isn't a multiple of the AES 556 // block length, exclude the last full block from the main loop by 557 // subtracting 16 from LEN. This is needed because ciphertext stealing 558 // decryption uses the last two tweaks in reverse order. We'll handle 559 // the last full block and the partial block specially at the end. 560 lea -16(LEN), %eax 561 test $15, LEN8 562 cmovnz %eax, LEN 563.endif 564 565 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). 566 movl 480(KEY), KEYLEN 567 568 // Setup the pointer to the round keys and cache as many as possible. 569 _setup_round_keys \enc 570 571 // Compute the first set of tweaks TWEAK[0-3]. 572 _compute_first_set_of_tweaks 573 574 add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 575 jl .Lhandle_remainder\@ 576 577.Lmain_loop\@: 578 // This is the main loop, en/decrypting 4*VL bytes per iteration. 579 580 // XOR each source block with its tweak and the zero-th round key. 581.if USE_AVX10 582 _vmovdqu 0*VL(SRC), V0 583 _vmovdqu 1*VL(SRC), V1 584 _vmovdqu 2*VL(SRC), V2 585 _vmovdqu 3*VL(SRC), V3 586 vpternlogd $0x96, TWEAK0, KEY0, V0 587 vpternlogd $0x96, TWEAK1, KEY0, V1 588 vpternlogd $0x96, TWEAK2, KEY0, V2 589 vpternlogd $0x96, TWEAK3, KEY0, V3 590.else 591 vpxor 0*VL(SRC), KEY0, V0 592 vpxor 1*VL(SRC), KEY0, V1 593 vpxor 2*VL(SRC), KEY0, V2 594 vpxor 3*VL(SRC), KEY0, V3 595 vpxor TWEAK0, V0, V0 596 vpxor TWEAK1, V1, V1 597 vpxor TWEAK2, V2, V2 598 vpxor TWEAK3, V3, V3 599.endif 600 cmp $24, KEYLEN 601 jl .Laes128\@ 602 je .Laes192\@ 603 // Do all the AES rounds on the data blocks, interleaved with 604 // the computation of the next set of tweaks. 605 _vaes_4x \enc, 1 606 _vaes_4x \enc, 2 607.Laes192\@: 608 _vaes_4x \enc, 3 609 _vaes_4x \enc, 4 610.Laes128\@: 611.irp i, 5,6,7,8,9,10,11,12,13 612 _vaes_4x \enc, \i 613.endr 614 // Do the last AES round, then XOR the results with the tweaks again. 615 // Reduce latency by doing the XOR before the vaesenclast, utilizing the 616 // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) 617 // (and likewise for vaesdeclast). 618.if USE_AVX10 619 _tweak_step 18 620 _tweak_step 19 621 vpxord TWEAK0, KEY14, V4 622 vpxord TWEAK1, KEY14, V5 623 _vaeslast \enc, V4, V0 624 _vaeslast \enc, V5, V1 625 vpxord TWEAK2, KEY14, V4 626 vpxord TWEAK3, KEY14, V5 627 _vaeslast \enc, V4, V2 628 _vaeslast \enc, V5, V3 629.else 630 _vbroadcast128 7*16(KEY), V4 631 _tweak_step 18 // uses V5 632 _tweak_step 19 // uses V5 633 vpxor TWEAK0, V4, V5 634 _vaeslast \enc, V5, V0 635 vpxor TWEAK1, V4, V5 636 _vaeslast \enc, V5, V1 637 vpxor TWEAK2, V4, V5 638 vpxor TWEAK3, V4, V4 639 _vaeslast \enc, V5, V2 640 _vaeslast \enc, V4, V3 641.endif 642 643 // Store the destination blocks. 644 _vmovdqu V0, 0*VL(DST) 645 _vmovdqu V1, 1*VL(DST) 646 _vmovdqu V2, 2*VL(DST) 647 _vmovdqu V3, 3*VL(DST) 648 649 // Finish computing the next set of tweaks. 650 _tweak_step 1000 651 652 sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 653 sub $-4*VL, DST 654 add $-4*VL, LEN 655 jge .Lmain_loop\@ 656 657 // Check for the uncommon case where the data length isn't a multiple of 658 // 4*VL. Handle it out-of-line in order to optimize for the common 659 // case. In the common case, just fall through to the ret. 660 test $4*VL-1, LEN8 661 jnz .Lhandle_remainder\@ 662.Ldone\@: 663 // Store the next tweak back to *TWEAK to support continuation calls. 664 vmovdqu TWEAK0_XMM, (TWEAK) 665.if VL > 16 666 vzeroupper 667.endif 668 RET 669 670.Lhandle_remainder\@: 671 672 // En/decrypt any remaining full blocks, one vector at a time. 673.if VL > 16 674 add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL. 675 jl .Lvec_at_a_time_done\@ 676.Lvec_at_a_time\@: 677 _vmovdqu (SRC), V0 678 _aes_crypt \enc, , TWEAK0, V0, tmp=V1 679 _vmovdqu V0, (DST) 680 _next_tweakvec TWEAK0, V0, V1, TWEAK0 681 add $VL, SRC 682 add $VL, DST 683 sub $VL, LEN 684 jge .Lvec_at_a_time\@ 685.Lvec_at_a_time_done\@: 686 add $VL-16, LEN // Undo extra sub of VL, then sub 16. 687.else 688 add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16. 689.endif 690 691 // En/decrypt any remaining full blocks, one at a time. 692 jl .Lblock_at_a_time_done\@ 693.Lblock_at_a_time\@: 694 vmovdqu (SRC), %xmm0 695 _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 696 vmovdqu %xmm0, (DST) 697 _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM 698 add $16, SRC 699 add $16, DST 700 sub $16, LEN 701 jge .Lblock_at_a_time\@ 702.Lblock_at_a_time_done\@: 703 add $16, LEN // Undo the extra sub of 16. 704 // Now 0 <= LEN <= 15. If LEN is zero, we're done. 705 jz .Ldone\@ 706 707 // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN. 708 // Do ciphertext stealing to process the last 16 + LEN bytes. 709 710.if \enc 711 // If encrypting, the main loop already encrypted the last full block to 712 // create the CTS intermediate ciphertext. Prepare for the rest of CTS 713 // by rewinding the pointers and loading the intermediate ciphertext. 714 sub $16, SRC 715 sub $16, DST 716 vmovdqu (DST), %xmm0 717.else 718 // If decrypting, the main loop didn't decrypt the last full block 719 // because CTS decryption uses the last two tweaks in reverse order. 720 // Do it now by advancing the tweak and decrypting the last full block. 721 _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM 722 vmovdqu (SRC), %xmm0 723 _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 724.endif 725 726.if USE_AVX10 727 // Create a mask that has the first LEN bits set. 728 mov $-1, %r9d 729 bzhi LEN, %r9d, %r9d 730 kmovd %r9d, %k1 731 732 // Swap the first LEN bytes of the en/decryption of the last full block 733 // with the partial block. Note that to support in-place en/decryption, 734 // the load from the src partial block must happen before the store to 735 // the dst partial block. 736 vmovdqa %xmm0, %xmm1 737 vmovdqu8 16(SRC), %xmm0{%k1} 738 vmovdqu8 %xmm1, 16(DST){%k1} 739.else 740 lea .Lcts_permute_table(%rip), %r9 741 742 // Load the src partial block, left-aligned. Note that to support 743 // in-place en/decryption, this must happen before the store to the dst 744 // partial block. 745 vmovdqu (SRC, LEN64, 1), %xmm1 746 747 // Shift the first LEN bytes of the en/decryption of the last full block 748 // to the end of a register, then store it to DST+LEN. This stores the 749 // dst partial block. It also writes to the second part of the dst last 750 // full block, but that part is overwritten later. 751 vpshufb (%r9, LEN64, 1), %xmm0, %xmm2 752 vmovdqu %xmm2, (DST, LEN64, 1) 753 754 // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. 755 sub LEN64, %r9 756 vmovdqu 32(%r9), %xmm3 757 758 // Shift the src partial block to the beginning of its register. 759 vpshufb %xmm3, %xmm1, %xmm1 760 761 // Do a blend to generate the src partial block followed by the second 762 // part of the en/decryption of the last full block. 763 vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 764.endif 765 // En/decrypt again and store the last full block. 766 _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 767 vmovdqu %xmm0, (DST) 768 jmp .Ldone\@ 769.endm 770 771// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, 772// u8 iv[AES_BLOCK_SIZE]); 773// 774// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes 775// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10. 776SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) 777 .set TWEAK_KEY, %rdi 778 .set IV, %rsi 779 .set KEYLEN, %eax 780 .set KEYLEN64, %rax 781 782 vmovdqu (IV), %xmm0 783 vpxor (TWEAK_KEY), %xmm0, %xmm0 784 movl 480(TWEAK_KEY), KEYLEN 785 lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY 786 cmp $24, KEYLEN 787 jl .Lencrypt_iv_aes128 788 je .Lencrypt_iv_aes192 789 vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 790 vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 791.Lencrypt_iv_aes192: 792 vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 793 vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 794.Lencrypt_iv_aes128: 795.irp i, -2,-1,0,1,2,3,4,5,6 796 vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 797.endr 798 vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 799 vmovdqu %xmm0, (IV) 800 RET 801SYM_FUNC_END(aes_xts_encrypt_iv) 802 803// Below are the actual AES-XTS encryption and decryption functions, 804// instantiated from the above macro. They all have the following prototype: 805// 806// void (*xts_crypt_func)(const struct crypto_aes_ctx *key, 807// const u8 *src, u8 *dst, int len, 808// u8 tweak[AES_BLOCK_SIZE]); 809// 810// |key| is the data key. |tweak| contains the next tweak; the encryption of 811// the original IV with the tweak key was already done. This function supports 812// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and 813// |len| must be a multiple of 16 except on the last call. If |len| is a 814// multiple of 16, then this function updates |tweak| to contain the next tweak. 815 816.set VL, 16 817.set USE_AVX10, 0 818SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) 819 _aes_xts_crypt 1 820SYM_FUNC_END(aes_xts_encrypt_aesni_avx) 821SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx) 822 _aes_xts_crypt 0 823SYM_FUNC_END(aes_xts_decrypt_aesni_avx) 824 825#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 826.set VL, 32 827.set USE_AVX10, 0 828SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) 829 _aes_xts_crypt 1 830SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) 831SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) 832 _aes_xts_crypt 0 833SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) 834 835.set VL, 32 836.set USE_AVX10, 1 837SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) 838 _aes_xts_crypt 1 839SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) 840SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) 841 _aes_xts_crypt 0 842SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) 843 844.set VL, 64 845.set USE_AVX10, 1 846SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) 847 _aes_xts_crypt 1 848SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) 849SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) 850 _aes_xts_crypt 0 851SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) 852#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ 853