1#! /usr/bin/env perl 2 3# Copyright (c) 2022, ARM Inc. 4# 5# Permission to use, copy, modify, and/or distribute this software for any 6# purpose with or without fee is hereby granted, provided that the above 7# copyright notice and this permission notice appear in all copies. 8# 9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 16 17#======================================================================== 18# Written by Fangming Fang <[email protected]> for the OpenSSL project, 19# derived from https://github.com/ARM-software/AArch64cryptolib, original 20# author Samuel Lee <[email protected]>. 21#======================================================================== 22# 23# Approach - assume we don't want to reload constants, so reserve ~half of 24# vector register file for constants 25# 26# main loop to act on 4 16B blocks per iteration, and then do modulo of the 27# accumulated intermediate hashes from the 4 blocks 28# 29# ____________________________________________________ 30# | | 31# | PRE | 32# |____________________________________________________| 33# | | | | 34# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 | 35# |________________|________________|__________________| 36# | | | | 37# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 | 38# |________________|________________|__________________| 39# | | | | 40# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 | 41# |________________|________________|__________________| 42# | | | | 43# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 | 44# |________________|____(mostly)____|__________________| 45# | | 46# | MODULO | 47# |____________________________________________________| 48# 49# PRE: Ensure previous generated intermediate hash is aligned and merged with 50# result for GHASH 4k+0 51# 52# EXT low_acc, low_acc, low_acc, #8 53# EOR res_curr (4k+0), res_curr (4k+0), low_acc 54# 55# CTR block: Increment and byte reverse counter in scalar registers and transfer 56# to SIMD registers 57# 58# REV ctr32, rev_ctr32 59# ORR ctr64, constctr96_top32, ctr32, LSL #32 60# // Keeping this in scalar registers to free up space in SIMD RF 61# INS ctr_next.d[0], constctr96_bottom64 62# INS ctr_next.d[1], ctr64X 63# ADD rev_ctr32, #1 64# 65# AES block: 66# 67# Do AES encryption/decryption on CTR block X and EOR it with input block X. 68# Take 256 bytes key below for example. Doing small trick here of loading input 69# in scalar registers, EORing with last key and then transferring Given we are 70# very constrained in our ASIMD registers this is quite important 71# 72# Encrypt: 73# LDR input_low, [ input_ptr ], #8 74# LDR input_high, [ input_ptr ], #8 75# EOR input_low, k14_low 76# EOR input_high, k14_high 77# INS res_curr.d[0], input_low 78# INS res_curr.d[1], input_high 79# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr 80# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr 81# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr 82# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr 83# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr 84# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr 85# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr 86# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr 87# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr 88# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr 89# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr 90# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr 91# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr 92# AESE ctr_curr, k13 93# EOR res_curr, res_curr, ctr_curr 94# ST1 { res_curr.16b }, [ output_ptr ], #16 95# 96# Decrypt: 97# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr 98# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr 99# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr 100# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr 101# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr 102# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr 103# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr 104# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr 105# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr 106# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr 107# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr 108# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr 109# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr 110# AESE ctr_curr, k13 111# LDR res_curr, [ input_ptr ], #16 112# EOR res_curr, res_curr, ctr_curr 113# MOV output_low, res_curr.d[0] 114# MOV output_high, res_curr.d[1] 115# EOR output_low, k14_low 116# EOR output_high, k14_high 117# STP output_low, output_high, [ output_ptr ], #16 118# 119# GHASH block X: 120# Do 128b karatsuba polynomial multiplication on block. We only have 121# 64b->128b polynomial multipliers, naively that means we need to do 4 64b 122# multiplies to generate a 128b. 123# 124# multiplication: 125# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ 126# (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 127# 128# The idea behind Karatsuba multiplication is that we can do just 3 64b 129# multiplies: 130# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ 131# (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ 132# Pmull(Al,Bl))<<64 133# 134# There is some complication here because the bit order of GHASH's PMULL is 135# reversed compared to elsewhere, so we are multiplying with "twisted" 136# powers of H 137# 138# Note: We can PMULL directly into the acc_x in first GHASH of the loop 139# 140# Note: For scheduling big cores we want to split the processing to happen over 141# two loop iterations - otherwise the critical path latency dominates the 142# performance. 143# 144# This has a knock on effect on register pressure, so we have to be a bit 145# more clever with our temporary registers than indicated here 146# 147# REV64 res_curr, res_curr 148# INS t_m.d[0], res_curr.d[1] 149# EOR t_m.8B, t_m.8B, res_curr.8B 150# PMULL2 t_h, res_curr, HX 151# PMULL t_l, res_curr, HX 152# PMULL t_m, t_m, HX_k 153# EOR acc_h, acc_h, t_h 154# EOR acc_l, acc_l, t_l 155# EOR acc_m, acc_m, t_m 156# 157# MODULO: take the partial accumulators (~representing sum of 256b 158# multiplication results), from GHASH and do modulo reduction on them 159# There is some complication here because the bit order of GHASH's 160# PMULL is reversed compared to elsewhere, so we are doing modulo with 161# a reversed constant 162# 163# EOR acc_m, acc_m, acc_h 164# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing 165# PMULL t_mod, acc_h, mod_constant 166# EXT acc_h, acc_h, acc_h, #8 167# EOR acc_m, acc_m, acc_h 168# EOR acc_m, acc_m, t_mod 169# PMULL acc_h, acc_m, mod_constant 170# EXT acc_m, acc_m, acc_m, #8 171# EOR acc_l, acc_l, acc_h 172# EOR acc_l, acc_l, acc_m 173# 174# This code was then modified to merge the AES-128-GCM, AES-192-GCM, and 175# AES-256-GCM implementations into a single function to reduce size. We move the 176# last two round keys into consistent registers across all sizes, as they're 177# treated special. Then, after rounds 0 through 8, we added some branches to 178# conditionally run rounds 9-10 (AES-192 + AES-256) and 11-12 (AES-256), before 179# merging back into code which finishes up the last two rounds. 180# 181# There is a mostly decision to be made around how much parallel work goes 182# before or after the conditional part. We attempted to preserve the original 183# scheduling where possible, but it's possible other schedulings are more 184# optimal with the current ordering. 185 186$flavour = shift; 187$output = shift; 188 189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 190( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 191( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 192die "can't locate arm-xlate.pl"; 193 194open OUT,"| \"$^X\" $xlate $flavour $output"; 195*STDOUT=*OUT; 196 197$code=<<___; 198#include <openssl/arm_arch.h> 199#if __ARM_MAX_ARCH__ >= 8 200 201.arch armv8-a+crypto 202.text 203___ 204 205$input_ptr="x0"; #argument block 206$bit_length="x1"; 207$output_ptr="x2"; 208$current_tag="x3"; 209$Htable="x6"; 210$counter="x16"; 211$cc="x8"; 212 213{ 214my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); 215my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); 216my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); 217my ($output_l0,$output_h0)=map("x$_",(6..7)); 218 219# rkN_l and rkN_h store the final round key, which is handled slightly 220# differently because it is EORed through general-purpose registers. 221my $ctr32w="w9"; 222my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rkN_l,$rkN_h,$len)=map("x$_",(9..15)); 223my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); 224 225my $rounds="x17"; 226my $roundsw="w17"; 227 228my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); 229my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); 230my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); 231my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); 232 233my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); 234my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); 235my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); 236 237my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); 238my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); 239my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); 240 241my $t0="v8"; 242my $t0d="d8"; 243my $t1="v4"; 244my $t1d="d4"; 245my $t2="v8"; 246my $t2d="d8"; 247my $t3="v4"; 248my $t3d="d4"; 249my $t4="v4"; 250my $t4d="d4"; 251my $t5="v5"; 252my $t5d="d5"; 253my $t6="v8"; 254my $t6d="d8"; 255my $t7="v5"; 256my $t7d="d5"; 257my $t8="v6"; 258my $t8d="d6"; 259my $t9="v4"; 260my $t9d="d4"; 261 262my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); 263my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); 264my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); 265 266my $mod_constantd="d8"; 267my $mod_constant="v8"; 268my $mod_t="v7"; 269 270# rkNm1 stores the second-to-last round key, which is handled slightly 271# differently because it uses plain AESE instead of an AESE + AESMC macro-op. 272my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rkNm1)=map("v$_.16b",(18..31)); 273my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rkNm1q)=map("q$_",(18..31)); 274my $rk2q1="v20.1q"; 275my $rk3q1="v21.1q"; 276my $rk4v="v22"; 277my $rk4d="d22"; 278 279################################################################################ 280# size_t aes_gcm_enc_kernel(const uint8_t *in, 281# size_t len_bits, 282# uint8_t *out, 283# u64 *Xi, 284# uint8_t ivec[16], 285# const void *key, 286# const void *Htable); 287# 288$code.=<<___; 289.global aes_gcm_enc_kernel 290.type aes_gcm_enc_kernel,%function 291.align 4 292aes_gcm_enc_kernel: 293 AARCH64_SIGN_LINK_REGISTER 294 stp x29, x30, [sp, #-128]! 295 mov x29, sp 296 stp x19, x20, [sp, #16] 297 mov $counter, x4 298 mov $cc, x5 299 stp x21, x22, [sp, #32] 300 stp x23, x24, [sp, #48] 301 stp d8, d9, [sp, #64] 302 stp d10, d11, [sp, #80] 303 stp d12, d13, [sp, #96] 304 stp d14, d15, [sp, #112] 305 ldr $roundsw, [$cc, #240] 306 add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key 307 ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys 308 ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys 309 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr 310 lsr $main_end_input_ptr, $bit_length, #3 // byte_len 311 mov $len, $main_end_input_ptr 312 ldp $ctr96_b64x, $ctr96_t32x, [$counter] // ctr96_b64, ctr96_t32 313 ld1 { $ctr0b}, [$counter] // special case vector load initial counter so we can start first AES block as quickly as possible 314 sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1 315 ldr $rk0q, [$cc, #0] // load rk0 316 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 317 ldr $rk7q, [$cc, #112] // load rk7 318 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 319 lsr $rctr32x, $ctr96_t32x, #32 320 fmov $ctr2d, $ctr96_b64x // CTR block 2 321 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 322 rev $rctr32w, $rctr32w // rev_ctr32 323 fmov $ctr1d, $ctr96_b64x // CTR block 1 324 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 0 325 add $rctr32w, $rctr32w, #1 // increment rev_ctr32 326 rev $ctr32w, $rctr32w // CTR block 1 327 fmov $ctr3d, $ctr96_b64x // CTR block 3 328 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 1 329 add $rctr32w, $rctr32w, #1 // CTR block 1 330 ldr $rk1q, [$cc, #16] // load rk1 331 fmov $ctr1.d[1], $ctr32x // CTR block 1 332 rev $ctr32w, $rctr32w // CTR block 2 333 add $rctr32w, $rctr32w, #1 // CTR block 2 334 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 2 335 ldr $rk2q, [$cc, #32] // load rk2 336 fmov $ctr2.d[1], $ctr32x // CTR block 2 337 rev $ctr32w, $rctr32w // CTR block 3 338 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 1 339 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 3 340 fmov $ctr3.d[1], $ctr32x // CTR block 3 341 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 0 342 ldr $rk3q, [$cc, #48] // load rk3 343 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 2 344 ldr $rk6q, [$cc, #96] // load rk6 345 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 0 346 ldr $rk5q, [$cc, #80] // load rk5 347 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 1 348 ldr $h3q, [$Htable, #48] // load h3l | h3h 349 ext $h3b, $h3b, $h3b, #8 350 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 0 351 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 1 352 ldr $rk4q, [$cc, #64] // load rk4 353 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 2 354 ldr $h2q, [$Htable, #32] // load h2l | h2h 355 ext $h2b, $h2b, $h2b, #8 356 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 1 357 ldr $rk12q, [$cc, #192] // load rk12 358 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 2 359 ldr $h4q, [$Htable, #80] // load h4l | h4h 360 ext $h4b, $h4b, $h4b, #8 361 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 3 362 ldr $rk11q, [$cc, #176] // load rk11 363 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 2 364 ldr $rk8q, [$cc, #128] // load rk8 365 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 3 366 add $rctr32w, $rctr32w, #1 // CTR block 3 367 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 3 368 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 3 369 ld1 { $acc_lb}, [$current_tag] 370 ext $acc_lb, $acc_lb, $acc_lb, #8 371 rev64 $acc_lb, $acc_lb 372 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 4 373 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 4 374 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 4 375 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 4 376 cmp $rounds, #12 // setup flags for AES-128/192/256 check 377 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 5 378 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 5 379 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 5 380 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 5 381 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 6 382 trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l 383 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 6 384 ldr $rk9q, [$cc, #144] // load rk9 385 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 6 386 ldr $h1q, [$Htable] // load h1l | h1h 387 ext $h1b, $h1b, $h1b, #8 388 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 6 389 ldr $rk10q, [$cc, #160] // load rk10 390 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 7 391 trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h 392 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 7 393 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 7 394 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 7 395 trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l 396 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 8 397 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 8 398 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 8 399 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 8 400 b.lt .Lenc_finish_first_blocks // branch if AES-128 401 402 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 9 403 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 9 404 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 9 405 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 9 406 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 10 407 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 10 408 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 10 409 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 10 410 b.eq .Lenc_finish_first_blocks // branch if AES-192 411 412 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 11 413 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 11 414 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 11 415 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 11 416 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 12 417 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 12 418 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 12 419 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 12 420 421.Lenc_finish_first_blocks: 422 cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks 423 eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k 424 aese $ctr2b, $rkNm1 // AES block 2 - round N-1 425 trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h 426 aese $ctr1b, $rkNm1 // AES block 1 - round N-1 427 aese $ctr0b, $rkNm1 // AES block 0 - round N-1 428 aese $ctr3b, $rkNm1 // AES block 3 - round N-1 429 eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k 430 b.ge .Lenc_tail // handle tail 431 432 ldp $input_l1, $input_h1, [$input_ptr, #16] // AES block 1 - load plaintext 433 rev $ctr32w, $rctr32w // CTR block 4 434 ldp $input_l0, $input_h0, [$input_ptr, #0] // AES block 0 - load plaintext 435 ldp $input_l3, $input_h3, [$input_ptr, #48] // AES block 3 - load plaintext 436 ldp $input_l2, $input_h2, [$input_ptr, #32] // AES block 2 - load plaintext 437 add $input_ptr, $input_ptr, #64 // AES input_ptr update 438 eor $input_l1, $input_l1, $rkN_l // AES block 1 - round N low 439 eor $input_h1, $input_h1, $rkN_h // AES block 1 - round N high 440 fmov $ctr_t1d, $input_l1 // AES block 1 - mov low 441 eor $input_l0, $input_l0, $rkN_l // AES block 0 - round N low 442 eor $input_h0, $input_h0, $rkN_h // AES block 0 - round N high 443 eor $input_h3, $input_h3, $rkN_h // AES block 3 - round N high 444 fmov $ctr_t0d, $input_l0 // AES block 0 - mov low 445 cmp $input_ptr, $main_end_input_ptr // check if we have <= 8 blocks 446 fmov $ctr_t0.d[1], $input_h0 // AES block 0 - mov high 447 eor $input_l3, $input_l3, $rkN_l // AES block 3 - round N low 448 eor $input_l2, $input_l2, $rkN_l // AES block 2 - round N low 449 fmov $ctr_t1.d[1], $input_h1 // AES block 1 - mov high 450 fmov $ctr_t2d, $input_l2 // AES block 2 - mov low 451 add $rctr32w, $rctr32w, #1 // CTR block 4 452 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4 453 fmov $ctr_t3d, $input_l3 // AES block 3 - mov low 454 eor $input_h2, $input_h2, $rkN_h // AES block 2 - round N high 455 fmov $ctr_t2.d[1], $input_h2 // AES block 2 - mov high 456 eor $res0b, $ctr_t0b, $ctr0b // AES block 0 - result 457 fmov $ctr0d, $ctr96_b64x // CTR block 4 458 fmov $ctr0.d[1], $ctr32x // CTR block 4 459 rev $ctr32w, $rctr32w // CTR block 5 460 add $rctr32w, $rctr32w, #1 // CTR block 5 461 eor $res1b, $ctr_t1b, $ctr1b // AES block 1 - result 462 fmov $ctr1d, $ctr96_b64x // CTR block 5 463 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 5 464 fmov $ctr1.d[1], $ctr32x // CTR block 5 465 rev $ctr32w, $rctr32w // CTR block 6 466 st1 { $res0b}, [$output_ptr], #16 // AES block 0 - store result 467 fmov $ctr_t3.d[1], $input_h3 // AES block 3 - mov high 468 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 6 469 eor $res2b, $ctr_t2b, $ctr2b // AES block 2 - result 470 st1 { $res1b}, [$output_ptr], #16 // AES block 1 - store result 471 add $rctr32w, $rctr32w, #1 // CTR block 6 472 fmov $ctr2d, $ctr96_b64x // CTR block 6 473 fmov $ctr2.d[1], $ctr32x // CTR block 6 474 st1 { $res2b}, [$output_ptr], #16 // AES block 2 - store result 475 rev $ctr32w, $rctr32w // CTR block 7 476 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 7 477 eor $res3b, $ctr_t3b, $ctr3b // AES block 3 - result 478 st1 { $res3b}, [$output_ptr], #16 // AES block 3 - store result 479 b.ge .Lenc_prepretail // do prepretail 480 481.Lenc_main_loop: // main loop start 482 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 483 rev64 $res0b, $res0b // GHASH block 4k (only t0 is free) 484 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 485 fmov $ctr3d, $ctr96_b64x // CTR block 4k+3 486 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 487 ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 488 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 489 fmov $ctr3.d[1], $ctr32x // CTR block 4k+3 490 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 491 ldp $input_l3, $input_h3, [$input_ptr, #48] // AES block 4k+7 - load plaintext 492 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 493 ldp $input_l2, $input_h2, [$input_ptr, #32] // AES block 4k+6 - load plaintext 494 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 495 eor $res0b, $res0b, $acc_lb // PRE 1 496 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 497 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 498 eor $input_l3, $input_l3, $rkN_l // AES block 4k+7 - round N low 499 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 500 mov $acc_md, $h34k.d[1] // GHASH block 4k - mid 501 pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high 502 eor $input_h2, $input_h2, $rkN_h // AES block 4k+6 - round N high 503 mov $t0d, $res0.d[1] // GHASH block 4k - mid 504 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 505 rev64 $res1b, $res1b // GHASH block 4k+1 (t0 and t1 free) 506 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 507 pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low 508 eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid 509 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 510 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 511 rev64 $res3b, $res3b // GHASH block 4k+3 (t0, t1, t2 and t3 free) 512 pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high 513 pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid 514 rev64 $res2b, $res2b // GHASH block 4k+2 (t0, t1, and t2 free) 515 pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low 516 eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high 517 mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid 518 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 519 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 520 eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low 521 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 522 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 523 mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid 524 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 525 eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid 526 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 527 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 528 eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid 529 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 530 pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid 531 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 532 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 533 ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid 534 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 535 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 536 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 537 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 538 eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid 539 pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high 540 pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low 541 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 542 pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low 543 eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high 544 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 545 ldp $input_l1, $input_h1, [$input_ptr, #16] // AES block 4k+5 - load plaintext 546 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 547 mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid 548 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 549 eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low 550 pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid 551 pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high 552 eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid 553 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 554 eor $input_l1, $input_l1, $rkN_l // AES block 4k+5 - round N low 555 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 556 eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid 557 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 558 eor $input_l2, $input_l2, $rkN_l // AES block 4k+6 - round N low 559 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 560 movi $mod_constant.8b, #0xc2 561 pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid 562 eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high 563 cmp $rounds, #12 // setup flags for AES-128/192/256 check 564 fmov $ctr_t1d, $input_l1 // AES block 4k+5 - mov low 565 ldp $input_l0, $input_h0, [$input_ptr, #0] // AES block 4k+4 - load plaintext 566 b.lt .Lenc_main_loop_continue // branch if AES-128 567 568 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 569 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 570 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 571 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 572 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 573 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 574 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 575 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 576 b.eq .Lenc_main_loop_continue // branch if AES-192 577 578 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 579 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 580 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 581 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 582 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 583 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 584 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 585 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 586 587.Lenc_main_loop_continue: 588 shl $mod_constantd, $mod_constantd, #56 // mod_constant 589 eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low 590 eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid 591 add $rctr32w, $rctr32w, #1 // CTR block 4k+3 592 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up 593 add $input_ptr, $input_ptr, #64 // AES input_ptr update 594 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid 595 rev $ctr32w, $rctr32w // CTR block 4k+8 596 ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment 597 eor $input_l0, $input_l0, $rkN_l // AES block 4k+4 - round N low 598 eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up 599 eor $input_h0, $input_h0, $rkN_h // AES block 4k+4 - round N high 600 fmov $ctr_t0d, $input_l0 // AES block 4k+4 - mov low 601 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+8 602 eor $mod_t.16b, $acc_hb, $mod_t.16b // MODULO - fold into mid 603 eor $input_h1, $input_h1, $rkN_h // AES block 4k+5 - round N high 604 eor $input_h3, $input_h3, $rkN_h // AES block 4k+7 - round N high 605 add $rctr32w, $rctr32w, #1 // CTR block 4k+8 606 aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 607 fmov $ctr_t0.d[1], $input_h0 // AES block 4k+4 - mov high 608 eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid 609 fmov $ctr_t3d, $input_l3 // AES block 4k+7 - mov low 610 aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 611 fmov $ctr_t1.d[1], $input_h1 // AES block 4k+5 - mov high 612 fmov $ctr_t2d, $input_l2 // AES block 4k+6 - mov low 613 cmp $input_ptr, $main_end_input_ptr // LOOP CONTROL 614 fmov $ctr_t2.d[1], $input_h2 // AES block 4k+6 - mov high 615 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low 616 eor $res0b, $ctr_t0b, $ctr0b // AES block 4k+4 - result 617 fmov $ctr0d, $ctr96_b64x // CTR block 4k+8 618 fmov $ctr0.d[1], $ctr32x // CTR block 4k+8 619 rev $ctr32w, $rctr32w // CTR block 4k+9 620 add $rctr32w, $rctr32w, #1 // CTR block 4k+9 621 eor $res1b, $ctr_t1b, $ctr1b // AES block 4k+5 - result 622 fmov $ctr1d, $ctr96_b64x // CTR block 4k+9 623 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+9 624 fmov $ctr1.d[1], $ctr32x // CTR block 4k+9 625 aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 626 rev $ctr32w, $rctr32w // CTR block 4k+10 627 st1 { $res0b}, [$output_ptr], #16 // AES block 4k+4 - store result 628 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+10 629 eor $acc_lb, $acc_lb, $acc_hb // MODULO - fold into low 630 fmov $ctr_t3.d[1], $input_h3 // AES block 4k+7 - mov high 631 ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment 632 st1 { $res1b}, [$output_ptr], #16 // AES block 4k+5 - store result 633 add $rctr32w, $rctr32w, #1 // CTR block 4k+10 634 aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 635 eor $res2b, $ctr_t2b, $ctr2b // AES block 4k+6 - result 636 fmov $ctr2d, $ctr96_b64x // CTR block 4k+10 637 st1 { $res2b}, [$output_ptr], #16 // AES block 4k+6 - store result 638 fmov $ctr2.d[1], $ctr32x // CTR block 4k+10 639 rev $ctr32w, $rctr32w // CTR block 4k+11 640 eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low 641 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+11 642 eor $res3b, $ctr_t3b, $ctr3b // AES block 4k+7 - result 643 st1 { $res3b}, [$output_ptr], #16 // AES block 4k+7 - store result 644 b.lt .Lenc_main_loop 645 646.Lenc_prepretail: // PREPRETAIL 647 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 648 rev64 $res2b, $res2b // GHASH block 4k+2 (t0, t1, and t2 free) 649 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 650 fmov $ctr3d, $ctr96_b64x // CTR block 4k+3 651 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 652 rev64 $res0b, $res0b // GHASH block 4k (only t0 is free) 653 fmov $ctr3.d[1], $ctr32x // CTR block 4k+3 654 ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 655 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 656 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 657 eor $res0b, $res0b, $acc_lb // PRE 1 658 rev64 $res1b, $res1b // GHASH block 4k+1 (t0 and t1 free) 659 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 660 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 661 mov $acc_md, $h34k.d[1] // GHASH block 4k - mid 662 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 663 pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low 664 mov $t0d, $res0.d[1] // GHASH block 4k - mid 665 pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high 666 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 667 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 668 eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid 669 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 670 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 671 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 672 pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid 673 pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high 674 pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low 675 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 676 eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high 677 mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid 678 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 679 eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low 680 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 681 eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid 682 mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid 683 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 684 rev64 $res3b, $res3b // GHASH block 4k+3 (t0, t1, t2 and t3 free) 685 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 686 pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid 687 eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid 688 add $rctr32w, $rctr32w, #1 // CTR block 4k+3 689 pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low 690 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 691 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 692 eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid 693 pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high 694 eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low 695 ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid 696 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 697 eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high 698 mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid 699 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 700 pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid 701 eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid 702 pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high 703 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 704 pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid 705 eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid 706 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 707 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 708 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 709 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 710 movi $mod_constant.8b, #0xc2 711 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 712 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 713 eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high 714 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 715 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 716 shl $mod_constantd, $mod_constantd, #56 // mod_constant 717 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 718 eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid 719 pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low 720 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 721 cmp $rounds, #12 // setup flags for AES-128/192/256 check 722 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 723 eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low 724 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 725 eor $acc_mb, $acc_mb, $acc_hb // karatsuba tidy up 726 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 727 pmull $t1.1q, $acc_h.1d, $mod_constant.1d 728 ext $acc_hb, $acc_hb, $acc_hb, #8 729 eor $acc_mb, $acc_mb, $acc_lb 730 b.lt .Lenc_finish_prepretail // branch if AES-128 731 732 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 733 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 734 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 735 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 736 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 737 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 738 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 739 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 740 b.eq .Lenc_finish_prepretail // branch if AES-192 741 742 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 743 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 744 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 745 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 746 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 747 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 748 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 749 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 750 751.Lenc_finish_prepretail: 752 eor $acc_mb, $acc_mb, $t1.16b 753 eor $acc_mb, $acc_mb, $acc_hb 754 pmull $t1.1q, $acc_m.1d, $mod_constant.1d 755 ext $acc_mb, $acc_mb, $acc_mb, #8 756 aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 757 eor $acc_lb, $acc_lb, $t1.16b 758 aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 759 aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 760 aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 761 eor $acc_lb, $acc_lb, $acc_mb 762 763.Lenc_tail: // TAIL 764 ext $t0.16b, $acc_lb, $acc_lb, #8 // prepare final partial tag 765 sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process 766 ldp $input_l0, $input_h0, [$input_ptr], #16 // AES block 4k+4 - load plaintext 767 eor $input_l0, $input_l0, $rkN_l // AES block 4k+4 - round N low 768 eor $input_h0, $input_h0, $rkN_h // AES block 4k+4 - round N high 769 cmp $main_end_input_ptr, #48 770 fmov $ctr_t0d, $input_l0 // AES block 4k+4 - mov low 771 fmov $ctr_t0.d[1], $input_h0 // AES block 4k+4 - mov high 772 eor $res1b, $ctr_t0b, $ctr0b // AES block 4k+4 - result 773 b.gt .Lenc_blocks_more_than_3 774 cmp $main_end_input_ptr, #32 775 mov $ctr3b, $ctr2b 776 movi $acc_l.8b, #0 777 movi $acc_h.8b, #0 778 sub $rctr32w, $rctr32w, #1 779 mov $ctr2b, $ctr1b 780 movi $acc_m.8b, #0 781 b.gt .Lenc_blocks_more_than_2 782 mov $ctr3b, $ctr1b 783 sub $rctr32w, $rctr32w, #1 784 cmp $main_end_input_ptr, #16 785 b.gt .Lenc_blocks_more_than_1 786 sub $rctr32w, $rctr32w, #1 787 b .Lenc_blocks_less_than_1 788.Lenc_blocks_more_than_3: // blocks left > 3 789 st1 { $res1b}, [$output_ptr], #16 // AES final-3 block - store result 790 ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-2 block - load input low & high 791 rev64 $res0b, $res1b // GHASH final-3 block 792 eor $input_l0, $input_l0, $rkN_l // AES final-2 block - round N low 793 eor $res0b, $res0b, $t0.16b // feed in partial tag 794 eor $input_h0, $input_h0, $rkN_h // AES final-2 block - round N high 795 mov $rk4d, $res0.d[1] // GHASH final-3 block - mid 796 fmov $res1d, $input_l0 // AES final-2 block - mov low 797 fmov $res1.d[1], $input_h0 // AES final-2 block - mov high 798 eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid 799 movi $t0.8b, #0 // suppress further partial tag feed in 800 mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid 801 pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low 802 pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high 803 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid 804 eor $res1b, $res1b, $ctr1b // AES final-2 block - result 805.Lenc_blocks_more_than_2: // blocks left > 2 806 st1 { $res1b}, [$output_ptr], #16 // AES final-2 block - store result 807 ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-1 block - load input low & high 808 rev64 $res0b, $res1b // GHASH final-2 block 809 eor $input_l0, $input_l0, $rkN_l // AES final-1 block - round N low 810 eor $res0b, $res0b, $t0.16b // feed in partial tag 811 fmov $res1d, $input_l0 // AES final-1 block - mov low 812 eor $input_h0, $input_h0, $rkN_h // AES final-1 block - round N high 813 fmov $res1.d[1], $input_h0 // AES final-1 block - mov high 814 movi $t0.8b, #0 // suppress further partial tag feed in 815 pmull2 $rk2q1, $res0.2d, $h3.2d // GHASH final-2 block - high 816 mov $rk4d, $res0.d[1] // GHASH final-2 block - mid 817 pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low 818 eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid 819 eor $res1b, $res1b, $ctr2b // AES final-1 block - result 820 eor $acc_hb, $acc_hb, $rk2 // GHASH final-2 block - high 821 pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid 822 eor $acc_lb, $acc_lb, $rk3 // GHASH final-2 block - low 823 eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-2 block - mid 824.Lenc_blocks_more_than_1: // blocks left > 1 825 st1 { $res1b}, [$output_ptr], #16 // AES final-1 block - store result 826 rev64 $res0b, $res1b // GHASH final-1 block 827 ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final block - load input low & high 828 eor $res0b, $res0b, $t0.16b // feed in partial tag 829 movi $t0.8b, #0 // suppress further partial tag feed in 830 eor $input_l0, $input_l0, $rkN_l // AES final block - round N low 831 mov $rk4d, $res0.d[1] // GHASH final-1 block - mid 832 pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high 833 eor $input_h0, $input_h0, $rkN_h // AES final block - round N high 834 eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid 835 eor $acc_hb, $acc_hb, $rk2 // GHASH final-1 block - high 836 ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid 837 fmov $res1d, $input_l0 // AES final block - mov low 838 fmov $res1.d[1], $input_h0 // AES final block - mov high 839 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid 840 pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low 841 eor $res1b, $res1b, $ctr3b // AES final block - result 842 eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-1 block - mid 843 eor $acc_lb, $acc_lb, $rk3 // GHASH final-1 block - low 844.Lenc_blocks_less_than_1: // blocks left <= 1 845 and $bit_length, $bit_length, #127 // bit_length %= 128 846 mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff 847 sub $bit_length, $bit_length, #128 // bit_length -= 128 848 neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128]) 849 ld1 { $rk0}, [$output_ptr] // load existing bytes where the possibly partial last block is to be stored 850 mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff 851 and $bit_length, $bit_length, #127 // bit_length %= 128 852 lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block 853 cmp $bit_length, #64 854 csel $input_l0, $rkN_l, $rkN_h, lt 855 csel $input_h0, $rkN_h, xzr, lt 856 fmov $ctr0d, $input_l0 // ctr0b is mask for last block 857 fmov $ctr0.d[1], $input_h0 858 and $res1b, $res1b, $ctr0b // possibly partial last block has zeroes in highest bits 859 rev64 $res0b, $res1b // GHASH final block 860 eor $res0b, $res0b, $t0.16b // feed in partial tag 861 bif $res1b, $rk0, $ctr0b // insert existing bytes in top end of result before storing 862 pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high 863 mov $t0d, $res0.d[1] // GHASH final block - mid 864 rev $ctr32w, $rctr32w 865 pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low 866 eor $acc_hb, $acc_hb, $rk2 // GHASH final block - high 867 eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid 868 pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid 869 eor $acc_lb, $acc_lb, $rk3 // GHASH final block - low 870 eor $acc_mb, $acc_mb, $t0.16b // GHASH final block - mid 871 movi $mod_constant.8b, #0xc2 872 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up 873 shl $mod_constantd, $mod_constantd, #56 // mod_constant 874 eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up 875 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid 876 ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment 877 eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid 878 eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid 879 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low 880 ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment 881 str $ctr32w, [$counter, #12] // store the updated counter 882 st1 { $res1b}, [$output_ptr] // store all 16B 883 eor $acc_lb, $acc_lb, $acc_hb // MODULO - fold into low 884 eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low 885 ext $acc_lb, $acc_lb, $acc_lb, #8 886 rev64 $acc_lb, $acc_lb 887 mov x0, $len 888 st1 { $acc_l.16b }, [$current_tag] 889 ldp x19, x20, [sp, #16] 890 ldp x21, x22, [sp, #32] 891 ldp x23, x24, [sp, #48] 892 ldp d8, d9, [sp, #64] 893 ldp d10, d11, [sp, #80] 894 ldp d12, d13, [sp, #96] 895 ldp d14, d15, [sp, #112] 896 ldp x29, x30, [sp], #128 897 AARCH64_VALIDATE_LINK_REGISTER 898 ret 899.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel 900___ 901 902{ 903my $t8="v4"; 904my $t8d="d4"; 905my $t9="v6"; 906my $t9d="d6"; 907################################################################################ 908# size_t aes_gcm_dec_kernel(const uint8_t *in, 909# size_t len_bits, 910# uint8_t *out, 911# u64 *Xi, 912# uint8_t ivec[16], 913# const void *key); 914# 915$code.=<<___; 916.global aes_gcm_dec_kernel 917.type aes_gcm_dec_kernel,%function 918.align 4 919aes_gcm_dec_kernel: 920 AARCH64_SIGN_LINK_REGISTER 921 stp x29, x30, [sp, #-128]! 922 mov x29, sp 923 stp x19, x20, [sp, #16] 924 mov $counter, x4 925 mov $cc, x5 926 stp x21, x22, [sp, #32] 927 stp x23, x24, [sp, #48] 928 stp d8, d9, [sp, #64] 929 stp d10, d11, [sp, #80] 930 stp d12, d13, [sp, #96] 931 stp d14, d15, [sp, #112] 932 ldr $roundsw, [$cc, #240] 933 add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key 934 ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys 935 ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys 936 lsr $main_end_input_ptr, $bit_length, #3 // byte_len 937 mov $len, $main_end_input_ptr 938 ldp $ctr96_b64x, $ctr96_t32x, [$counter] // ctr96_b64, ctr96_t32 939 ldr $rk8q, [$cc, #128] // load rk8 940 sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1 941 ldr $rk7q, [$cc, #112] // load rk7 942 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 943 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr 944 ldr $rk6q, [$cc, #96] // load rk6 945 lsr $rctr32x, $ctr96_t32x, #32 946 ldr $rk5q, [$cc, #80] // load rk5 947 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 948 ldr $rk3q, [$cc, #48] // load rk3 949 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 950 rev $rctr32w, $rctr32w // rev_ctr32 951 add $rctr32w, $rctr32w, #1 // increment rev_ctr32 952 fmov $ctr3d, $ctr96_b64x // CTR block 3 953 rev $ctr32w, $rctr32w // CTR block 1 954 add $rctr32w, $rctr32w, #1 // CTR block 1 955 fmov $ctr1d, $ctr96_b64x // CTR block 1 956 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 1 957 ld1 { $ctr0b}, [$counter] // special case vector load initial counter so we can start first AES block as quickly as possible 958 fmov $ctr1.d[1], $ctr32x // CTR block 1 959 rev $ctr32w, $rctr32w // CTR block 2 960 add $rctr32w, $rctr32w, #1 // CTR block 2 961 fmov $ctr2d, $ctr96_b64x // CTR block 2 962 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 2 963 fmov $ctr2.d[1], $ctr32x // CTR block 2 964 rev $ctr32w, $rctr32w // CTR block 3 965 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 3 966 ldr $rk0q, [$cc, #0] // load rk0 967 fmov $ctr3.d[1], $ctr32x // CTR block 3 968 add $rctr32w, $rctr32w, #1 // CTR block 3 969 ldr $rk4q, [$cc, #64] // load rk4 970 ldr $rk1q, [$cc, #16] // load rk1 971 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 0 972 ldr $h3q, [$Htable, #48] // load h3l | h3h 973 ext $h3b, $h3b, $h3b, #8 974 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 0 975 ldr $h4q, [$Htable, #80] // load h4l | h4h 976 ext $h4b, $h4b, $h4b, #8 977 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 0 978 ldr $h2q, [$Htable, #32] // load h2l | h2h 979 ext $h2b, $h2b, $h2b, #8 980 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 0 981 ldr $rk2q, [$cc, #32] // load rk2 982 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 1 983 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 1 984 ld1 { $acc_lb}, [$current_tag] 985 ext $acc_lb, $acc_lb, $acc_lb, #8 986 rev64 $acc_lb, $acc_lb 987 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 1 988 ldr $rk9q, [$cc, #144] // load rk9 989 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 1 990 ldr $rk12q, [$cc, #192] // load rk12 991 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 2 992 ldr $h1q, [$Htable] // load h1l | h1h 993 ext $h1b, $h1b, $h1b, #8 994 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 2 995 ldr $rk10q, [$cc, #160] // load rk10 996 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 2 997 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 3 998 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 2 999 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 3 1000 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 4 1001 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 3 1002 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 3 1003 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 4 1004 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 4 1005 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 4 1006 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 5 1007 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 5 1008 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 5 1009 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 5 1010 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 6 1011 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 6 1012 cmp $rounds, #12 // setup flags for AES-128/192/256 check 1013 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 6 1014 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 6 1015 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 7 1016 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 7 1017 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 7 1018 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 8 1019 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 7 1020 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 8 1021 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 8 1022 ldr $rk11q, [$cc, #176] // load rk11 1023 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 8 1024 b.lt .Ldec_finish_first_blocks // branch if AES-128 1025 1026 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 9 1027 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 9 1028 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 9 1029 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 9 1030 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 10 1031 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 10 1032 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 10 1033 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 10 1034 b.eq .Ldec_finish_first_blocks // branch if AES-192 1035 1036 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 11 1037 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 11 1038 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 11 1039 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 11 1040 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 12 1041 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 12 1042 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 12 1043 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 12 1044 1045.Ldec_finish_first_blocks: 1046 cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks 1047 trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h 1048 trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l 1049 trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h 1050 trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l 1051 eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k 1052 aese $ctr1b, $rkNm1 // AES block 1 - round N-1 1053 aese $ctr2b, $rkNm1 // AES block 2 - round N-1 1054 eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k 1055 aese $ctr3b, $rkNm1 // AES block 3 - round N-1 1056 aese $ctr0b, $rkNm1 // AES block 0 - round N-1 1057 b.ge .Ldec_tail // handle tail 1058 1059 ldr $res0q, [$input_ptr, #0] // AES block 0 - load ciphertext 1060 ldr $res1q, [$input_ptr, #16] // AES block 1 - load ciphertext 1061 rev $ctr32w, $rctr32w // CTR block 4 1062 eor $ctr0b, $res0b, $ctr0b // AES block 0 - result 1063 eor $ctr1b, $res1b, $ctr1b // AES block 1 - result 1064 rev64 $res1b, $res1b // GHASH block 1 1065 ldr $res3q, [$input_ptr, #48] // AES block 3 - load ciphertext 1066 mov $output_h0, $ctr0.d[1] // AES block 0 - mov high 1067 mov $output_l0, $ctr0.d[0] // AES block 0 - mov low 1068 rev64 $res0b, $res0b // GHASH block 0 1069 add $rctr32w, $rctr32w, #1 // CTR block 4 1070 fmov $ctr0d, $ctr96_b64x // CTR block 4 1071 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4 1072 fmov $ctr0.d[1], $ctr32x // CTR block 4 1073 rev $ctr32w, $rctr32w // CTR block 5 1074 add $rctr32w, $rctr32w, #1 // CTR block 5 1075 mov $output_l1, $ctr1.d[0] // AES block 1 - mov low 1076 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 5 1077 mov $output_h1, $ctr1.d[1] // AES block 1 - mov high 1078 eor $output_h0, $output_h0, $rkN_h // AES block 0 - round N high 1079 eor $output_l0, $output_l0, $rkN_l // AES block 0 - round N low 1080 stp $output_l0, $output_h0, [$output_ptr], #16 // AES block 0 - store result 1081 fmov $ctr1d, $ctr96_b64x // CTR block 5 1082 ldr $res2q, [$input_ptr, #32] // AES block 2 - load ciphertext 1083 add $input_ptr, $input_ptr, #64 // AES input_ptr update 1084 fmov $ctr1.d[1], $ctr32x // CTR block 5 1085 rev $ctr32w, $rctr32w // CTR block 6 1086 add $rctr32w, $rctr32w, #1 // CTR block 6 1087 eor $output_l1, $output_l1, $rkN_l // AES block 1 - round N low 1088 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 6 1089 eor $output_h1, $output_h1, $rkN_h // AES block 1 - round N high 1090 stp $output_l1, $output_h1, [$output_ptr], #16 // AES block 1 - store result 1091 eor $ctr2b, $res2b, $ctr2b // AES block 2 - result 1092 cmp $input_ptr, $main_end_input_ptr // check if we have <= 8 blocks 1093 b.ge .Ldec_prepretail // do prepretail 1094 1095.Ldec_main_loop: // main loop start 1096 mov $output_l2, $ctr2.d[0] // AES block 4k+2 - mov low 1097 ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 1098 eor $ctr3b, $res3b, $ctr3b // AES block 4k+3 - result 1099 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 1100 mov $output_h2, $ctr2.d[1] // AES block 4k+2 - mov high 1101 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 1102 fmov $ctr2d, $ctr96_b64x // CTR block 4k+6 1103 fmov $ctr2.d[1], $ctr32x // CTR block 4k+6 1104 eor $res0b, $res0b, $acc_lb // PRE 1 1105 rev $ctr32w, $rctr32w // CTR block 4k+7 1106 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 1107 mov $output_h3, $ctr3.d[1] // AES block 4k+3 - mov high 1108 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 1109 mov $output_l3, $ctr3.d[0] // AES block 4k+3 - mov low 1110 pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high 1111 mov $t0d, $res0.d[1] // GHASH block 4k - mid 1112 fmov $ctr3d, $ctr96_b64x // CTR block 4k+7 1113 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 1114 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+7 1115 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 1116 fmov $ctr3.d[1], $ctr32x // CTR block 4k+7 1117 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 1118 eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid 1119 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 1120 eor $output_h2, $output_h2, $rkN_h // AES block 4k+2 - round N high 1121 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 1122 mov $acc_md, $h34k.d[1] // GHASH block 4k - mid 1123 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 1124 rev64 $res2b, $res2b // GHASH block 4k+2 1125 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 1126 eor $output_l2, $output_l2, $rkN_l // AES block 4k+2 - round N low 1127 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 1128 stp $output_l2, $output_h2, [$output_ptr], #16 // AES block 4k+2 - store result 1129 pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low 1130 pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high 1131 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 1132 rev64 $res3b, $res3b // GHASH block 4k+3 1133 pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid 1134 eor $output_l3, $output_l3, $rkN_l // AES block 4k+3 - round N low 1135 pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low 1136 eor $output_h3, $output_h3, $rkN_h // AES block 4k+3 - round N high 1137 eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high 1138 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 1139 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 1140 mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid 1141 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 1142 eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low 1143 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 1144 add $rctr32w, $rctr32w, #1 // CTR block 4k+7 1145 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 1146 mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid 1147 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 1148 eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid 1149 pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low 1150 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 1151 eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid 1152 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 1153 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 1154 eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low 1155 pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid 1156 rev $ctr32w, $rctr32w // CTR block 4k+8 1157 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 1158 ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid 1159 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 1160 add $rctr32w, $rctr32w, #1 // CTR block 4k+8 1161 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 1162 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 1163 eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid 1164 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 1165 pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high 1166 mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid 1167 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 1168 pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid 1169 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 1170 eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high 1171 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 1172 pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low 1173 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+8 1174 eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid 1175 pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high 1176 cmp $rounds, #12 // setup flags for AES-128/192/256 check 1177 eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid 1178 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 1179 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 1180 eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high 1181 pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid 1182 movi $mod_constant.8b, #0xc2 1183 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 1184 eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low 1185 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 1186 shl $mod_constantd, $mod_constantd, #56 // mod_constant 1187 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 1188 eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid 1189 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 1190 b.lt .Ldec_main_loop_continue // branch if AES-128 1191 1192 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 1193 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 1194 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 1195 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 1196 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 1197 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 1198 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 1199 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 1200 b.eq .Ldec_main_loop_continue // branch if AES-192 1201 1202 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 1203 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 1204 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 1205 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 1206 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 1207 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 1208 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 1209 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 1210 1211.Ldec_main_loop_continue: 1212 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid 1213 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up 1214 ldr $res0q, [$input_ptr, #0] // AES block 4k+4 - load ciphertext 1215 aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 1216 ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment 1217 eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up 1218 ldr $res1q, [$input_ptr, #16] // AES block 4k+5 - load ciphertext 1219 eor $ctr0b, $res0b, $ctr0b // AES block 4k+4 - result 1220 stp $output_l3, $output_h3, [$output_ptr], #16 // AES block 4k+3 - store result 1221 eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid 1222 ldr $res3q, [$input_ptr, #48] // AES block 4k+7 - load ciphertext 1223 ldr $res2q, [$input_ptr, #32] // AES block 4k+6 - load ciphertext 1224 mov $output_h0, $ctr0.d[1] // AES block 4k+4 - mov high 1225 eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid 1226 aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 1227 add $input_ptr, $input_ptr, #64 // AES input_ptr update 1228 mov $output_l0, $ctr0.d[0] // AES block 4k+4 - mov low 1229 fmov $ctr0d, $ctr96_b64x // CTR block 4k+8 1230 fmov $ctr0.d[1], $ctr32x // CTR block 4k+8 1231 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low 1232 eor $ctr1b, $res1b, $ctr1b // AES block 4k+5 - result 1233 rev $ctr32w, $rctr32w // CTR block 4k+9 1234 aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 1235 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+9 1236 cmp $input_ptr, $main_end_input_ptr // LOOP CONTROL 1237 add $rctr32w, $rctr32w, #1 // CTR block 4k+9 1238 eor $output_l0, $output_l0, $rkN_l // AES block 4k+4 - round N low 1239 eor $output_h0, $output_h0, $rkN_h // AES block 4k+4 - round N high 1240 mov $output_h1, $ctr1.d[1] // AES block 4k+5 - mov high 1241 eor $ctr2b, $res2b, $ctr2b // AES block 4k+6 - result 1242 eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low 1243 mov $output_l1, $ctr1.d[0] // AES block 4k+5 - mov low 1244 fmov $ctr1d, $ctr96_b64x // CTR block 4k+9 1245 ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment 1246 fmov $ctr1.d[1], $ctr32x // CTR block 4k+9 1247 rev $ctr32w, $rctr32w // CTR block 4k+10 1248 add $rctr32w, $rctr32w, #1 // CTR block 4k+10 1249 aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 1250 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+10 1251 rev64 $res1b, $res1b // GHASH block 4k+5 1252 eor $output_h1, $output_h1, $rkN_h // AES block 4k+5 - round N high 1253 stp $output_l0, $output_h0, [$output_ptr], #16 // AES block 4k+4 - store result 1254 eor $output_l1, $output_l1, $rkN_l // AES block 4k+5 - round N low 1255 stp $output_l1, $output_h1, [$output_ptr], #16 // AES block 4k+5 - store result 1256 rev64 $res0b, $res0b // GHASH block 4k+4 1257 eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low 1258 b.lt .Ldec_main_loop 1259 1260.Ldec_prepretail: // PREPRETAIL 1261 ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0 1262 mov $output_l2, $ctr2.d[0] // AES block 4k+2 - mov low 1263 eor $ctr3b, $res3b, $ctr3b // AES block 4k+3 - result 1264 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0 1265 mov $output_h2, $ctr2.d[1] // AES block 4k+2 - mov high 1266 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0 1267 fmov $ctr2d, $ctr96_b64x // CTR block 4k+6 1268 fmov $ctr2.d[1], $ctr32x // CTR block 4k+6 1269 rev $ctr32w, $rctr32w // CTR block 4k+7 1270 eor $res0b, $res0b, $acc_lb // PRE 1 1271 rev64 $res2b, $res2b // GHASH block 4k+2 1272 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+7 1273 mov $output_l3, $ctr3.d[0] // AES block 4k+3 - mov low 1274 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1 1275 mov $output_h3, $ctr3.d[1] // AES block 4k+3 - mov high 1276 pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low 1277 mov $t0d, $res0.d[1] // GHASH block 4k - mid 1278 fmov $ctr3d, $ctr96_b64x // CTR block 4k+7 1279 pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high 1280 fmov $ctr3.d[1], $ctr32x // CTR block 4k+7 1281 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0 1282 mov $acc_md, $h34k.d[1] // GHASH block 4k - mid 1283 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1 1284 eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid 1285 pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high 1286 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1 1287 rev64 $res3b, $res3b // GHASH block 4k+3 1288 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0 1289 pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid 1290 eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high 1291 pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low 1292 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1 1293 mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid 1294 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2 1295 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2 1296 eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low 1297 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2 1298 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3 1299 mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid 1300 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2 1301 eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid 1302 pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low 1303 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4 1304 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3 1305 eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid 1306 pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid 1307 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5 1308 eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low 1309 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4 1310 pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high 1311 eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid 1312 pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high 1313 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5 1314 ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid 1315 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3 1316 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3 1317 eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high 1318 pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low 1319 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4 1320 mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid 1321 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4 1322 pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid 1323 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5 1324 eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid 1325 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5 1326 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6 1327 eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid 1328 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6 1329 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6 1330 movi $mod_constant.8b, #0xc2 1331 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6 1332 eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low 1333 pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid 1334 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7 1335 cmp $rounds, #12 // setup flags for AES-128/192/256 check 1336 eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high 1337 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7 1338 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7 1339 eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid 1340 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8 1341 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7 1342 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up 1343 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8 1344 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8 1345 shl $mod_constantd, $mod_constantd, #56 // mod_constant 1346 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8 1347 b.lt .Ldec_finish_prepretail // branch if AES-128 1348 1349 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9 1350 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9 1351 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9 1352 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9 1353 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10 1354 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10 1355 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10 1356 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10 1357 b.eq .Ldec_finish_prepretail // branch if AES-192 1358 1359 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11 1360 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11 1361 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11 1362 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12 1363 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11 1364 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12 1365 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12 1366 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12 1367 1368.Ldec_finish_prepretail: 1369 eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up 1370 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid 1371 ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment 1372 eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid 1373 eor $output_h2, $output_h2, $rkN_h // AES block 4k+2 - round N high 1374 eor $output_l3, $output_l3, $rkN_l // AES block 4k+3 - round N low 1375 eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid 1376 add $rctr32w, $rctr32w, #1 // CTR block 4k+7 1377 eor $output_l2, $output_l2, $rkN_l // AES block 4k+2 - round N low 1378 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low 1379 eor $output_h3, $output_h3, $rkN_h // AES block 4k+3 - round N high 1380 stp $output_l2, $output_h2, [$output_ptr], #16 // AES block 4k+2 - store result 1381 ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment 1382 stp $output_l3, $output_h3, [$output_ptr], #16 // AES block 4k+3 - store result 1383 1384 eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low 1385 aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1 1386 aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1 1387 aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1 1388 aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1 1389 eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low 1390 1391.Ldec_tail: // TAIL 1392 sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process 1393 ld1 { $res1b}, [$input_ptr], #16 // AES block 4k+4 - load ciphertext 1394 eor $ctr0b, $res1b, $ctr0b // AES block 4k+4 - result 1395 mov $output_l0, $ctr0.d[0] // AES block 4k+4 - mov low 1396 mov $output_h0, $ctr0.d[1] // AES block 4k+4 - mov high 1397 ext $t0.16b, $acc_lb, $acc_lb, #8 // prepare final partial tag 1398 cmp $main_end_input_ptr, #48 1399 eor $output_l0, $output_l0, $rkN_l // AES block 4k+4 - round N low 1400 eor $output_h0, $output_h0, $rkN_h // AES block 4k+4 - round N high 1401 b.gt .Ldec_blocks_more_than_3 1402 sub $rctr32w, $rctr32w, #1 1403 mov $ctr3b, $ctr2b 1404 movi $acc_m.8b, #0 1405 movi $acc_l.8b, #0 1406 cmp $main_end_input_ptr, #32 1407 movi $acc_h.8b, #0 1408 mov $ctr2b, $ctr1b 1409 b.gt .Ldec_blocks_more_than_2 1410 sub $rctr32w, $rctr32w, #1 1411 mov $ctr3b, $ctr1b 1412 cmp $main_end_input_ptr, #16 1413 b.gt .Ldec_blocks_more_than_1 1414 sub $rctr32w, $rctr32w, #1 1415 b .Ldec_blocks_less_than_1 1416.Ldec_blocks_more_than_3: // blocks left > 3 1417 rev64 $res0b, $res1b // GHASH final-3 block 1418 ld1 { $res1b}, [$input_ptr], #16 // AES final-2 block - load ciphertext 1419 stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-3 block - store result 1420 mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid 1421 eor $res0b, $res0b, $t0.16b // feed in partial tag 1422 eor $ctr0b, $res1b, $ctr1b // AES final-2 block - result 1423 mov $rk4d, $res0.d[1] // GHASH final-3 block - mid 1424 mov $output_l0, $ctr0.d[0] // AES final-2 block - mov low 1425 mov $output_h0, $ctr0.d[1] // AES final-2 block - mov high 1426 eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid 1427 movi $t0.8b, #0 // suppress further partial tag feed in 1428 pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high 1429 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid 1430 eor $output_l0, $output_l0, $rkN_l // AES final-2 block - round N low 1431 pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low 1432 eor $output_h0, $output_h0, $rkN_h // AES final-2 block - round N high 1433.Ldec_blocks_more_than_2: // blocks left > 2 1434 rev64 $res0b, $res1b // GHASH final-2 block 1435 ld1 { $res1b}, [$input_ptr], #16 // AES final-1 block - load ciphertext 1436 eor $res0b, $res0b, $t0.16b // feed in partial tag 1437 stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-2 block - store result 1438 eor $ctr0b, $res1b, $ctr2b // AES final-1 block - result 1439 mov $rk4d, $res0.d[1] // GHASH final-2 block - mid 1440 pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low 1441 pmull2 $rk2q1, $res0.2d, $h3.2d // GHASH final-2 block - high 1442 eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid 1443 mov $output_l0, $ctr0.d[0] // AES final-1 block - mov low 1444 mov $output_h0, $ctr0.d[1] // AES final-1 block - mov high 1445 eor $acc_lb, $acc_lb, $rk3 // GHASH final-2 block - low 1446 movi $t0.8b, #0 // suppress further partial tag feed in 1447 pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid 1448 eor $acc_hb, $acc_hb, $rk2 // GHASH final-2 block - high 1449 eor $output_l0, $output_l0, $rkN_l // AES final-1 block - round N low 1450 eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-2 block - mid 1451 eor $output_h0, $output_h0, $rkN_h // AES final-1 block - round N high 1452.Ldec_blocks_more_than_1: // blocks left > 1 1453 stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-1 block - store result 1454 rev64 $res0b, $res1b // GHASH final-1 block 1455 ld1 { $res1b}, [$input_ptr], #16 // AES final block - load ciphertext 1456 eor $res0b, $res0b, $t0.16b // feed in partial tag 1457 movi $t0.8b, #0 // suppress further partial tag feed in 1458 mov $rk4d, $res0.d[1] // GHASH final-1 block - mid 1459 eor $ctr0b, $res1b, $ctr3b // AES final block - result 1460 pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high 1461 eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid 1462 pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low 1463 mov $output_l0, $ctr0.d[0] // AES final block - mov low 1464 ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid 1465 mov $output_h0, $ctr0.d[1] // AES final block - mov high 1466 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid 1467 eor $output_l0, $output_l0, $rkN_l // AES final block - round N low 1468 eor $acc_lb, $acc_lb, $rk3 // GHASH final-1 block - low 1469 eor $acc_hb, $acc_hb, $rk2 // GHASH final-1 block - high 1470 eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-1 block - mid 1471 eor $output_h0, $output_h0, $rkN_h // AES final block - round N high 1472.Ldec_blocks_less_than_1: // blocks left <= 1 1473 and $bit_length, $bit_length, #127 // bit_length %= 128 1474 mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff 1475 sub $bit_length, $bit_length, #128 // bit_length -= 128 1476 mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff 1477 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite 1478 neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128]) 1479 and $bit_length, $bit_length, #127 // bit_length %= 128 1480 lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block 1481 cmp $bit_length, #64 1482 csel $ctr32x, $rkN_l, $rkN_h, lt 1483 csel $ctr96_b64x, $rkN_h, xzr, lt 1484 fmov $ctr0d, $ctr32x // ctr0b is mask for last block 1485 and $output_l0, $output_l0, $ctr32x 1486 mov $ctr0.d[1], $ctr96_b64x 1487 bic $end_input_ptr, $end_input_ptr, $ctr32x // mask out low existing bytes 1488 rev $ctr32w, $rctr32w 1489 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x // mask out high existing bytes 1490 orr $output_l0, $output_l0, $end_input_ptr 1491 and $output_h0, $output_h0, $ctr96_b64x 1492 orr $output_h0, $output_h0, $main_end_input_ptr 1493 and $res1b, $res1b, $ctr0b // possibly partial last block has zeroes in highest bits 1494 rev64 $res0b, $res1b // GHASH final block 1495 eor $res0b, $res0b, $t0.16b // feed in partial tag 1496 pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low 1497 mov $t0d, $res0.d[1] // GHASH final block - mid 1498 eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid 1499 pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high 1500 pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid 1501 eor $acc_hb, $acc_hb, $rk2 // GHASH final block - high 1502 eor $acc_lb, $acc_lb, $rk3 // GHASH final block - low 1503 eor $acc_mb, $acc_mb, $t0.16b // GHASH final block - mid 1504 movi $mod_constant.8b, #0xc2 1505 eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up 1506 shl $mod_constantd, $mod_constantd, #56 // mod_constant 1507 eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up 1508 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid 1509 ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment 1510 eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid 1511 eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid 1512 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low 1513 ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment 1514 eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low 1515 stp $output_l0, $output_h0, [$output_ptr] 1516 str $ctr32w, [$counter, #12] // store the updated counter 1517 eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low 1518 ext $acc_lb, $acc_lb, $acc_lb, #8 1519 rev64 $acc_lb, $acc_lb 1520 mov x0, $len 1521 st1 { $acc_l.16b }, [$current_tag] 1522 ldp x19, x20, [sp, #16] 1523 ldp x21, x22, [sp, #32] 1524 ldp x23, x24, [sp, #48] 1525 ldp d8, d9, [sp, #64] 1526 ldp d10, d11, [sp, #80] 1527 ldp d12, d13, [sp, #96] 1528 ldp d14, d15, [sp, #112] 1529 ldp x29, x30, [sp], #128 1530 AARCH64_VALIDATE_LINK_REGISTER 1531 ret 1532.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel 1533___ 1534} 1535} 1536 1537$code.=<<___; 1538#endif 1539___ 1540 1541print $code; 1542close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1543