1#! /usr/bin/env perl 2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <[email protected]> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for ARMv8 AES instructions. The 18# module is endian-agnostic in sense that it supports both big- and 19# little-endian cases. As does it support both 32- and 64-bit modes 20# of operation. Latter is achieved by limiting amount of utilized 21# registers to 16, which implies additional NEON load and integer 22# instructions. This has no effect on mighty Apple A7, where results 23# are literally equal to the theoretical estimates based on AES 24# instruction latencies and issue rates. On Cortex-A53, an in-order 25# execution core, this costs up to 10-15%, which is partially 26# compensated by implementing dedicated code path for 128-bit 27# CBC encrypt case. On Cortex-A57 parallelizable mode performance 28# seems to be limited by sheer amount of NEON instructions... 29# 30# Performance in cycles per byte processed with 128-bit key: 31# 32# CBC enc CBC dec CTR 33# Apple A7 2.39 1.20 1.20 34# Cortex-A53 1.32 1.29 1.46 35# Cortex-A57(*) 1.95 0.85 0.93 36# Denver 1.96 0.86 0.80 37# Mongoose 1.33 1.20 1.20 38# 39# (*) original 3.64/1.34/1.32 results were for r0p0 revision 40# and are still same even for updated module; 41 42$flavour = shift; 43$output = shift; 44 45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 48die "can't locate arm-xlate.pl"; 49 50open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 51*STDOUT=*OUT; 52 53$prefix="aes_hw"; 54 55$code=<<___; 56#include <openssl/arm_arch.h> 57 58#if __ARM_MAX_ARCH__>=7 59.text 60___ 61$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); 62$code.=<<___ if ($flavour !~ /64/); 63.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) 64.fpu neon 65.code 32 66#undef __thumb2__ 67___ 68 69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, 70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to 71# maintain both 32- and 64-bit codes within single module and 72# transliterate common code to either flavour with regex vodoo. 73# 74{{{ 75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); 76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= 77 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); 78 79 80# On AArch64, put the data .rodata and use adrp + add for compatibility with 81# execute-only memory. On AArch32, put it in .text and use adr. 82$code.= ".section .rodata\n" if ($flavour =~ /64/); 83$code.=<<___; 84.align 5 85.Lrcon: 86.long 0x01,0x01,0x01,0x01 87.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 88.long 0x1b,0x1b,0x1b,0x1b 89 90.text 91 92.globl ${prefix}_set_encrypt_key 93.type ${prefix}_set_encrypt_key,%function 94.align 5 95${prefix}_set_encrypt_key: 96.Lenc_key: 97___ 98$code.=<<___ if ($flavour =~ /64/); 99 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 100 AARCH64_VALID_CALL_TARGET 101 stp x29,x30,[sp,#-16]! 102 add x29,sp,#0 103___ 104$code.=<<___; 105 mov $ptr,#-2 106 cmp $bits,#128 107 b.lt .Lenc_key_abort 108 cmp $bits,#256 109 b.gt .Lenc_key_abort 110 tst $bits,#0x3f 111 b.ne .Lenc_key_abort 112 113___ 114$code.=<<___ if ($flavour =~ /64/); 115 adrp $ptr,:pg_hi21:.Lrcon 116 add $ptr,$ptr,:lo12:.Lrcon 117___ 118$code.=<<___ if ($flavour !~ /64/); 119 adr $ptr,.Lrcon 120___ 121$code.=<<___; 122 cmp $bits,#192 123 124 veor $zero,$zero,$zero 125 vld1.8 {$in0},[$inp],#16 126 mov $bits,#8 // reuse $bits 127 vld1.32 {$rcon,$mask},[$ptr],#32 128 129 b.lt .Loop128 130 b.eq .L192 131 b .L256 132 133.align 4 134.Loop128: 135 vtbl.8 $key,{$in0},$mask 136 vext.8 $tmp,$zero,$in0,#12 137 vst1.32 {$in0},[$out],#16 138 aese $key,$zero 139 subs $bits,$bits,#1 140 141 veor $in0,$in0,$tmp 142 vext.8 $tmp,$zero,$tmp,#12 143 veor $in0,$in0,$tmp 144 vext.8 $tmp,$zero,$tmp,#12 145 veor $key,$key,$rcon 146 veor $in0,$in0,$tmp 147 vshl.u8 $rcon,$rcon,#1 148 veor $in0,$in0,$key 149 b.ne .Loop128 150 151 vld1.32 {$rcon},[$ptr] 152 153 vtbl.8 $key,{$in0},$mask 154 vext.8 $tmp,$zero,$in0,#12 155 vst1.32 {$in0},[$out],#16 156 aese $key,$zero 157 158 veor $in0,$in0,$tmp 159 vext.8 $tmp,$zero,$tmp,#12 160 veor $in0,$in0,$tmp 161 vext.8 $tmp,$zero,$tmp,#12 162 veor $key,$key,$rcon 163 veor $in0,$in0,$tmp 164 vshl.u8 $rcon,$rcon,#1 165 veor $in0,$in0,$key 166 167 vtbl.8 $key,{$in0},$mask 168 vext.8 $tmp,$zero,$in0,#12 169 vst1.32 {$in0},[$out],#16 170 aese $key,$zero 171 172 veor $in0,$in0,$tmp 173 vext.8 $tmp,$zero,$tmp,#12 174 veor $in0,$in0,$tmp 175 vext.8 $tmp,$zero,$tmp,#12 176 veor $key,$key,$rcon 177 veor $in0,$in0,$tmp 178 veor $in0,$in0,$key 179 vst1.32 {$in0},[$out] 180 add $out,$out,#0x50 181 182 mov $rounds,#10 183 b .Ldone 184 185.align 4 186.L192: 187 vld1.8 {$in1},[$inp],#8 188 vmov.i8 $key,#8 // borrow $key 189 vst1.32 {$in0},[$out],#16 190 vsub.i8 $mask,$mask,$key // adjust the mask 191 192.Loop192: 193 vtbl.8 $key,{$in1},$mask 194 vext.8 $tmp,$zero,$in0,#12 195 vst1.32 {$in1},[$out],#8 196 aese $key,$zero 197 subs $bits,$bits,#1 198 199 veor $in0,$in0,$tmp 200 vext.8 $tmp,$zero,$tmp,#12 201 veor $in0,$in0,$tmp 202 vext.8 $tmp,$zero,$tmp,#12 203 veor $in0,$in0,$tmp 204 205 vdup.32 $tmp,${in0}[3] 206 veor $tmp,$tmp,$in1 207 veor $key,$key,$rcon 208 vext.8 $in1,$zero,$in1,#12 209 vshl.u8 $rcon,$rcon,#1 210 veor $in1,$in1,$tmp 211 veor $in0,$in0,$key 212 veor $in1,$in1,$key 213 vst1.32 {$in0},[$out],#16 214 b.ne .Loop192 215 216 mov $rounds,#12 217 add $out,$out,#0x20 218 b .Ldone 219 220.align 4 221.L256: 222 vld1.8 {$in1},[$inp] 223 mov $bits,#7 224 mov $rounds,#14 225 vst1.32 {$in0},[$out],#16 226 227.Loop256: 228 vtbl.8 $key,{$in1},$mask 229 vext.8 $tmp,$zero,$in0,#12 230 vst1.32 {$in1},[$out],#16 231 aese $key,$zero 232 subs $bits,$bits,#1 233 234 veor $in0,$in0,$tmp 235 vext.8 $tmp,$zero,$tmp,#12 236 veor $in0,$in0,$tmp 237 vext.8 $tmp,$zero,$tmp,#12 238 veor $key,$key,$rcon 239 veor $in0,$in0,$tmp 240 vshl.u8 $rcon,$rcon,#1 241 veor $in0,$in0,$key 242 vst1.32 {$in0},[$out],#16 243 b.eq .Ldone 244 245 vdup.32 $key,${in0}[3] // just splat 246 vext.8 $tmp,$zero,$in1,#12 247 aese $key,$zero 248 249 veor $in1,$in1,$tmp 250 vext.8 $tmp,$zero,$tmp,#12 251 veor $in1,$in1,$tmp 252 vext.8 $tmp,$zero,$tmp,#12 253 veor $in1,$in1,$tmp 254 255 veor $in1,$in1,$key 256 b .Loop256 257 258.Ldone: 259 str $rounds,[$out] 260 mov $ptr,#0 261 262.Lenc_key_abort: 263 mov x0,$ptr // return value 264 `"ldr x29,[sp],#16" if ($flavour =~ /64/)` 265 ret 266.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key 267 268.globl ${prefix}_set_decrypt_key 269.type ${prefix}_set_decrypt_key,%function 270.align 5 271${prefix}_set_decrypt_key: 272___ 273$code.=<<___ if ($flavour =~ /64/); 274 AARCH64_SIGN_LINK_REGISTER 275 stp x29,x30,[sp,#-16]! 276 add x29,sp,#0 277___ 278$code.=<<___ if ($flavour !~ /64/); 279 stmdb sp!,{r4,lr} 280___ 281$code.=<<___; 282 bl .Lenc_key 283 284 cmp x0,#0 285 b.ne .Ldec_key_abort 286 287 sub $out,$out,#240 // restore original $out 288 mov x4,#-16 289 add $inp,$out,x12,lsl#4 // end of key schedule 290 291 vld1.32 {v0.16b},[$out] 292 vld1.32 {v1.16b},[$inp] 293 vst1.32 {v0.16b},[$inp],x4 294 vst1.32 {v1.16b},[$out],#16 295 296.Loop_imc: 297 vld1.32 {v0.16b},[$out] 298 vld1.32 {v1.16b},[$inp] 299 aesimc v0.16b,v0.16b 300 aesimc v1.16b,v1.16b 301 vst1.32 {v0.16b},[$inp],x4 302 vst1.32 {v1.16b},[$out],#16 303 cmp $inp,$out 304 b.hi .Loop_imc 305 306 vld1.32 {v0.16b},[$out] 307 aesimc v0.16b,v0.16b 308 vst1.32 {v0.16b},[$inp] 309 310 eor x0,x0,x0 // return value 311.Ldec_key_abort: 312___ 313$code.=<<___ if ($flavour !~ /64/); 314 ldmia sp!,{r4,pc} 315___ 316$code.=<<___ if ($flavour =~ /64/); 317 ldp x29,x30,[sp],#16 318 AARCH64_VALIDATE_LINK_REGISTER 319 ret 320___ 321$code.=<<___; 322.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key 323___ 324}}} 325{{{ 326sub gen_block () { 327my $dir = shift; 328my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); 329my ($inp,$out,$key)=map("x$_",(0..2)); 330my $rounds="w3"; 331my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); 332 333$code.=<<___; 334.globl ${prefix}_${dir}crypt 335.type ${prefix}_${dir}crypt,%function 336.align 5 337${prefix}_${dir}crypt: 338 AARCH64_VALID_CALL_TARGET 339 ldr $rounds,[$key,#240] 340 vld1.32 {$rndkey0},[$key],#16 341 vld1.8 {$inout},[$inp] 342 sub $rounds,$rounds,#2 343 vld1.32 {$rndkey1},[$key],#16 344 345.Loop_${dir}c: 346 aes$e $inout,$rndkey0 347 aes$mc $inout,$inout 348 vld1.32 {$rndkey0},[$key],#16 349 subs $rounds,$rounds,#2 350 aes$e $inout,$rndkey1 351 aes$mc $inout,$inout 352 vld1.32 {$rndkey1},[$key],#16 353 b.gt .Loop_${dir}c 354 355 aes$e $inout,$rndkey0 356 aes$mc $inout,$inout 357 vld1.32 {$rndkey0},[$key] 358 aes$e $inout,$rndkey1 359 veor $inout,$inout,$rndkey0 360 361 vst1.8 {$inout},[$out] 362 ret 363.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt 364___ 365} 366&gen_block("en"); 367&gen_block("de"); 368}}} 369{{{ 370my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; 371my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); 372my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 373 374my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); 375my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); 376 377### q8-q15 preloaded key schedule 378 379$code.=<<___; 380.globl ${prefix}_cbc_encrypt 381.type ${prefix}_cbc_encrypt,%function 382.align 5 383${prefix}_cbc_encrypt: 384___ 385$code.=<<___ if ($flavour =~ /64/); 386 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 387 AARCH64_VALID_CALL_TARGET 388 stp x29,x30,[sp,#-16]! 389 add x29,sp,#0 390___ 391$code.=<<___ if ($flavour !~ /64/); 392 mov ip,sp 393 stmdb sp!,{r4-r8,lr} 394 vstmdb sp!,{d8-d15} @ ABI specification says so 395 ldmia ip,{r4-r5} @ load remaining args 396___ 397$code.=<<___; 398 subs $len,$len,#16 399 mov $step,#16 400 b.lo .Lcbc_abort 401 cclr $step,eq 402 403 cmp $enc,#0 // en- or decrypting? 404 ldr $rounds,[$key,#240] 405 and $len,$len,#-16 406 vld1.8 {$ivec},[$ivp] 407 vld1.8 {$dat},[$inp],$step 408 409 vld1.32 {q8-q9},[$key] // load key schedule... 410 sub $rounds,$rounds,#6 411 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys 412 sub $rounds,$rounds,#2 413 vld1.32 {q10-q11},[$key_],#32 414 vld1.32 {q12-q13},[$key_],#32 415 vld1.32 {q14-q15},[$key_],#32 416 vld1.32 {$rndlast},[$key_] 417 418 add $key_,$key,#32 419 mov $cnt,$rounds 420 b.eq .Lcbc_dec 421 422 cmp $rounds,#2 423 veor $dat,$dat,$ivec 424 veor $rndzero_n_last,q8,$rndlast 425 b.eq .Lcbc_enc128 426 427 vld1.32 {$in0-$in1},[$key_] 428 add $key_,$key,#16 429 add $key4,$key,#16*4 430 add $key5,$key,#16*5 431 aese $dat,q8 432 aesmc $dat,$dat 433 add $key6,$key,#16*6 434 add $key7,$key,#16*7 435 b .Lenter_cbc_enc 436 437.align 4 438.Loop_cbc_enc: 439 aese $dat,q8 440 aesmc $dat,$dat 441 vst1.8 {$ivec},[$out],#16 442.Lenter_cbc_enc: 443 aese $dat,q9 444 aesmc $dat,$dat 445 aese $dat,$in0 446 aesmc $dat,$dat 447 vld1.32 {q8},[$key4] 448 cmp $rounds,#4 449 aese $dat,$in1 450 aesmc $dat,$dat 451 vld1.32 {q9},[$key5] 452 b.eq .Lcbc_enc192 453 454 aese $dat,q8 455 aesmc $dat,$dat 456 vld1.32 {q8},[$key6] 457 aese $dat,q9 458 aesmc $dat,$dat 459 vld1.32 {q9},[$key7] 460 nop 461 462.Lcbc_enc192: 463 aese $dat,q8 464 aesmc $dat,$dat 465 subs $len,$len,#16 466 aese $dat,q9 467 aesmc $dat,$dat 468 cclr $step,eq 469 aese $dat,q10 470 aesmc $dat,$dat 471 aese $dat,q11 472 aesmc $dat,$dat 473 vld1.8 {q8},[$inp],$step 474 aese $dat,q12 475 aesmc $dat,$dat 476 veor q8,q8,$rndzero_n_last 477 aese $dat,q13 478 aesmc $dat,$dat 479 vld1.32 {q9},[$key_] // re-pre-load rndkey[1] 480 aese $dat,q14 481 aesmc $dat,$dat 482 aese $dat,q15 483 veor $ivec,$dat,$rndlast 484 b.hs .Loop_cbc_enc 485 486 vst1.8 {$ivec},[$out],#16 487 b .Lcbc_done 488 489.align 5 490.Lcbc_enc128: 491 vld1.32 {$in0-$in1},[$key_] 492 aese $dat,q8 493 aesmc $dat,$dat 494 b .Lenter_cbc_enc128 495.Loop_cbc_enc128: 496 aese $dat,q8 497 aesmc $dat,$dat 498 vst1.8 {$ivec},[$out],#16 499.Lenter_cbc_enc128: 500 aese $dat,q9 501 aesmc $dat,$dat 502 subs $len,$len,#16 503 aese $dat,$in0 504 aesmc $dat,$dat 505 cclr $step,eq 506 aese $dat,$in1 507 aesmc $dat,$dat 508 aese $dat,q10 509 aesmc $dat,$dat 510 aese $dat,q11 511 aesmc $dat,$dat 512 vld1.8 {q8},[$inp],$step 513 aese $dat,q12 514 aesmc $dat,$dat 515 aese $dat,q13 516 aesmc $dat,$dat 517 aese $dat,q14 518 aesmc $dat,$dat 519 veor q8,q8,$rndzero_n_last 520 aese $dat,q15 521 veor $ivec,$dat,$rndlast 522 b.hs .Loop_cbc_enc128 523 524 vst1.8 {$ivec},[$out],#16 525 b .Lcbc_done 526___ 527{ 528my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 529$code.=<<___; 530.align 5 531.Lcbc_dec: 532 vld1.8 {$dat2},[$inp],#16 533 subs $len,$len,#32 // bias 534 add $cnt,$rounds,#2 535 vorr $in1,$dat,$dat 536 vorr $dat1,$dat,$dat 537 vorr $in2,$dat2,$dat2 538 b.lo .Lcbc_dec_tail 539 540 vorr $dat1,$dat2,$dat2 541 vld1.8 {$dat2},[$inp],#16 542 vorr $in0,$dat,$dat 543 vorr $in1,$dat1,$dat1 544 vorr $in2,$dat2,$dat2 545 546.Loop3x_cbc_dec: 547 aesd $dat0,q8 548 aesimc $dat0,$dat0 549 aesd $dat1,q8 550 aesimc $dat1,$dat1 551 aesd $dat2,q8 552 aesimc $dat2,$dat2 553 vld1.32 {q8},[$key_],#16 554 subs $cnt,$cnt,#2 555 aesd $dat0,q9 556 aesimc $dat0,$dat0 557 aesd $dat1,q9 558 aesimc $dat1,$dat1 559 aesd $dat2,q9 560 aesimc $dat2,$dat2 561 vld1.32 {q9},[$key_],#16 562 b.gt .Loop3x_cbc_dec 563 564 aesd $dat0,q8 565 aesimc $dat0,$dat0 566 aesd $dat1,q8 567 aesimc $dat1,$dat1 568 aesd $dat2,q8 569 aesimc $dat2,$dat2 570 veor $tmp0,$ivec,$rndlast 571 subs $len,$len,#0x30 572 veor $tmp1,$in0,$rndlast 573 mov.lo x6,$len // x6, $cnt, is zero at this point 574 aesd $dat0,q9 575 aesimc $dat0,$dat0 576 aesd $dat1,q9 577 aesimc $dat1,$dat1 578 aesd $dat2,q9 579 aesimc $dat2,$dat2 580 veor $tmp2,$in1,$rndlast 581 add $inp,$inp,x6 // $inp is adjusted in such way that 582 // at exit from the loop $dat1-$dat2 583 // are loaded with last "words" 584 vorr $ivec,$in2,$in2 585 mov $key_,$key 586 aesd $dat0,q12 587 aesimc $dat0,$dat0 588 aesd $dat1,q12 589 aesimc $dat1,$dat1 590 aesd $dat2,q12 591 aesimc $dat2,$dat2 592 vld1.8 {$in0},[$inp],#16 593 aesd $dat0,q13 594 aesimc $dat0,$dat0 595 aesd $dat1,q13 596 aesimc $dat1,$dat1 597 aesd $dat2,q13 598 aesimc $dat2,$dat2 599 vld1.8 {$in1},[$inp],#16 600 aesd $dat0,q14 601 aesimc $dat0,$dat0 602 aesd $dat1,q14 603 aesimc $dat1,$dat1 604 aesd $dat2,q14 605 aesimc $dat2,$dat2 606 vld1.8 {$in2},[$inp],#16 607 aesd $dat0,q15 608 aesd $dat1,q15 609 aesd $dat2,q15 610 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 611 add $cnt,$rounds,#2 612 veor $tmp0,$tmp0,$dat0 613 veor $tmp1,$tmp1,$dat1 614 veor $dat2,$dat2,$tmp2 615 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 616 vst1.8 {$tmp0},[$out],#16 617 vorr $dat0,$in0,$in0 618 vst1.8 {$tmp1},[$out],#16 619 vorr $dat1,$in1,$in1 620 vst1.8 {$dat2},[$out],#16 621 vorr $dat2,$in2,$in2 622 b.hs .Loop3x_cbc_dec 623 624 cmn $len,#0x30 625 b.eq .Lcbc_done 626 nop 627 628.Lcbc_dec_tail: 629 aesd $dat1,q8 630 aesimc $dat1,$dat1 631 aesd $dat2,q8 632 aesimc $dat2,$dat2 633 vld1.32 {q8},[$key_],#16 634 subs $cnt,$cnt,#2 635 aesd $dat1,q9 636 aesimc $dat1,$dat1 637 aesd $dat2,q9 638 aesimc $dat2,$dat2 639 vld1.32 {q9},[$key_],#16 640 b.gt .Lcbc_dec_tail 641 642 aesd $dat1,q8 643 aesimc $dat1,$dat1 644 aesd $dat2,q8 645 aesimc $dat2,$dat2 646 aesd $dat1,q9 647 aesimc $dat1,$dat1 648 aesd $dat2,q9 649 aesimc $dat2,$dat2 650 aesd $dat1,q12 651 aesimc $dat1,$dat1 652 aesd $dat2,q12 653 aesimc $dat2,$dat2 654 cmn $len,#0x20 655 aesd $dat1,q13 656 aesimc $dat1,$dat1 657 aesd $dat2,q13 658 aesimc $dat2,$dat2 659 veor $tmp1,$ivec,$rndlast 660 aesd $dat1,q14 661 aesimc $dat1,$dat1 662 aesd $dat2,q14 663 aesimc $dat2,$dat2 664 veor $tmp2,$in1,$rndlast 665 aesd $dat1,q15 666 aesd $dat2,q15 667 b.eq .Lcbc_dec_one 668 veor $tmp1,$tmp1,$dat1 669 veor $tmp2,$tmp2,$dat2 670 vorr $ivec,$in2,$in2 671 vst1.8 {$tmp1},[$out],#16 672 vst1.8 {$tmp2},[$out],#16 673 b .Lcbc_done 674 675.Lcbc_dec_one: 676 veor $tmp1,$tmp1,$dat2 677 vorr $ivec,$in2,$in2 678 vst1.8 {$tmp1},[$out],#16 679 680.Lcbc_done: 681 vst1.8 {$ivec},[$ivp] 682.Lcbc_abort: 683___ 684} 685$code.=<<___ if ($flavour !~ /64/); 686 vldmia sp!,{d8-d15} 687 ldmia sp!,{r4-r8,pc} 688___ 689$code.=<<___ if ($flavour =~ /64/); 690 ldr x29,[sp],#16 691 ret 692___ 693$code.=<<___; 694.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt 695___ 696}}} 697{{{ 698my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); 699my ($rounds,$cnt,$key_)=("w5","w6","x7"); 700my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); 701my $step="x12"; # aliases with $tctr2 702 703my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); 704my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); 705 706my ($dat,$tmp)=($dat0,$tmp0); 707 708### q8-q15 preloaded key schedule 709 710$code.=<<___; 711.globl ${prefix}_ctr32_encrypt_blocks 712.type ${prefix}_ctr32_encrypt_blocks,%function 713.align 5 714${prefix}_ctr32_encrypt_blocks: 715___ 716$code.=<<___ if ($flavour =~ /64/); 717 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 718 AARCH64_VALID_CALL_TARGET 719 stp x29,x30,[sp,#-16]! 720 add x29,sp,#0 721___ 722$code.=<<___ if ($flavour !~ /64/); 723 mov ip,sp 724 stmdb sp!,{r4-r10,lr} 725 vstmdb sp!,{d8-d15} @ ABI specification says so 726 ldr r4, [ip] @ load remaining arg 727___ 728$code.=<<___; 729 ldr $rounds,[$key,#240] 730 731 ldr $ctr, [$ivp, #12] 732 vld1.32 {$dat0},[$ivp] 733 734 vld1.32 {q8-q9},[$key] // load key schedule... 735 sub $rounds,$rounds,#4 736 mov $step,#16 737 cmp $len,#2 738 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys 739 sub $rounds,$rounds,#2 740 vld1.32 {q12-q13},[$key_],#32 741 vld1.32 {q14-q15},[$key_],#32 742 vld1.32 {$rndlast},[$key_] 743 add $key_,$key,#32 744 mov $cnt,$rounds 745 cclr $step,lo 746 747 // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are 748 // affected by silicon errata #1742098 [0] and #1655431 [1], 749 // respectively, where the second instruction of an aese/aesmc 750 // instruction pair may execute twice if an interrupt is taken right 751 // after the first instruction consumes an input register of which a 752 // single 32-bit lane has been updated the last time it was modified. 753 // 754 // This function uses a counter in one 32-bit lane. The vmov.32 lines 755 // could write to $dat1 and $dat2 directly, but that trips this bugs. 756 // We write to $ivec and copy to the final register as a workaround. 757 // 758 // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice 759 // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice 760#ifndef __ARMEB__ 761 rev $ctr, $ctr 762#endif 763 add $tctr1, $ctr, #1 764 vorr $ivec,$dat0,$dat0 765 rev $tctr1, $tctr1 766 vmov.32 ${ivec}[3],$tctr1 767 add $ctr, $ctr, #2 768 vorr $dat1,$ivec,$ivec 769 b.ls .Lctr32_tail 770 rev $tctr2, $ctr 771 vmov.32 ${ivec}[3],$tctr2 772 sub $len,$len,#3 // bias 773 vorr $dat2,$ivec,$ivec 774 b .Loop3x_ctr32 775 776.align 4 777.Loop3x_ctr32: 778 aese $dat0,q8 779 aesmc $dat0,$dat0 780 aese $dat1,q8 781 aesmc $dat1,$dat1 782 aese $dat2,q8 783 aesmc $dat2,$dat2 784 vld1.32 {q8},[$key_],#16 785 subs $cnt,$cnt,#2 786 aese $dat0,q9 787 aesmc $dat0,$dat0 788 aese $dat1,q9 789 aesmc $dat1,$dat1 790 aese $dat2,q9 791 aesmc $dat2,$dat2 792 vld1.32 {q9},[$key_],#16 793 b.gt .Loop3x_ctr32 794 795 aese $dat0,q8 796 aesmc $tmp0,$dat0 797 aese $dat1,q8 798 aesmc $tmp1,$dat1 799 vld1.8 {$in0},[$inp],#16 800 add $tctr0,$ctr,#1 801 aese $dat2,q8 802 aesmc $dat2,$dat2 803 vld1.8 {$in1},[$inp],#16 804 rev $tctr0,$tctr0 805 aese $tmp0,q9 806 aesmc $tmp0,$tmp0 807 aese $tmp1,q9 808 aesmc $tmp1,$tmp1 809 vld1.8 {$in2},[$inp],#16 810 mov $key_,$key 811 aese $dat2,q9 812 aesmc $tmp2,$dat2 813 aese $tmp0,q12 814 aesmc $tmp0,$tmp0 815 aese $tmp1,q12 816 aesmc $tmp1,$tmp1 817 veor $in0,$in0,$rndlast 818 add $tctr1,$ctr,#2 819 aese $tmp2,q12 820 aesmc $tmp2,$tmp2 821 veor $in1,$in1,$rndlast 822 add $ctr,$ctr,#3 823 aese $tmp0,q13 824 aesmc $tmp0,$tmp0 825 aese $tmp1,q13 826 aesmc $tmp1,$tmp1 827 // Note the logic to update $dat0, $dat1, and $dat1 is written to work 828 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in 829 // 32-bit mode. See the comment above. 830 veor $in2,$in2,$rndlast 831 vmov.32 ${ivec}[3], $tctr0 832 aese $tmp2,q13 833 aesmc $tmp2,$tmp2 834 vorr $dat0,$ivec,$ivec 835 rev $tctr1,$tctr1 836 aese $tmp0,q14 837 aesmc $tmp0,$tmp0 838 vmov.32 ${ivec}[3], $tctr1 839 rev $tctr2,$ctr 840 aese $tmp1,q14 841 aesmc $tmp1,$tmp1 842 vorr $dat1,$ivec,$ivec 843 vmov.32 ${ivec}[3], $tctr2 844 aese $tmp2,q14 845 aesmc $tmp2,$tmp2 846 vorr $dat2,$ivec,$ivec 847 subs $len,$len,#3 848 aese $tmp0,q15 849 aese $tmp1,q15 850 aese $tmp2,q15 851 852 veor $in0,$in0,$tmp0 853 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] 854 vst1.8 {$in0},[$out],#16 855 veor $in1,$in1,$tmp1 856 mov $cnt,$rounds 857 vst1.8 {$in1},[$out],#16 858 veor $in2,$in2,$tmp2 859 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] 860 vst1.8 {$in2},[$out],#16 861 b.hs .Loop3x_ctr32 862 863 adds $len,$len,#3 864 b.eq .Lctr32_done 865 cmp $len,#1 866 mov $step,#16 867 cclr $step,eq 868 869.Lctr32_tail: 870 aese $dat0,q8 871 aesmc $dat0,$dat0 872 aese $dat1,q8 873 aesmc $dat1,$dat1 874 vld1.32 {q8},[$key_],#16 875 subs $cnt,$cnt,#2 876 aese $dat0,q9 877 aesmc $dat0,$dat0 878 aese $dat1,q9 879 aesmc $dat1,$dat1 880 vld1.32 {q9},[$key_],#16 881 b.gt .Lctr32_tail 882 883 aese $dat0,q8 884 aesmc $dat0,$dat0 885 aese $dat1,q8 886 aesmc $dat1,$dat1 887 aese $dat0,q9 888 aesmc $dat0,$dat0 889 aese $dat1,q9 890 aesmc $dat1,$dat1 891 vld1.8 {$in0},[$inp],$step 892 aese $dat0,q12 893 aesmc $dat0,$dat0 894 aese $dat1,q12 895 aesmc $dat1,$dat1 896 vld1.8 {$in1},[$inp] 897 aese $dat0,q13 898 aesmc $dat0,$dat0 899 aese $dat1,q13 900 aesmc $dat1,$dat1 901 veor $in0,$in0,$rndlast 902 aese $dat0,q14 903 aesmc $dat0,$dat0 904 aese $dat1,q14 905 aesmc $dat1,$dat1 906 veor $in1,$in1,$rndlast 907 aese $dat0,q15 908 aese $dat1,q15 909 910 cmp $len,#1 911 veor $in0,$in0,$dat0 912 veor $in1,$in1,$dat1 913 vst1.8 {$in0},[$out],#16 914 b.eq .Lctr32_done 915 vst1.8 {$in1},[$out] 916 917.Lctr32_done: 918___ 919$code.=<<___ if ($flavour !~ /64/); 920 vldmia sp!,{d8-d15} 921 ldmia sp!,{r4-r10,pc} 922___ 923$code.=<<___ if ($flavour =~ /64/); 924 ldr x29,[sp],#16 925 ret 926___ 927$code.=<<___; 928.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks 929___ 930}}} 931$code.=<<___; 932#endif 933___ 934######################################## 935if ($flavour =~ /64/) { ######## 64-bit code 936 my %opcode = ( 937 "aesd" => 0x4e285800, "aese" => 0x4e284800, 938 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); 939 940 local *unaes = sub { 941 my ($mnemonic,$arg)=@_; 942 943 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && 944 sprintf ".inst\t0x%08x\t//%s %s", 945 $opcode{$mnemonic}|$1|($2<<5), 946 $mnemonic,$arg; 947 }; 948 949 foreach(split("\n",$code)) { 950 s/\`([^\`]*)\`/eval($1)/geo; 951 952 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers 953 s/@\s/\/\//o; # old->new style commentary 954 955 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 956 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or 957 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or 958 s/vmov\.i8/movi/o or # fix up legacy mnemonics 959 s/vext\.8/ext/o or 960 s/vrev32\.8/rev32/o or 961 s/vtst\.8/cmtst/o or 962 s/vshr/ushr/o or 963 s/^(\s+)v/$1/o or # strip off v prefix 964 s/\bbx\s+lr\b/ret/o; 965 966 # fix up remaining legacy suffixes 967 s/\.[ui]?8//o; 968 m/\],#8/o and s/\.16b/\.8b/go; 969 s/\.[ui]?32//o and s/\.16b/\.4s/go; 970 s/\.[ui]?64//o and s/\.16b/\.2d/go; 971 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; 972 973 # Switch preprocessor checks to aarch64 versions. 974 s/__ARME([BL])__/__AARCH64E$1__/go; 975 976 print $_,"\n"; 977 } 978} else { ######## 32-bit code 979 my %opcode = ( 980 "aesd" => 0xf3b00340, "aese" => 0xf3b00300, 981 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); 982 983 local *unaes = sub { 984 my ($mnemonic,$arg)=@_; 985 986 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { 987 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 988 |(($2&7)<<1) |(($2&8)<<2); 989 # since ARMv7 instructions are always encoded little-endian. 990 # correct solution is to use .inst directive, but older 991 # assemblers don't implement it:-( 992 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 993 $word&0xff,($word>>8)&0xff, 994 ($word>>16)&0xff,($word>>24)&0xff, 995 $mnemonic,$arg; 996 } 997 }; 998 999 sub unvtbl { 1000 my $arg=shift; 1001 1002 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && 1003 sprintf "vtbl.8 d%d,{q%d},d%d\n\t". 1004 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 1005 } 1006 1007 sub unvdup32 { 1008 my $arg=shift; 1009 1010 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 1011 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 1012 } 1013 1014 sub unvmov32 { 1015 my $arg=shift; 1016 1017 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && 1018 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; 1019 } 1020 1021 foreach(split("\n",$code)) { 1022 s/\`([^\`]*)\`/eval($1)/geo; 1023 1024 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 1025 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 1026 s/\/\/\s?/@ /o; # new->old style commentary 1027 1028 # fix up remaining new-style suffixes 1029 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or 1030 s/\],#[0-9]+/]!/o; 1031 1032 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or 1033 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or 1034 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or 1035 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 1036 s/vmov\.32\s+(.*)/unvmov32($1)/geo or 1037 s/^(\s+)b\./$1b/o or 1038 s/^(\s+)mov\./$1mov/o or 1039 s/^(\s+)ret/$1bx\tlr/o; 1040 1041 print $_,"\n"; 1042 } 1043} 1044 1045close STDOUT or die "error closing STDOUT: $!"; 1046