1#! /usr/bin/env perl 2# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# ==================================================================== 10# Written by Andy Polyakov <[email protected]> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# 15# Permission to use under GPLv2 terms is granted. 16# ==================================================================== 17# 18# SHA256/512 for ARMv8. 19# 20# Performance in cycles per processed byte and improvement coefficient 21# over code generated with "default" compiler: 22# 23# SHA256-hw SHA256(*) SHA512 24# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) 25# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) 26# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) 27# Denver 2.01 10.5 (+26%) 6.70 (+8%) 28# X-Gene 20.0 (+100%) 12.8 (+300%(***)) 29# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) 30# Kryo 1.92 17.4 (+30%) 11.2 (+8%) 31# 32# (*) Software SHA256 results are of lesser relevance, presented 33# mostly for informational purposes. 34# (**) The result is a trade-off: it's possible to improve it by 35# 10% (or by 1 cycle per round), but at the cost of 20% loss 36# on Cortex-A53 (or by 4 cycles per round). 37# (***) Super-impressive coefficients over gcc-generated code are 38# indication of some compiler "pathology", most notably code 39# generated with -mgeneral-regs-only is significantly faster 40# and the gap is only 40-90%. 41 42my ($flavour, $hash, $output) = @ARGV; 43 44if ($hash eq "sha512") { 45 $BITS=512; 46 $SZ=8; 47 @Sigma0=(28,34,39); 48 @Sigma1=(14,18,41); 49 @sigma0=(1, 8, 7); 50 @sigma1=(19,61, 6); 51 $rounds=80; 52 $reg_t="x"; 53} elsif ($hash eq "sha256") { 54 $BITS=256; 55 $SZ=4; 56 @Sigma0=( 2,13,22); 57 @Sigma1=( 6,11,25); 58 @sigma0=( 7,18, 3); 59 @sigma1=(17,19,10); 60 $rounds=64; 61 $reg_t="w"; 62} else { 63 die "unknown hash: $hash"; 64} 65 66if ($flavour && $flavour ne "void") { 67 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 69 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 70 die "can't locate arm-xlate.pl"; 71 72 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 73 *STDOUT=*OUT; 74} else { 75 open OUT,">$output"; 76 *STDOUT=*OUT; 77} 78 79$func="sha${BITS}_block_data_order_nohw"; 80 81($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); 82 83@X=map("$reg_t$_",(3..15,0..2)); 84@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); 85($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); 86 87sub BODY_00_xx { 88my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 89my $j=($i+1)&15; 90my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); 91 $T0=@X[$i+3] if ($i<11); 92 93$code.=<<___ if ($i<16); 94#ifndef __AARCH64EB__ 95 rev @X[$i],@X[$i] // $i 96#endif 97___ 98$code.=<<___ if ($i<13 && ($i&1)); 99 ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ 100___ 101$code.=<<___ if ($i==13); 102 ldp @X[14],@X[15],[$inp] 103___ 104$code.=<<___ if ($i>=14); 105 ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] 106___ 107$code.=<<___ if ($i>0 && $i<16); 108 add $a,$a,$t1 // h+=Sigma0(a) 109___ 110$code.=<<___ if ($i>=11); 111 str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] 112___ 113# While ARMv8 specifies merged rotate-n-logical operation such as 114# 'eor x,y,z,ror#n', it was found to negatively affect performance 115# on Apple A7. The reason seems to be that it requires even 'y' to 116# be available earlier. This means that such merged instruction is 117# not necessarily best choice on critical path... On the other hand 118# Cortex-A5x handles merged instructions much better than disjoint 119# rotate and logical... See (**) footnote above. 120$code.=<<___ if ($i<15); 121 ror $t0,$e,#$Sigma1[0] 122 add $h,$h,$t2 // h+=K[i] 123 eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` 124 and $t1,$f,$e 125 bic $t2,$g,$e 126 add $h,$h,@X[$i&15] // h+=X[i] 127 orr $t1,$t1,$t2 // Ch(e,f,g) 128 eor $t2,$a,$b // a^b, b^c in next round 129 eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) 130 ror $T0,$a,#$Sigma0[0] 131 add $h,$h,$t1 // h+=Ch(e,f,g) 132 eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` 133 add $h,$h,$t0 // h+=Sigma1(e) 134 and $t3,$t3,$t2 // (b^c)&=(a^b) 135 add $d,$d,$h // d+=h 136 eor $t3,$t3,$b // Maj(a,b,c) 137 eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) 138 add $h,$h,$t3 // h+=Maj(a,b,c) 139 ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round 140 //add $h,$h,$t1 // h+=Sigma0(a) 141___ 142$code.=<<___ if ($i>=15); 143 ror $t0,$e,#$Sigma1[0] 144 add $h,$h,$t2 // h+=K[i] 145 ror $T1,@X[($j+1)&15],#$sigma0[0] 146 and $t1,$f,$e 147 ror $T2,@X[($j+14)&15],#$sigma1[0] 148 bic $t2,$g,$e 149 ror $T0,$a,#$Sigma0[0] 150 add $h,$h,@X[$i&15] // h+=X[i] 151 eor $t0,$t0,$e,ror#$Sigma1[1] 152 eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] 153 orr $t1,$t1,$t2 // Ch(e,f,g) 154 eor $t2,$a,$b // a^b, b^c in next round 155 eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) 156 eor $T0,$T0,$a,ror#$Sigma0[1] 157 add $h,$h,$t1 // h+=Ch(e,f,g) 158 and $t3,$t3,$t2 // (b^c)&=(a^b) 159 eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] 160 eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) 161 add $h,$h,$t0 // h+=Sigma1(e) 162 eor $t3,$t3,$b // Maj(a,b,c) 163 eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) 164 eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) 165 add @X[$j],@X[$j],@X[($j+9)&15] 166 add $d,$d,$h // d+=h 167 add $h,$h,$t3 // h+=Maj(a,b,c) 168 ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round 169 add @X[$j],@X[$j],$T1 170 add $h,$h,$t1 // h+=Sigma0(a) 171 add @X[$j],@X[$j],$T2 172___ 173 ($t2,$t3)=($t3,$t2); 174} 175 176$code.=<<___; 177#ifndef __KERNEL__ 178# include <openssl/arm_arch.h> 179#endif 180 181.text 182 183.globl $func 184.type $func,%function 185.align 6 186$func: 187 AARCH64_SIGN_LINK_REGISTER 188 stp x29,x30,[sp,#-128]! 189 add x29,sp,#0 190 191 stp x19,x20,[sp,#16] 192 stp x21,x22,[sp,#32] 193 stp x23,x24,[sp,#48] 194 stp x25,x26,[sp,#64] 195 stp x27,x28,[sp,#80] 196 sub sp,sp,#4*$SZ 197 198 ldp $A,$B,[$ctx] // load context 199 ldp $C,$D,[$ctx,#2*$SZ] 200 ldp $E,$F,[$ctx,#4*$SZ] 201 add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input 202 ldp $G,$H,[$ctx,#6*$SZ] 203 adrp $Ktbl,:pg_hi21:.LK$BITS 204 add $Ktbl,$Ktbl,:lo12:.LK$BITS 205 stp $ctx,$num,[x29,#96] 206 207.Loop: 208 ldp @X[0],@X[1],[$inp],#2*$SZ 209 ldr $t2,[$Ktbl],#$SZ // *K++ 210 eor $t3,$B,$C // magic seed 211 str $inp,[x29,#112] 212___ 213for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } 214$code.=".Loop_16_xx:\n"; 215for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } 216$code.=<<___; 217 cbnz $t2,.Loop_16_xx 218 219 ldp $ctx,$num,[x29,#96] 220 ldr $inp,[x29,#112] 221 sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind 222 223 ldp @X[0],@X[1],[$ctx] 224 ldp @X[2],@X[3],[$ctx,#2*$SZ] 225 add $inp,$inp,#14*$SZ // advance input pointer 226 ldp @X[4],@X[5],[$ctx,#4*$SZ] 227 add $A,$A,@X[0] 228 ldp @X[6],@X[7],[$ctx,#6*$SZ] 229 add $B,$B,@X[1] 230 add $C,$C,@X[2] 231 add $D,$D,@X[3] 232 stp $A,$B,[$ctx] 233 add $E,$E,@X[4] 234 add $F,$F,@X[5] 235 stp $C,$D,[$ctx,#2*$SZ] 236 add $G,$G,@X[6] 237 add $H,$H,@X[7] 238 cmp $inp,$num 239 stp $E,$F,[$ctx,#4*$SZ] 240 stp $G,$H,[$ctx,#6*$SZ] 241 b.ne .Loop 242 243 ldp x19,x20,[x29,#16] 244 add sp,sp,#4*$SZ 245 ldp x21,x22,[x29,#32] 246 ldp x23,x24,[x29,#48] 247 ldp x25,x26,[x29,#64] 248 ldp x27,x28,[x29,#80] 249 ldp x29,x30,[sp],#128 250 AARCH64_VALIDATE_LINK_REGISTER 251 ret 252.size $func,.-$func 253 254.section .rodata 255.align 6 256.type .LK$BITS,%object 257.LK$BITS: 258___ 259$code.=<<___ if ($SZ==8); 260 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 261 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 262 .quad 0x3956c25bf348b538,0x59f111f1b605d019 263 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 264 .quad 0xd807aa98a3030242,0x12835b0145706fbe 265 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 266 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 267 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 268 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 269 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 270 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 271 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 272 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 273 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 274 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 275 .quad 0x06ca6351e003826f,0x142929670a0e6e70 276 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 277 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 278 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 279 .quad 0x81c2c92e47edaee6,0x92722c851482353b 280 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 281 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 282 .quad 0xd192e819d6ef5218,0xd69906245565a910 283 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 284 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 285 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 286 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 287 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 288 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 289 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 290 .quad 0x90befffa23631e28,0xa4506cebde82bde9 291 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 292 .quad 0xca273eceea26619c,0xd186b8c721c0c207 293 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 294 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 295 .quad 0x113f9804bef90dae,0x1b710b35131c471b 296 .quad 0x28db77f523047d84,0x32caab7b40c72493 297 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 298 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 299 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 300 .quad 0 // terminator 301___ 302$code.=<<___ if ($SZ==4); 303 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 304 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 305 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 306 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 307 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 308 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 309 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 310 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 311 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 312 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 313 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 314 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 315 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 316 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 317 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 318 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 319 .long 0 //terminator 320___ 321$code.=<<___; 322.size .LK$BITS,.-.LK$BITS 323.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 324.align 2 325___ 326 327if ($SZ==4) { 328my $Ktbl="x3"; 329 330my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); 331my @MSG=map("v$_.16b",(4..7)); 332my ($W0,$W1)=("v16.4s","v17.4s"); 333my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); 334 335$code.=<<___; 336.text 337#ifndef __KERNEL__ 338.globl sha256_block_data_order_hw 339.type sha256_block_data_order_hw,%function 340.align 6 341sha256_block_data_order_hw: 342 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 343 AARCH64_VALID_CALL_TARGET 344 stp x29,x30,[sp,#-16]! 345 add x29,sp,#0 346 347 ld1.32 {$ABCD,$EFGH},[$ctx] 348 adrp $Ktbl,:pg_hi21:.LK256 349 add $Ktbl,$Ktbl,:lo12:.LK256 350 351.Loop_hw: 352 ld1 {@MSG[0]-@MSG[3]},[$inp],#64 353 sub $num,$num,#1 354 ld1.32 {$W0},[$Ktbl],#16 355 rev32 @MSG[0],@MSG[0] 356 rev32 @MSG[1],@MSG[1] 357 rev32 @MSG[2],@MSG[2] 358 rev32 @MSG[3],@MSG[3] 359 orr $ABCD_SAVE,$ABCD,$ABCD // offload 360 orr $EFGH_SAVE,$EFGH,$EFGH 361___ 362for($i=0;$i<12;$i++) { 363$code.=<<___; 364 ld1.32 {$W1},[$Ktbl],#16 365 add.i32 $W0,$W0,@MSG[0] 366 sha256su0 @MSG[0],@MSG[1] 367 orr $abcd,$ABCD,$ABCD 368 sha256h $ABCD,$EFGH,$W0 369 sha256h2 $EFGH,$abcd,$W0 370 sha256su1 @MSG[0],@MSG[2],@MSG[3] 371___ 372 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 373} 374$code.=<<___; 375 ld1.32 {$W1},[$Ktbl],#16 376 add.i32 $W0,$W0,@MSG[0] 377 orr $abcd,$ABCD,$ABCD 378 sha256h $ABCD,$EFGH,$W0 379 sha256h2 $EFGH,$abcd,$W0 380 381 ld1.32 {$W0},[$Ktbl],#16 382 add.i32 $W1,$W1,@MSG[1] 383 orr $abcd,$ABCD,$ABCD 384 sha256h $ABCD,$EFGH,$W1 385 sha256h2 $EFGH,$abcd,$W1 386 387 ld1.32 {$W1},[$Ktbl] 388 add.i32 $W0,$W0,@MSG[2] 389 sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind 390 orr $abcd,$ABCD,$ABCD 391 sha256h $ABCD,$EFGH,$W0 392 sha256h2 $EFGH,$abcd,$W0 393 394 add.i32 $W1,$W1,@MSG[3] 395 orr $abcd,$ABCD,$ABCD 396 sha256h $ABCD,$EFGH,$W1 397 sha256h2 $EFGH,$abcd,$W1 398 399 add.i32 $ABCD,$ABCD,$ABCD_SAVE 400 add.i32 $EFGH,$EFGH,$EFGH_SAVE 401 402 cbnz $num,.Loop_hw 403 404 st1.32 {$ABCD,$EFGH},[$ctx] 405 406 ldr x29,[sp],#16 407 ret 408.size sha256_block_data_order_hw,.-sha256_block_data_order_hw 409#endif 410___ 411} 412 413if ($SZ==8) { 414my $Ktbl="x3"; 415 416my @H = map("v$_.16b",(0..4)); 417my ($fg,$de,$m9_10)=map("v$_.16b",(5..7)); 418my @MSG=map("v$_.16b",(16..23)); 419my ($W0,$W1)=("v24.2d","v25.2d"); 420my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29)); 421 422$code.=<<___; 423.text 424#ifndef __KERNEL__ 425.globl sha512_block_data_order_hw 426.type sha512_block_data_order_hw,%function 427.align 6 428sha512_block_data_order_hw: 429 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 430 AARCH64_VALID_CALL_TARGET 431 stp x29,x30,[sp,#-16]! 432 add x29,sp,#0 433 434 ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input 435 ld1 {@MSG[4]-@MSG[7]},[$inp],#64 436 437 ld1.64 {@H[0]-@H[3]},[$ctx] // load context 438 adrp $Ktbl,:pg_hi21:.LK512 439 add $Ktbl,$Ktbl,:lo12:.LK512 440 441 rev64 @MSG[0],@MSG[0] 442 rev64 @MSG[1],@MSG[1] 443 rev64 @MSG[2],@MSG[2] 444 rev64 @MSG[3],@MSG[3] 445 rev64 @MSG[4],@MSG[4] 446 rev64 @MSG[5],@MSG[5] 447 rev64 @MSG[6],@MSG[6] 448 rev64 @MSG[7],@MSG[7] 449 b .Loop_hw 450 451.align 4 452.Loop_hw: 453 ld1.64 {$W0},[$Ktbl],#16 454 subs $num,$num,#1 455 sub x4,$inp,#128 456 orr $AB,@H[0],@H[0] // offload 457 orr $CD,@H[1],@H[1] 458 orr $EF,@H[2],@H[2] 459 orr $GH,@H[3],@H[3] 460 csel $inp,$inp,x4,ne // conditional rewind 461___ 462for($i=0;$i<32;$i++) { 463$code.=<<___; 464 add.i64 $W0,$W0,@MSG[0] 465 ld1.64 {$W1},[$Ktbl],#16 466 ext $W0,$W0,$W0,#8 467 ext $fg,@H[2],@H[3],#8 468 ext $de,@H[1],@H[2],#8 469 add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" 470 sha512su0 @MSG[0],@MSG[1] 471 ext $m9_10,@MSG[4],@MSG[5],#8 472 sha512h @H[3],$fg,$de 473 sha512su1 @MSG[0],@MSG[7],$m9_10 474 add.i64 @H[4],@H[1],@H[3] // "D + T1" 475 sha512h2 @H[3],$H[1],@H[0] 476___ 477 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 478 @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); 479} 480for(;$i<40;$i++) { 481$code.=<<___ if ($i<39); 482 ld1.64 {$W1},[$Ktbl],#16 483___ 484$code.=<<___ if ($i==39); 485 sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind 486___ 487$code.=<<___; 488 add.i64 $W0,$W0,@MSG[0] 489 ld1 {@MSG[0]},[$inp],#16 // load next input 490 ext $W0,$W0,$W0,#8 491 ext $fg,@H[2],@H[3],#8 492 ext $de,@H[1],@H[2],#8 493 add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" 494 sha512h @H[3],$fg,$de 495 rev64 @MSG[0],@MSG[0] 496 add.i64 @H[4],@H[1],@H[3] // "D + T1" 497 sha512h2 @H[3],$H[1],@H[0] 498___ 499 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 500 @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); 501} 502$code.=<<___; 503 add.i64 @H[0],@H[0],$AB // accumulate 504 add.i64 @H[1],@H[1],$CD 505 add.i64 @H[2],@H[2],$EF 506 add.i64 @H[3],@H[3],$GH 507 508 cbnz $num,.Loop_hw 509 510 st1.64 {@H[0]-@H[3]},[$ctx] // store context 511 512 ldr x29,[sp],#16 513 ret 514.size sha512_block_data_order_hw,.-sha512_block_data_order_hw 515#endif 516___ 517} 518 519{ my %opcode = ( 520 "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, 521 "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); 522 523 sub unsha256 { 524 my ($mnemonic,$arg)=@_; 525 526 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o 527 && 528 sprintf ".inst\t0x%08x\t//%s %s", 529 $opcode{$mnemonic}|$1|($2<<5)|($3<<16), 530 $mnemonic,$arg; 531 } 532} 533 534{ my %opcode = ( 535 "sha512h" => 0xce608000, "sha512h2" => 0xce608400, 536 "sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 ); 537 538 sub unsha512 { 539 my ($mnemonic,$arg)=@_; 540 541 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o 542 && 543 sprintf ".inst\t0x%08x\t//%s %s", 544 $opcode{$mnemonic}|$1|($2<<5)|($3<<16), 545 $mnemonic,$arg; 546 } 547} 548 549open SELF,$0; 550while(<SELF>) { 551 next if (/^#!/); 552 last if (!s/^#/\/\// and !/^$/); 553 print; 554} 555close SELF; 556 557foreach(split("\n",$code)) { 558 559 s/\`([^\`]*)\`/eval($1)/ge; 560 561 s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or 562 s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; 563 564 s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers 565 566 s/\.[ui]?8(\s)/$1/; 567 s/\.\w?64\b// and s/\.16b/\.2d/g or 568 s/\.\w?32\b// and s/\.16b/\.4s/g; 569 m/\bext\b/ and s/\.2d/\.16b/g or 570 m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; 571 572 print $_,"\n"; 573} 574 575close STDOUT or die "error closing STDOUT: $!"; 576