1#! /usr/bin/env perl 2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <[email protected]> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Permission to use under GPL terms is granted. 17# ==================================================================== 18 19# SHA256 block procedure for ARMv4. May 2007. 20 21# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23# byte [on single-issue Xscale PXA250 core]. 24 25# July 2010. 26# 27# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28# Cortex A8 core and ~20 cycles per processed byte. 29 30# February 2011. 31# 32# Profiler-assisted and platform-specific optimization resulted in 16% 33# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34 35# September 2013. 36# 37# Add NEON implementation. On Cortex A8 it was measured to process one 38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40# code (meaning that latter performs sub-optimally, nothing was done 41# about it). 42 43# May 2014. 44# 45# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46 47$flavour = shift; 48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 50 51if ($flavour && $flavour ne "void") { 52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 53 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 54 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 55 die "can't locate arm-xlate.pl"; 56 57 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 58 *STDOUT=*OUT; 59} else { 60 open OUT,">$output"; 61 *STDOUT=*OUT; 62} 63 64$ctx="r0"; $t0="r0"; 65$inp="r1"; $t4="r1"; 66$len="r2"; $t1="r2"; 67$T1="r3"; $t3="r3"; 68$A="r4"; 69$B="r5"; 70$C="r6"; 71$D="r7"; 72$E="r8"; 73$F="r9"; 74$G="r10"; 75$H="r11"; 76@V=($A,$B,$C,$D,$E,$F,$G,$H); 77$t2="r12"; 78$Ktbl="r14"; 79 80@Sigma0=( 2,13,22); 81@Sigma1=( 6,11,25); 82@sigma0=( 7,18, 3); 83@sigma1=(17,19,10); 84 85sub BODY_00_15 { 86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 87 88$code.=<<___ if ($i<16); 89#if __ARM_ARCH>=7 90 @ ldr $t1,[$inp],#4 @ $i 91# if $i==15 92 str $inp,[sp,#17*4] @ make room for $t4 93# endif 94 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 95 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 96 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 97# ifndef __ARMEB__ 98 rev $t1,$t1 99# endif 100#else 101 @ ldrb $t1,[$inp,#3] @ $i 102 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 103 ldrb $t2,[$inp,#2] 104 ldrb $t0,[$inp,#1] 105 orr $t1,$t1,$t2,lsl#8 106 ldrb $t2,[$inp],#4 107 orr $t1,$t1,$t0,lsl#16 108# if $i==15 109 str $inp,[sp,#17*4] @ make room for $t4 110# endif 111 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 112 orr $t1,$t1,$t2,lsl#24 113 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 114#endif 115___ 116$code.=<<___; 117 ldr $t2,[$Ktbl],#4 @ *K256++ 118 add $h,$h,$t1 @ h+=X[i] 119 str $t1,[sp,#`$i%16`*4] 120 eor $t1,$f,$g 121 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 122 and $t1,$t1,$e 123 add $h,$h,$t2 @ h+=K256[i] 124 eor $t1,$t1,$g @ Ch(e,f,g) 125 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 126 add $h,$h,$t1 @ h+=Ch(e,f,g) 127#if $i==31 128 and $t2,$t2,#0xff 129 cmp $t2,#0xf2 @ done? 130#endif 131#if $i<15 132# if __ARM_ARCH>=7 133 ldr $t1,[$inp],#4 @ prefetch 134# else 135 ldrb $t1,[$inp,#3] 136# endif 137 eor $t2,$a,$b @ a^b, b^c in next round 138#else 139 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 140 eor $t2,$a,$b @ a^b, b^c in next round 141 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 142#endif 143 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 144 and $t3,$t3,$t2 @ (b^c)&=(a^b) 145 add $d,$d,$h @ d+=h 146 eor $t3,$t3,$b @ Maj(a,b,c) 147 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 148 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 149___ 150 ($t2,$t3)=($t3,$t2); 151} 152 153sub BODY_16_XX { 154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 155 156$code.=<<___; 157 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 158 @ ldr $t4,[sp,#`($i+14)%16`*4] 159 mov $t0,$t1,ror#$sigma0[0] 160 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 161 mov $t2,$t4,ror#$sigma1[0] 162 eor $t0,$t0,$t1,ror#$sigma0[1] 163 eor $t2,$t2,$t4,ror#$sigma1[1] 164 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 165 ldr $t1,[sp,#`($i+0)%16`*4] 166 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 167 ldr $t4,[sp,#`($i+9)%16`*4] 168 169 add $t2,$t2,$t0 170 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 171 add $t1,$t1,$t2 172 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 173 add $t1,$t1,$t4 @ X[i] 174___ 175 &BODY_00_15(@_); 176} 177 178$code=<<___; 179#ifndef __KERNEL__ 180# include <openssl/arm_arch.h> 181#else 182# define __ARM_ARCH __LINUX_ARM_ARCH__ 183# define __ARM_MAX_ARCH__ 7 184#endif 185 186@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 187@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those 188@ instructions are manually-encoded. (See unsha256.) 189.arch armv7-a 190 191.text 192#if defined(__thumb2__) 193.syntax unified 194.thumb 195#else 196.code 32 197#endif 198 199.type K256,%object 200.align 5 201K256: 202.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 203.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 204.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 205.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 206.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 207.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 208.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 209.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 210.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 211.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 212.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 213.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 214.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 215.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 216.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 217.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 218.size K256,.-K256 219.word 0 @ terminator 220.align 5 221 222.global sha256_block_data_order_nohw 223.type sha256_block_data_order_nohw,%function 224sha256_block_data_order_nohw: 225 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 226 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 227 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 228 adr $Ktbl,K256 229 sub sp,sp,#16*4 @ alloca(X[16]) 230.Loop: 231# if __ARM_ARCH>=7 232 ldr $t1,[$inp],#4 233# else 234 ldrb $t1,[$inp,#3] 235# endif 236 eor $t3,$B,$C @ magic 237 eor $t2,$t2,$t2 238___ 239for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 240$code.=".Lrounds_16_xx:\n"; 241for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 242$code.=<<___; 243#if __ARM_ARCH>=7 244 ite eq @ Thumb2 thing, sanity check in ARM 245#endif 246 ldreq $t3,[sp,#16*4] @ pull ctx 247 bne .Lrounds_16_xx 248 249 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 250 ldr $t0,[$t3,#0] 251 ldr $t1,[$t3,#4] 252 ldr $t2,[$t3,#8] 253 add $A,$A,$t0 254 ldr $t0,[$t3,#12] 255 add $B,$B,$t1 256 ldr $t1,[$t3,#16] 257 add $C,$C,$t2 258 ldr $t2,[$t3,#20] 259 add $D,$D,$t0 260 ldr $t0,[$t3,#24] 261 add $E,$E,$t1 262 ldr $t1,[$t3,#28] 263 add $F,$F,$t2 264 ldr $inp,[sp,#17*4] @ pull inp 265 ldr $t2,[sp,#18*4] @ pull inp+len 266 add $G,$G,$t0 267 add $H,$H,$t1 268 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 269 cmp $inp,$t2 270 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 271 bne .Loop 272 273 add sp,sp,#`16+3`*4 @ destroy frame 274#if __ARM_ARCH>=5 275 ldmia sp!,{r4-r11,pc} 276#else 277 ldmia sp!,{r4-r11,lr} 278 tst lr,#1 279 moveq pc,lr @ be binary compatible with V4, yet 280 bx lr @ interoperable with Thumb ISA:-) 281#endif 282.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw 283___ 284###################################################################### 285# NEON stuff 286# 287{{{ 288my @X=map("q$_",(0..3)); 289my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 290my $Xfer=$t4; 291my $j=0; 292 293sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 294sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 295 296sub AUTOLOAD() # thunk [simplified] x86-style perlasm 297{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 298 my $arg = pop; 299 $arg = "#$arg" if ($arg*1 eq $arg); 300 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 301} 302 303sub Xupdate() 304{ use integer; 305 my $body = shift; 306 my @insns = (&$body,&$body,&$body,&$body); 307 my ($a,$b,$c,$d,$e,$f,$g,$h); 308 309 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 310 eval(shift(@insns)); 311 eval(shift(@insns)); 312 eval(shift(@insns)); 313 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 314 eval(shift(@insns)); 315 eval(shift(@insns)); 316 eval(shift(@insns)); 317 &vshr_u32 ($T2,$T0,$sigma0[0]); 318 eval(shift(@insns)); 319 eval(shift(@insns)); 320 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 321 eval(shift(@insns)); 322 eval(shift(@insns)); 323 &vshr_u32 ($T1,$T0,$sigma0[2]); 324 eval(shift(@insns)); 325 eval(shift(@insns)); 326 &vsli_32 ($T2,$T0,32-$sigma0[0]); 327 eval(shift(@insns)); 328 eval(shift(@insns)); 329 &vshr_u32 ($T3,$T0,$sigma0[1]); 330 eval(shift(@insns)); 331 eval(shift(@insns)); 332 &veor ($T1,$T1,$T2); 333 eval(shift(@insns)); 334 eval(shift(@insns)); 335 &vsli_32 ($T3,$T0,32-$sigma0[1]); 336 eval(shift(@insns)); 337 eval(shift(@insns)); 338 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 339 eval(shift(@insns)); 340 eval(shift(@insns)); 341 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 342 eval(shift(@insns)); 343 eval(shift(@insns)); 344 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 345 eval(shift(@insns)); 346 eval(shift(@insns)); 347 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 348 eval(shift(@insns)); 349 eval(shift(@insns)); 350 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 351 eval(shift(@insns)); 352 eval(shift(@insns)); 353 &veor ($T5,$T5,$T4); 354 eval(shift(@insns)); 355 eval(shift(@insns)); 356 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 357 eval(shift(@insns)); 358 eval(shift(@insns)); 359 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 360 eval(shift(@insns)); 361 eval(shift(@insns)); 362 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 363 eval(shift(@insns)); 364 eval(shift(@insns)); 365 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 366 eval(shift(@insns)); 367 eval(shift(@insns)); 368 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 369 eval(shift(@insns)); 370 eval(shift(@insns)); 371 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 372 eval(shift(@insns)); 373 eval(shift(@insns)); 374 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 375 eval(shift(@insns)); 376 eval(shift(@insns)); 377 &veor ($T5,$T5,$T4); 378 eval(shift(@insns)); 379 eval(shift(@insns)); 380 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 381 eval(shift(@insns)); 382 eval(shift(@insns)); 383 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 384 eval(shift(@insns)); 385 eval(shift(@insns)); 386 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 387 eval(shift(@insns)); 388 eval(shift(@insns)); 389 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 390 eval(shift(@insns)); 391 eval(shift(@insns)); 392 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 393 eval(shift(@insns)); 394 eval(shift(@insns)); 395 &vadd_i32 ($T0,$T0,@X[0]); 396 while($#insns>=2) { eval(shift(@insns)); } 397 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 398 eval(shift(@insns)); 399 eval(shift(@insns)); 400 401 push(@X,shift(@X)); # "rotate" X[] 402} 403 404sub Xpreload() 405{ use integer; 406 my $body = shift; 407 my @insns = (&$body,&$body,&$body,&$body); 408 my ($a,$b,$c,$d,$e,$f,$g,$h); 409 410 eval(shift(@insns)); 411 eval(shift(@insns)); 412 eval(shift(@insns)); 413 eval(shift(@insns)); 414 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 415 eval(shift(@insns)); 416 eval(shift(@insns)); 417 eval(shift(@insns)); 418 eval(shift(@insns)); 419 &vrev32_8 (@X[0],@X[0]); 420 eval(shift(@insns)); 421 eval(shift(@insns)); 422 eval(shift(@insns)); 423 eval(shift(@insns)); 424 &vadd_i32 ($T0,$T0,@X[0]); 425 foreach (@insns) { eval; } # remaining instructions 426 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 427 428 push(@X,shift(@X)); # "rotate" X[] 429} 430 431sub body_00_15 () { 432 ( 433 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 434 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 435 '&eor ($t1,$f,$g)', 436 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 437 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 438 '&and ($t1,$t1,$e)', 439 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 440 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 441 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 442 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 443 '&eor ($t2,$a,$b)', # a^b, b^c in next round 444 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 445 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 446 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 447 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 448 '&ldr ($t1,"[sp,#64]") if ($j==31)', 449 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 450 '&add ($d,$d,$h)', # d+=h 451 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 452 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 453 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 454 ) 455} 456 457$code.=<<___; 458#if __ARM_MAX_ARCH__>=7 459.arch armv7-a 460.fpu neon 461 462.LK256_shortcut_neon: 463@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. 464#if defined(__thumb2__) 465.word K256-(.LK256_add_neon+4) 466#else 467.word K256-(.LK256_add_neon+8) 468#endif 469 470.global sha256_block_data_order_neon 471.type sha256_block_data_order_neon,%function 472.align 5 473.skip 16 474sha256_block_data_order_neon: 475 stmdb sp!,{r4-r12,lr} 476 477 sub $H,sp,#16*4+16 478 479 @ K256 is just at the boundary of being easily referenced by an ADR from 480 @ this function. In Arm mode, when building with __ARM_ARCH=6, it does 481 @ not fit. By moving code around, we could make it fit, but this is too 482 @ fragile. For simplicity, just load the offset from 483 @ .LK256_shortcut_neon. 484 @ 485 @ TODO(davidben): adrl would avoid a load, but clang-assembler does not 486 @ support it. We might be able to emulate it with a macro, but Android's 487 @ did not work when I tried it. 488 @ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm 489 ldr $Ktbl,.LK256_shortcut_neon 490.LK256_add_neon: 491 add $Ktbl,pc,$Ktbl 492 493 bic $H,$H,#15 @ align for 128-bit stores 494 mov $t2,sp 495 mov sp,$H @ alloca 496 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 497 498 vld1.8 {@X[0]},[$inp]! 499 vld1.8 {@X[1]},[$inp]! 500 vld1.8 {@X[2]},[$inp]! 501 vld1.8 {@X[3]},[$inp]! 502 vld1.32 {$T0},[$Ktbl,:128]! 503 vld1.32 {$T1},[$Ktbl,:128]! 504 vld1.32 {$T2},[$Ktbl,:128]! 505 vld1.32 {$T3},[$Ktbl,:128]! 506 vrev32.8 @X[0],@X[0] @ yes, even on 507 str $ctx,[sp,#64] 508 vrev32.8 @X[1],@X[1] @ big-endian 509 str $inp,[sp,#68] 510 mov $Xfer,sp 511 vrev32.8 @X[2],@X[2] 512 str $len,[sp,#72] 513 vrev32.8 @X[3],@X[3] 514 str $t2,[sp,#76] @ save original sp 515 vadd.i32 $T0,$T0,@X[0] 516 vadd.i32 $T1,$T1,@X[1] 517 vst1.32 {$T0},[$Xfer,:128]! 518 vadd.i32 $T2,$T2,@X[2] 519 vst1.32 {$T1},[$Xfer,:128]! 520 vadd.i32 $T3,$T3,@X[3] 521 vst1.32 {$T2},[$Xfer,:128]! 522 vst1.32 {$T3},[$Xfer,:128]! 523 524 ldmia $ctx,{$A-$H} 525 sub $Xfer,$Xfer,#64 526 ldr $t1,[sp,#0] 527 eor $t2,$t2,$t2 528 eor $t3,$B,$C 529 b .L_00_48 530 531.align 4 532.L_00_48: 533___ 534 &Xupdate(\&body_00_15); 535 &Xupdate(\&body_00_15); 536 &Xupdate(\&body_00_15); 537 &Xupdate(\&body_00_15); 538$code.=<<___; 539 teq $t1,#0 @ check for K256 terminator 540 ldr $t1,[sp,#0] 541 sub $Xfer,$Xfer,#64 542 bne .L_00_48 543 544 ldr $inp,[sp,#68] 545 ldr $t0,[sp,#72] 546 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 547 teq $inp,$t0 548 it eq 549 subeq $inp,$inp,#64 @ avoid SEGV 550 vld1.8 {@X[0]},[$inp]! @ load next input block 551 vld1.8 {@X[1]},[$inp]! 552 vld1.8 {@X[2]},[$inp]! 553 vld1.8 {@X[3]},[$inp]! 554 it ne 555 strne $inp,[sp,#68] 556 mov $Xfer,sp 557___ 558 &Xpreload(\&body_00_15); 559 &Xpreload(\&body_00_15); 560 &Xpreload(\&body_00_15); 561 &Xpreload(\&body_00_15); 562$code.=<<___; 563 ldr $t0,[$t1,#0] 564 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 565 ldr $t2,[$t1,#4] 566 ldr $t3,[$t1,#8] 567 ldr $t4,[$t1,#12] 568 add $A,$A,$t0 @ accumulate 569 ldr $t0,[$t1,#16] 570 add $B,$B,$t2 571 ldr $t2,[$t1,#20] 572 add $C,$C,$t3 573 ldr $t3,[$t1,#24] 574 add $D,$D,$t4 575 ldr $t4,[$t1,#28] 576 add $E,$E,$t0 577 str $A,[$t1],#4 578 add $F,$F,$t2 579 str $B,[$t1],#4 580 add $G,$G,$t3 581 str $C,[$t1],#4 582 add $H,$H,$t4 583 str $D,[$t1],#4 584 stmia $t1,{$E-$H} 585 586 ittte ne 587 movne $Xfer,sp 588 ldrne $t1,[sp,#0] 589 eorne $t2,$t2,$t2 590 ldreq sp,[sp,#76] @ restore original sp 591 itt ne 592 eorne $t3,$B,$C 593 bne .L_00_48 594 595 ldmia sp!,{r4-r12,pc} 596.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 597#endif 598___ 599}}} 600###################################################################### 601# ARMv8 stuff 602# 603{{{ 604my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 605my @MSG=map("q$_",(8..11)); 606my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 607my $Ktbl="r3"; 608 609$code.=<<___; 610#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 611 612# if defined(__thumb2__) 613# define INST(a,b,c,d) .byte c,d|0xc,a,b 614# else 615# define INST(a,b,c,d) .byte a,b,c,d 616# endif 617 618.LK256_shortcut_hw: 619@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. 620#if defined(__thumb2__) 621.word K256-(.LK256_add_hw+4) 622#else 623.word K256-(.LK256_add_hw+8) 624#endif 625 626.global sha256_block_data_order_hw 627.type sha256_block_data_order_hw,%function 628.align 5 629sha256_block_data_order_hw: 630 @ K256 is too far to reference from one ADR command in Thumb mode. In 631 @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte 632 @ boundary. For simplicity, just load the offset from .LK256_shortcut_hw. 633 ldr $Ktbl,.LK256_shortcut_hw 634.LK256_add_hw: 635 add $Ktbl,pc,$Ktbl 636 637 vld1.32 {$ABCD,$EFGH},[$ctx] 638 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 639 b .Loop_v8 640 641.align 4 642.Loop_v8: 643 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 644 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 645 vld1.32 {$W0},[$Ktbl]! 646 vrev32.8 @MSG[0],@MSG[0] 647 vrev32.8 @MSG[1],@MSG[1] 648 vrev32.8 @MSG[2],@MSG[2] 649 vrev32.8 @MSG[3],@MSG[3] 650 vmov $ABCD_SAVE,$ABCD @ offload 651 vmov $EFGH_SAVE,$EFGH 652 teq $inp,$len 653___ 654for($i=0;$i<12;$i++) { 655$code.=<<___; 656 vld1.32 {$W1},[$Ktbl]! 657 vadd.i32 $W0,$W0,@MSG[0] 658 sha256su0 @MSG[0],@MSG[1] 659 vmov $abcd,$ABCD 660 sha256h $ABCD,$EFGH,$W0 661 sha256h2 $EFGH,$abcd,$W0 662 sha256su1 @MSG[0],@MSG[2],@MSG[3] 663___ 664 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 665} 666$code.=<<___; 667 vld1.32 {$W1},[$Ktbl]! 668 vadd.i32 $W0,$W0,@MSG[0] 669 vmov $abcd,$ABCD 670 sha256h $ABCD,$EFGH,$W0 671 sha256h2 $EFGH,$abcd,$W0 672 673 vld1.32 {$W0},[$Ktbl]! 674 vadd.i32 $W1,$W1,@MSG[1] 675 vmov $abcd,$ABCD 676 sha256h $ABCD,$EFGH,$W1 677 sha256h2 $EFGH,$abcd,$W1 678 679 vld1.32 {$W1},[$Ktbl] 680 vadd.i32 $W0,$W0,@MSG[2] 681 sub $Ktbl,$Ktbl,#256-16 @ rewind 682 vmov $abcd,$ABCD 683 sha256h $ABCD,$EFGH,$W0 684 sha256h2 $EFGH,$abcd,$W0 685 686 vadd.i32 $W1,$W1,@MSG[3] 687 vmov $abcd,$ABCD 688 sha256h $ABCD,$EFGH,$W1 689 sha256h2 $EFGH,$abcd,$W1 690 691 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 692 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 693 it ne 694 bne .Loop_v8 695 696 vst1.32 {$ABCD,$EFGH},[$ctx] 697 698 ret @ bx lr 699.size sha256_block_data_order_hw,.-sha256_block_data_order_hw 700#endif 701___ 702}}} 703$code.=<<___; 704.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 705.align 2 706___ 707 708open SELF,$0; 709while(<SELF>) { 710 next if (/^#!/); 711 last if (!s/^#/@/ and !/^$/); 712 print; 713} 714close SELF; 715 716{ my %opcode = ( 717 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 718 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 719 720 sub unsha256 { 721 my ($mnemonic,$arg)=@_; 722 723 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 724 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 725 |(($2&7)<<17)|(($2&8)<<4) 726 |(($3&7)<<1) |(($3&8)<<2); 727 # since ARMv7 instructions are always encoded little-endian. 728 # correct solution is to use .inst directive, but older 729 # assemblers don't implement it:-( 730 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 731 $word&0xff,($word>>8)&0xff, 732 ($word>>16)&0xff,($word>>24)&0xff, 733 $mnemonic,$arg; 734 } 735 } 736} 737 738foreach (split($/,$code)) { 739 740 s/\`([^\`]*)\`/eval $1/geo; 741 742 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 743 744 s/\bret\b/bx lr/go or 745 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 746 747 print $_,"\n"; 748} 749 750close STDOUT or die "error closing STDOUT: $!"; # enforce flush 751