1#! /usr/bin/env perl 2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <[email protected]> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# December 2014 18# 19# ChaCha20 for ARMv4. 20# 21# Performance in cycles per byte out of large buffer. 22# 23# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU 24# 25# Cortex-A5 19.3(*)/+95% 21.8 14.1 26# Cortex-A8 10.5(*)/+160% 13.9 6.35 27# Cortex-A9 12.9(**)/+110% 14.3 6.50 28# Cortex-A15 11.0/+40% 16.0 5.00 29# Snapdragon S4 11.5/+125% 13.6 4.90 30# 31# (*) most "favourable" result for aligned data on little-endian 32# processor, result for misaligned data is 10-15% lower; 33# (**) this result is a trade-off: it can be improved by 20%, 34# but then Snapdragon S4 and Cortex-A8 results get 35# 20-25% worse; 36 37$flavour = shift; 38if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 39else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 40 41if ($flavour && $flavour ne "void") { 42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 44 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 45 die "can't locate arm-xlate.pl"; 46 47 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 48 *STDOUT=*OUT; 49} else { 50 open OUT,">$output"; 51 *STDOUT=*OUT; 52} 53 54sub AUTOLOAD() # thunk [simplified] x86-style perlasm 55{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 56 my $arg = pop; 57 $arg = "#$arg" if ($arg*1 eq $arg); 58 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 59} 60 61my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); 62my @t=map("r$_",(8..11)); 63 64sub ROUND { 65my ($a0,$b0,$c0,$d0)=@_; 66my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); 67my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); 68my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); 69my $odd = $d0&1; 70my ($xc,$xc_) = (@t[0..1]); 71my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); 72my @ret; 73 74 # Consider order in which variables are addressed by their 75 # index: 76 # 77 # a b c d 78 # 79 # 0 4 8 12 < even round 80 # 1 5 9 13 81 # 2 6 10 14 82 # 3 7 11 15 83 # 0 5 10 15 < odd round 84 # 1 6 11 12 85 # 2 7 8 13 86 # 3 4 9 14 87 # 88 # 'a', 'b' are permanently allocated in registers, @x[0..7], 89 # while 'c's and pair of 'd's are maintained in memory. If 90 # you observe 'c' column, you'll notice that pair of 'c's is 91 # invariant between rounds. This means that we have to reload 92 # them once per round, in the middle. This is why you'll see 93 # bunch of 'c' stores and loads in the middle, but none in 94 # the beginning or end. If you observe 'd' column, you'll 95 # notice that 15 and 13 are reused in next pair of rounds. 96 # This is why these two are chosen for offloading to memory, 97 # to make loads count more. 98 push @ret,( 99 "&add (@x[$a0],@x[$a0],@x[$b0])", 100 "&mov ($xd,$xd,'ror#16')", 101 "&add (@x[$a1],@x[$a1],@x[$b1])", 102 "&mov ($xd_,$xd_,'ror#16')", 103 "&eor ($xd,$xd,@x[$a0],'ror#16')", 104 "&eor ($xd_,$xd_,@x[$a1],'ror#16')", 105 106 "&add ($xc,$xc,$xd)", 107 "&mov (@x[$b0],@x[$b0],'ror#20')", 108 "&add ($xc_,$xc_,$xd_)", 109 "&mov (@x[$b1],@x[$b1],'ror#20')", 110 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')", 111 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')", 112 113 "&add (@x[$a0],@x[$a0],@x[$b0])", 114 "&mov ($xd,$xd,'ror#24')", 115 "&add (@x[$a1],@x[$a1],@x[$b1])", 116 "&mov ($xd_,$xd_,'ror#24')", 117 "&eor ($xd,$xd,@x[$a0],'ror#24')", 118 "&eor ($xd_,$xd_,@x[$a1],'ror#24')", 119 120 "&add ($xc,$xc,$xd)", 121 "&mov (@x[$b0],@x[$b0],'ror#25')" ); 122 push @ret,( 123 "&str ($xd,'[sp,#4*(16+$d0)]')", 124 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); 125 push @ret,( 126 "&add ($xc_,$xc_,$xd_)", 127 "&mov (@x[$b1],@x[$b1],'ror#25')" ); 128 push @ret,( 129 "&str ($xd_,'[sp,#4*(16+$d1)]')", 130 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); 131 push @ret,( 132 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')", 133 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" ); 134 135 $xd=@x[$d2] if (!$odd); 136 $xd_=@x[$d3] if ($odd); 137 push @ret,( 138 "&str ($xc,'[sp,#4*(16+$c0)]')", 139 "&ldr ($xc,'[sp,#4*(16+$c2)]')", 140 "&add (@x[$a2],@x[$a2],@x[$b2])", 141 "&mov ($xd,$xd,'ror#16')", 142 "&str ($xc_,'[sp,#4*(16+$c1)]')", 143 "&ldr ($xc_,'[sp,#4*(16+$c3)]')", 144 "&add (@x[$a3],@x[$a3],@x[$b3])", 145 "&mov ($xd_,$xd_,'ror#16')", 146 "&eor ($xd,$xd,@x[$a2],'ror#16')", 147 "&eor ($xd_,$xd_,@x[$a3],'ror#16')", 148 149 "&add ($xc,$xc,$xd)", 150 "&mov (@x[$b2],@x[$b2],'ror#20')", 151 "&add ($xc_,$xc_,$xd_)", 152 "&mov (@x[$b3],@x[$b3],'ror#20')", 153 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')", 154 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')", 155 156 "&add (@x[$a2],@x[$a2],@x[$b2])", 157 "&mov ($xd,$xd,'ror#24')", 158 "&add (@x[$a3],@x[$a3],@x[$b3])", 159 "&mov ($xd_,$xd_,'ror#24')", 160 "&eor ($xd,$xd,@x[$a2],'ror#24')", 161 "&eor ($xd_,$xd_,@x[$a3],'ror#24')", 162 163 "&add ($xc,$xc,$xd)", 164 "&mov (@x[$b2],@x[$b2],'ror#25')", 165 "&add ($xc_,$xc_,$xd_)", 166 "&mov (@x[$b3],@x[$b3],'ror#25')", 167 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')", 168 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" ); 169 170 @ret; 171} 172 173$code.=<<___; 174#include <openssl/arm_arch.h> 175 176@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 177@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. 178.arch armv7-a 179 180.text 181#if defined(__thumb2__) || defined(__clang__) 182.syntax unified 183#endif 184#if defined(__thumb2__) 185.thumb 186#else 187.code 32 188#endif 189 190#if defined(__thumb2__) || defined(__clang__) 191#define ldrhsb ldrbhs 192#endif 193 194.align 5 195.Lsigma: 196.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral 197.Lone: 198.long 1,0,0,0 199 200.globl ChaCha20_ctr32_nohw 201.type ChaCha20_ctr32_nohw,%function 202.align 5 203ChaCha20_ctr32_nohw: 204 ldr r12,[sp,#0] @ pull pointer to counter and nonce 205 stmdb sp!,{r0-r2,r4-r11,lr} 206 adr r14,.Lsigma 207 ldmia r12,{r4-r7} @ load counter and nonce 208 sub sp,sp,#4*(16) @ off-load area 209 stmdb sp!,{r4-r7} @ copy counter and nonce 210 ldmia r3,{r4-r11} @ load key 211 ldmia r14,{r0-r3} @ load sigma 212 stmdb sp!,{r4-r11} @ copy key 213 stmdb sp!,{r0-r3} @ copy sigma 214 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 215 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 216 b .Loop_outer_enter 217 218.align 4 219.Loop_outer: 220 ldmia sp,{r0-r9} @ load key material 221 str @t[3],[sp,#4*(32+2)] @ save len 222 str r12, [sp,#4*(32+1)] @ save inp 223 str r14, [sp,#4*(32+0)] @ save out 224.Loop_outer_enter: 225 ldr @t[3], [sp,#4*(15)] 226 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 227 ldr @t[2], [sp,#4*(13)] 228 ldr @x[14],[sp,#4*(14)] 229 str @t[3], [sp,#4*(16+15)] 230 mov @t[3],#10 231 b .Loop 232 233.align 4 234.Loop: 235 subs @t[3],@t[3],#1 236___ 237 foreach (&ROUND(0, 4, 8,12)) { eval; } 238 foreach (&ROUND(0, 5,10,15)) { eval; } 239$code.=<<___; 240 bne .Loop 241 242 ldr @t[3],[sp,#4*(32+2)] @ load len 243 244 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 245 str @t[1], [sp,#4*(16+9)] 246 str @x[12],[sp,#4*(16+12)] 247 str @t[2], [sp,#4*(16+13)] 248 str @x[14],[sp,#4*(16+14)] 249 250 @ at this point we have first half of 512-bit result in 251 @ @x[0-7] and second half at sp+4*(16+8) 252 253 cmp @t[3],#64 @ done yet? 254#ifdef __thumb2__ 255 itete lo 256#endif 257 addlo r12,sp,#4*(0) @ shortcut or ... 258 ldrhs r12,[sp,#4*(32+1)] @ ... load inp 259 addlo r14,sp,#4*(0) @ shortcut or ... 260 ldrhs r14,[sp,#4*(32+0)] @ ... load out 261 262 ldr @t[0],[sp,#4*(0)] @ load key material 263 ldr @t[1],[sp,#4*(1)] 264 265#if __ARM_ARCH>=6 || !defined(__ARMEB__) 266# if __ARM_ARCH<7 267 orr @t[2],r12,r14 268 tst @t[2],#3 @ are input and output aligned? 269 ldr @t[2],[sp,#4*(2)] 270 bne .Lunaligned 271 cmp @t[3],#64 @ restore flags 272# else 273 ldr @t[2],[sp,#4*(2)] 274# endif 275 ldr @t[3],[sp,#4*(3)] 276 277 add @x[0],@x[0],@t[0] @ accumulate key material 278 add @x[1],@x[1],@t[1] 279# ifdef __thumb2__ 280 itt hs 281# endif 282 ldrhs @t[0],[r12],#16 @ load input 283 ldrhs @t[1],[r12,#-12] 284 285 add @x[2],@x[2],@t[2] 286 add @x[3],@x[3],@t[3] 287# ifdef __thumb2__ 288 itt hs 289# endif 290 ldrhs @t[2],[r12,#-8] 291 ldrhs @t[3],[r12,#-4] 292# if __ARM_ARCH>=6 && defined(__ARMEB__) 293 rev @x[0],@x[0] 294 rev @x[1],@x[1] 295 rev @x[2],@x[2] 296 rev @x[3],@x[3] 297# endif 298# ifdef __thumb2__ 299 itt hs 300# endif 301 eorhs @x[0],@x[0],@t[0] @ xor with input 302 eorhs @x[1],@x[1],@t[1] 303 add @t[0],sp,#4*(4) 304 str @x[0],[r14],#16 @ store output 305# ifdef __thumb2__ 306 itt hs 307# endif 308 eorhs @x[2],@x[2],@t[2] 309 eorhs @x[3],@x[3],@t[3] 310 ldmia @t[0],{@t[0]-@t[3]} @ load key material 311 str @x[1],[r14,#-12] 312 str @x[2],[r14,#-8] 313 str @x[3],[r14,#-4] 314 315 add @x[4],@x[4],@t[0] @ accumulate key material 316 add @x[5],@x[5],@t[1] 317# ifdef __thumb2__ 318 itt hs 319# endif 320 ldrhs @t[0],[r12],#16 @ load input 321 ldrhs @t[1],[r12,#-12] 322 add @x[6],@x[6],@t[2] 323 add @x[7],@x[7],@t[3] 324# ifdef __thumb2__ 325 itt hs 326# endif 327 ldrhs @t[2],[r12,#-8] 328 ldrhs @t[3],[r12,#-4] 329# if __ARM_ARCH>=6 && defined(__ARMEB__) 330 rev @x[4],@x[4] 331 rev @x[5],@x[5] 332 rev @x[6],@x[6] 333 rev @x[7],@x[7] 334# endif 335# ifdef __thumb2__ 336 itt hs 337# endif 338 eorhs @x[4],@x[4],@t[0] 339 eorhs @x[5],@x[5],@t[1] 340 add @t[0],sp,#4*(8) 341 str @x[4],[r14],#16 @ store output 342# ifdef __thumb2__ 343 itt hs 344# endif 345 eorhs @x[6],@x[6],@t[2] 346 eorhs @x[7],@x[7],@t[3] 347 str @x[5],[r14,#-12] 348 ldmia @t[0],{@t[0]-@t[3]} @ load key material 349 str @x[6],[r14,#-8] 350 add @x[0],sp,#4*(16+8) 351 str @x[7],[r14,#-4] 352 353 ldmia @x[0],{@x[0]-@x[7]} @ load second half 354 355 add @x[0],@x[0],@t[0] @ accumulate key material 356 add @x[1],@x[1],@t[1] 357# ifdef __thumb2__ 358 itt hs 359# endif 360 ldrhs @t[0],[r12],#16 @ load input 361 ldrhs @t[1],[r12,#-12] 362# ifdef __thumb2__ 363 itt hi 364# endif 365 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 366 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 367 add @x[2],@x[2],@t[2] 368 add @x[3],@x[3],@t[3] 369# ifdef __thumb2__ 370 itt hs 371# endif 372 ldrhs @t[2],[r12,#-8] 373 ldrhs @t[3],[r12,#-4] 374# if __ARM_ARCH>=6 && defined(__ARMEB__) 375 rev @x[0],@x[0] 376 rev @x[1],@x[1] 377 rev @x[2],@x[2] 378 rev @x[3],@x[3] 379# endif 380# ifdef __thumb2__ 381 itt hs 382# endif 383 eorhs @x[0],@x[0],@t[0] 384 eorhs @x[1],@x[1],@t[1] 385 add @t[0],sp,#4*(12) 386 str @x[0],[r14],#16 @ store output 387# ifdef __thumb2__ 388 itt hs 389# endif 390 eorhs @x[2],@x[2],@t[2] 391 eorhs @x[3],@x[3],@t[3] 392 str @x[1],[r14,#-12] 393 ldmia @t[0],{@t[0]-@t[3]} @ load key material 394 str @x[2],[r14,#-8] 395 str @x[3],[r14,#-4] 396 397 add @x[4],@x[4],@t[0] @ accumulate key material 398 add @x[5],@x[5],@t[1] 399# ifdef __thumb2__ 400 itt hi 401# endif 402 addhi @t[0],@t[0],#1 @ next counter value 403 strhi @t[0],[sp,#4*(12)] @ save next counter value 404# ifdef __thumb2__ 405 itt hs 406# endif 407 ldrhs @t[0],[r12],#16 @ load input 408 ldrhs @t[1],[r12,#-12] 409 add @x[6],@x[6],@t[2] 410 add @x[7],@x[7],@t[3] 411# ifdef __thumb2__ 412 itt hs 413# endif 414 ldrhs @t[2],[r12,#-8] 415 ldrhs @t[3],[r12,#-4] 416# if __ARM_ARCH>=6 && defined(__ARMEB__) 417 rev @x[4],@x[4] 418 rev @x[5],@x[5] 419 rev @x[6],@x[6] 420 rev @x[7],@x[7] 421# endif 422# ifdef __thumb2__ 423 itt hs 424# endif 425 eorhs @x[4],@x[4],@t[0] 426 eorhs @x[5],@x[5],@t[1] 427# ifdef __thumb2__ 428 it ne 429# endif 430 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 431# ifdef __thumb2__ 432 itt hs 433# endif 434 eorhs @x[6],@x[6],@t[2] 435 eorhs @x[7],@x[7],@t[3] 436 str @x[4],[r14],#16 @ store output 437 str @x[5],[r14,#-12] 438# ifdef __thumb2__ 439 it hs 440# endif 441 subhs @t[3],@t[0],#64 @ len-=64 442 str @x[6],[r14,#-8] 443 str @x[7],[r14,#-4] 444 bhi .Loop_outer 445 446 beq .Ldone 447# if __ARM_ARCH<7 448 b .Ltail 449 450.align 4 451.Lunaligned: @ unaligned endian-neutral path 452 cmp @t[3],#64 @ restore flags 453# endif 454#endif 455#if __ARM_ARCH<7 456 ldr @t[3],[sp,#4*(3)] 457___ 458for ($i=0;$i<16;$i+=4) { 459my $j=$i&0x7; 460 461$code.=<<___ if ($i==4); 462 add @x[0],sp,#4*(16+8) 463___ 464$code.=<<___ if ($i==8); 465 ldmia @x[0],{@x[0]-@x[7]} @ load second half 466# ifdef __thumb2__ 467 itt hi 468# endif 469 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" 470 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" 471___ 472$code.=<<___; 473 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material 474___ 475$code.=<<___ if ($i==12); 476# ifdef __thumb2__ 477 itt hi 478# endif 479 addhi @t[0],@t[0],#1 @ next counter value 480 strhi @t[0],[sp,#4*(12)] @ save next counter value 481___ 482$code.=<<___; 483 add @x[$j+1],@x[$j+1],@t[1] 484 add @x[$j+2],@x[$j+2],@t[2] 485# ifdef __thumb2__ 486 itete lo 487# endif 488 eorlo @t[0],@t[0],@t[0] @ zero or ... 489 ldrhsb @t[0],[r12],#16 @ ... load input 490 eorlo @t[1],@t[1],@t[1] 491 ldrhsb @t[1],[r12,#-12] 492 493 add @x[$j+3],@x[$j+3],@t[3] 494# ifdef __thumb2__ 495 itete lo 496# endif 497 eorlo @t[2],@t[2],@t[2] 498 ldrhsb @t[2],[r12,#-8] 499 eorlo @t[3],@t[3],@t[3] 500 ldrhsb @t[3],[r12,#-4] 501 502 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) 503 eor @x[$j+1],@t[1],@x[$j+1] 504# ifdef __thumb2__ 505 itt hs 506# endif 507 ldrhsb @t[0],[r12,#-15] @ load more input 508 ldrhsb @t[1],[r12,#-11] 509 eor @x[$j+2],@t[2],@x[$j+2] 510 strb @x[$j+0],[r14],#16 @ store output 511 eor @x[$j+3],@t[3],@x[$j+3] 512# ifdef __thumb2__ 513 itt hs 514# endif 515 ldrhsb @t[2],[r12,#-7] 516 ldrhsb @t[3],[r12,#-3] 517 strb @x[$j+1],[r14,#-12] 518 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 519 strb @x[$j+2],[r14,#-8] 520 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 521# ifdef __thumb2__ 522 itt hs 523# endif 524 ldrhsb @t[0],[r12,#-14] @ load more input 525 ldrhsb @t[1],[r12,#-10] 526 strb @x[$j+3],[r14,#-4] 527 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 528 strb @x[$j+0],[r14,#-15] 529 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 530# ifdef __thumb2__ 531 itt hs 532# endif 533 ldrhsb @t[2],[r12,#-6] 534 ldrhsb @t[3],[r12,#-2] 535 strb @x[$j+1],[r14,#-11] 536 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 537 strb @x[$j+2],[r14,#-7] 538 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 539# ifdef __thumb2__ 540 itt hs 541# endif 542 ldrhsb @t[0],[r12,#-13] @ load more input 543 ldrhsb @t[1],[r12,#-9] 544 strb @x[$j+3],[r14,#-3] 545 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 546 strb @x[$j+0],[r14,#-14] 547 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 548# ifdef __thumb2__ 549 itt hs 550# endif 551 ldrhsb @t[2],[r12,#-5] 552 ldrhsb @t[3],[r12,#-1] 553 strb @x[$j+1],[r14,#-10] 554 strb @x[$j+2],[r14,#-6] 555 eor @x[$j+0],@t[0],@x[$j+0],lsr#8 556 strb @x[$j+3],[r14,#-2] 557 eor @x[$j+1],@t[1],@x[$j+1],lsr#8 558 strb @x[$j+0],[r14,#-13] 559 eor @x[$j+2],@t[2],@x[$j+2],lsr#8 560 strb @x[$j+1],[r14,#-9] 561 eor @x[$j+3],@t[3],@x[$j+3],lsr#8 562 strb @x[$j+2],[r14,#-5] 563 strb @x[$j+3],[r14,#-1] 564___ 565$code.=<<___ if ($i<12); 566 add @t[0],sp,#4*(4+$i) 567 ldmia @t[0],{@t[0]-@t[3]} @ load key material 568___ 569} 570$code.=<<___; 571# ifdef __thumb2__ 572 it ne 573# endif 574 ldrne @t[0],[sp,#4*(32+2)] @ re-load len 575# ifdef __thumb2__ 576 it hs 577# endif 578 subhs @t[3],@t[0],#64 @ len-=64 579 bhi .Loop_outer 580 581 beq .Ldone 582#endif 583 584.Ltail: 585 ldr r12,[sp,#4*(32+1)] @ load inp 586 add @t[1],sp,#4*(0) 587 ldr r14,[sp,#4*(32+0)] @ load out 588 589.Loop_tail: 590 ldrb @t[2],[@t[1]],#1 @ read buffer on stack 591 ldrb @t[3],[r12],#1 @ read input 592 subs @t[0],@t[0],#1 593 eor @t[3],@t[3],@t[2] 594 strb @t[3],[r14],#1 @ store output 595 bne .Loop_tail 596 597.Ldone: 598 add sp,sp,#4*(32+3) 599 ldmia sp!,{r4-r11,pc} 600.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw 601___ 602 603{{{ 604my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = 605 map("q$_",(0..15)); 606 607sub NEONROUND { 608my $odd = pop; 609my ($a,$b,$c,$d,$t)=@_; 610 611 ( 612 "&vadd_i32 ($a,$a,$b)", 613 "&veor ($d,$d,$a)", 614 "&vrev32_16 ($d,$d)", # vrot ($d,16) 615 616 "&vadd_i32 ($c,$c,$d)", 617 "&veor ($t,$b,$c)", 618 "&vshr_u32 ($b,$t,20)", 619 "&vsli_32 ($b,$t,12)", 620 621 "&vadd_i32 ($a,$a,$b)", 622 "&veor ($t,$d,$a)", 623 "&vshr_u32 ($d,$t,24)", 624 "&vsli_32 ($d,$t,8)", 625 626 "&vadd_i32 ($c,$c,$d)", 627 "&veor ($t,$b,$c)", 628 "&vshr_u32 ($b,$t,25)", 629 "&vsli_32 ($b,$t,7)", 630 631 "&vext_8 ($c,$c,$c,8)", 632 "&vext_8 ($b,$b,$b,$odd?12:4)", 633 "&vext_8 ($d,$d,$d,$odd?4:12)" 634 ); 635} 636 637$code.=<<___; 638#if __ARM_MAX_ARCH__>=7 639.arch armv7-a 640.fpu neon 641 642.globl ChaCha20_ctr32_neon 643.type ChaCha20_ctr32_neon,%function 644.align 5 645ChaCha20_ctr32_neon: 646 ldr r12,[sp,#0] @ pull pointer to counter and nonce 647 stmdb sp!,{r0-r2,r4-r11,lr} 648 adr r14,.Lsigma 649 vstmdb sp!,{d8-d15} @ ABI spec says so 650 stmdb sp!,{r0-r3} 651 652 vld1.32 {$b0-$c0},[r3] @ load key 653 ldmia r3,{r4-r11} @ load key 654 655 sub sp,sp,#4*(16+16) 656 vld1.32 {$d0},[r12] @ load counter and nonce 657 add r12,sp,#4*8 658 ldmia r14,{r0-r3} @ load sigma 659 vld1.32 {$a0},[r14]! @ load sigma 660 vld1.32 {$t0},[r14] @ one 661 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce 662 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key 663 664 str r10,[sp,#4*(16+10)] @ off-load "@x[10]" 665 str r11,[sp,#4*(16+11)] @ off-load "@x[11]" 666 vshl.i32 $t1#lo,$t0#lo,#1 @ two 667 vstr $t0#lo,[sp,#4*(16+0)] 668 vshl.i32 $t2#lo,$t0#lo,#2 @ four 669 vstr $t1#lo,[sp,#4*(16+2)] 670 vmov $a1,$a0 671 vstr $t2#lo,[sp,#4*(16+4)] 672 vmov $a2,$a0 673 vmov $b1,$b0 674 vmov $b2,$b0 675 b .Loop_neon_enter 676 677.align 4 678.Loop_neon_outer: 679 ldmia sp,{r0-r9} @ load key material 680 cmp @t[3],#64*2 @ if len<=64*2 681 bls .Lbreak_neon @ switch to integer-only 682 vmov $a1,$a0 683 str @t[3],[sp,#4*(32+2)] @ save len 684 vmov $a2,$a0 685 str r12, [sp,#4*(32+1)] @ save inp 686 vmov $b1,$b0 687 str r14, [sp,#4*(32+0)] @ save out 688 vmov $b2,$b0 689.Loop_neon_enter: 690 ldr @t[3], [sp,#4*(15)] 691 vadd.i32 $d1,$d0,$t0 @ counter+1 692 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 693 vmov $c1,$c0 694 ldr @t[2], [sp,#4*(13)] 695 vmov $c2,$c0 696 ldr @x[14],[sp,#4*(14)] 697 vadd.i32 $d2,$d1,$t0 @ counter+2 698 str @t[3], [sp,#4*(16+15)] 699 mov @t[3],#10 700 add @x[12],@x[12],#3 @ counter+3 701 b .Loop_neon 702 703.align 4 704.Loop_neon: 705 subs @t[3],@t[3],#1 706___ 707 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); 708 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); 709 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); 710 my @thread3=&ROUND(0,4,8,12); 711 712 foreach (@thread0) { 713 eval; eval(shift(@thread3)); 714 eval(shift(@thread1)); eval(shift(@thread3)); 715 eval(shift(@thread2)); eval(shift(@thread3)); 716 } 717 718 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); 719 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); 720 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); 721 @thread3=&ROUND(0,5,10,15); 722 723 foreach (@thread0) { 724 eval; eval(shift(@thread3)); 725 eval(shift(@thread1)); eval(shift(@thread3)); 726 eval(shift(@thread2)); eval(shift(@thread3)); 727 } 728$code.=<<___; 729 bne .Loop_neon 730 731 add @t[3],sp,#32 732 vld1.32 {$t0-$t1},[sp] @ load key material 733 vld1.32 {$t2-$t3},[@t[3]] 734 735 ldr @t[3],[sp,#4*(32+2)] @ load len 736 737 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store 738 str @t[1], [sp,#4*(16+9)] 739 str @x[12],[sp,#4*(16+12)] 740 str @t[2], [sp,#4*(16+13)] 741 str @x[14],[sp,#4*(16+14)] 742 743 @ at this point we have first half of 512-bit result in 744 @ @x[0-7] and second half at sp+4*(16+8) 745 746 ldr r12,[sp,#4*(32+1)] @ load inp 747 ldr r14,[sp,#4*(32+0)] @ load out 748 749 vadd.i32 $a0,$a0,$t0 @ accumulate key material 750 vadd.i32 $a1,$a1,$t0 751 vadd.i32 $a2,$a2,$t0 752 vldr $t0#lo,[sp,#4*(16+0)] @ one 753 754 vadd.i32 $b0,$b0,$t1 755 vadd.i32 $b1,$b1,$t1 756 vadd.i32 $b2,$b2,$t1 757 vldr $t1#lo,[sp,#4*(16+2)] @ two 758 759 vadd.i32 $c0,$c0,$t2 760 vadd.i32 $c1,$c1,$t2 761 vadd.i32 $c2,$c2,$t2 762 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 763 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 764 765 vadd.i32 $d0,$d0,$t3 766 vadd.i32 $d1,$d1,$t3 767 vadd.i32 $d2,$d2,$t3 768 769 cmp @t[3],#64*4 770 blo .Ltail_neon 771 772 vld1.8 {$t0-$t1},[r12]! @ load input 773 mov @t[3],sp 774 vld1.8 {$t2-$t3},[r12]! 775 veor $a0,$a0,$t0 @ xor with input 776 veor $b0,$b0,$t1 777 vld1.8 {$t0-$t1},[r12]! 778 veor $c0,$c0,$t2 779 veor $d0,$d0,$t3 780 vld1.8 {$t2-$t3},[r12]! 781 782 veor $a1,$a1,$t0 783 vst1.8 {$a0-$b0},[r14]! @ store output 784 veor $b1,$b1,$t1 785 vld1.8 {$t0-$t1},[r12]! 786 veor $c1,$c1,$t2 787 vst1.8 {$c0-$d0},[r14]! 788 veor $d1,$d1,$t3 789 vld1.8 {$t2-$t3},[r12]! 790 791 veor $a2,$a2,$t0 792 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration 793 veor $t0#hi,$t0#hi,$t0#hi 794 vldr $t0#lo,[sp,#4*(16+4)] @ four 795 veor $b2,$b2,$t1 796 vld1.32 {$c0-$d0},[@t[3]] 797 veor $c2,$c2,$t2 798 vst1.8 {$a1-$b1},[r14]! 799 veor $d2,$d2,$t3 800 vst1.8 {$c1-$d1},[r14]! 801 802 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value 803 vldr $t0#lo,[sp,#4*(16+0)] @ one 804 805 ldmia sp,{@t[0]-@t[3]} @ load key material 806 add @x[0],@x[0],@t[0] @ accumulate key material 807 ldr @t[0],[r12],#16 @ load input 808 vst1.8 {$a2-$b2},[r14]! 809 add @x[1],@x[1],@t[1] 810 ldr @t[1],[r12,#-12] 811 vst1.8 {$c2-$d2},[r14]! 812 add @x[2],@x[2],@t[2] 813 ldr @t[2],[r12,#-8] 814 add @x[3],@x[3],@t[3] 815 ldr @t[3],[r12,#-4] 816# ifdef __ARMEB__ 817 rev @x[0],@x[0] 818 rev @x[1],@x[1] 819 rev @x[2],@x[2] 820 rev @x[3],@x[3] 821# endif 822 eor @x[0],@x[0],@t[0] @ xor with input 823 add @t[0],sp,#4*(4) 824 eor @x[1],@x[1],@t[1] 825 str @x[0],[r14],#16 @ store output 826 eor @x[2],@x[2],@t[2] 827 str @x[1],[r14,#-12] 828 eor @x[3],@x[3],@t[3] 829 ldmia @t[0],{@t[0]-@t[3]} @ load key material 830 str @x[2],[r14,#-8] 831 str @x[3],[r14,#-4] 832 833 add @x[4],@x[4],@t[0] @ accumulate key material 834 ldr @t[0],[r12],#16 @ load input 835 add @x[5],@x[5],@t[1] 836 ldr @t[1],[r12,#-12] 837 add @x[6],@x[6],@t[2] 838 ldr @t[2],[r12,#-8] 839 add @x[7],@x[7],@t[3] 840 ldr @t[3],[r12,#-4] 841# ifdef __ARMEB__ 842 rev @x[4],@x[4] 843 rev @x[5],@x[5] 844 rev @x[6],@x[6] 845 rev @x[7],@x[7] 846# endif 847 eor @x[4],@x[4],@t[0] 848 add @t[0],sp,#4*(8) 849 eor @x[5],@x[5],@t[1] 850 str @x[4],[r14],#16 @ store output 851 eor @x[6],@x[6],@t[2] 852 str @x[5],[r14,#-12] 853 eor @x[7],@x[7],@t[3] 854 ldmia @t[0],{@t[0]-@t[3]} @ load key material 855 str @x[6],[r14,#-8] 856 add @x[0],sp,#4*(16+8) 857 str @x[7],[r14,#-4] 858 859 ldmia @x[0],{@x[0]-@x[7]} @ load second half 860 861 add @x[0],@x[0],@t[0] @ accumulate key material 862 ldr @t[0],[r12],#16 @ load input 863 add @x[1],@x[1],@t[1] 864 ldr @t[1],[r12,#-12] 865# ifdef __thumb2__ 866 it hi 867# endif 868 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it 869 add @x[2],@x[2],@t[2] 870 ldr @t[2],[r12,#-8] 871# ifdef __thumb2__ 872 it hi 873# endif 874 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it 875 add @x[3],@x[3],@t[3] 876 ldr @t[3],[r12,#-4] 877# ifdef __ARMEB__ 878 rev @x[0],@x[0] 879 rev @x[1],@x[1] 880 rev @x[2],@x[2] 881 rev @x[3],@x[3] 882# endif 883 eor @x[0],@x[0],@t[0] 884 add @t[0],sp,#4*(12) 885 eor @x[1],@x[1],@t[1] 886 str @x[0],[r14],#16 @ store output 887 eor @x[2],@x[2],@t[2] 888 str @x[1],[r14,#-12] 889 eor @x[3],@x[3],@t[3] 890 ldmia @t[0],{@t[0]-@t[3]} @ load key material 891 str @x[2],[r14,#-8] 892 str @x[3],[r14,#-4] 893 894 add @x[4],@x[4],@t[0] @ accumulate key material 895 add @t[0],@t[0],#4 @ next counter value 896 add @x[5],@x[5],@t[1] 897 str @t[0],[sp,#4*(12)] @ save next counter value 898 ldr @t[0],[r12],#16 @ load input 899 add @x[6],@x[6],@t[2] 900 add @x[4],@x[4],#3 @ counter+3 901 ldr @t[1],[r12,#-12] 902 add @x[7],@x[7],@t[3] 903 ldr @t[2],[r12,#-8] 904 ldr @t[3],[r12,#-4] 905# ifdef __ARMEB__ 906 rev @x[4],@x[4] 907 rev @x[5],@x[5] 908 rev @x[6],@x[6] 909 rev @x[7],@x[7] 910# endif 911 eor @x[4],@x[4],@t[0] 912# ifdef __thumb2__ 913 it hi 914# endif 915 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len 916 eor @x[5],@x[5],@t[1] 917 eor @x[6],@x[6],@t[2] 918 str @x[4],[r14],#16 @ store output 919 eor @x[7],@x[7],@t[3] 920 str @x[5],[r14,#-12] 921 sub @t[3],@t[0],#64*4 @ len-=64*4 922 str @x[6],[r14,#-8] 923 str @x[7],[r14,#-4] 924 bhi .Loop_neon_outer 925 926 b .Ldone_neon 927 928.align 4 929.Lbreak_neon: 930 @ harmonize NEON and integer-only stack frames: load data 931 @ from NEON frame, but save to integer-only one; distance 932 @ between the two is 4*(32+4+16-32)=4*(20). 933 934 str @t[3], [sp,#4*(20+32+2)] @ save len 935 add @t[3],sp,#4*(32+4) 936 str r12, [sp,#4*(20+32+1)] @ save inp 937 str r14, [sp,#4*(20+32+0)] @ save out 938 939 ldr @x[12],[sp,#4*(16+10)] 940 ldr @x[14],[sp,#4*(16+11)] 941 vldmia @t[3],{d8-d15} @ fulfill ABI requirement 942 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" 943 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" 944 945 ldr @t[3], [sp,#4*(15)] 946 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load 947 ldr @t[2], [sp,#4*(13)] 948 ldr @x[14],[sp,#4*(14)] 949 str @t[3], [sp,#4*(20+16+15)] 950 add @t[3],sp,#4*(20) 951 vst1.32 {$a0-$b0},[@t[3]]! @ copy key 952 add sp,sp,#4*(20) @ switch frame 953 vst1.32 {$c0-$d0},[@t[3]] 954 mov @t[3],#10 955 b .Loop @ go integer-only 956 957.align 4 958.Ltail_neon: 959 cmp @t[3],#64*3 960 bhs .L192_or_more_neon 961 cmp @t[3],#64*2 962 bhs .L128_or_more_neon 963 cmp @t[3],#64*1 964 bhs .L64_or_more_neon 965 966 add @t[0],sp,#4*(8) 967 vst1.8 {$a0-$b0},[sp] 968 add @t[2],sp,#4*(0) 969 vst1.8 {$c0-$d0},[@t[0]] 970 b .Loop_tail_neon 971 972.align 4 973.L64_or_more_neon: 974 vld1.8 {$t0-$t1},[r12]! 975 vld1.8 {$t2-$t3},[r12]! 976 veor $a0,$a0,$t0 977 veor $b0,$b0,$t1 978 veor $c0,$c0,$t2 979 veor $d0,$d0,$t3 980 vst1.8 {$a0-$b0},[r14]! 981 vst1.8 {$c0-$d0},[r14]! 982 983 beq .Ldone_neon 984 985 add @t[0],sp,#4*(8) 986 vst1.8 {$a1-$b1},[sp] 987 add @t[2],sp,#4*(0) 988 vst1.8 {$c1-$d1},[@t[0]] 989 sub @t[3],@t[3],#64*1 @ len-=64*1 990 b .Loop_tail_neon 991 992.align 4 993.L128_or_more_neon: 994 vld1.8 {$t0-$t1},[r12]! 995 vld1.8 {$t2-$t3},[r12]! 996 veor $a0,$a0,$t0 997 veor $b0,$b0,$t1 998 vld1.8 {$t0-$t1},[r12]! 999 veor $c0,$c0,$t2 1000 veor $d0,$d0,$t3 1001 vld1.8 {$t2-$t3},[r12]! 1002 1003 veor $a1,$a1,$t0 1004 veor $b1,$b1,$t1 1005 vst1.8 {$a0-$b0},[r14]! 1006 veor $c1,$c1,$t2 1007 vst1.8 {$c0-$d0},[r14]! 1008 veor $d1,$d1,$t3 1009 vst1.8 {$a1-$b1},[r14]! 1010 vst1.8 {$c1-$d1},[r14]! 1011 1012 beq .Ldone_neon 1013 1014 add @t[0],sp,#4*(8) 1015 vst1.8 {$a2-$b2},[sp] 1016 add @t[2],sp,#4*(0) 1017 vst1.8 {$c2-$d2},[@t[0]] 1018 sub @t[3],@t[3],#64*2 @ len-=64*2 1019 b .Loop_tail_neon 1020 1021.align 4 1022.L192_or_more_neon: 1023 vld1.8 {$t0-$t1},[r12]! 1024 vld1.8 {$t2-$t3},[r12]! 1025 veor $a0,$a0,$t0 1026 veor $b0,$b0,$t1 1027 vld1.8 {$t0-$t1},[r12]! 1028 veor $c0,$c0,$t2 1029 veor $d0,$d0,$t3 1030 vld1.8 {$t2-$t3},[r12]! 1031 1032 veor $a1,$a1,$t0 1033 veor $b1,$b1,$t1 1034 vld1.8 {$t0-$t1},[r12]! 1035 veor $c1,$c1,$t2 1036 vst1.8 {$a0-$b0},[r14]! 1037 veor $d1,$d1,$t3 1038 vld1.8 {$t2-$t3},[r12]! 1039 1040 veor $a2,$a2,$t0 1041 vst1.8 {$c0-$d0},[r14]! 1042 veor $b2,$b2,$t1 1043 vst1.8 {$a1-$b1},[r14]! 1044 veor $c2,$c2,$t2 1045 vst1.8 {$c1-$d1},[r14]! 1046 veor $d2,$d2,$t3 1047 vst1.8 {$a2-$b2},[r14]! 1048 vst1.8 {$c2-$d2},[r14]! 1049 1050 beq .Ldone_neon 1051 1052 ldmia sp,{@t[0]-@t[3]} @ load key material 1053 add @x[0],@x[0],@t[0] @ accumulate key material 1054 add @t[0],sp,#4*(4) 1055 add @x[1],@x[1],@t[1] 1056 add @x[2],@x[2],@t[2] 1057 add @x[3],@x[3],@t[3] 1058 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1059 1060 add @x[4],@x[4],@t[0] @ accumulate key material 1061 add @t[0],sp,#4*(8) 1062 add @x[5],@x[5],@t[1] 1063 add @x[6],@x[6],@t[2] 1064 add @x[7],@x[7],@t[3] 1065 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1066# ifdef __ARMEB__ 1067 rev @x[0],@x[0] 1068 rev @x[1],@x[1] 1069 rev @x[2],@x[2] 1070 rev @x[3],@x[3] 1071 rev @x[4],@x[4] 1072 rev @x[5],@x[5] 1073 rev @x[6],@x[6] 1074 rev @x[7],@x[7] 1075# endif 1076 stmia sp,{@x[0]-@x[7]} 1077 add @x[0],sp,#4*(16+8) 1078 1079 ldmia @x[0],{@x[0]-@x[7]} @ load second half 1080 1081 add @x[0],@x[0],@t[0] @ accumulate key material 1082 add @t[0],sp,#4*(12) 1083 add @x[1],@x[1],@t[1] 1084 add @x[2],@x[2],@t[2] 1085 add @x[3],@x[3],@t[3] 1086 ldmia @t[0],{@t[0]-@t[3]} @ load key material 1087 1088 add @x[4],@x[4],@t[0] @ accumulate key material 1089 add @t[0],sp,#4*(8) 1090 add @x[5],@x[5],@t[1] 1091 add @x[4],@x[4],#3 @ counter+3 1092 add @x[6],@x[6],@t[2] 1093 add @x[7],@x[7],@t[3] 1094 ldr @t[3],[sp,#4*(32+2)] @ re-load len 1095# ifdef __ARMEB__ 1096 rev @x[0],@x[0] 1097 rev @x[1],@x[1] 1098 rev @x[2],@x[2] 1099 rev @x[3],@x[3] 1100 rev @x[4],@x[4] 1101 rev @x[5],@x[5] 1102 rev @x[6],@x[6] 1103 rev @x[7],@x[7] 1104# endif 1105 stmia @t[0],{@x[0]-@x[7]} 1106 add @t[2],sp,#4*(0) 1107 sub @t[3],@t[3],#64*3 @ len-=64*3 1108 1109.Loop_tail_neon: 1110 ldrb @t[0],[@t[2]],#1 @ read buffer on stack 1111 ldrb @t[1],[r12],#1 @ read input 1112 subs @t[3],@t[3],#1 1113 eor @t[0],@t[0],@t[1] 1114 strb @t[0],[r14],#1 @ store output 1115 bne .Loop_tail_neon 1116 1117.Ldone_neon: 1118 add sp,sp,#4*(32+4) 1119 vldmia sp,{d8-d15} 1120 add sp,sp,#4*(16+3) 1121 ldmia sp!,{r4-r11,pc} 1122.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon 1123#endif 1124___ 1125}}} 1126 1127foreach (split("\n",$code)) { 1128 s/\`([^\`]*)\`/eval $1/geo; 1129 1130 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1131 1132 print $_,"\n"; 1133} 1134close STDOUT or die "error closing STDOUT: $!"; 1135