1#! /usr/bin/env perl 2# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <[email protected]> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# January 2015 18# 19# ChaCha20 for x86. 20# 21# Performance in cycles per byte out of large buffer. 22# 23# 1xIALU/gcc 4xSSSE3 24# Pentium 17.5/+80% 25# PIII 14.2/+60% 26# P4 18.6/+84% 27# Core2 9.56/+89% 4.83 28# Westmere 9.50/+45% 3.35 29# Sandy Bridge 10.5/+47% 3.20 30# Haswell 8.15/+50% 2.83 31# Skylake 7.53/+22% 2.75 32# Silvermont 17.4/+36% 8.35 33# Goldmont 13.4/+40% 4.36 34# Sledgehammer 10.2/+54% 35# Bulldozer 13.4/+50% 4.38(*) 36# 37# (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; 38# 39# Modified from upstream OpenSSL to remove the XOP code. 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42push(@INC,"${dir}","${dir}../../perlasm"); 43require "x86asm.pl"; 44 45$output=pop; 46open STDOUT,">$output"; 47 48&asm_init($ARGV[0]); 49 50$xmm=$ymm=1; 51$gasver=999; # enable everything 52 53$a="eax"; 54($b,$b_)=("ebx","ebp"); 55($c,$c_)=("ecx","esi"); 56($d,$d_)=("edx","edi"); 57 58sub QUARTERROUND { 59my ($ai,$bi,$ci,$di,$i)=@_; 60my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 61my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 62 63 # a b c d 64 # 65 # 0 4 8 12 < even round 66 # 1 5 9 13 67 # 2 6 10 14 68 # 3 7 11 15 69 # 0 5 10 15 < odd round 70 # 1 6 11 12 71 # 2 7 8 13 72 # 3 4 9 14 73 74 if ($i==0) { 75 my $j=4; 76 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 77 } elsif ($i==3) { 78 my $j=0; 79 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 80 } elsif ($i==4) { 81 my $j=4; 82 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 83 } elsif ($i==7) { 84 my $j=0; 85 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 86 } 87 88 #&add ($a,$b); # see elsewhere 89 &xor ($d,$a); 90 &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); 91 &rol ($d,16); 92 &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); 93 &add ($c,$d); 94 &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); 95 &xor ($b,$c); 96 &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); 97 &rol ($b,12); 98 &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); 99 &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter 100 &add ($a,$b); 101 &xor ($d,$a); 102 &mov (&DWP(4*$ai,"esp"),$a); 103 &rol ($d,8); 104 &mov ($a,&DWP(4*$an,"esp")); 105 &add ($c,$d); 106 &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); 107 &mov ($d_,$d) if ($di==$dn); 108 &xor ($b,$c); 109 &add ($a,$b_) if ($i<7); # elsewhere 110 &rol ($b,7); 111 112 ($b,$b_)=($b_,$b); 113 ($c,$c_)=($c_,$c); 114 ($d,$d_)=($d_,$d); 115} 116 117&static_label("ssse3_data"); 118&static_label("pic_point"); 119 120&function_begin("ChaCha20_ctr32_nohw"); 121 &mov ("esi",&wparam(3)); # key 122 &mov ("edi",&wparam(4)); # counter and nonce 123 124 &stack_push(33); 125 126 &mov ("eax",&DWP(4*0,"esi")); # copy key 127 &mov ("ebx",&DWP(4*1,"esi")); 128 &mov ("ecx",&DWP(4*2,"esi")); 129 &mov ("edx",&DWP(4*3,"esi")); 130 &mov (&DWP(64+4*4,"esp"),"eax"); 131 &mov (&DWP(64+4*5,"esp"),"ebx"); 132 &mov (&DWP(64+4*6,"esp"),"ecx"); 133 &mov (&DWP(64+4*7,"esp"),"edx"); 134 &mov ("eax",&DWP(4*4,"esi")); 135 &mov ("ebx",&DWP(4*5,"esi")); 136 &mov ("ecx",&DWP(4*6,"esi")); 137 &mov ("edx",&DWP(4*7,"esi")); 138 &mov (&DWP(64+4*8,"esp"),"eax"); 139 &mov (&DWP(64+4*9,"esp"),"ebx"); 140 &mov (&DWP(64+4*10,"esp"),"ecx"); 141 &mov (&DWP(64+4*11,"esp"),"edx"); 142 &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce 143 &mov ("ebx",&DWP(4*1,"edi")); 144 &mov ("ecx",&DWP(4*2,"edi")); 145 &mov ("edx",&DWP(4*3,"edi")); 146 &sub ("eax",1); 147 &mov (&DWP(64+4*12,"esp"),"eax"); 148 &mov (&DWP(64+4*13,"esp"),"ebx"); 149 &mov (&DWP(64+4*14,"esp"),"ecx"); 150 &mov (&DWP(64+4*15,"esp"),"edx"); 151 &jmp (&label("entry")); 152 153&set_label("outer_loop",16); 154 &mov (&wparam(1),$b); # save input 155 &mov (&wparam(0),$a); # save output 156 &mov (&wparam(2),$c); # save len 157&set_label("entry"); 158 &mov ($a,0x61707865); 159 &mov (&DWP(4*1,"esp"),0x3320646e); 160 &mov (&DWP(4*2,"esp"),0x79622d32); 161 &mov (&DWP(4*3,"esp"),0x6b206574); 162 163 &mov ($b, &DWP(64+4*5,"esp")); # copy key material 164 &mov ($b_,&DWP(64+4*6,"esp")); 165 &mov ($c, &DWP(64+4*10,"esp")); 166 &mov ($c_,&DWP(64+4*11,"esp")); 167 &mov ($d, &DWP(64+4*13,"esp")); 168 &mov ($d_,&DWP(64+4*14,"esp")); 169 &mov (&DWP(4*5,"esp"),$b); 170 &mov (&DWP(4*6,"esp"),$b_); 171 &mov (&DWP(4*10,"esp"),$c); 172 &mov (&DWP(4*11,"esp"),$c_); 173 &mov (&DWP(4*13,"esp"),$d); 174 &mov (&DWP(4*14,"esp"),$d_); 175 176 &mov ($b, &DWP(64+4*7,"esp")); 177 &mov ($d_,&DWP(64+4*15,"esp")); 178 &mov ($d, &DWP(64+4*12,"esp")); 179 &mov ($b_,&DWP(64+4*4,"esp")); 180 &mov ($c, &DWP(64+4*8,"esp")); 181 &mov ($c_,&DWP(64+4*9,"esp")); 182 &add ($d,1); # counter value 183 &mov (&DWP(4*7,"esp"),$b); 184 &mov (&DWP(4*15,"esp"),$d_); 185 &mov (&DWP(64+4*12,"esp"),$d); # save counter value 186 187 &mov ($b,10); # loop counter 188 &jmp (&label("loop")); 189 190&set_label("loop",16); 191 &add ($a,$b_); # elsewhere 192 &mov (&DWP(128,"esp"),$b); # save loop counter 193 &mov ($b,$b_); 194 &QUARTERROUND(0, 4, 8, 12, 0); 195 &QUARTERROUND(1, 5, 9, 13, 1); 196 &QUARTERROUND(2, 6,10, 14, 2); 197 &QUARTERROUND(3, 7,11, 15, 3); 198 &QUARTERROUND(0, 5,10, 15, 4); 199 &QUARTERROUND(1, 6,11, 12, 5); 200 &QUARTERROUND(2, 7, 8, 13, 6); 201 &QUARTERROUND(3, 4, 9, 14, 7); 202 &dec ($b); 203 &jnz (&label("loop")); 204 205 &mov ($b,&wparam(2)); # load len 206 207 &add ($a,0x61707865); # accumulate key material 208 &add ($b_,&DWP(64+4*4,"esp")); 209 &add ($c, &DWP(64+4*8,"esp")); 210 &add ($c_,&DWP(64+4*9,"esp")); 211 212 &cmp ($b,64); 213 &jb (&label("tail")); 214 215 &mov ($b,&wparam(1)); # load input pointer 216 &add ($d, &DWP(64+4*12,"esp")); 217 &add ($d_,&DWP(64+4*14,"esp")); 218 219 &xor ($a, &DWP(4*0,$b)); # xor with input 220 &xor ($b_,&DWP(4*4,$b)); 221 &mov (&DWP(4*0,"esp"),$a); 222 &mov ($a,&wparam(0)); # load output pointer 223 &xor ($c, &DWP(4*8,$b)); 224 &xor ($c_,&DWP(4*9,$b)); 225 &xor ($d, &DWP(4*12,$b)); 226 &xor ($d_,&DWP(4*14,$b)); 227 &mov (&DWP(4*4,$a),$b_); # write output 228 &mov (&DWP(4*8,$a),$c); 229 &mov (&DWP(4*9,$a),$c_); 230 &mov (&DWP(4*12,$a),$d); 231 &mov (&DWP(4*14,$a),$d_); 232 233 &mov ($b_,&DWP(4*1,"esp")); 234 &mov ($c, &DWP(4*2,"esp")); 235 &mov ($c_,&DWP(4*3,"esp")); 236 &mov ($d, &DWP(4*5,"esp")); 237 &mov ($d_,&DWP(4*6,"esp")); 238 &add ($b_,0x3320646e); # accumulate key material 239 &add ($c, 0x79622d32); 240 &add ($c_,0x6b206574); 241 &add ($d, &DWP(64+4*5,"esp")); 242 &add ($d_,&DWP(64+4*6,"esp")); 243 &xor ($b_,&DWP(4*1,$b)); 244 &xor ($c, &DWP(4*2,$b)); 245 &xor ($c_,&DWP(4*3,$b)); 246 &xor ($d, &DWP(4*5,$b)); 247 &xor ($d_,&DWP(4*6,$b)); 248 &mov (&DWP(4*1,$a),$b_); 249 &mov (&DWP(4*2,$a),$c); 250 &mov (&DWP(4*3,$a),$c_); 251 &mov (&DWP(4*5,$a),$d); 252 &mov (&DWP(4*6,$a),$d_); 253 254 &mov ($b_,&DWP(4*7,"esp")); 255 &mov ($c, &DWP(4*10,"esp")); 256 &mov ($c_,&DWP(4*11,"esp")); 257 &mov ($d, &DWP(4*13,"esp")); 258 &mov ($d_,&DWP(4*15,"esp")); 259 &add ($b_,&DWP(64+4*7,"esp")); 260 &add ($c, &DWP(64+4*10,"esp")); 261 &add ($c_,&DWP(64+4*11,"esp")); 262 &add ($d, &DWP(64+4*13,"esp")); 263 &add ($d_,&DWP(64+4*15,"esp")); 264 &xor ($b_,&DWP(4*7,$b)); 265 &xor ($c, &DWP(4*10,$b)); 266 &xor ($c_,&DWP(4*11,$b)); 267 &xor ($d, &DWP(4*13,$b)); 268 &xor ($d_,&DWP(4*15,$b)); 269 &lea ($b,&DWP(4*16,$b)); 270 &mov (&DWP(4*7,$a),$b_); 271 &mov ($b_,&DWP(4*0,"esp")); 272 &mov (&DWP(4*10,$a),$c); 273 &mov ($c,&wparam(2)); # len 274 &mov (&DWP(4*11,$a),$c_); 275 &mov (&DWP(4*13,$a),$d); 276 &mov (&DWP(4*15,$a),$d_); 277 &mov (&DWP(4*0,$a),$b_); 278 &lea ($a,&DWP(4*16,$a)); 279 &sub ($c,64); 280 &jnz (&label("outer_loop")); 281 282 &jmp (&label("done")); 283 284&set_label("tail"); 285 &add ($d, &DWP(64+4*12,"esp")); 286 &add ($d_,&DWP(64+4*14,"esp")); 287 &mov (&DWP(4*0,"esp"),$a); 288 &mov (&DWP(4*4,"esp"),$b_); 289 &mov (&DWP(4*8,"esp"),$c); 290 &mov (&DWP(4*9,"esp"),$c_); 291 &mov (&DWP(4*12,"esp"),$d); 292 &mov (&DWP(4*14,"esp"),$d_); 293 294 &mov ($b_,&DWP(4*1,"esp")); 295 &mov ($c, &DWP(4*2,"esp")); 296 &mov ($c_,&DWP(4*3,"esp")); 297 &mov ($d, &DWP(4*5,"esp")); 298 &mov ($d_,&DWP(4*6,"esp")); 299 &add ($b_,0x3320646e); # accumulate key material 300 &add ($c, 0x79622d32); 301 &add ($c_,0x6b206574); 302 &add ($d, &DWP(64+4*5,"esp")); 303 &add ($d_,&DWP(64+4*6,"esp")); 304 &mov (&DWP(4*1,"esp"),$b_); 305 &mov (&DWP(4*2,"esp"),$c); 306 &mov (&DWP(4*3,"esp"),$c_); 307 &mov (&DWP(4*5,"esp"),$d); 308 &mov (&DWP(4*6,"esp"),$d_); 309 310 &mov ($b_,&DWP(4*7,"esp")); 311 &mov ($c, &DWP(4*10,"esp")); 312 &mov ($c_,&DWP(4*11,"esp")); 313 &mov ($d, &DWP(4*13,"esp")); 314 &mov ($d_,&DWP(4*15,"esp")); 315 &add ($b_,&DWP(64+4*7,"esp")); 316 &add ($c, &DWP(64+4*10,"esp")); 317 &add ($c_,&DWP(64+4*11,"esp")); 318 &add ($d, &DWP(64+4*13,"esp")); 319 &add ($d_,&DWP(64+4*15,"esp")); 320 &mov (&DWP(4*7,"esp"),$b_); 321 &mov ($b_,&wparam(1)); # load input 322 &mov (&DWP(4*10,"esp"),$c); 323 &mov ($c,&wparam(0)); # load output 324 &mov (&DWP(4*11,"esp"),$c_); 325 &xor ($c_,$c_); 326 &mov (&DWP(4*13,"esp"),$d); 327 &mov (&DWP(4*15,"esp"),$d_); 328 329 &xor ("eax","eax"); 330 &xor ("edx","edx"); 331&set_label("tail_loop"); 332 &movb ("al",&BP(0,$c_,$b_)); 333 &movb ("dl",&BP(0,"esp",$c_)); 334 &lea ($c_,&DWP(1,$c_)); 335 &xor ("al","dl"); 336 &mov (&BP(-1,$c,$c_),"al"); 337 &dec ($b); 338 &jnz (&label("tail_loop")); 339 340&set_label("done"); 341 &stack_pop(33); 342&function_end("ChaCha20_ctr32_nohw"); 343 344if ($xmm) { 345my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); 346my ($out,$inp,$len)=("edi","esi","ecx"); 347 348sub QUARTERROUND_SSSE3 { 349my ($ai,$bi,$ci,$di,$i)=@_; 350my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next 351my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous 352 353 # a b c d 354 # 355 # 0 4 8 12 < even round 356 # 1 5 9 13 357 # 2 6 10 14 358 # 3 7 11 15 359 # 0 5 10 15 < odd round 360 # 1 6 11 12 361 # 2 7 8 13 362 # 3 4 9 14 363 364 if ($i==0) { 365 my $j=4; 366 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); 367 } elsif ($i==3) { 368 my $j=0; 369 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); 370 } elsif ($i==4) { 371 my $j=4; 372 ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); 373 } elsif ($i==7) { 374 my $j=0; 375 ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); 376 } 377 378 #&paddd ($xa,$xb); # see elsewhere 379 #&pxor ($xd,$xa); # see elsewhere 380 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); 381 &pshufb ($xd,&QWP(0,"eax")); # rot16 382 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); 383 &paddd ($xc,$xd); 384 &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); 385 &pxor ($xb,$xc); 386 &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); 387 &movdqa ($xa_,$xb); # borrow as temporary 388 &pslld ($xb,12); 389 &psrld ($xa_,20); 390 &por ($xb,$xa_); 391 &movdqa($xa_,&QWP(16*$an-128,"ebx")); 392 &paddd ($xa,$xb); 393 &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); 394 &pxor ($xd,$xa); 395 &movdqa (&QWP(16*$ai-128,"ebx"),$xa); 396 &pshufb ($xd,&QWP(16,"eax")); # rot8 397 &paddd ($xc,$xd); 398 &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); 399 &movdqa ($xd_,$xd) if ($di==$dn); 400 &pxor ($xb,$xc); 401 &paddd ($xa_,$xb_) if ($i<7); # elsewhere 402 &movdqa ($xa,$xb); # borrow as temporary 403 &pslld ($xb,7); 404 &psrld ($xa,25); 405 &pxor ($xd_,$xa_) if ($i<7); # elsewhere 406 &por ($xb,$xa); 407 408 ($xa,$xa_)=($xa_,$xa); 409 ($xb,$xb_)=($xb_,$xb); 410 ($xc,$xc_)=($xc_,$xc); 411 ($xd,$xd_)=($xd_,$xd); 412} 413 414&function_begin("ChaCha20_ctr32_ssse3"); 415 &call (&label("pic_point")); 416&set_label("pic_point"); 417 &blindpop("eax"); 418 419 &mov ($out,&wparam(0)); 420 &mov ($inp,&wparam(1)); 421 &mov ($len,&wparam(2)); 422 &mov ("edx",&wparam(3)); # key 423 &mov ("ebx",&wparam(4)); # counter and nonce 424 425 &mov ("ebp","esp"); 426 &stack_push (131); 427 &and ("esp",-64); 428 &mov (&DWP(512,"esp"),"ebp"); 429 430 &lea ("eax",&DWP(&label("ssse3_data")."-". 431 &label("pic_point"),"eax")); 432 &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce 433 434if (defined($gasver) && $gasver>=2.17) { # even though we encode 435 # pshufb manually, we 436 # handle only register 437 # operands, while this 438 # segment uses memory 439 # operand... 440 &cmp ($len,64*4); 441 &jb (&label("1x")); 442 443 &mov (&DWP(512+4,"esp"),"edx"); # offload pointers 444 &mov (&DWP(512+8,"esp"),"ebx"); 445 &sub ($len,64*4); # bias len 446 &lea ("ebp",&DWP(256+128,"esp")); # size optimization 447 448 &movdqu ("xmm7",&QWP(0,"edx")); # key 449 &pshufd ("xmm0","xmm3",0x00); 450 &pshufd ("xmm1","xmm3",0x55); 451 &pshufd ("xmm2","xmm3",0xaa); 452 &pshufd ("xmm3","xmm3",0xff); 453 &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters 454 &pshufd ("xmm4","xmm7",0x00); 455 &pshufd ("xmm5","xmm7",0x55); 456 &psubd ("xmm0",&QWP(16*4,"eax")); 457 &pshufd ("xmm6","xmm7",0xaa); 458 &pshufd ("xmm7","xmm7",0xff); 459 &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); 460 &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); 461 &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); 462 &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); 463 &movdqu ("xmm3",&QWP(16,"edx")); # key 464 &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); 465 &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); 466 &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); 467 &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); 468 &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma 469 &lea ("ebx",&DWP(128,"esp")); # size optimization 470 471 &pshufd ("xmm0","xmm3",0x00); 472 &pshufd ("xmm1","xmm3",0x55); 473 &pshufd ("xmm2","xmm3",0xaa); 474 &pshufd ("xmm3","xmm3",0xff); 475 &pshufd ("xmm4","xmm7",0x00); 476 &pshufd ("xmm5","xmm7",0x55); 477 &pshufd ("xmm6","xmm7",0xaa); 478 &pshufd ("xmm7","xmm7",0xff); 479 &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); 480 &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); 481 &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); 482 &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); 483 &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); 484 &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); 485 &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); 486 &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); 487 488 &lea ($inp,&DWP(128,$inp)); # size optimization 489 &lea ($out,&DWP(128,$out)); # size optimization 490 &jmp (&label("outer_loop")); 491 492&set_label("outer_loop",16); 493 #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material 494 &movdqa ("xmm1",&QWP(16*1-128,"ebp")); 495 &movdqa ("xmm2",&QWP(16*2-128,"ebp")); 496 &movdqa ("xmm3",&QWP(16*3-128,"ebp")); 497 #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); 498 &movdqa ("xmm5",&QWP(16*5-128,"ebp")); 499 &movdqa ("xmm6",&QWP(16*6-128,"ebp")); 500 &movdqa ("xmm7",&QWP(16*7-128,"ebp")); 501 #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); 502 &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); 503 &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); 504 &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); 505 #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); 506 &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); 507 &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); 508 &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); 509 #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); 510 #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); 511 &movdqa ("xmm2",&QWP(16*10-128,"ebp")); 512 &movdqa ("xmm3",&QWP(16*11-128,"ebp")); 513 &movdqa ("xmm4",&QWP(16*12-128,"ebp")); 514 &movdqa ("xmm5",&QWP(16*13-128,"ebp")); 515 &movdqa ("xmm6",&QWP(16*14-128,"ebp")); 516 &movdqa ("xmm7",&QWP(16*15-128,"ebp")); 517 &paddd ("xmm4",&QWP(16*4,"eax")); # counter value 518 #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); 519 #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); 520 &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); 521 &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); 522 &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); 523 &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); 524 &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); 525 &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); 526 &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value 527 528 &movdqa ($xa, &QWP(16*0-128,"ebp")); 529 &movdqa ($xd, "xmm4"); 530 &movdqa ($xb_,&QWP(16*4-128,"ebp")); 531 &movdqa ($xc, &QWP(16*8-128,"ebp")); 532 &movdqa ($xc_,&QWP(16*9-128,"ebp")); 533 534 &mov ("edx",10); # loop counter 535 &nop (); 536 537&set_label("loop",16); 538 &paddd ($xa,$xb_); # elsewhere 539 &movdqa ($xb,$xb_); 540 &pxor ($xd,$xa); # elsewhere 541 &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); 542 &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); 543 &QUARTERROUND_SSSE3(2, 6,10, 14, 2); 544 &QUARTERROUND_SSSE3(3, 7,11, 15, 3); 545 &QUARTERROUND_SSSE3(0, 5,10, 15, 4); 546 &QUARTERROUND_SSSE3(1, 6,11, 12, 5); 547 &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); 548 &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); 549 &dec ("edx"); 550 &jnz (&label("loop")); 551 552 &movdqa (&QWP(16*4-128,"ebx"),$xb_); 553 &movdqa (&QWP(16*8-128,"ebx"),$xc); 554 &movdqa (&QWP(16*9-128,"ebx"),$xc_); 555 &movdqa (&QWP(16*12-128,"ebx"),$xd); 556 &movdqa (&QWP(16*14-128,"ebx"),$xd_); 557 558 my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); 559 560 #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there 561 &movdqa ($xa1,&QWP(16*1-128,"ebx")); 562 &movdqa ($xa2,&QWP(16*2-128,"ebx")); 563 &movdqa ($xa3,&QWP(16*3-128,"ebx")); 564 565 for($i=0;$i<256;$i+=64) { 566 &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material 567 &paddd ($xa1,&QWP($i+16*1-128,"ebp")); 568 &paddd ($xa2,&QWP($i+16*2-128,"ebp")); 569 &paddd ($xa3,&QWP($i+16*3-128,"ebp")); 570 571 &movdqa ($xt2,$xa0); # "de-interlace" data 572 &punpckldq ($xa0,$xa1); 573 &movdqa ($xt3,$xa2); 574 &punpckldq ($xa2,$xa3); 575 &punpckhdq ($xt2,$xa1); 576 &punpckhdq ($xt3,$xa3); 577 &movdqa ($xa1,$xa0); 578 &punpcklqdq ($xa0,$xa2); # "a0" 579 &movdqa ($xa3,$xt2); 580 &punpcklqdq ($xt2,$xt3); # "a2" 581 &punpckhqdq ($xa1,$xa2); # "a1" 582 &punpckhqdq ($xa3,$xt3); # "a3" 583 584 #($xa2,$xt2)=($xt2,$xa2); 585 586 &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input 587 &movdqu ($xt1,&QWP(64*1-128,$inp)); 588 &movdqu ($xa2,&QWP(64*2-128,$inp)); 589 &movdqu ($xt3,&QWP(64*3-128,$inp)); 590 &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); 591 &pxor ($xt0,$xa0); 592 &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); 593 &pxor ($xt1,$xa1); 594 &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); 595 &pxor ($xt2,$xa2); 596 &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); 597 &pxor ($xt3,$xa3); 598 &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); 599 &movdqu (&QWP(64*0-128,$out),$xt0); # store output 600 &movdqu (&QWP(64*1-128,$out),$xt1); 601 &movdqu (&QWP(64*2-128,$out),$xt2); 602 &movdqu (&QWP(64*3-128,$out),$xt3); 603 &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); 604 } 605 &sub ($len,64*4); 606 &jnc (&label("outer_loop")); 607 608 &add ($len,64*4); 609 &jz (&label("done")); 610 611 &mov ("ebx",&DWP(512+8,"esp")); # restore pointers 612 &lea ($inp,&DWP(-128,$inp)); 613 &mov ("edx",&DWP(512+4,"esp")); 614 &lea ($out,&DWP(-128,$out)); 615 616 &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value 617 &movdqu ("xmm3",&QWP(0,"ebx")); 618 &paddd ("xmm2",&QWP(16*6,"eax")); # +four 619 &pand ("xmm3",&QWP(16*7,"eax")); 620 &por ("xmm3","xmm2"); # counter value 621} 622{ 623my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); 624 625sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round 626 &paddd ($a,$b); 627 &pxor ($d,$a); 628 &pshufb ($d,$rot16); 629 630 &paddd ($c,$d); 631 &pxor ($b,$c); 632 &movdqa ($t,$b); 633 &psrld ($b,20); 634 &pslld ($t,12); 635 &por ($b,$t); 636 637 &paddd ($a,$b); 638 &pxor ($d,$a); 639 &pshufb ($d,$rot24); 640 641 &paddd ($c,$d); 642 &pxor ($b,$c); 643 &movdqa ($t,$b); 644 &psrld ($b,25); 645 &pslld ($t,7); 646 &por ($b,$t); 647} 648 649&set_label("1x"); 650 &movdqa ($a,&QWP(16*2,"eax")); # sigma 651 &movdqu ($b,&QWP(0,"edx")); 652 &movdqu ($c,&QWP(16,"edx")); 653 #&movdqu ($d,&QWP(0,"ebx")); # already loaded 654 &movdqa ($rot16,&QWP(0,"eax")); 655 &movdqa ($rot24,&QWP(16,"eax")); 656 &mov (&DWP(16*3,"esp"),"ebp"); 657 658 &movdqa (&QWP(16*0,"esp"),$a); 659 &movdqa (&QWP(16*1,"esp"),$b); 660 &movdqa (&QWP(16*2,"esp"),$c); 661 &movdqa (&QWP(16*3,"esp"),$d); 662 &mov ("edx",10); 663 &jmp (&label("loop1x")); 664 665&set_label("outer1x",16); 666 &movdqa ($d,&QWP(16*5,"eax")); # one 667 &movdqa ($a,&QWP(16*0,"esp")); 668 &movdqa ($b,&QWP(16*1,"esp")); 669 &movdqa ($c,&QWP(16*2,"esp")); 670 &paddd ($d,&QWP(16*3,"esp")); 671 &mov ("edx",10); 672 &movdqa (&QWP(16*3,"esp"),$d); 673 &jmp (&label("loop1x")); 674 675&set_label("loop1x",16); 676 &SSSE3ROUND(); 677 &pshufd ($c,$c,0b01001110); 678 &pshufd ($b,$b,0b00111001); 679 &pshufd ($d,$d,0b10010011); 680 &nop (); 681 682 &SSSE3ROUND(); 683 &pshufd ($c,$c,0b01001110); 684 &pshufd ($b,$b,0b10010011); 685 &pshufd ($d,$d,0b00111001); 686 687 &dec ("edx"); 688 &jnz (&label("loop1x")); 689 690 &paddd ($a,&QWP(16*0,"esp")); 691 &paddd ($b,&QWP(16*1,"esp")); 692 &paddd ($c,&QWP(16*2,"esp")); 693 &paddd ($d,&QWP(16*3,"esp")); 694 695 &cmp ($len,64); 696 &jb (&label("tail")); 697 698 &movdqu ($t,&QWP(16*0,$inp)); 699 &movdqu ($t1,&QWP(16*1,$inp)); 700 &pxor ($a,$t); # xor with input 701 &movdqu ($t,&QWP(16*2,$inp)); 702 &pxor ($b,$t1); 703 &movdqu ($t1,&QWP(16*3,$inp)); 704 &pxor ($c,$t); 705 &pxor ($d,$t1); 706 &lea ($inp,&DWP(16*4,$inp)); # inp+=64 707 708 &movdqu (&QWP(16*0,$out),$a); # write output 709 &movdqu (&QWP(16*1,$out),$b); 710 &movdqu (&QWP(16*2,$out),$c); 711 &movdqu (&QWP(16*3,$out),$d); 712 &lea ($out,&DWP(16*4,$out)); # inp+=64 713 714 &sub ($len,64); 715 &jnz (&label("outer1x")); 716 717 &jmp (&label("done")); 718 719&set_label("tail"); 720 &movdqa (&QWP(16*0,"esp"),$a); 721 &movdqa (&QWP(16*1,"esp"),$b); 722 &movdqa (&QWP(16*2,"esp"),$c); 723 &movdqa (&QWP(16*3,"esp"),$d); 724 725 &xor ("eax","eax"); 726 &xor ("edx","edx"); 727 &xor ("ebp","ebp"); 728 729&set_label("tail_loop"); 730 &movb ("al",&BP(0,"esp","ebp")); 731 &movb ("dl",&BP(0,$inp,"ebp")); 732 &lea ("ebp",&DWP(1,"ebp")); 733 &xor ("al","dl"); 734 &movb (&BP(-1,$out,"ebp"),"al"); 735 &dec ($len); 736 &jnz (&label("tail_loop")); 737} 738&set_label("done"); 739 &mov ("esp",&DWP(512,"esp")); 740&function_end("ChaCha20_ctr32_ssse3"); 741 742&align (64); 743&set_label("ssse3_data"); 744&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); 745&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); 746&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); 747&data_word(0,1,2,3); 748&data_word(4,4,4,4); 749&data_word(1,0,0,0); 750&data_word(4,0,0,0); 751&data_word(0,-1,-1,-1); 752&align (64); 753} 754&asciz ("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>"); 755 756&asm_finish(); 757 758close STDOUT or die "error closing STDOUT: $!"; 759