1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) 7.text 8.extern OPENSSL_ia32cap_P 9.hidden OPENSSL_ia32cap_P 10.globl gcm_init_clmul 11.hidden gcm_init_clmul 12.type gcm_init_clmul,@function 13.align 16 14gcm_init_clmul: 15.cfi_startproc 16 17_CET_ENDBR 18.L_init_clmul: 19 movdqu (%rsi),%xmm2 20 pshufd $78,%xmm2,%xmm2 21 22 23 pshufd $255,%xmm2,%xmm4 24 movdqa %xmm2,%xmm3 25 psllq $1,%xmm2 26 pxor %xmm5,%xmm5 27 psrlq $63,%xmm3 28 pcmpgtd %xmm4,%xmm5 29 pslldq $8,%xmm3 30 por %xmm3,%xmm2 31 32 33 pand .L0x1c2_polynomial(%rip),%xmm5 34 pxor %xmm5,%xmm2 35 36 37 pshufd $78,%xmm2,%xmm6 38 movdqa %xmm2,%xmm0 39 pxor %xmm2,%xmm6 40 movdqa %xmm0,%xmm1 41 pshufd $78,%xmm0,%xmm3 42 pxor %xmm0,%xmm3 43.byte 102,15,58,68,194,0 44.byte 102,15,58,68,202,17 45.byte 102,15,58,68,222,0 46 pxor %xmm0,%xmm3 47 pxor %xmm1,%xmm3 48 49 movdqa %xmm3,%xmm4 50 psrldq $8,%xmm3 51 pslldq $8,%xmm4 52 pxor %xmm3,%xmm1 53 pxor %xmm4,%xmm0 54 55 movdqa %xmm0,%xmm4 56 movdqa %xmm0,%xmm3 57 psllq $5,%xmm0 58 pxor %xmm0,%xmm3 59 psllq $1,%xmm0 60 pxor %xmm3,%xmm0 61 psllq $57,%xmm0 62 movdqa %xmm0,%xmm3 63 pslldq $8,%xmm0 64 psrldq $8,%xmm3 65 pxor %xmm4,%xmm0 66 pxor %xmm3,%xmm1 67 68 69 movdqa %xmm0,%xmm4 70 psrlq $1,%xmm0 71 pxor %xmm4,%xmm1 72 pxor %xmm0,%xmm4 73 psrlq $5,%xmm0 74 pxor %xmm4,%xmm0 75 psrlq $1,%xmm0 76 pxor %xmm1,%xmm0 77 pshufd $78,%xmm2,%xmm3 78 pshufd $78,%xmm0,%xmm4 79 pxor %xmm2,%xmm3 80 movdqu %xmm2,0(%rdi) 81 pxor %xmm0,%xmm4 82 movdqu %xmm0,16(%rdi) 83.byte 102,15,58,15,227,8 84 movdqu %xmm4,32(%rdi) 85 movdqa %xmm0,%xmm1 86 pshufd $78,%xmm0,%xmm3 87 pxor %xmm0,%xmm3 88.byte 102,15,58,68,194,0 89.byte 102,15,58,68,202,17 90.byte 102,15,58,68,222,0 91 pxor %xmm0,%xmm3 92 pxor %xmm1,%xmm3 93 94 movdqa %xmm3,%xmm4 95 psrldq $8,%xmm3 96 pslldq $8,%xmm4 97 pxor %xmm3,%xmm1 98 pxor %xmm4,%xmm0 99 100 movdqa %xmm0,%xmm4 101 movdqa %xmm0,%xmm3 102 psllq $5,%xmm0 103 pxor %xmm0,%xmm3 104 psllq $1,%xmm0 105 pxor %xmm3,%xmm0 106 psllq $57,%xmm0 107 movdqa %xmm0,%xmm3 108 pslldq $8,%xmm0 109 psrldq $8,%xmm3 110 pxor %xmm4,%xmm0 111 pxor %xmm3,%xmm1 112 113 114 movdqa %xmm0,%xmm4 115 psrlq $1,%xmm0 116 pxor %xmm4,%xmm1 117 pxor %xmm0,%xmm4 118 psrlq $5,%xmm0 119 pxor %xmm4,%xmm0 120 psrlq $1,%xmm0 121 pxor %xmm1,%xmm0 122 movdqa %xmm0,%xmm5 123 movdqa %xmm0,%xmm1 124 pshufd $78,%xmm0,%xmm3 125 pxor %xmm0,%xmm3 126.byte 102,15,58,68,194,0 127.byte 102,15,58,68,202,17 128.byte 102,15,58,68,222,0 129 pxor %xmm0,%xmm3 130 pxor %xmm1,%xmm3 131 132 movdqa %xmm3,%xmm4 133 psrldq $8,%xmm3 134 pslldq $8,%xmm4 135 pxor %xmm3,%xmm1 136 pxor %xmm4,%xmm0 137 138 movdqa %xmm0,%xmm4 139 movdqa %xmm0,%xmm3 140 psllq $5,%xmm0 141 pxor %xmm0,%xmm3 142 psllq $1,%xmm0 143 pxor %xmm3,%xmm0 144 psllq $57,%xmm0 145 movdqa %xmm0,%xmm3 146 pslldq $8,%xmm0 147 psrldq $8,%xmm3 148 pxor %xmm4,%xmm0 149 pxor %xmm3,%xmm1 150 151 152 movdqa %xmm0,%xmm4 153 psrlq $1,%xmm0 154 pxor %xmm4,%xmm1 155 pxor %xmm0,%xmm4 156 psrlq $5,%xmm0 157 pxor %xmm4,%xmm0 158 psrlq $1,%xmm0 159 pxor %xmm1,%xmm0 160 pshufd $78,%xmm5,%xmm3 161 pshufd $78,%xmm0,%xmm4 162 pxor %xmm5,%xmm3 163 movdqu %xmm5,48(%rdi) 164 pxor %xmm0,%xmm4 165 movdqu %xmm0,64(%rdi) 166.byte 102,15,58,15,227,8 167 movdqu %xmm4,80(%rdi) 168 ret 169.cfi_endproc 170 171.size gcm_init_clmul,.-gcm_init_clmul 172.globl gcm_gmult_clmul 173.hidden gcm_gmult_clmul 174.type gcm_gmult_clmul,@function 175.align 16 176gcm_gmult_clmul: 177.cfi_startproc 178_CET_ENDBR 179.L_gmult_clmul: 180 movdqu (%rdi),%xmm0 181 movdqa .Lbswap_mask(%rip),%xmm5 182 movdqu (%rsi),%xmm2 183 movdqu 32(%rsi),%xmm4 184.byte 102,15,56,0,197 185 movdqa %xmm0,%xmm1 186 pshufd $78,%xmm0,%xmm3 187 pxor %xmm0,%xmm3 188.byte 102,15,58,68,194,0 189.byte 102,15,58,68,202,17 190.byte 102,15,58,68,220,0 191 pxor %xmm0,%xmm3 192 pxor %xmm1,%xmm3 193 194 movdqa %xmm3,%xmm4 195 psrldq $8,%xmm3 196 pslldq $8,%xmm4 197 pxor %xmm3,%xmm1 198 pxor %xmm4,%xmm0 199 200 movdqa %xmm0,%xmm4 201 movdqa %xmm0,%xmm3 202 psllq $5,%xmm0 203 pxor %xmm0,%xmm3 204 psllq $1,%xmm0 205 pxor %xmm3,%xmm0 206 psllq $57,%xmm0 207 movdqa %xmm0,%xmm3 208 pslldq $8,%xmm0 209 psrldq $8,%xmm3 210 pxor %xmm4,%xmm0 211 pxor %xmm3,%xmm1 212 213 214 movdqa %xmm0,%xmm4 215 psrlq $1,%xmm0 216 pxor %xmm4,%xmm1 217 pxor %xmm0,%xmm4 218 psrlq $5,%xmm0 219 pxor %xmm4,%xmm0 220 psrlq $1,%xmm0 221 pxor %xmm1,%xmm0 222.byte 102,15,56,0,197 223 movdqu %xmm0,(%rdi) 224 ret 225.cfi_endproc 226.size gcm_gmult_clmul,.-gcm_gmult_clmul 227.globl gcm_ghash_clmul 228.hidden gcm_ghash_clmul 229.type gcm_ghash_clmul,@function 230.align 32 231gcm_ghash_clmul: 232.cfi_startproc 233 234_CET_ENDBR 235.L_ghash_clmul: 236 movdqa .Lbswap_mask(%rip),%xmm10 237 238 movdqu (%rdi),%xmm0 239 movdqu (%rsi),%xmm2 240 movdqu 32(%rsi),%xmm7 241.byte 102,65,15,56,0,194 242 243 subq $0x10,%rcx 244 jz .Lodd_tail 245 246 movdqu 16(%rsi),%xmm6 247 leaq OPENSSL_ia32cap_P(%rip),%rax 248 movl 4(%rax),%eax 249 cmpq $0x30,%rcx 250 jb .Lskip4x 251 252 andl $71303168,%eax 253 cmpl $4194304,%eax 254 je .Lskip4x 255 256 subq $0x30,%rcx 257 movq $0xA040608020C0E000,%rax 258 movdqu 48(%rsi),%xmm14 259 movdqu 64(%rsi),%xmm15 260 261 262 263 264 movdqu 48(%rdx),%xmm3 265 movdqu 32(%rdx),%xmm11 266.byte 102,65,15,56,0,218 267.byte 102,69,15,56,0,218 268 movdqa %xmm3,%xmm5 269 pshufd $78,%xmm3,%xmm4 270 pxor %xmm3,%xmm4 271.byte 102,15,58,68,218,0 272.byte 102,15,58,68,234,17 273.byte 102,15,58,68,231,0 274 275 movdqa %xmm11,%xmm13 276 pshufd $78,%xmm11,%xmm12 277 pxor %xmm11,%xmm12 278.byte 102,68,15,58,68,222,0 279.byte 102,68,15,58,68,238,17 280.byte 102,68,15,58,68,231,16 281 xorps %xmm11,%xmm3 282 xorps %xmm13,%xmm5 283 movups 80(%rsi),%xmm7 284 xorps %xmm12,%xmm4 285 286 movdqu 16(%rdx),%xmm11 287 movdqu 0(%rdx),%xmm8 288.byte 102,69,15,56,0,218 289.byte 102,69,15,56,0,194 290 movdqa %xmm11,%xmm13 291 pshufd $78,%xmm11,%xmm12 292 pxor %xmm8,%xmm0 293 pxor %xmm11,%xmm12 294.byte 102,69,15,58,68,222,0 295 movdqa %xmm0,%xmm1 296 pshufd $78,%xmm0,%xmm8 297 pxor %xmm0,%xmm8 298.byte 102,69,15,58,68,238,17 299.byte 102,68,15,58,68,231,0 300 xorps %xmm11,%xmm3 301 xorps %xmm13,%xmm5 302 303 leaq 64(%rdx),%rdx 304 subq $0x40,%rcx 305 jc .Ltail4x 306 307 jmp .Lmod4_loop 308.align 32 309.Lmod4_loop: 310.byte 102,65,15,58,68,199,0 311 xorps %xmm12,%xmm4 312 movdqu 48(%rdx),%xmm11 313.byte 102,69,15,56,0,218 314.byte 102,65,15,58,68,207,17 315 xorps %xmm3,%xmm0 316 movdqu 32(%rdx),%xmm3 317 movdqa %xmm11,%xmm13 318.byte 102,68,15,58,68,199,16 319 pshufd $78,%xmm11,%xmm12 320 xorps %xmm5,%xmm1 321 pxor %xmm11,%xmm12 322.byte 102,65,15,56,0,218 323 movups 32(%rsi),%xmm7 324 xorps %xmm4,%xmm8 325.byte 102,68,15,58,68,218,0 326 pshufd $78,%xmm3,%xmm4 327 328 pxor %xmm0,%xmm8 329 movdqa %xmm3,%xmm5 330 pxor %xmm1,%xmm8 331 pxor %xmm3,%xmm4 332 movdqa %xmm8,%xmm9 333.byte 102,68,15,58,68,234,17 334 pslldq $8,%xmm8 335 psrldq $8,%xmm9 336 pxor %xmm8,%xmm0 337 movdqa .L7_mask(%rip),%xmm8 338 pxor %xmm9,%xmm1 339.byte 102,76,15,110,200 340 341 pand %xmm0,%xmm8 342.byte 102,69,15,56,0,200 343 pxor %xmm0,%xmm9 344.byte 102,68,15,58,68,231,0 345 psllq $57,%xmm9 346 movdqa %xmm9,%xmm8 347 pslldq $8,%xmm9 348.byte 102,15,58,68,222,0 349 psrldq $8,%xmm8 350 pxor %xmm9,%xmm0 351 pxor %xmm8,%xmm1 352 movdqu 0(%rdx),%xmm8 353 354 movdqa %xmm0,%xmm9 355 psrlq $1,%xmm0 356.byte 102,15,58,68,238,17 357 xorps %xmm11,%xmm3 358 movdqu 16(%rdx),%xmm11 359.byte 102,69,15,56,0,218 360.byte 102,15,58,68,231,16 361 xorps %xmm13,%xmm5 362 movups 80(%rsi),%xmm7 363.byte 102,69,15,56,0,194 364 pxor %xmm9,%xmm1 365 pxor %xmm0,%xmm9 366 psrlq $5,%xmm0 367 368 movdqa %xmm11,%xmm13 369 pxor %xmm12,%xmm4 370 pshufd $78,%xmm11,%xmm12 371 pxor %xmm9,%xmm0 372 pxor %xmm8,%xmm1 373 pxor %xmm11,%xmm12 374.byte 102,69,15,58,68,222,0 375 psrlq $1,%xmm0 376 pxor %xmm1,%xmm0 377 movdqa %xmm0,%xmm1 378.byte 102,69,15,58,68,238,17 379 xorps %xmm11,%xmm3 380 pshufd $78,%xmm0,%xmm8 381 pxor %xmm0,%xmm8 382 383.byte 102,68,15,58,68,231,0 384 xorps %xmm13,%xmm5 385 386 leaq 64(%rdx),%rdx 387 subq $0x40,%rcx 388 jnc .Lmod4_loop 389 390.Ltail4x: 391.byte 102,65,15,58,68,199,0 392.byte 102,65,15,58,68,207,17 393.byte 102,68,15,58,68,199,16 394 xorps %xmm12,%xmm4 395 xorps %xmm3,%xmm0 396 xorps %xmm5,%xmm1 397 pxor %xmm0,%xmm1 398 pxor %xmm4,%xmm8 399 400 pxor %xmm1,%xmm8 401 pxor %xmm0,%xmm1 402 403 movdqa %xmm8,%xmm9 404 psrldq $8,%xmm8 405 pslldq $8,%xmm9 406 pxor %xmm8,%xmm1 407 pxor %xmm9,%xmm0 408 409 movdqa %xmm0,%xmm4 410 movdqa %xmm0,%xmm3 411 psllq $5,%xmm0 412 pxor %xmm0,%xmm3 413 psllq $1,%xmm0 414 pxor %xmm3,%xmm0 415 psllq $57,%xmm0 416 movdqa %xmm0,%xmm3 417 pslldq $8,%xmm0 418 psrldq $8,%xmm3 419 pxor %xmm4,%xmm0 420 pxor %xmm3,%xmm1 421 422 423 movdqa %xmm0,%xmm4 424 psrlq $1,%xmm0 425 pxor %xmm4,%xmm1 426 pxor %xmm0,%xmm4 427 psrlq $5,%xmm0 428 pxor %xmm4,%xmm0 429 psrlq $1,%xmm0 430 pxor %xmm1,%xmm0 431 addq $0x40,%rcx 432 jz .Ldone 433 movdqu 32(%rsi),%xmm7 434 subq $0x10,%rcx 435 jz .Lodd_tail 436.Lskip4x: 437 438 439 440 441 442 movdqu (%rdx),%xmm8 443 movdqu 16(%rdx),%xmm3 444.byte 102,69,15,56,0,194 445.byte 102,65,15,56,0,218 446 pxor %xmm8,%xmm0 447 448 movdqa %xmm3,%xmm5 449 pshufd $78,%xmm3,%xmm4 450 pxor %xmm3,%xmm4 451.byte 102,15,58,68,218,0 452.byte 102,15,58,68,234,17 453.byte 102,15,58,68,231,0 454 455 leaq 32(%rdx),%rdx 456 nop 457 subq $0x20,%rcx 458 jbe .Leven_tail 459 nop 460 jmp .Lmod_loop 461 462.align 32 463.Lmod_loop: 464 movdqa %xmm0,%xmm1 465 movdqa %xmm4,%xmm8 466 pshufd $78,%xmm0,%xmm4 467 pxor %xmm0,%xmm4 468 469.byte 102,15,58,68,198,0 470.byte 102,15,58,68,206,17 471.byte 102,15,58,68,231,16 472 473 pxor %xmm3,%xmm0 474 pxor %xmm5,%xmm1 475 movdqu (%rdx),%xmm9 476 pxor %xmm0,%xmm8 477.byte 102,69,15,56,0,202 478 movdqu 16(%rdx),%xmm3 479 480 pxor %xmm1,%xmm8 481 pxor %xmm9,%xmm1 482 pxor %xmm8,%xmm4 483.byte 102,65,15,56,0,218 484 movdqa %xmm4,%xmm8 485 psrldq $8,%xmm8 486 pslldq $8,%xmm4 487 pxor %xmm8,%xmm1 488 pxor %xmm4,%xmm0 489 490 movdqa %xmm3,%xmm5 491 492 movdqa %xmm0,%xmm9 493 movdqa %xmm0,%xmm8 494 psllq $5,%xmm0 495 pxor %xmm0,%xmm8 496.byte 102,15,58,68,218,0 497 psllq $1,%xmm0 498 pxor %xmm8,%xmm0 499 psllq $57,%xmm0 500 movdqa %xmm0,%xmm8 501 pslldq $8,%xmm0 502 psrldq $8,%xmm8 503 pxor %xmm9,%xmm0 504 pshufd $78,%xmm5,%xmm4 505 pxor %xmm8,%xmm1 506 pxor %xmm5,%xmm4 507 508 movdqa %xmm0,%xmm9 509 psrlq $1,%xmm0 510.byte 102,15,58,68,234,17 511 pxor %xmm9,%xmm1 512 pxor %xmm0,%xmm9 513 psrlq $5,%xmm0 514 pxor %xmm9,%xmm0 515 leaq 32(%rdx),%rdx 516 psrlq $1,%xmm0 517.byte 102,15,58,68,231,0 518 pxor %xmm1,%xmm0 519 520 subq $0x20,%rcx 521 ja .Lmod_loop 522 523.Leven_tail: 524 movdqa %xmm0,%xmm1 525 movdqa %xmm4,%xmm8 526 pshufd $78,%xmm0,%xmm4 527 pxor %xmm0,%xmm4 528 529.byte 102,15,58,68,198,0 530.byte 102,15,58,68,206,17 531.byte 102,15,58,68,231,16 532 533 pxor %xmm3,%xmm0 534 pxor %xmm5,%xmm1 535 pxor %xmm0,%xmm8 536 pxor %xmm1,%xmm8 537 pxor %xmm8,%xmm4 538 movdqa %xmm4,%xmm8 539 psrldq $8,%xmm8 540 pslldq $8,%xmm4 541 pxor %xmm8,%xmm1 542 pxor %xmm4,%xmm0 543 544 movdqa %xmm0,%xmm4 545 movdqa %xmm0,%xmm3 546 psllq $5,%xmm0 547 pxor %xmm0,%xmm3 548 psllq $1,%xmm0 549 pxor %xmm3,%xmm0 550 psllq $57,%xmm0 551 movdqa %xmm0,%xmm3 552 pslldq $8,%xmm0 553 psrldq $8,%xmm3 554 pxor %xmm4,%xmm0 555 pxor %xmm3,%xmm1 556 557 558 movdqa %xmm0,%xmm4 559 psrlq $1,%xmm0 560 pxor %xmm4,%xmm1 561 pxor %xmm0,%xmm4 562 psrlq $5,%xmm0 563 pxor %xmm4,%xmm0 564 psrlq $1,%xmm0 565 pxor %xmm1,%xmm0 566 testq %rcx,%rcx 567 jnz .Ldone 568 569.Lodd_tail: 570 movdqu (%rdx),%xmm8 571.byte 102,69,15,56,0,194 572 pxor %xmm8,%xmm0 573 movdqa %xmm0,%xmm1 574 pshufd $78,%xmm0,%xmm3 575 pxor %xmm0,%xmm3 576.byte 102,15,58,68,194,0 577.byte 102,15,58,68,202,17 578.byte 102,15,58,68,223,0 579 pxor %xmm0,%xmm3 580 pxor %xmm1,%xmm3 581 582 movdqa %xmm3,%xmm4 583 psrldq $8,%xmm3 584 pslldq $8,%xmm4 585 pxor %xmm3,%xmm1 586 pxor %xmm4,%xmm0 587 588 movdqa %xmm0,%xmm4 589 movdqa %xmm0,%xmm3 590 psllq $5,%xmm0 591 pxor %xmm0,%xmm3 592 psllq $1,%xmm0 593 pxor %xmm3,%xmm0 594 psllq $57,%xmm0 595 movdqa %xmm0,%xmm3 596 pslldq $8,%xmm0 597 psrldq $8,%xmm3 598 pxor %xmm4,%xmm0 599 pxor %xmm3,%xmm1 600 601 602 movdqa %xmm0,%xmm4 603 psrlq $1,%xmm0 604 pxor %xmm4,%xmm1 605 pxor %xmm0,%xmm4 606 psrlq $5,%xmm0 607 pxor %xmm4,%xmm0 608 psrlq $1,%xmm0 609 pxor %xmm1,%xmm0 610.Ldone: 611.byte 102,65,15,56,0,194 612 movdqu %xmm0,(%rdi) 613 ret 614.cfi_endproc 615 616.size gcm_ghash_clmul,.-gcm_ghash_clmul 617.globl gcm_init_avx 618.hidden gcm_init_avx 619.type gcm_init_avx,@function 620.align 32 621gcm_init_avx: 622.cfi_startproc 623_CET_ENDBR 624 vzeroupper 625 626 vmovdqu (%rsi),%xmm2 627 vpshufd $78,%xmm2,%xmm2 628 629 630 vpshufd $255,%xmm2,%xmm4 631 vpsrlq $63,%xmm2,%xmm3 632 vpsllq $1,%xmm2,%xmm2 633 vpxor %xmm5,%xmm5,%xmm5 634 vpcmpgtd %xmm4,%xmm5,%xmm5 635 vpslldq $8,%xmm3,%xmm3 636 vpor %xmm3,%xmm2,%xmm2 637 638 639 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 640 vpxor %xmm5,%xmm2,%xmm2 641 642 vpunpckhqdq %xmm2,%xmm2,%xmm6 643 vmovdqa %xmm2,%xmm0 644 vpxor %xmm2,%xmm6,%xmm6 645 movq $4,%r10 646 jmp .Linit_start_avx 647.align 32 648.Linit_loop_avx: 649 vpalignr $8,%xmm3,%xmm4,%xmm5 650 vmovdqu %xmm5,-16(%rdi) 651 vpunpckhqdq %xmm0,%xmm0,%xmm3 652 vpxor %xmm0,%xmm3,%xmm3 653 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 654 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 655 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 656 vpxor %xmm0,%xmm1,%xmm4 657 vpxor %xmm4,%xmm3,%xmm3 658 659 vpslldq $8,%xmm3,%xmm4 660 vpsrldq $8,%xmm3,%xmm3 661 vpxor %xmm4,%xmm0,%xmm0 662 vpxor %xmm3,%xmm1,%xmm1 663 vpsllq $57,%xmm0,%xmm3 664 vpsllq $62,%xmm0,%xmm4 665 vpxor %xmm3,%xmm4,%xmm4 666 vpsllq $63,%xmm0,%xmm3 667 vpxor %xmm3,%xmm4,%xmm4 668 vpslldq $8,%xmm4,%xmm3 669 vpsrldq $8,%xmm4,%xmm4 670 vpxor %xmm3,%xmm0,%xmm0 671 vpxor %xmm4,%xmm1,%xmm1 672 673 vpsrlq $1,%xmm0,%xmm4 674 vpxor %xmm0,%xmm1,%xmm1 675 vpxor %xmm4,%xmm0,%xmm0 676 vpsrlq $5,%xmm4,%xmm4 677 vpxor %xmm4,%xmm0,%xmm0 678 vpsrlq $1,%xmm0,%xmm0 679 vpxor %xmm1,%xmm0,%xmm0 680.Linit_start_avx: 681 vmovdqa %xmm0,%xmm5 682 vpunpckhqdq %xmm0,%xmm0,%xmm3 683 vpxor %xmm0,%xmm3,%xmm3 684 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 685 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 686 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 687 vpxor %xmm0,%xmm1,%xmm4 688 vpxor %xmm4,%xmm3,%xmm3 689 690 vpslldq $8,%xmm3,%xmm4 691 vpsrldq $8,%xmm3,%xmm3 692 vpxor %xmm4,%xmm0,%xmm0 693 vpxor %xmm3,%xmm1,%xmm1 694 vpsllq $57,%xmm0,%xmm3 695 vpsllq $62,%xmm0,%xmm4 696 vpxor %xmm3,%xmm4,%xmm4 697 vpsllq $63,%xmm0,%xmm3 698 vpxor %xmm3,%xmm4,%xmm4 699 vpslldq $8,%xmm4,%xmm3 700 vpsrldq $8,%xmm4,%xmm4 701 vpxor %xmm3,%xmm0,%xmm0 702 vpxor %xmm4,%xmm1,%xmm1 703 704 vpsrlq $1,%xmm0,%xmm4 705 vpxor %xmm0,%xmm1,%xmm1 706 vpxor %xmm4,%xmm0,%xmm0 707 vpsrlq $5,%xmm4,%xmm4 708 vpxor %xmm4,%xmm0,%xmm0 709 vpsrlq $1,%xmm0,%xmm0 710 vpxor %xmm1,%xmm0,%xmm0 711 vpshufd $78,%xmm5,%xmm3 712 vpshufd $78,%xmm0,%xmm4 713 vpxor %xmm5,%xmm3,%xmm3 714 vmovdqu %xmm5,0(%rdi) 715 vpxor %xmm0,%xmm4,%xmm4 716 vmovdqu %xmm0,16(%rdi) 717 leaq 48(%rdi),%rdi 718 subq $1,%r10 719 jnz .Linit_loop_avx 720 721 vpalignr $8,%xmm4,%xmm3,%xmm5 722 vmovdqu %xmm5,-16(%rdi) 723 724 vzeroupper 725 ret 726 727.cfi_endproc 728.size gcm_init_avx,.-gcm_init_avx 729.globl gcm_ghash_avx 730.hidden gcm_ghash_avx 731.type gcm_ghash_avx,@function 732.align 32 733gcm_ghash_avx: 734.cfi_startproc 735_CET_ENDBR 736 vzeroupper 737 738 vmovdqu (%rdi),%xmm10 739 leaq .L0x1c2_polynomial(%rip),%r10 740 leaq 64(%rsi),%rsi 741 vmovdqu .Lbswap_mask(%rip),%xmm13 742 vpshufb %xmm13,%xmm10,%xmm10 743 cmpq $0x80,%rcx 744 jb .Lshort_avx 745 subq $0x80,%rcx 746 747 vmovdqu 112(%rdx),%xmm14 748 vmovdqu 0-64(%rsi),%xmm6 749 vpshufb %xmm13,%xmm14,%xmm14 750 vmovdqu 32-64(%rsi),%xmm7 751 752 vpunpckhqdq %xmm14,%xmm14,%xmm9 753 vmovdqu 96(%rdx),%xmm15 754 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 755 vpxor %xmm14,%xmm9,%xmm9 756 vpshufb %xmm13,%xmm15,%xmm15 757 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 758 vmovdqu 16-64(%rsi),%xmm6 759 vpunpckhqdq %xmm15,%xmm15,%xmm8 760 vmovdqu 80(%rdx),%xmm14 761 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 762 vpxor %xmm15,%xmm8,%xmm8 763 764 vpshufb %xmm13,%xmm14,%xmm14 765 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 766 vpunpckhqdq %xmm14,%xmm14,%xmm9 767 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 768 vmovdqu 48-64(%rsi),%xmm6 769 vpxor %xmm14,%xmm9,%xmm9 770 vmovdqu 64(%rdx),%xmm15 771 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 772 vmovdqu 80-64(%rsi),%xmm7 773 774 vpshufb %xmm13,%xmm15,%xmm15 775 vpxor %xmm0,%xmm3,%xmm3 776 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 777 vpxor %xmm1,%xmm4,%xmm4 778 vpunpckhqdq %xmm15,%xmm15,%xmm8 779 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 780 vmovdqu 64-64(%rsi),%xmm6 781 vpxor %xmm2,%xmm5,%xmm5 782 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 783 vpxor %xmm15,%xmm8,%xmm8 784 785 vmovdqu 48(%rdx),%xmm14 786 vpxor %xmm3,%xmm0,%xmm0 787 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 788 vpxor %xmm4,%xmm1,%xmm1 789 vpshufb %xmm13,%xmm14,%xmm14 790 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 791 vmovdqu 96-64(%rsi),%xmm6 792 vpxor %xmm5,%xmm2,%xmm2 793 vpunpckhqdq %xmm14,%xmm14,%xmm9 794 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 795 vmovdqu 128-64(%rsi),%xmm7 796 vpxor %xmm14,%xmm9,%xmm9 797 798 vmovdqu 32(%rdx),%xmm15 799 vpxor %xmm0,%xmm3,%xmm3 800 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 801 vpxor %xmm1,%xmm4,%xmm4 802 vpshufb %xmm13,%xmm15,%xmm15 803 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 804 vmovdqu 112-64(%rsi),%xmm6 805 vpxor %xmm2,%xmm5,%xmm5 806 vpunpckhqdq %xmm15,%xmm15,%xmm8 807 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 808 vpxor %xmm15,%xmm8,%xmm8 809 810 vmovdqu 16(%rdx),%xmm14 811 vpxor %xmm3,%xmm0,%xmm0 812 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 813 vpxor %xmm4,%xmm1,%xmm1 814 vpshufb %xmm13,%xmm14,%xmm14 815 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 816 vmovdqu 144-64(%rsi),%xmm6 817 vpxor %xmm5,%xmm2,%xmm2 818 vpunpckhqdq %xmm14,%xmm14,%xmm9 819 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 820 vmovdqu 176-64(%rsi),%xmm7 821 vpxor %xmm14,%xmm9,%xmm9 822 823 vmovdqu (%rdx),%xmm15 824 vpxor %xmm0,%xmm3,%xmm3 825 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 826 vpxor %xmm1,%xmm4,%xmm4 827 vpshufb %xmm13,%xmm15,%xmm15 828 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 829 vmovdqu 160-64(%rsi),%xmm6 830 vpxor %xmm2,%xmm5,%xmm5 831 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 832 833 leaq 128(%rdx),%rdx 834 cmpq $0x80,%rcx 835 jb .Ltail_avx 836 837 vpxor %xmm10,%xmm15,%xmm15 838 subq $0x80,%rcx 839 jmp .Loop8x_avx 840 841.align 32 842.Loop8x_avx: 843 vpunpckhqdq %xmm15,%xmm15,%xmm8 844 vmovdqu 112(%rdx),%xmm14 845 vpxor %xmm0,%xmm3,%xmm3 846 vpxor %xmm15,%xmm8,%xmm8 847 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 848 vpshufb %xmm13,%xmm14,%xmm14 849 vpxor %xmm1,%xmm4,%xmm4 850 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 851 vmovdqu 0-64(%rsi),%xmm6 852 vpunpckhqdq %xmm14,%xmm14,%xmm9 853 vpxor %xmm2,%xmm5,%xmm5 854 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 855 vmovdqu 32-64(%rsi),%xmm7 856 vpxor %xmm14,%xmm9,%xmm9 857 858 vmovdqu 96(%rdx),%xmm15 859 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 860 vpxor %xmm3,%xmm10,%xmm10 861 vpshufb %xmm13,%xmm15,%xmm15 862 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 863 vxorps %xmm4,%xmm11,%xmm11 864 vmovdqu 16-64(%rsi),%xmm6 865 vpunpckhqdq %xmm15,%xmm15,%xmm8 866 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 867 vpxor %xmm5,%xmm12,%xmm12 868 vxorps %xmm15,%xmm8,%xmm8 869 870 vmovdqu 80(%rdx),%xmm14 871 vpxor %xmm10,%xmm12,%xmm12 872 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 873 vpxor %xmm11,%xmm12,%xmm12 874 vpslldq $8,%xmm12,%xmm9 875 vpxor %xmm0,%xmm3,%xmm3 876 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 877 vpsrldq $8,%xmm12,%xmm12 878 vpxor %xmm9,%xmm10,%xmm10 879 vmovdqu 48-64(%rsi),%xmm6 880 vpshufb %xmm13,%xmm14,%xmm14 881 vxorps %xmm12,%xmm11,%xmm11 882 vpxor %xmm1,%xmm4,%xmm4 883 vpunpckhqdq %xmm14,%xmm14,%xmm9 884 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 885 vmovdqu 80-64(%rsi),%xmm7 886 vpxor %xmm14,%xmm9,%xmm9 887 vpxor %xmm2,%xmm5,%xmm5 888 889 vmovdqu 64(%rdx),%xmm15 890 vpalignr $8,%xmm10,%xmm10,%xmm12 891 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 892 vpshufb %xmm13,%xmm15,%xmm15 893 vpxor %xmm3,%xmm0,%xmm0 894 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 895 vmovdqu 64-64(%rsi),%xmm6 896 vpunpckhqdq %xmm15,%xmm15,%xmm8 897 vpxor %xmm4,%xmm1,%xmm1 898 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 899 vxorps %xmm15,%xmm8,%xmm8 900 vpxor %xmm5,%xmm2,%xmm2 901 902 vmovdqu 48(%rdx),%xmm14 903 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 904 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 905 vpshufb %xmm13,%xmm14,%xmm14 906 vpxor %xmm0,%xmm3,%xmm3 907 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 908 vmovdqu 96-64(%rsi),%xmm6 909 vpunpckhqdq %xmm14,%xmm14,%xmm9 910 vpxor %xmm1,%xmm4,%xmm4 911 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 912 vmovdqu 128-64(%rsi),%xmm7 913 vpxor %xmm14,%xmm9,%xmm9 914 vpxor %xmm2,%xmm5,%xmm5 915 916 vmovdqu 32(%rdx),%xmm15 917 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 918 vpshufb %xmm13,%xmm15,%xmm15 919 vpxor %xmm3,%xmm0,%xmm0 920 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 921 vmovdqu 112-64(%rsi),%xmm6 922 vpunpckhqdq %xmm15,%xmm15,%xmm8 923 vpxor %xmm4,%xmm1,%xmm1 924 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 925 vpxor %xmm15,%xmm8,%xmm8 926 vpxor %xmm5,%xmm2,%xmm2 927 vxorps %xmm12,%xmm10,%xmm10 928 929 vmovdqu 16(%rdx),%xmm14 930 vpalignr $8,%xmm10,%xmm10,%xmm12 931 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 932 vpshufb %xmm13,%xmm14,%xmm14 933 vpxor %xmm0,%xmm3,%xmm3 934 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 935 vmovdqu 144-64(%rsi),%xmm6 936 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 937 vxorps %xmm11,%xmm12,%xmm12 938 vpunpckhqdq %xmm14,%xmm14,%xmm9 939 vpxor %xmm1,%xmm4,%xmm4 940 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 941 vmovdqu 176-64(%rsi),%xmm7 942 vpxor %xmm14,%xmm9,%xmm9 943 vpxor %xmm2,%xmm5,%xmm5 944 945 vmovdqu (%rdx),%xmm15 946 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 947 vpshufb %xmm13,%xmm15,%xmm15 948 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 949 vmovdqu 160-64(%rsi),%xmm6 950 vpxor %xmm12,%xmm15,%xmm15 951 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 952 vpxor %xmm10,%xmm15,%xmm15 953 954 leaq 128(%rdx),%rdx 955 subq $0x80,%rcx 956 jnc .Loop8x_avx 957 958 addq $0x80,%rcx 959 jmp .Ltail_no_xor_avx 960 961.align 32 962.Lshort_avx: 963 vmovdqu -16(%rdx,%rcx,1),%xmm14 964 leaq (%rdx,%rcx,1),%rdx 965 vmovdqu 0-64(%rsi),%xmm6 966 vmovdqu 32-64(%rsi),%xmm7 967 vpshufb %xmm13,%xmm14,%xmm15 968 969 vmovdqa %xmm0,%xmm3 970 vmovdqa %xmm1,%xmm4 971 vmovdqa %xmm2,%xmm5 972 subq $0x10,%rcx 973 jz .Ltail_avx 974 975 vpunpckhqdq %xmm15,%xmm15,%xmm8 976 vpxor %xmm0,%xmm3,%xmm3 977 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 978 vpxor %xmm15,%xmm8,%xmm8 979 vmovdqu -32(%rdx),%xmm14 980 vpxor %xmm1,%xmm4,%xmm4 981 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 982 vmovdqu 16-64(%rsi),%xmm6 983 vpshufb %xmm13,%xmm14,%xmm15 984 vpxor %xmm2,%xmm5,%xmm5 985 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 986 vpsrldq $8,%xmm7,%xmm7 987 subq $0x10,%rcx 988 jz .Ltail_avx 989 990 vpunpckhqdq %xmm15,%xmm15,%xmm8 991 vpxor %xmm0,%xmm3,%xmm3 992 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 993 vpxor %xmm15,%xmm8,%xmm8 994 vmovdqu -48(%rdx),%xmm14 995 vpxor %xmm1,%xmm4,%xmm4 996 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 997 vmovdqu 48-64(%rsi),%xmm6 998 vpshufb %xmm13,%xmm14,%xmm15 999 vpxor %xmm2,%xmm5,%xmm5 1000 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1001 vmovdqu 80-64(%rsi),%xmm7 1002 subq $0x10,%rcx 1003 jz .Ltail_avx 1004 1005 vpunpckhqdq %xmm15,%xmm15,%xmm8 1006 vpxor %xmm0,%xmm3,%xmm3 1007 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1008 vpxor %xmm15,%xmm8,%xmm8 1009 vmovdqu -64(%rdx),%xmm14 1010 vpxor %xmm1,%xmm4,%xmm4 1011 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1012 vmovdqu 64-64(%rsi),%xmm6 1013 vpshufb %xmm13,%xmm14,%xmm15 1014 vpxor %xmm2,%xmm5,%xmm5 1015 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1016 vpsrldq $8,%xmm7,%xmm7 1017 subq $0x10,%rcx 1018 jz .Ltail_avx 1019 1020 vpunpckhqdq %xmm15,%xmm15,%xmm8 1021 vpxor %xmm0,%xmm3,%xmm3 1022 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1023 vpxor %xmm15,%xmm8,%xmm8 1024 vmovdqu -80(%rdx),%xmm14 1025 vpxor %xmm1,%xmm4,%xmm4 1026 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1027 vmovdqu 96-64(%rsi),%xmm6 1028 vpshufb %xmm13,%xmm14,%xmm15 1029 vpxor %xmm2,%xmm5,%xmm5 1030 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1031 vmovdqu 128-64(%rsi),%xmm7 1032 subq $0x10,%rcx 1033 jz .Ltail_avx 1034 1035 vpunpckhqdq %xmm15,%xmm15,%xmm8 1036 vpxor %xmm0,%xmm3,%xmm3 1037 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1038 vpxor %xmm15,%xmm8,%xmm8 1039 vmovdqu -96(%rdx),%xmm14 1040 vpxor %xmm1,%xmm4,%xmm4 1041 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1042 vmovdqu 112-64(%rsi),%xmm6 1043 vpshufb %xmm13,%xmm14,%xmm15 1044 vpxor %xmm2,%xmm5,%xmm5 1045 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1046 vpsrldq $8,%xmm7,%xmm7 1047 subq $0x10,%rcx 1048 jz .Ltail_avx 1049 1050 vpunpckhqdq %xmm15,%xmm15,%xmm8 1051 vpxor %xmm0,%xmm3,%xmm3 1052 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1053 vpxor %xmm15,%xmm8,%xmm8 1054 vmovdqu -112(%rdx),%xmm14 1055 vpxor %xmm1,%xmm4,%xmm4 1056 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1057 vmovdqu 144-64(%rsi),%xmm6 1058 vpshufb %xmm13,%xmm14,%xmm15 1059 vpxor %xmm2,%xmm5,%xmm5 1060 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1061 vmovq 184-64(%rsi),%xmm7 1062 subq $0x10,%rcx 1063 jmp .Ltail_avx 1064 1065.align 32 1066.Ltail_avx: 1067 vpxor %xmm10,%xmm15,%xmm15 1068.Ltail_no_xor_avx: 1069 vpunpckhqdq %xmm15,%xmm15,%xmm8 1070 vpxor %xmm0,%xmm3,%xmm3 1071 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1072 vpxor %xmm15,%xmm8,%xmm8 1073 vpxor %xmm1,%xmm4,%xmm4 1074 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1075 vpxor %xmm2,%xmm5,%xmm5 1076 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1077 1078 vmovdqu (%r10),%xmm12 1079 1080 vpxor %xmm0,%xmm3,%xmm10 1081 vpxor %xmm1,%xmm4,%xmm11 1082 vpxor %xmm2,%xmm5,%xmm5 1083 1084 vpxor %xmm10,%xmm5,%xmm5 1085 vpxor %xmm11,%xmm5,%xmm5 1086 vpslldq $8,%xmm5,%xmm9 1087 vpsrldq $8,%xmm5,%xmm5 1088 vpxor %xmm9,%xmm10,%xmm10 1089 vpxor %xmm5,%xmm11,%xmm11 1090 1091 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 1092 vpalignr $8,%xmm10,%xmm10,%xmm10 1093 vpxor %xmm9,%xmm10,%xmm10 1094 1095 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 1096 vpalignr $8,%xmm10,%xmm10,%xmm10 1097 vpxor %xmm11,%xmm10,%xmm10 1098 vpxor %xmm9,%xmm10,%xmm10 1099 1100 cmpq $0,%rcx 1101 jne .Lshort_avx 1102 1103 vpshufb %xmm13,%xmm10,%xmm10 1104 vmovdqu %xmm10,(%rdi) 1105 vzeroupper 1106 ret 1107.cfi_endproc 1108 1109.size gcm_ghash_avx,.-gcm_ghash_avx 1110.section .rodata 1111.align 64 1112.Lbswap_mask: 1113.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1114.L0x1c2_polynomial: 1115.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1116.L7_mask: 1117.long 7,0,7,0 1118.align 64 1119 1120.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1121.align 64 1122.text 1123#endif 1124