1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) 7.text 8.globl gcm_init_clmul 9.hidden gcm_init_clmul 10.type gcm_init_clmul,@function 11.align 16 12gcm_init_clmul: 13.cfi_startproc 14 15_CET_ENDBR 16.L_init_clmul: 17 movdqu (%rsi),%xmm2 18 pshufd $78,%xmm2,%xmm2 19 20 21 pshufd $255,%xmm2,%xmm4 22 movdqa %xmm2,%xmm3 23 psllq $1,%xmm2 24 pxor %xmm5,%xmm5 25 psrlq $63,%xmm3 26 pcmpgtd %xmm4,%xmm5 27 pslldq $8,%xmm3 28 por %xmm3,%xmm2 29 30 31 pand .L0x1c2_polynomial(%rip),%xmm5 32 pxor %xmm5,%xmm2 33 34 35 pshufd $78,%xmm2,%xmm6 36 movdqa %xmm2,%xmm0 37 pxor %xmm2,%xmm6 38 movdqa %xmm0,%xmm1 39 pshufd $78,%xmm0,%xmm3 40 pxor %xmm0,%xmm3 41.byte 102,15,58,68,194,0 42.byte 102,15,58,68,202,17 43.byte 102,15,58,68,222,0 44 pxor %xmm0,%xmm3 45 pxor %xmm1,%xmm3 46 47 movdqa %xmm3,%xmm4 48 psrldq $8,%xmm3 49 pslldq $8,%xmm4 50 pxor %xmm3,%xmm1 51 pxor %xmm4,%xmm0 52 53 movdqa %xmm0,%xmm4 54 movdqa %xmm0,%xmm3 55 psllq $5,%xmm0 56 pxor %xmm0,%xmm3 57 psllq $1,%xmm0 58 pxor %xmm3,%xmm0 59 psllq $57,%xmm0 60 movdqa %xmm0,%xmm3 61 pslldq $8,%xmm0 62 psrldq $8,%xmm3 63 pxor %xmm4,%xmm0 64 pxor %xmm3,%xmm1 65 66 67 movdqa %xmm0,%xmm4 68 psrlq $1,%xmm0 69 pxor %xmm4,%xmm1 70 pxor %xmm0,%xmm4 71 psrlq $5,%xmm0 72 pxor %xmm4,%xmm0 73 psrlq $1,%xmm0 74 pxor %xmm1,%xmm0 75 pshufd $78,%xmm2,%xmm3 76 pshufd $78,%xmm0,%xmm4 77 pxor %xmm2,%xmm3 78 movdqu %xmm2,0(%rdi) 79 pxor %xmm0,%xmm4 80 movdqu %xmm0,16(%rdi) 81.byte 102,15,58,15,227,8 82 movdqu %xmm4,32(%rdi) 83 movdqa %xmm0,%xmm1 84 pshufd $78,%xmm0,%xmm3 85 pxor %xmm0,%xmm3 86.byte 102,15,58,68,194,0 87.byte 102,15,58,68,202,17 88.byte 102,15,58,68,222,0 89 pxor %xmm0,%xmm3 90 pxor %xmm1,%xmm3 91 92 movdqa %xmm3,%xmm4 93 psrldq $8,%xmm3 94 pslldq $8,%xmm4 95 pxor %xmm3,%xmm1 96 pxor %xmm4,%xmm0 97 98 movdqa %xmm0,%xmm4 99 movdqa %xmm0,%xmm3 100 psllq $5,%xmm0 101 pxor %xmm0,%xmm3 102 psllq $1,%xmm0 103 pxor %xmm3,%xmm0 104 psllq $57,%xmm0 105 movdqa %xmm0,%xmm3 106 pslldq $8,%xmm0 107 psrldq $8,%xmm3 108 pxor %xmm4,%xmm0 109 pxor %xmm3,%xmm1 110 111 112 movdqa %xmm0,%xmm4 113 psrlq $1,%xmm0 114 pxor %xmm4,%xmm1 115 pxor %xmm0,%xmm4 116 psrlq $5,%xmm0 117 pxor %xmm4,%xmm0 118 psrlq $1,%xmm0 119 pxor %xmm1,%xmm0 120 movdqa %xmm0,%xmm5 121 movdqa %xmm0,%xmm1 122 pshufd $78,%xmm0,%xmm3 123 pxor %xmm0,%xmm3 124.byte 102,15,58,68,194,0 125.byte 102,15,58,68,202,17 126.byte 102,15,58,68,222,0 127 pxor %xmm0,%xmm3 128 pxor %xmm1,%xmm3 129 130 movdqa %xmm3,%xmm4 131 psrldq $8,%xmm3 132 pslldq $8,%xmm4 133 pxor %xmm3,%xmm1 134 pxor %xmm4,%xmm0 135 136 movdqa %xmm0,%xmm4 137 movdqa %xmm0,%xmm3 138 psllq $5,%xmm0 139 pxor %xmm0,%xmm3 140 psllq $1,%xmm0 141 pxor %xmm3,%xmm0 142 psllq $57,%xmm0 143 movdqa %xmm0,%xmm3 144 pslldq $8,%xmm0 145 psrldq $8,%xmm3 146 pxor %xmm4,%xmm0 147 pxor %xmm3,%xmm1 148 149 150 movdqa %xmm0,%xmm4 151 psrlq $1,%xmm0 152 pxor %xmm4,%xmm1 153 pxor %xmm0,%xmm4 154 psrlq $5,%xmm0 155 pxor %xmm4,%xmm0 156 psrlq $1,%xmm0 157 pxor %xmm1,%xmm0 158 pshufd $78,%xmm5,%xmm3 159 pshufd $78,%xmm0,%xmm4 160 pxor %xmm5,%xmm3 161 movdqu %xmm5,48(%rdi) 162 pxor %xmm0,%xmm4 163 movdqu %xmm0,64(%rdi) 164.byte 102,15,58,15,227,8 165 movdqu %xmm4,80(%rdi) 166 ret 167.cfi_endproc 168 169.size gcm_init_clmul,.-gcm_init_clmul 170.globl gcm_gmult_clmul 171.hidden gcm_gmult_clmul 172.type gcm_gmult_clmul,@function 173.align 16 174gcm_gmult_clmul: 175.cfi_startproc 176_CET_ENDBR 177.L_gmult_clmul: 178 movdqu (%rdi),%xmm0 179 movdqa .Lbswap_mask(%rip),%xmm5 180 movdqu (%rsi),%xmm2 181 movdqu 32(%rsi),%xmm4 182.byte 102,15,56,0,197 183 movdqa %xmm0,%xmm1 184 pshufd $78,%xmm0,%xmm3 185 pxor %xmm0,%xmm3 186.byte 102,15,58,68,194,0 187.byte 102,15,58,68,202,17 188.byte 102,15,58,68,220,0 189 pxor %xmm0,%xmm3 190 pxor %xmm1,%xmm3 191 192 movdqa %xmm3,%xmm4 193 psrldq $8,%xmm3 194 pslldq $8,%xmm4 195 pxor %xmm3,%xmm1 196 pxor %xmm4,%xmm0 197 198 movdqa %xmm0,%xmm4 199 movdqa %xmm0,%xmm3 200 psllq $5,%xmm0 201 pxor %xmm0,%xmm3 202 psllq $1,%xmm0 203 pxor %xmm3,%xmm0 204 psllq $57,%xmm0 205 movdqa %xmm0,%xmm3 206 pslldq $8,%xmm0 207 psrldq $8,%xmm3 208 pxor %xmm4,%xmm0 209 pxor %xmm3,%xmm1 210 211 212 movdqa %xmm0,%xmm4 213 psrlq $1,%xmm0 214 pxor %xmm4,%xmm1 215 pxor %xmm0,%xmm4 216 psrlq $5,%xmm0 217 pxor %xmm4,%xmm0 218 psrlq $1,%xmm0 219 pxor %xmm1,%xmm0 220.byte 102,15,56,0,197 221 movdqu %xmm0,(%rdi) 222 ret 223.cfi_endproc 224.size gcm_gmult_clmul,.-gcm_gmult_clmul 225.globl gcm_ghash_clmul 226.hidden gcm_ghash_clmul 227.type gcm_ghash_clmul,@function 228.align 32 229gcm_ghash_clmul: 230.cfi_startproc 231 232_CET_ENDBR 233.L_ghash_clmul: 234 movdqa .Lbswap_mask(%rip),%xmm10 235 236 movdqu (%rdi),%xmm0 237 movdqu (%rsi),%xmm2 238 movdqu 32(%rsi),%xmm7 239.byte 102,65,15,56,0,194 240 241 subq $0x10,%rcx 242 jz .Lodd_tail 243 244 movdqu 16(%rsi),%xmm6 245 cmpq $0x30,%rcx 246 jb .Lskip4x 247 248 subq $0x30,%rcx 249 movq $0xA040608020C0E000,%rax 250 movdqu 48(%rsi),%xmm14 251 movdqu 64(%rsi),%xmm15 252 253 254 255 256 movdqu 48(%rdx),%xmm3 257 movdqu 32(%rdx),%xmm11 258.byte 102,65,15,56,0,218 259.byte 102,69,15,56,0,218 260 movdqa %xmm3,%xmm5 261 pshufd $78,%xmm3,%xmm4 262 pxor %xmm3,%xmm4 263.byte 102,15,58,68,218,0 264.byte 102,15,58,68,234,17 265.byte 102,15,58,68,231,0 266 267 movdqa %xmm11,%xmm13 268 pshufd $78,%xmm11,%xmm12 269 pxor %xmm11,%xmm12 270.byte 102,68,15,58,68,222,0 271.byte 102,68,15,58,68,238,17 272.byte 102,68,15,58,68,231,16 273 xorps %xmm11,%xmm3 274 xorps %xmm13,%xmm5 275 movups 80(%rsi),%xmm7 276 xorps %xmm12,%xmm4 277 278 movdqu 16(%rdx),%xmm11 279 movdqu 0(%rdx),%xmm8 280.byte 102,69,15,56,0,218 281.byte 102,69,15,56,0,194 282 movdqa %xmm11,%xmm13 283 pshufd $78,%xmm11,%xmm12 284 pxor %xmm8,%xmm0 285 pxor %xmm11,%xmm12 286.byte 102,69,15,58,68,222,0 287 movdqa %xmm0,%xmm1 288 pshufd $78,%xmm0,%xmm8 289 pxor %xmm0,%xmm8 290.byte 102,69,15,58,68,238,17 291.byte 102,68,15,58,68,231,0 292 xorps %xmm11,%xmm3 293 xorps %xmm13,%xmm5 294 295 leaq 64(%rdx),%rdx 296 subq $0x40,%rcx 297 jc .Ltail4x 298 299 jmp .Lmod4_loop 300.align 32 301.Lmod4_loop: 302.byte 102,65,15,58,68,199,0 303 xorps %xmm12,%xmm4 304 movdqu 48(%rdx),%xmm11 305.byte 102,69,15,56,0,218 306.byte 102,65,15,58,68,207,17 307 xorps %xmm3,%xmm0 308 movdqu 32(%rdx),%xmm3 309 movdqa %xmm11,%xmm13 310.byte 102,68,15,58,68,199,16 311 pshufd $78,%xmm11,%xmm12 312 xorps %xmm5,%xmm1 313 pxor %xmm11,%xmm12 314.byte 102,65,15,56,0,218 315 movups 32(%rsi),%xmm7 316 xorps %xmm4,%xmm8 317.byte 102,68,15,58,68,218,0 318 pshufd $78,%xmm3,%xmm4 319 320 pxor %xmm0,%xmm8 321 movdqa %xmm3,%xmm5 322 pxor %xmm1,%xmm8 323 pxor %xmm3,%xmm4 324 movdqa %xmm8,%xmm9 325.byte 102,68,15,58,68,234,17 326 pslldq $8,%xmm8 327 psrldq $8,%xmm9 328 pxor %xmm8,%xmm0 329 movdqa .L7_mask(%rip),%xmm8 330 pxor %xmm9,%xmm1 331.byte 102,76,15,110,200 332 333 pand %xmm0,%xmm8 334.byte 102,69,15,56,0,200 335 pxor %xmm0,%xmm9 336.byte 102,68,15,58,68,231,0 337 psllq $57,%xmm9 338 movdqa %xmm9,%xmm8 339 pslldq $8,%xmm9 340.byte 102,15,58,68,222,0 341 psrldq $8,%xmm8 342 pxor %xmm9,%xmm0 343 pxor %xmm8,%xmm1 344 movdqu 0(%rdx),%xmm8 345 346 movdqa %xmm0,%xmm9 347 psrlq $1,%xmm0 348.byte 102,15,58,68,238,17 349 xorps %xmm11,%xmm3 350 movdqu 16(%rdx),%xmm11 351.byte 102,69,15,56,0,218 352.byte 102,15,58,68,231,16 353 xorps %xmm13,%xmm5 354 movups 80(%rsi),%xmm7 355.byte 102,69,15,56,0,194 356 pxor %xmm9,%xmm1 357 pxor %xmm0,%xmm9 358 psrlq $5,%xmm0 359 360 movdqa %xmm11,%xmm13 361 pxor %xmm12,%xmm4 362 pshufd $78,%xmm11,%xmm12 363 pxor %xmm9,%xmm0 364 pxor %xmm8,%xmm1 365 pxor %xmm11,%xmm12 366.byte 102,69,15,58,68,222,0 367 psrlq $1,%xmm0 368 pxor %xmm1,%xmm0 369 movdqa %xmm0,%xmm1 370.byte 102,69,15,58,68,238,17 371 xorps %xmm11,%xmm3 372 pshufd $78,%xmm0,%xmm8 373 pxor %xmm0,%xmm8 374 375.byte 102,68,15,58,68,231,0 376 xorps %xmm13,%xmm5 377 378 leaq 64(%rdx),%rdx 379 subq $0x40,%rcx 380 jnc .Lmod4_loop 381 382.Ltail4x: 383.byte 102,65,15,58,68,199,0 384.byte 102,65,15,58,68,207,17 385.byte 102,68,15,58,68,199,16 386 xorps %xmm12,%xmm4 387 xorps %xmm3,%xmm0 388 xorps %xmm5,%xmm1 389 pxor %xmm0,%xmm1 390 pxor %xmm4,%xmm8 391 392 pxor %xmm1,%xmm8 393 pxor %xmm0,%xmm1 394 395 movdqa %xmm8,%xmm9 396 psrldq $8,%xmm8 397 pslldq $8,%xmm9 398 pxor %xmm8,%xmm1 399 pxor %xmm9,%xmm0 400 401 movdqa %xmm0,%xmm4 402 movdqa %xmm0,%xmm3 403 psllq $5,%xmm0 404 pxor %xmm0,%xmm3 405 psllq $1,%xmm0 406 pxor %xmm3,%xmm0 407 psllq $57,%xmm0 408 movdqa %xmm0,%xmm3 409 pslldq $8,%xmm0 410 psrldq $8,%xmm3 411 pxor %xmm4,%xmm0 412 pxor %xmm3,%xmm1 413 414 415 movdqa %xmm0,%xmm4 416 psrlq $1,%xmm0 417 pxor %xmm4,%xmm1 418 pxor %xmm0,%xmm4 419 psrlq $5,%xmm0 420 pxor %xmm4,%xmm0 421 psrlq $1,%xmm0 422 pxor %xmm1,%xmm0 423 addq $0x40,%rcx 424 jz .Ldone 425 movdqu 32(%rsi),%xmm7 426 subq $0x10,%rcx 427 jz .Lodd_tail 428.Lskip4x: 429 430 431 432 433 434 movdqu (%rdx),%xmm8 435 movdqu 16(%rdx),%xmm3 436.byte 102,69,15,56,0,194 437.byte 102,65,15,56,0,218 438 pxor %xmm8,%xmm0 439 440 movdqa %xmm3,%xmm5 441 pshufd $78,%xmm3,%xmm4 442 pxor %xmm3,%xmm4 443.byte 102,15,58,68,218,0 444.byte 102,15,58,68,234,17 445.byte 102,15,58,68,231,0 446 447 leaq 32(%rdx),%rdx 448 nop 449 subq $0x20,%rcx 450 jbe .Leven_tail 451 nop 452 jmp .Lmod_loop 453 454.align 32 455.Lmod_loop: 456 movdqa %xmm0,%xmm1 457 movdqa %xmm4,%xmm8 458 pshufd $78,%xmm0,%xmm4 459 pxor %xmm0,%xmm4 460 461.byte 102,15,58,68,198,0 462.byte 102,15,58,68,206,17 463.byte 102,15,58,68,231,16 464 465 pxor %xmm3,%xmm0 466 pxor %xmm5,%xmm1 467 movdqu (%rdx),%xmm9 468 pxor %xmm0,%xmm8 469.byte 102,69,15,56,0,202 470 movdqu 16(%rdx),%xmm3 471 472 pxor %xmm1,%xmm8 473 pxor %xmm9,%xmm1 474 pxor %xmm8,%xmm4 475.byte 102,65,15,56,0,218 476 movdqa %xmm4,%xmm8 477 psrldq $8,%xmm8 478 pslldq $8,%xmm4 479 pxor %xmm8,%xmm1 480 pxor %xmm4,%xmm0 481 482 movdqa %xmm3,%xmm5 483 484 movdqa %xmm0,%xmm9 485 movdqa %xmm0,%xmm8 486 psllq $5,%xmm0 487 pxor %xmm0,%xmm8 488.byte 102,15,58,68,218,0 489 psllq $1,%xmm0 490 pxor %xmm8,%xmm0 491 psllq $57,%xmm0 492 movdqa %xmm0,%xmm8 493 pslldq $8,%xmm0 494 psrldq $8,%xmm8 495 pxor %xmm9,%xmm0 496 pshufd $78,%xmm5,%xmm4 497 pxor %xmm8,%xmm1 498 pxor %xmm5,%xmm4 499 500 movdqa %xmm0,%xmm9 501 psrlq $1,%xmm0 502.byte 102,15,58,68,234,17 503 pxor %xmm9,%xmm1 504 pxor %xmm0,%xmm9 505 psrlq $5,%xmm0 506 pxor %xmm9,%xmm0 507 leaq 32(%rdx),%rdx 508 psrlq $1,%xmm0 509.byte 102,15,58,68,231,0 510 pxor %xmm1,%xmm0 511 512 subq $0x20,%rcx 513 ja .Lmod_loop 514 515.Leven_tail: 516 movdqa %xmm0,%xmm1 517 movdqa %xmm4,%xmm8 518 pshufd $78,%xmm0,%xmm4 519 pxor %xmm0,%xmm4 520 521.byte 102,15,58,68,198,0 522.byte 102,15,58,68,206,17 523.byte 102,15,58,68,231,16 524 525 pxor %xmm3,%xmm0 526 pxor %xmm5,%xmm1 527 pxor %xmm0,%xmm8 528 pxor %xmm1,%xmm8 529 pxor %xmm8,%xmm4 530 movdqa %xmm4,%xmm8 531 psrldq $8,%xmm8 532 pslldq $8,%xmm4 533 pxor %xmm8,%xmm1 534 pxor %xmm4,%xmm0 535 536 movdqa %xmm0,%xmm4 537 movdqa %xmm0,%xmm3 538 psllq $5,%xmm0 539 pxor %xmm0,%xmm3 540 psllq $1,%xmm0 541 pxor %xmm3,%xmm0 542 psllq $57,%xmm0 543 movdqa %xmm0,%xmm3 544 pslldq $8,%xmm0 545 psrldq $8,%xmm3 546 pxor %xmm4,%xmm0 547 pxor %xmm3,%xmm1 548 549 550 movdqa %xmm0,%xmm4 551 psrlq $1,%xmm0 552 pxor %xmm4,%xmm1 553 pxor %xmm0,%xmm4 554 psrlq $5,%xmm0 555 pxor %xmm4,%xmm0 556 psrlq $1,%xmm0 557 pxor %xmm1,%xmm0 558 testq %rcx,%rcx 559 jnz .Ldone 560 561.Lodd_tail: 562 movdqu (%rdx),%xmm8 563.byte 102,69,15,56,0,194 564 pxor %xmm8,%xmm0 565 movdqa %xmm0,%xmm1 566 pshufd $78,%xmm0,%xmm3 567 pxor %xmm0,%xmm3 568.byte 102,15,58,68,194,0 569.byte 102,15,58,68,202,17 570.byte 102,15,58,68,223,0 571 pxor %xmm0,%xmm3 572 pxor %xmm1,%xmm3 573 574 movdqa %xmm3,%xmm4 575 psrldq $8,%xmm3 576 pslldq $8,%xmm4 577 pxor %xmm3,%xmm1 578 pxor %xmm4,%xmm0 579 580 movdqa %xmm0,%xmm4 581 movdqa %xmm0,%xmm3 582 psllq $5,%xmm0 583 pxor %xmm0,%xmm3 584 psllq $1,%xmm0 585 pxor %xmm3,%xmm0 586 psllq $57,%xmm0 587 movdqa %xmm0,%xmm3 588 pslldq $8,%xmm0 589 psrldq $8,%xmm3 590 pxor %xmm4,%xmm0 591 pxor %xmm3,%xmm1 592 593 594 movdqa %xmm0,%xmm4 595 psrlq $1,%xmm0 596 pxor %xmm4,%xmm1 597 pxor %xmm0,%xmm4 598 psrlq $5,%xmm0 599 pxor %xmm4,%xmm0 600 psrlq $1,%xmm0 601 pxor %xmm1,%xmm0 602.Ldone: 603.byte 102,65,15,56,0,194 604 movdqu %xmm0,(%rdi) 605 ret 606.cfi_endproc 607 608.size gcm_ghash_clmul,.-gcm_ghash_clmul 609.globl gcm_init_avx 610.hidden gcm_init_avx 611.type gcm_init_avx,@function 612.align 32 613gcm_init_avx: 614.cfi_startproc 615 616_CET_ENDBR 617 vzeroupper 618 619 vmovdqu (%rsi),%xmm2 620 vpshufd $78,%xmm2,%xmm2 621 622 623 vpshufd $255,%xmm2,%xmm4 624 vpsrlq $63,%xmm2,%xmm3 625 vpsllq $1,%xmm2,%xmm2 626 vpxor %xmm5,%xmm5,%xmm5 627 vpcmpgtd %xmm4,%xmm5,%xmm5 628 vpslldq $8,%xmm3,%xmm3 629 vpor %xmm3,%xmm2,%xmm2 630 631 632 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 633 vpxor %xmm5,%xmm2,%xmm2 634 635 vpunpckhqdq %xmm2,%xmm2,%xmm6 636 vmovdqa %xmm2,%xmm0 637 vpxor %xmm2,%xmm6,%xmm6 638 movq $4,%r10 639 jmp .Linit_start_avx 640.align 32 641.Linit_loop_avx: 642 vpalignr $8,%xmm3,%xmm4,%xmm5 643 vmovdqu %xmm5,-16(%rdi) 644 vpunpckhqdq %xmm0,%xmm0,%xmm3 645 vpxor %xmm0,%xmm3,%xmm3 646 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 647 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 648 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 649 vpxor %xmm0,%xmm1,%xmm4 650 vpxor %xmm4,%xmm3,%xmm3 651 652 vpslldq $8,%xmm3,%xmm4 653 vpsrldq $8,%xmm3,%xmm3 654 vpxor %xmm4,%xmm0,%xmm0 655 vpxor %xmm3,%xmm1,%xmm1 656 vpsllq $57,%xmm0,%xmm3 657 vpsllq $62,%xmm0,%xmm4 658 vpxor %xmm3,%xmm4,%xmm4 659 vpsllq $63,%xmm0,%xmm3 660 vpxor %xmm3,%xmm4,%xmm4 661 vpslldq $8,%xmm4,%xmm3 662 vpsrldq $8,%xmm4,%xmm4 663 vpxor %xmm3,%xmm0,%xmm0 664 vpxor %xmm4,%xmm1,%xmm1 665 666 vpsrlq $1,%xmm0,%xmm4 667 vpxor %xmm0,%xmm1,%xmm1 668 vpxor %xmm4,%xmm0,%xmm0 669 vpsrlq $5,%xmm4,%xmm4 670 vpxor %xmm4,%xmm0,%xmm0 671 vpsrlq $1,%xmm0,%xmm0 672 vpxor %xmm1,%xmm0,%xmm0 673.Linit_start_avx: 674 vmovdqa %xmm0,%xmm5 675 vpunpckhqdq %xmm0,%xmm0,%xmm3 676 vpxor %xmm0,%xmm3,%xmm3 677 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 678 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 679 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 680 vpxor %xmm0,%xmm1,%xmm4 681 vpxor %xmm4,%xmm3,%xmm3 682 683 vpslldq $8,%xmm3,%xmm4 684 vpsrldq $8,%xmm3,%xmm3 685 vpxor %xmm4,%xmm0,%xmm0 686 vpxor %xmm3,%xmm1,%xmm1 687 vpsllq $57,%xmm0,%xmm3 688 vpsllq $62,%xmm0,%xmm4 689 vpxor %xmm3,%xmm4,%xmm4 690 vpsllq $63,%xmm0,%xmm3 691 vpxor %xmm3,%xmm4,%xmm4 692 vpslldq $8,%xmm4,%xmm3 693 vpsrldq $8,%xmm4,%xmm4 694 vpxor %xmm3,%xmm0,%xmm0 695 vpxor %xmm4,%xmm1,%xmm1 696 697 vpsrlq $1,%xmm0,%xmm4 698 vpxor %xmm0,%xmm1,%xmm1 699 vpxor %xmm4,%xmm0,%xmm0 700 vpsrlq $5,%xmm4,%xmm4 701 vpxor %xmm4,%xmm0,%xmm0 702 vpsrlq $1,%xmm0,%xmm0 703 vpxor %xmm1,%xmm0,%xmm0 704 vpshufd $78,%xmm5,%xmm3 705 vpshufd $78,%xmm0,%xmm4 706 vpxor %xmm5,%xmm3,%xmm3 707 vmovdqu %xmm5,0(%rdi) 708 vpxor %xmm0,%xmm4,%xmm4 709 vmovdqu %xmm0,16(%rdi) 710 leaq 48(%rdi),%rdi 711 subq $1,%r10 712 jnz .Linit_loop_avx 713 714 vpalignr $8,%xmm4,%xmm3,%xmm5 715 vmovdqu %xmm5,-16(%rdi) 716 717 vzeroupper 718 ret 719 720.cfi_endproc 721.size gcm_init_avx,.-gcm_init_avx 722.globl gcm_gmult_avx 723.hidden gcm_gmult_avx 724.type gcm_gmult_avx,@function 725.align 32 726gcm_gmult_avx: 727.cfi_startproc 728_CET_ENDBR 729 jmp .L_gmult_clmul 730.cfi_endproc 731.size gcm_gmult_avx,.-gcm_gmult_avx 732.globl gcm_ghash_avx 733.hidden gcm_ghash_avx 734.type gcm_ghash_avx,@function 735.align 32 736gcm_ghash_avx: 737.cfi_startproc 738 739_CET_ENDBR 740 vzeroupper 741 742 vmovdqu (%rdi),%xmm10 743 leaq .L0x1c2_polynomial(%rip),%r10 744 leaq 64(%rsi),%rsi 745 vmovdqu .Lbswap_mask(%rip),%xmm13 746 vpshufb %xmm13,%xmm10,%xmm10 747 cmpq $0x80,%rcx 748 jb .Lshort_avx 749 subq $0x80,%rcx 750 751 vmovdqu 112(%rdx),%xmm14 752 vmovdqu 0-64(%rsi),%xmm6 753 vpshufb %xmm13,%xmm14,%xmm14 754 vmovdqu 32-64(%rsi),%xmm7 755 756 vpunpckhqdq %xmm14,%xmm14,%xmm9 757 vmovdqu 96(%rdx),%xmm15 758 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 759 vpxor %xmm14,%xmm9,%xmm9 760 vpshufb %xmm13,%xmm15,%xmm15 761 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 762 vmovdqu 16-64(%rsi),%xmm6 763 vpunpckhqdq %xmm15,%xmm15,%xmm8 764 vmovdqu 80(%rdx),%xmm14 765 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 766 vpxor %xmm15,%xmm8,%xmm8 767 768 vpshufb %xmm13,%xmm14,%xmm14 769 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 770 vpunpckhqdq %xmm14,%xmm14,%xmm9 771 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 772 vmovdqu 48-64(%rsi),%xmm6 773 vpxor %xmm14,%xmm9,%xmm9 774 vmovdqu 64(%rdx),%xmm15 775 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 776 vmovdqu 80-64(%rsi),%xmm7 777 778 vpshufb %xmm13,%xmm15,%xmm15 779 vpxor %xmm0,%xmm3,%xmm3 780 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 781 vpxor %xmm1,%xmm4,%xmm4 782 vpunpckhqdq %xmm15,%xmm15,%xmm8 783 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 784 vmovdqu 64-64(%rsi),%xmm6 785 vpxor %xmm2,%xmm5,%xmm5 786 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 787 vpxor %xmm15,%xmm8,%xmm8 788 789 vmovdqu 48(%rdx),%xmm14 790 vpxor %xmm3,%xmm0,%xmm0 791 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 792 vpxor %xmm4,%xmm1,%xmm1 793 vpshufb %xmm13,%xmm14,%xmm14 794 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 795 vmovdqu 96-64(%rsi),%xmm6 796 vpxor %xmm5,%xmm2,%xmm2 797 vpunpckhqdq %xmm14,%xmm14,%xmm9 798 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 799 vmovdqu 128-64(%rsi),%xmm7 800 vpxor %xmm14,%xmm9,%xmm9 801 802 vmovdqu 32(%rdx),%xmm15 803 vpxor %xmm0,%xmm3,%xmm3 804 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 805 vpxor %xmm1,%xmm4,%xmm4 806 vpshufb %xmm13,%xmm15,%xmm15 807 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 808 vmovdqu 112-64(%rsi),%xmm6 809 vpxor %xmm2,%xmm5,%xmm5 810 vpunpckhqdq %xmm15,%xmm15,%xmm8 811 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 812 vpxor %xmm15,%xmm8,%xmm8 813 814 vmovdqu 16(%rdx),%xmm14 815 vpxor %xmm3,%xmm0,%xmm0 816 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 817 vpxor %xmm4,%xmm1,%xmm1 818 vpshufb %xmm13,%xmm14,%xmm14 819 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 820 vmovdqu 144-64(%rsi),%xmm6 821 vpxor %xmm5,%xmm2,%xmm2 822 vpunpckhqdq %xmm14,%xmm14,%xmm9 823 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 824 vmovdqu 176-64(%rsi),%xmm7 825 vpxor %xmm14,%xmm9,%xmm9 826 827 vmovdqu (%rdx),%xmm15 828 vpxor %xmm0,%xmm3,%xmm3 829 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 830 vpxor %xmm1,%xmm4,%xmm4 831 vpshufb %xmm13,%xmm15,%xmm15 832 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 833 vmovdqu 160-64(%rsi),%xmm6 834 vpxor %xmm2,%xmm5,%xmm5 835 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 836 837 leaq 128(%rdx),%rdx 838 cmpq $0x80,%rcx 839 jb .Ltail_avx 840 841 vpxor %xmm10,%xmm15,%xmm15 842 subq $0x80,%rcx 843 jmp .Loop8x_avx 844 845.align 32 846.Loop8x_avx: 847 vpunpckhqdq %xmm15,%xmm15,%xmm8 848 vmovdqu 112(%rdx),%xmm14 849 vpxor %xmm0,%xmm3,%xmm3 850 vpxor %xmm15,%xmm8,%xmm8 851 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 852 vpshufb %xmm13,%xmm14,%xmm14 853 vpxor %xmm1,%xmm4,%xmm4 854 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 855 vmovdqu 0-64(%rsi),%xmm6 856 vpunpckhqdq %xmm14,%xmm14,%xmm9 857 vpxor %xmm2,%xmm5,%xmm5 858 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 859 vmovdqu 32-64(%rsi),%xmm7 860 vpxor %xmm14,%xmm9,%xmm9 861 862 vmovdqu 96(%rdx),%xmm15 863 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 864 vpxor %xmm3,%xmm10,%xmm10 865 vpshufb %xmm13,%xmm15,%xmm15 866 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 867 vxorps %xmm4,%xmm11,%xmm11 868 vmovdqu 16-64(%rsi),%xmm6 869 vpunpckhqdq %xmm15,%xmm15,%xmm8 870 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 871 vpxor %xmm5,%xmm12,%xmm12 872 vxorps %xmm15,%xmm8,%xmm8 873 874 vmovdqu 80(%rdx),%xmm14 875 vpxor %xmm10,%xmm12,%xmm12 876 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 877 vpxor %xmm11,%xmm12,%xmm12 878 vpslldq $8,%xmm12,%xmm9 879 vpxor %xmm0,%xmm3,%xmm3 880 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 881 vpsrldq $8,%xmm12,%xmm12 882 vpxor %xmm9,%xmm10,%xmm10 883 vmovdqu 48-64(%rsi),%xmm6 884 vpshufb %xmm13,%xmm14,%xmm14 885 vxorps %xmm12,%xmm11,%xmm11 886 vpxor %xmm1,%xmm4,%xmm4 887 vpunpckhqdq %xmm14,%xmm14,%xmm9 888 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 889 vmovdqu 80-64(%rsi),%xmm7 890 vpxor %xmm14,%xmm9,%xmm9 891 vpxor %xmm2,%xmm5,%xmm5 892 893 vmovdqu 64(%rdx),%xmm15 894 vpalignr $8,%xmm10,%xmm10,%xmm12 895 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 896 vpshufb %xmm13,%xmm15,%xmm15 897 vpxor %xmm3,%xmm0,%xmm0 898 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 899 vmovdqu 64-64(%rsi),%xmm6 900 vpunpckhqdq %xmm15,%xmm15,%xmm8 901 vpxor %xmm4,%xmm1,%xmm1 902 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 903 vxorps %xmm15,%xmm8,%xmm8 904 vpxor %xmm5,%xmm2,%xmm2 905 906 vmovdqu 48(%rdx),%xmm14 907 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 908 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 909 vpshufb %xmm13,%xmm14,%xmm14 910 vpxor %xmm0,%xmm3,%xmm3 911 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 912 vmovdqu 96-64(%rsi),%xmm6 913 vpunpckhqdq %xmm14,%xmm14,%xmm9 914 vpxor %xmm1,%xmm4,%xmm4 915 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 916 vmovdqu 128-64(%rsi),%xmm7 917 vpxor %xmm14,%xmm9,%xmm9 918 vpxor %xmm2,%xmm5,%xmm5 919 920 vmovdqu 32(%rdx),%xmm15 921 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 922 vpshufb %xmm13,%xmm15,%xmm15 923 vpxor %xmm3,%xmm0,%xmm0 924 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 925 vmovdqu 112-64(%rsi),%xmm6 926 vpunpckhqdq %xmm15,%xmm15,%xmm8 927 vpxor %xmm4,%xmm1,%xmm1 928 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 929 vpxor %xmm15,%xmm8,%xmm8 930 vpxor %xmm5,%xmm2,%xmm2 931 vxorps %xmm12,%xmm10,%xmm10 932 933 vmovdqu 16(%rdx),%xmm14 934 vpalignr $8,%xmm10,%xmm10,%xmm12 935 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 936 vpshufb %xmm13,%xmm14,%xmm14 937 vpxor %xmm0,%xmm3,%xmm3 938 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 939 vmovdqu 144-64(%rsi),%xmm6 940 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 941 vxorps %xmm11,%xmm12,%xmm12 942 vpunpckhqdq %xmm14,%xmm14,%xmm9 943 vpxor %xmm1,%xmm4,%xmm4 944 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 945 vmovdqu 176-64(%rsi),%xmm7 946 vpxor %xmm14,%xmm9,%xmm9 947 vpxor %xmm2,%xmm5,%xmm5 948 949 vmovdqu (%rdx),%xmm15 950 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 951 vpshufb %xmm13,%xmm15,%xmm15 952 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 953 vmovdqu 160-64(%rsi),%xmm6 954 vpxor %xmm12,%xmm15,%xmm15 955 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 956 vpxor %xmm10,%xmm15,%xmm15 957 958 leaq 128(%rdx),%rdx 959 subq $0x80,%rcx 960 jnc .Loop8x_avx 961 962 addq $0x80,%rcx 963 jmp .Ltail_no_xor_avx 964 965.align 32 966.Lshort_avx: 967 vmovdqu -16(%rdx,%rcx,1),%xmm14 968 leaq (%rdx,%rcx,1),%rdx 969 vmovdqu 0-64(%rsi),%xmm6 970 vmovdqu 32-64(%rsi),%xmm7 971 vpshufb %xmm13,%xmm14,%xmm15 972 973 vmovdqa %xmm0,%xmm3 974 vmovdqa %xmm1,%xmm4 975 vmovdqa %xmm2,%xmm5 976 subq $0x10,%rcx 977 jz .Ltail_avx 978 979 vpunpckhqdq %xmm15,%xmm15,%xmm8 980 vpxor %xmm0,%xmm3,%xmm3 981 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 982 vpxor %xmm15,%xmm8,%xmm8 983 vmovdqu -32(%rdx),%xmm14 984 vpxor %xmm1,%xmm4,%xmm4 985 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 986 vmovdqu 16-64(%rsi),%xmm6 987 vpshufb %xmm13,%xmm14,%xmm15 988 vpxor %xmm2,%xmm5,%xmm5 989 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 990 vpsrldq $8,%xmm7,%xmm7 991 subq $0x10,%rcx 992 jz .Ltail_avx 993 994 vpunpckhqdq %xmm15,%xmm15,%xmm8 995 vpxor %xmm0,%xmm3,%xmm3 996 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 997 vpxor %xmm15,%xmm8,%xmm8 998 vmovdqu -48(%rdx),%xmm14 999 vpxor %xmm1,%xmm4,%xmm4 1000 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1001 vmovdqu 48-64(%rsi),%xmm6 1002 vpshufb %xmm13,%xmm14,%xmm15 1003 vpxor %xmm2,%xmm5,%xmm5 1004 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1005 vmovdqu 80-64(%rsi),%xmm7 1006 subq $0x10,%rcx 1007 jz .Ltail_avx 1008 1009 vpunpckhqdq %xmm15,%xmm15,%xmm8 1010 vpxor %xmm0,%xmm3,%xmm3 1011 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1012 vpxor %xmm15,%xmm8,%xmm8 1013 vmovdqu -64(%rdx),%xmm14 1014 vpxor %xmm1,%xmm4,%xmm4 1015 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1016 vmovdqu 64-64(%rsi),%xmm6 1017 vpshufb %xmm13,%xmm14,%xmm15 1018 vpxor %xmm2,%xmm5,%xmm5 1019 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1020 vpsrldq $8,%xmm7,%xmm7 1021 subq $0x10,%rcx 1022 jz .Ltail_avx 1023 1024 vpunpckhqdq %xmm15,%xmm15,%xmm8 1025 vpxor %xmm0,%xmm3,%xmm3 1026 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1027 vpxor %xmm15,%xmm8,%xmm8 1028 vmovdqu -80(%rdx),%xmm14 1029 vpxor %xmm1,%xmm4,%xmm4 1030 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1031 vmovdqu 96-64(%rsi),%xmm6 1032 vpshufb %xmm13,%xmm14,%xmm15 1033 vpxor %xmm2,%xmm5,%xmm5 1034 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1035 vmovdqu 128-64(%rsi),%xmm7 1036 subq $0x10,%rcx 1037 jz .Ltail_avx 1038 1039 vpunpckhqdq %xmm15,%xmm15,%xmm8 1040 vpxor %xmm0,%xmm3,%xmm3 1041 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1042 vpxor %xmm15,%xmm8,%xmm8 1043 vmovdqu -96(%rdx),%xmm14 1044 vpxor %xmm1,%xmm4,%xmm4 1045 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1046 vmovdqu 112-64(%rsi),%xmm6 1047 vpshufb %xmm13,%xmm14,%xmm15 1048 vpxor %xmm2,%xmm5,%xmm5 1049 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1050 vpsrldq $8,%xmm7,%xmm7 1051 subq $0x10,%rcx 1052 jz .Ltail_avx 1053 1054 vpunpckhqdq %xmm15,%xmm15,%xmm8 1055 vpxor %xmm0,%xmm3,%xmm3 1056 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1057 vpxor %xmm15,%xmm8,%xmm8 1058 vmovdqu -112(%rdx),%xmm14 1059 vpxor %xmm1,%xmm4,%xmm4 1060 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1061 vmovdqu 144-64(%rsi),%xmm6 1062 vpshufb %xmm13,%xmm14,%xmm15 1063 vpxor %xmm2,%xmm5,%xmm5 1064 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1065 vmovq 184-64(%rsi),%xmm7 1066 subq $0x10,%rcx 1067 jmp .Ltail_avx 1068 1069.align 32 1070.Ltail_avx: 1071 vpxor %xmm10,%xmm15,%xmm15 1072.Ltail_no_xor_avx: 1073 vpunpckhqdq %xmm15,%xmm15,%xmm8 1074 vpxor %xmm0,%xmm3,%xmm3 1075 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1076 vpxor %xmm15,%xmm8,%xmm8 1077 vpxor %xmm1,%xmm4,%xmm4 1078 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1079 vpxor %xmm2,%xmm5,%xmm5 1080 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1081 1082 vmovdqu (%r10),%xmm12 1083 1084 vpxor %xmm0,%xmm3,%xmm10 1085 vpxor %xmm1,%xmm4,%xmm11 1086 vpxor %xmm2,%xmm5,%xmm5 1087 1088 vpxor %xmm10,%xmm5,%xmm5 1089 vpxor %xmm11,%xmm5,%xmm5 1090 vpslldq $8,%xmm5,%xmm9 1091 vpsrldq $8,%xmm5,%xmm5 1092 vpxor %xmm9,%xmm10,%xmm10 1093 vpxor %xmm5,%xmm11,%xmm11 1094 1095 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 1096 vpalignr $8,%xmm10,%xmm10,%xmm10 1097 vpxor %xmm9,%xmm10,%xmm10 1098 1099 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 1100 vpalignr $8,%xmm10,%xmm10,%xmm10 1101 vpxor %xmm11,%xmm10,%xmm10 1102 vpxor %xmm9,%xmm10,%xmm10 1103 1104 cmpq $0,%rcx 1105 jne .Lshort_avx 1106 1107 vpshufb %xmm13,%xmm10,%xmm10 1108 vmovdqu %xmm10,(%rdi) 1109 vzeroupper 1110 ret 1111.cfi_endproc 1112 1113.size gcm_ghash_avx,.-gcm_ghash_avx 1114.section .rodata 1115.align 64 1116.Lbswap_mask: 1117.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1118.L0x1c2_polynomial: 1119.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1120.L7_mask: 1121.long 7,0,7,0 1122.align 64 1123 1124.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1125.align 64 1126.text 1127#endif 1128