1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build !purego 6 7// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI 8// The implementation uses some optimization as described in: 9// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication 10// Instruction and its Usage for Computing the GCM Mode rev. 2.02 11// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and 12// Hardware 13 14#include "textflag.h" 15 16#define B0 X0 17#define B1 X1 18#define B2 X2 19#define B3 X3 20#define B4 X4 21#define B5 X5 22#define B6 X6 23#define B7 X7 24 25#define ACC0 X8 26#define ACC1 X9 27#define ACCM X10 28 29#define T0 X11 30#define T1 X12 31#define T2 X13 32#define POLY X14 33#define BSWAP X15 34 35DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f 36DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 37 38DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001 39DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000 40 41DATA andMask<>+0x00(SB)/8, $0x00000000000000ff 42DATA andMask<>+0x08(SB)/8, $0x0000000000000000 43DATA andMask<>+0x10(SB)/8, $0x000000000000ffff 44DATA andMask<>+0x18(SB)/8, $0x0000000000000000 45DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff 46DATA andMask<>+0x28(SB)/8, $0x0000000000000000 47DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff 48DATA andMask<>+0x38(SB)/8, $0x0000000000000000 49DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff 50DATA andMask<>+0x48(SB)/8, $0x0000000000000000 51DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff 52DATA andMask<>+0x58(SB)/8, $0x0000000000000000 53DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff 54DATA andMask<>+0x68(SB)/8, $0x0000000000000000 55DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff 56DATA andMask<>+0x78(SB)/8, $0x0000000000000000 57DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff 58DATA andMask<>+0x88(SB)/8, $0x00000000000000ff 59DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff 60DATA andMask<>+0x98(SB)/8, $0x000000000000ffff 61DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff 62DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff 63DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff 64DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff 65DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff 66DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff 67DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff 68DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff 69DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff 70DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff 71 72GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 73GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16 74GLOBL andMask<>(SB), (NOPTR+RODATA), $240 75 76// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 77TEXT ·gcmAesFinish(SB),NOSPLIT,$0 78#define pTbl DI 79#define tMsk SI 80#define tPtr DX 81#define plen AX 82#define dlen CX 83 84 MOVQ productTable+0(FP), pTbl 85 MOVQ tagMask+8(FP), tMsk 86 MOVQ T+16(FP), tPtr 87 MOVQ pLen+24(FP), plen 88 MOVQ dLen+32(FP), dlen 89 90 MOVOU (tPtr), ACC0 91 MOVOU (tMsk), T2 92 93 MOVOU bswapMask<>(SB), BSWAP 94 MOVOU gcmPoly<>(SB), POLY 95 96 SHLQ $3, plen 97 SHLQ $3, dlen 98 99 MOVQ plen, B0 100 PINSRQ $1, dlen, B0 101 102 PXOR ACC0, B0 103 104 MOVOU (16*14)(pTbl), ACC0 105 MOVOU (16*15)(pTbl), ACCM 106 MOVOU ACC0, ACC1 107 108 PCLMULQDQ $0x00, B0, ACC0 109 PCLMULQDQ $0x11, B0, ACC1 110 PSHUFD $78, B0, T0 111 PXOR B0, T0 112 PCLMULQDQ $0x00, T0, ACCM 113 114 PXOR ACC0, ACCM 115 PXOR ACC1, ACCM 116 MOVOU ACCM, T0 117 PSRLDQ $8, ACCM 118 PSLLDQ $8, T0 119 PXOR ACCM, ACC1 120 PXOR T0, ACC0 121 122 MOVOU POLY, T0 123 PCLMULQDQ $0x01, ACC0, T0 124 PSHUFD $78, ACC0, ACC0 125 PXOR T0, ACC0 126 127 MOVOU POLY, T0 128 PCLMULQDQ $0x01, ACC0, T0 129 PSHUFD $78, ACC0, ACC0 130 PXOR T0, ACC0 131 132 PXOR ACC1, ACC0 133 134 PSHUFB BSWAP, ACC0 135 PXOR T2, ACC0 136 MOVOU ACC0, (tPtr) 137 138 RET 139#undef pTbl 140#undef tMsk 141#undef tPtr 142#undef plen 143#undef dlen 144 145// func gcmAesInit(productTable *[256]byte, ks []uint32) 146TEXT ·gcmAesInit(SB),NOSPLIT,$0 147#define dst DI 148#define KS SI 149#define NR DX 150 151 MOVQ productTable+0(FP), dst 152 MOVQ ks_base+8(FP), KS 153 MOVQ ks_len+16(FP), NR 154 155 SHRQ $2, NR 156 DECQ NR 157 158 MOVOU bswapMask<>(SB), BSWAP 159 MOVOU gcmPoly<>(SB), POLY 160 161 // Encrypt block 0, with the AES key to generate the hash key H 162 MOVOU (16*0)(KS), B0 163 MOVOU (16*1)(KS), T0 164 AESENC T0, B0 165 MOVOU (16*2)(KS), T0 166 AESENC T0, B0 167 MOVOU (16*3)(KS), T0 168 AESENC T0, B0 169 MOVOU (16*4)(KS), T0 170 AESENC T0, B0 171 MOVOU (16*5)(KS), T0 172 AESENC T0, B0 173 MOVOU (16*6)(KS), T0 174 AESENC T0, B0 175 MOVOU (16*7)(KS), T0 176 AESENC T0, B0 177 MOVOU (16*8)(KS), T0 178 AESENC T0, B0 179 MOVOU (16*9)(KS), T0 180 AESENC T0, B0 181 MOVOU (16*10)(KS), T0 182 CMPQ NR, $12 183 JB initEncLast 184 AESENC T0, B0 185 MOVOU (16*11)(KS), T0 186 AESENC T0, B0 187 MOVOU (16*12)(KS), T0 188 JE initEncLast 189 AESENC T0, B0 190 MOVOU (16*13)(KS), T0 191 AESENC T0, B0 192 MOVOU (16*14)(KS), T0 193initEncLast: 194 AESENCLAST T0, B0 195 196 PSHUFB BSWAP, B0 197 // H * 2 198 PSHUFD $0xff, B0, T0 199 MOVOU B0, T1 200 PSRAL $31, T0 201 PAND POLY, T0 202 PSRLL $31, T1 203 PSLLDQ $4, T1 204 PSLLL $1, B0 205 PXOR T0, B0 206 PXOR T1, B0 207 // Karatsuba pre-computations 208 MOVOU B0, (16*14)(dst) 209 PSHUFD $78, B0, B1 210 PXOR B0, B1 211 MOVOU B1, (16*15)(dst) 212 213 MOVOU B0, B2 214 MOVOU B1, B3 215 // Now prepare powers of H and pre-computations for them 216 MOVQ $7, AX 217 218initLoop: 219 MOVOU B2, T0 220 MOVOU B2, T1 221 MOVOU B3, T2 222 PCLMULQDQ $0x00, B0, T0 223 PCLMULQDQ $0x11, B0, T1 224 PCLMULQDQ $0x00, B1, T2 225 226 PXOR T0, T2 227 PXOR T1, T2 228 MOVOU T2, B4 229 PSLLDQ $8, B4 230 PSRLDQ $8, T2 231 PXOR B4, T0 232 PXOR T2, T1 233 234 MOVOU POLY, B2 235 PCLMULQDQ $0x01, T0, B2 236 PSHUFD $78, T0, T0 237 PXOR B2, T0 238 MOVOU POLY, B2 239 PCLMULQDQ $0x01, T0, B2 240 PSHUFD $78, T0, T0 241 PXOR T0, B2 242 PXOR T1, B2 243 244 MOVOU B2, (16*12)(dst) 245 PSHUFD $78, B2, B3 246 PXOR B2, B3 247 MOVOU B3, (16*13)(dst) 248 249 DECQ AX 250 LEAQ (-16*2)(dst), dst 251 JNE initLoop 252 253 RET 254#undef NR 255#undef KS 256#undef dst 257 258// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) 259TEXT ·gcmAesData(SB),NOSPLIT,$0 260#define pTbl DI 261#define aut SI 262#define tPtr CX 263#define autLen DX 264 265#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a 266#define mulRoundAAD(X ,i) \ 267 MOVOU (16*(i*2))(pTbl), T1;\ 268 MOVOU T1, T2;\ 269 PCLMULQDQ $0x00, X, T1;\ 270 PXOR T1, ACC0;\ 271 PCLMULQDQ $0x11, X, T2;\ 272 PXOR T2, ACC1;\ 273 PSHUFD $78, X, T1;\ 274 PXOR T1, X;\ 275 MOVOU (16*(i*2+1))(pTbl), T1;\ 276 PCLMULQDQ $0x00, X, T1;\ 277 PXOR T1, ACCM 278 279 MOVQ productTable+0(FP), pTbl 280 MOVQ data_base+8(FP), aut 281 MOVQ data_len+16(FP), autLen 282 MOVQ T+32(FP), tPtr 283 284 PXOR ACC0, ACC0 285 MOVOU bswapMask<>(SB), BSWAP 286 MOVOU gcmPoly<>(SB), POLY 287 288 TESTQ autLen, autLen 289 JEQ dataBail 290 291 CMPQ autLen, $13 // optimize the TLS case 292 JE dataTLS 293 CMPQ autLen, $128 294 JB startSinglesLoop 295 JMP dataOctaLoop 296 297dataTLS: 298 MOVOU (16*14)(pTbl), T1 299 MOVOU (16*15)(pTbl), T2 300 PXOR B0, B0 301 MOVQ (aut), B0 302 PINSRD $2, 8(aut), B0 303 PINSRB $12, 12(aut), B0 304 XORQ autLen, autLen 305 JMP dataMul 306 307dataOctaLoop: 308 CMPQ autLen, $128 309 JB startSinglesLoop 310 SUBQ $128, autLen 311 312 MOVOU (16*0)(aut), X0 313 MOVOU (16*1)(aut), X1 314 MOVOU (16*2)(aut), X2 315 MOVOU (16*3)(aut), X3 316 MOVOU (16*4)(aut), X4 317 MOVOU (16*5)(aut), X5 318 MOVOU (16*6)(aut), X6 319 MOVOU (16*7)(aut), X7 320 LEAQ (16*8)(aut), aut 321 PSHUFB BSWAP, X0 322 PSHUFB BSWAP, X1 323 PSHUFB BSWAP, X2 324 PSHUFB BSWAP, X3 325 PSHUFB BSWAP, X4 326 PSHUFB BSWAP, X5 327 PSHUFB BSWAP, X6 328 PSHUFB BSWAP, X7 329 PXOR ACC0, X0 330 331 MOVOU (16*0)(pTbl), ACC0 332 MOVOU (16*1)(pTbl), ACCM 333 MOVOU ACC0, ACC1 334 PSHUFD $78, X0, T1 335 PXOR X0, T1 336 PCLMULQDQ $0x00, X0, ACC0 337 PCLMULQDQ $0x11, X0, ACC1 338 PCLMULQDQ $0x00, T1, ACCM 339 340 mulRoundAAD(X1, 1) 341 mulRoundAAD(X2, 2) 342 mulRoundAAD(X3, 3) 343 mulRoundAAD(X4, 4) 344 mulRoundAAD(X5, 5) 345 mulRoundAAD(X6, 6) 346 mulRoundAAD(X7, 7) 347 348 PXOR ACC0, ACCM 349 PXOR ACC1, ACCM 350 MOVOU ACCM, T0 351 PSRLDQ $8, ACCM 352 PSLLDQ $8, T0 353 PXOR ACCM, ACC1 354 PXOR T0, ACC0 355 reduceRound(ACC0) 356 reduceRound(ACC0) 357 PXOR ACC1, ACC0 358 JMP dataOctaLoop 359 360startSinglesLoop: 361 MOVOU (16*14)(pTbl), T1 362 MOVOU (16*15)(pTbl), T2 363 364dataSinglesLoop: 365 366 CMPQ autLen, $16 367 JB dataEnd 368 SUBQ $16, autLen 369 370 MOVOU (aut), B0 371dataMul: 372 PSHUFB BSWAP, B0 373 PXOR ACC0, B0 374 375 MOVOU T1, ACC0 376 MOVOU T2, ACCM 377 MOVOU T1, ACC1 378 379 PSHUFD $78, B0, T0 380 PXOR B0, T0 381 PCLMULQDQ $0x00, B0, ACC0 382 PCLMULQDQ $0x11, B0, ACC1 383 PCLMULQDQ $0x00, T0, ACCM 384 385 PXOR ACC0, ACCM 386 PXOR ACC1, ACCM 387 MOVOU ACCM, T0 388 PSRLDQ $8, ACCM 389 PSLLDQ $8, T0 390 PXOR ACCM, ACC1 391 PXOR T0, ACC0 392 393 MOVOU POLY, T0 394 PCLMULQDQ $0x01, ACC0, T0 395 PSHUFD $78, ACC0, ACC0 396 PXOR T0, ACC0 397 398 MOVOU POLY, T0 399 PCLMULQDQ $0x01, ACC0, T0 400 PSHUFD $78, ACC0, ACC0 401 PXOR T0, ACC0 402 PXOR ACC1, ACC0 403 404 LEAQ 16(aut), aut 405 406 JMP dataSinglesLoop 407 408dataEnd: 409 410 TESTQ autLen, autLen 411 JEQ dataBail 412 413 PXOR B0, B0 414 LEAQ -1(aut)(autLen*1), aut 415 416dataLoadLoop: 417 418 PSLLDQ $1, B0 419 PINSRB $0, (aut), B0 420 421 LEAQ -1(aut), aut 422 DECQ autLen 423 JNE dataLoadLoop 424 425 JMP dataMul 426 427dataBail: 428 MOVOU ACC0, (tPtr) 429 RET 430#undef pTbl 431#undef aut 432#undef tPtr 433#undef autLen 434 435// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 436TEXT ·gcmAesEnc(SB),0,$256-96 437#define pTbl DI 438#define ctx DX 439#define ctrPtr CX 440#define ptx SI 441#define ks AX 442#define tPtr R8 443#define ptxLen R9 444#define aluCTR R10 445#define aluTMP R11 446#define aluK R12 447#define NR R13 448 449#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP) 450#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7 451#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7 452#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7 453#define combinedRound(i) \ 454 MOVOU (16*i)(ks), T0;\ 455 AESENC T0, B0;\ 456 AESENC T0, B1;\ 457 AESENC T0, B2;\ 458 AESENC T0, B3;\ 459 MOVOU (16*(i*2))(pTbl), T1;\ 460 MOVOU T1, T2;\ 461 AESENC T0, B4;\ 462 AESENC T0, B5;\ 463 AESENC T0, B6;\ 464 AESENC T0, B7;\ 465 MOVOU (16*i)(SP), T0;\ 466 PCLMULQDQ $0x00, T0, T1;\ 467 PXOR T1, ACC0;\ 468 PSHUFD $78, T0, T1;\ 469 PCLMULQDQ $0x11, T0, T2;\ 470 PXOR T1, T0;\ 471 PXOR T2, ACC1;\ 472 MOVOU (16*(i*2+1))(pTbl), T2;\ 473 PCLMULQDQ $0x00, T2, T0;\ 474 PXOR T0, ACCM 475#define mulRound(i) \ 476 MOVOU (16*i)(SP), T0;\ 477 MOVOU (16*(i*2))(pTbl), T1;\ 478 MOVOU T1, T2;\ 479 PCLMULQDQ $0x00, T0, T1;\ 480 PXOR T1, ACC0;\ 481 PCLMULQDQ $0x11, T0, T2;\ 482 PXOR T2, ACC1;\ 483 PSHUFD $78, T0, T1;\ 484 PXOR T1, T0;\ 485 MOVOU (16*(i*2+1))(pTbl), T1;\ 486 PCLMULQDQ $0x00, T0, T1;\ 487 PXOR T1, ACCM 488 489 MOVQ productTable+0(FP), pTbl 490 MOVQ dst+8(FP), ctx 491 MOVQ src_base+32(FP), ptx 492 MOVQ src_len+40(FP), ptxLen 493 MOVQ ctr+56(FP), ctrPtr 494 MOVQ T+64(FP), tPtr 495 MOVQ ks_base+72(FP), ks 496 MOVQ ks_len+80(FP), NR 497 498 SHRQ $2, NR 499 DECQ NR 500 501 MOVOU bswapMask<>(SB), BSWAP 502 MOVOU gcmPoly<>(SB), POLY 503 504 MOVOU (tPtr), ACC0 505 PXOR ACC1, ACC1 506 PXOR ACCM, ACCM 507 MOVOU (ctrPtr), B0 508 MOVL (3*4)(ctrPtr), aluCTR 509 MOVOU (ks), T0 510 MOVL (3*4)(ks), aluK 511 BSWAPL aluCTR 512 BSWAPL aluK 513 514 PXOR B0, T0 515 MOVOU T0, (8*16 + 0*16)(SP) 516 increment(0) 517 518 CMPQ ptxLen, $128 519 JB gcmAesEncSingles 520 SUBQ $128, ptxLen 521 522 // We have at least 8 blocks to encrypt, prepare the rest of the counters 523 MOVOU T0, (8*16 + 1*16)(SP) 524 increment(1) 525 MOVOU T0, (8*16 + 2*16)(SP) 526 increment(2) 527 MOVOU T0, (8*16 + 3*16)(SP) 528 increment(3) 529 MOVOU T0, (8*16 + 4*16)(SP) 530 increment(4) 531 MOVOU T0, (8*16 + 5*16)(SP) 532 increment(5) 533 MOVOU T0, (8*16 + 6*16)(SP) 534 increment(6) 535 MOVOU T0, (8*16 + 7*16)(SP) 536 increment(7) 537 538 MOVOU (8*16 + 0*16)(SP), B0 539 MOVOU (8*16 + 1*16)(SP), B1 540 MOVOU (8*16 + 2*16)(SP), B2 541 MOVOU (8*16 + 3*16)(SP), B3 542 MOVOU (8*16 + 4*16)(SP), B4 543 MOVOU (8*16 + 5*16)(SP), B5 544 MOVOU (8*16 + 6*16)(SP), B6 545 MOVOU (8*16 + 7*16)(SP), B7 546 547 aesRound(1) 548 increment(0) 549 aesRound(2) 550 increment(1) 551 aesRound(3) 552 increment(2) 553 aesRound(4) 554 increment(3) 555 aesRound(5) 556 increment(4) 557 aesRound(6) 558 increment(5) 559 aesRound(7) 560 increment(6) 561 aesRound(8) 562 increment(7) 563 aesRound(9) 564 MOVOU (16*10)(ks), T0 565 CMPQ NR, $12 566 JB encLast1 567 aesRnd(T0) 568 aesRound(11) 569 MOVOU (16*12)(ks), T0 570 JE encLast1 571 aesRnd(T0) 572 aesRound(13) 573 MOVOU (16*14)(ks), T0 574encLast1: 575 aesRndLast(T0) 576 577 MOVOU (16*0)(ptx), T0 578 PXOR T0, B0 579 MOVOU (16*1)(ptx), T0 580 PXOR T0, B1 581 MOVOU (16*2)(ptx), T0 582 PXOR T0, B2 583 MOVOU (16*3)(ptx), T0 584 PXOR T0, B3 585 MOVOU (16*4)(ptx), T0 586 PXOR T0, B4 587 MOVOU (16*5)(ptx), T0 588 PXOR T0, B5 589 MOVOU (16*6)(ptx), T0 590 PXOR T0, B6 591 MOVOU (16*7)(ptx), T0 592 PXOR T0, B7 593 594 MOVOU B0, (16*0)(ctx) 595 PSHUFB BSWAP, B0 596 PXOR ACC0, B0 597 MOVOU B1, (16*1)(ctx) 598 PSHUFB BSWAP, B1 599 MOVOU B2, (16*2)(ctx) 600 PSHUFB BSWAP, B2 601 MOVOU B3, (16*3)(ctx) 602 PSHUFB BSWAP, B3 603 MOVOU B4, (16*4)(ctx) 604 PSHUFB BSWAP, B4 605 MOVOU B5, (16*5)(ctx) 606 PSHUFB BSWAP, B5 607 MOVOU B6, (16*6)(ctx) 608 PSHUFB BSWAP, B6 609 MOVOU B7, (16*7)(ctx) 610 PSHUFB BSWAP, B7 611 612 MOVOU B0, (16*0)(SP) 613 MOVOU B1, (16*1)(SP) 614 MOVOU B2, (16*2)(SP) 615 MOVOU B3, (16*3)(SP) 616 MOVOU B4, (16*4)(SP) 617 MOVOU B5, (16*5)(SP) 618 MOVOU B6, (16*6)(SP) 619 MOVOU B7, (16*7)(SP) 620 621 LEAQ 128(ptx), ptx 622 LEAQ 128(ctx), ctx 623 624gcmAesEncOctetsLoop: 625 626 CMPQ ptxLen, $128 627 JB gcmAesEncOctetsEnd 628 SUBQ $128, ptxLen 629 630 MOVOU (8*16 + 0*16)(SP), B0 631 MOVOU (8*16 + 1*16)(SP), B1 632 MOVOU (8*16 + 2*16)(SP), B2 633 MOVOU (8*16 + 3*16)(SP), B3 634 MOVOU (8*16 + 4*16)(SP), B4 635 MOVOU (8*16 + 5*16)(SP), B5 636 MOVOU (8*16 + 6*16)(SP), B6 637 MOVOU (8*16 + 7*16)(SP), B7 638 639 MOVOU (16*0)(SP), T0 640 PSHUFD $78, T0, T1 641 PXOR T0, T1 642 643 MOVOU (16*0)(pTbl), ACC0 644 MOVOU (16*1)(pTbl), ACCM 645 MOVOU ACC0, ACC1 646 647 PCLMULQDQ $0x00, T1, ACCM 648 PCLMULQDQ $0x00, T0, ACC0 649 PCLMULQDQ $0x11, T0, ACC1 650 651 combinedRound(1) 652 increment(0) 653 combinedRound(2) 654 increment(1) 655 combinedRound(3) 656 increment(2) 657 combinedRound(4) 658 increment(3) 659 combinedRound(5) 660 increment(4) 661 combinedRound(6) 662 increment(5) 663 combinedRound(7) 664 increment(6) 665 666 aesRound(8) 667 increment(7) 668 669 PXOR ACC0, ACCM 670 PXOR ACC1, ACCM 671 MOVOU ACCM, T0 672 PSRLDQ $8, ACCM 673 PSLLDQ $8, T0 674 PXOR ACCM, ACC1 675 PXOR T0, ACC0 676 677 reduceRound(ACC0) 678 aesRound(9) 679 680 reduceRound(ACC0) 681 PXOR ACC1, ACC0 682 683 MOVOU (16*10)(ks), T0 684 CMPQ NR, $12 685 JB encLast2 686 aesRnd(T0) 687 aesRound(11) 688 MOVOU (16*12)(ks), T0 689 JE encLast2 690 aesRnd(T0) 691 aesRound(13) 692 MOVOU (16*14)(ks), T0 693encLast2: 694 aesRndLast(T0) 695 696 MOVOU (16*0)(ptx), T0 697 PXOR T0, B0 698 MOVOU (16*1)(ptx), T0 699 PXOR T0, B1 700 MOVOU (16*2)(ptx), T0 701 PXOR T0, B2 702 MOVOU (16*3)(ptx), T0 703 PXOR T0, B3 704 MOVOU (16*4)(ptx), T0 705 PXOR T0, B4 706 MOVOU (16*5)(ptx), T0 707 PXOR T0, B5 708 MOVOU (16*6)(ptx), T0 709 PXOR T0, B6 710 MOVOU (16*7)(ptx), T0 711 PXOR T0, B7 712 713 MOVOU B0, (16*0)(ctx) 714 PSHUFB BSWAP, B0 715 PXOR ACC0, B0 716 MOVOU B1, (16*1)(ctx) 717 PSHUFB BSWAP, B1 718 MOVOU B2, (16*2)(ctx) 719 PSHUFB BSWAP, B2 720 MOVOU B3, (16*3)(ctx) 721 PSHUFB BSWAP, B3 722 MOVOU B4, (16*4)(ctx) 723 PSHUFB BSWAP, B4 724 MOVOU B5, (16*5)(ctx) 725 PSHUFB BSWAP, B5 726 MOVOU B6, (16*6)(ctx) 727 PSHUFB BSWAP, B6 728 MOVOU B7, (16*7)(ctx) 729 PSHUFB BSWAP, B7 730 731 MOVOU B0, (16*0)(SP) 732 MOVOU B1, (16*1)(SP) 733 MOVOU B2, (16*2)(SP) 734 MOVOU B3, (16*3)(SP) 735 MOVOU B4, (16*4)(SP) 736 MOVOU B5, (16*5)(SP) 737 MOVOU B6, (16*6)(SP) 738 MOVOU B7, (16*7)(SP) 739 740 LEAQ 128(ptx), ptx 741 LEAQ 128(ctx), ctx 742 743 JMP gcmAesEncOctetsLoop 744 745gcmAesEncOctetsEnd: 746 747 MOVOU (16*0)(SP), T0 748 MOVOU (16*0)(pTbl), ACC0 749 MOVOU (16*1)(pTbl), ACCM 750 MOVOU ACC0, ACC1 751 PSHUFD $78, T0, T1 752 PXOR T0, T1 753 PCLMULQDQ $0x00, T0, ACC0 754 PCLMULQDQ $0x11, T0, ACC1 755 PCLMULQDQ $0x00, T1, ACCM 756 757 mulRound(1) 758 mulRound(2) 759 mulRound(3) 760 mulRound(4) 761 mulRound(5) 762 mulRound(6) 763 mulRound(7) 764 765 PXOR ACC0, ACCM 766 PXOR ACC1, ACCM 767 MOVOU ACCM, T0 768 PSRLDQ $8, ACCM 769 PSLLDQ $8, T0 770 PXOR ACCM, ACC1 771 PXOR T0, ACC0 772 773 reduceRound(ACC0) 774 reduceRound(ACC0) 775 PXOR ACC1, ACC0 776 777 TESTQ ptxLen, ptxLen 778 JE gcmAesEncDone 779 780 SUBQ $7, aluCTR 781 782gcmAesEncSingles: 783 784 MOVOU (16*1)(ks), B1 785 MOVOU (16*2)(ks), B2 786 MOVOU (16*3)(ks), B3 787 MOVOU (16*4)(ks), B4 788 MOVOU (16*5)(ks), B5 789 MOVOU (16*6)(ks), B6 790 MOVOU (16*7)(ks), B7 791 792 MOVOU (16*14)(pTbl), T2 793 794gcmAesEncSinglesLoop: 795 796 CMPQ ptxLen, $16 797 JB gcmAesEncTail 798 SUBQ $16, ptxLen 799 800 MOVOU (8*16 + 0*16)(SP), B0 801 increment(0) 802 803 AESENC B1, B0 804 AESENC B2, B0 805 AESENC B3, B0 806 AESENC B4, B0 807 AESENC B5, B0 808 AESENC B6, B0 809 AESENC B7, B0 810 MOVOU (16*8)(ks), T0 811 AESENC T0, B0 812 MOVOU (16*9)(ks), T0 813 AESENC T0, B0 814 MOVOU (16*10)(ks), T0 815 CMPQ NR, $12 816 JB encLast3 817 AESENC T0, B0 818 MOVOU (16*11)(ks), T0 819 AESENC T0, B0 820 MOVOU (16*12)(ks), T0 821 JE encLast3 822 AESENC T0, B0 823 MOVOU (16*13)(ks), T0 824 AESENC T0, B0 825 MOVOU (16*14)(ks), T0 826encLast3: 827 AESENCLAST T0, B0 828 829 MOVOU (ptx), T0 830 PXOR T0, B0 831 MOVOU B0, (ctx) 832 833 PSHUFB BSWAP, B0 834 PXOR ACC0, B0 835 836 MOVOU T2, ACC0 837 MOVOU T2, ACC1 838 MOVOU (16*15)(pTbl), ACCM 839 840 PSHUFD $78, B0, T0 841 PXOR B0, T0 842 PCLMULQDQ $0x00, B0, ACC0 843 PCLMULQDQ $0x11, B0, ACC1 844 PCLMULQDQ $0x00, T0, ACCM 845 846 PXOR ACC0, ACCM 847 PXOR ACC1, ACCM 848 MOVOU ACCM, T0 849 PSRLDQ $8, ACCM 850 PSLLDQ $8, T0 851 PXOR ACCM, ACC1 852 PXOR T0, ACC0 853 854 reduceRound(ACC0) 855 reduceRound(ACC0) 856 PXOR ACC1, ACC0 857 858 LEAQ (16*1)(ptx), ptx 859 LEAQ (16*1)(ctx), ctx 860 861 JMP gcmAesEncSinglesLoop 862 863gcmAesEncTail: 864 TESTQ ptxLen, ptxLen 865 JE gcmAesEncDone 866 867 MOVOU (8*16 + 0*16)(SP), B0 868 AESENC B1, B0 869 AESENC B2, B0 870 AESENC B3, B0 871 AESENC B4, B0 872 AESENC B5, B0 873 AESENC B6, B0 874 AESENC B7, B0 875 MOVOU (16*8)(ks), T0 876 AESENC T0, B0 877 MOVOU (16*9)(ks), T0 878 AESENC T0, B0 879 MOVOU (16*10)(ks), T0 880 CMPQ NR, $12 881 JB encLast4 882 AESENC T0, B0 883 MOVOU (16*11)(ks), T0 884 AESENC T0, B0 885 MOVOU (16*12)(ks), T0 886 JE encLast4 887 AESENC T0, B0 888 MOVOU (16*13)(ks), T0 889 AESENC T0, B0 890 MOVOU (16*14)(ks), T0 891encLast4: 892 AESENCLAST T0, B0 893 MOVOU B0, T0 894 895 LEAQ -1(ptx)(ptxLen*1), ptx 896 897 MOVQ ptxLen, aluTMP 898 SHLQ $4, aluTMP 899 900 LEAQ andMask<>(SB), aluCTR 901 MOVOU -16(aluCTR)(aluTMP*1), T1 902 903 PXOR B0, B0 904ptxLoadLoop: 905 PSLLDQ $1, B0 906 PINSRB $0, (ptx), B0 907 LEAQ -1(ptx), ptx 908 DECQ ptxLen 909 JNE ptxLoadLoop 910 911 PXOR T0, B0 912 PAND T1, B0 913 MOVOU B0, (ctx) // I assume there is always space, due to TAG in the end of the CT 914 915 PSHUFB BSWAP, B0 916 PXOR ACC0, B0 917 918 MOVOU T2, ACC0 919 MOVOU T2, ACC1 920 MOVOU (16*15)(pTbl), ACCM 921 922 PSHUFD $78, B0, T0 923 PXOR B0, T0 924 PCLMULQDQ $0x00, B0, ACC0 925 PCLMULQDQ $0x11, B0, ACC1 926 PCLMULQDQ $0x00, T0, ACCM 927 928 PXOR ACC0, ACCM 929 PXOR ACC1, ACCM 930 MOVOU ACCM, T0 931 PSRLDQ $8, ACCM 932 PSLLDQ $8, T0 933 PXOR ACCM, ACC1 934 PXOR T0, ACC0 935 936 reduceRound(ACC0) 937 reduceRound(ACC0) 938 PXOR ACC1, ACC0 939 940gcmAesEncDone: 941 MOVOU ACC0, (tPtr) 942 RET 943#undef increment 944 945// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 946TEXT ·gcmAesDec(SB),0,$128-96 947#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP) 948#define combinedDecRound(i) \ 949 MOVOU (16*i)(ks), T0;\ 950 AESENC T0, B0;\ 951 AESENC T0, B1;\ 952 AESENC T0, B2;\ 953 AESENC T0, B3;\ 954 MOVOU (16*(i*2))(pTbl), T1;\ 955 MOVOU T1, T2;\ 956 AESENC T0, B4;\ 957 AESENC T0, B5;\ 958 AESENC T0, B6;\ 959 AESENC T0, B7;\ 960 MOVOU (16*i)(ctx), T0;\ 961 PSHUFB BSWAP, T0;\ 962 PCLMULQDQ $0x00, T0, T1;\ 963 PXOR T1, ACC0;\ 964 PSHUFD $78, T0, T1;\ 965 PCLMULQDQ $0x11, T0, T2;\ 966 PXOR T1, T0;\ 967 PXOR T2, ACC1;\ 968 MOVOU (16*(i*2+1))(pTbl), T2;\ 969 PCLMULQDQ $0x00, T2, T0;\ 970 PXOR T0, ACCM 971 972 MOVQ productTable+0(FP), pTbl 973 MOVQ dst+8(FP), ptx 974 MOVQ src_base+32(FP), ctx 975 MOVQ src_len+40(FP), ptxLen 976 MOVQ ctr+56(FP), ctrPtr 977 MOVQ T+64(FP), tPtr 978 MOVQ ks_base+72(FP), ks 979 MOVQ ks_len+80(FP), NR 980 981 SHRQ $2, NR 982 DECQ NR 983 984 MOVOU bswapMask<>(SB), BSWAP 985 MOVOU gcmPoly<>(SB), POLY 986 987 MOVOU (tPtr), ACC0 988 PXOR ACC1, ACC1 989 PXOR ACCM, ACCM 990 MOVOU (ctrPtr), B0 991 MOVL (3*4)(ctrPtr), aluCTR 992 MOVOU (ks), T0 993 MOVL (3*4)(ks), aluK 994 BSWAPL aluCTR 995 BSWAPL aluK 996 997 PXOR B0, T0 998 MOVOU T0, (0*16)(SP) 999 increment(0) 1000 1001 CMPQ ptxLen, $128 1002 JB gcmAesDecSingles 1003 1004 MOVOU T0, (1*16)(SP) 1005 increment(1) 1006 MOVOU T0, (2*16)(SP) 1007 increment(2) 1008 MOVOU T0, (3*16)(SP) 1009 increment(3) 1010 MOVOU T0, (4*16)(SP) 1011 increment(4) 1012 MOVOU T0, (5*16)(SP) 1013 increment(5) 1014 MOVOU T0, (6*16)(SP) 1015 increment(6) 1016 MOVOU T0, (7*16)(SP) 1017 increment(7) 1018 1019gcmAesDecOctetsLoop: 1020 1021 CMPQ ptxLen, $128 1022 JB gcmAesDecEndOctets 1023 SUBQ $128, ptxLen 1024 1025 MOVOU (0*16)(SP), B0 1026 MOVOU (1*16)(SP), B1 1027 MOVOU (2*16)(SP), B2 1028 MOVOU (3*16)(SP), B3 1029 MOVOU (4*16)(SP), B4 1030 MOVOU (5*16)(SP), B5 1031 MOVOU (6*16)(SP), B6 1032 MOVOU (7*16)(SP), B7 1033 1034 MOVOU (16*0)(ctx), T0 1035 PSHUFB BSWAP, T0 1036 PXOR ACC0, T0 1037 PSHUFD $78, T0, T1 1038 PXOR T0, T1 1039 1040 MOVOU (16*0)(pTbl), ACC0 1041 MOVOU (16*1)(pTbl), ACCM 1042 MOVOU ACC0, ACC1 1043 1044 PCLMULQDQ $0x00, T1, ACCM 1045 PCLMULQDQ $0x00, T0, ACC0 1046 PCLMULQDQ $0x11, T0, ACC1 1047 1048 combinedDecRound(1) 1049 increment(0) 1050 combinedDecRound(2) 1051 increment(1) 1052 combinedDecRound(3) 1053 increment(2) 1054 combinedDecRound(4) 1055 increment(3) 1056 combinedDecRound(5) 1057 increment(4) 1058 combinedDecRound(6) 1059 increment(5) 1060 combinedDecRound(7) 1061 increment(6) 1062 1063 aesRound(8) 1064 increment(7) 1065 1066 PXOR ACC0, ACCM 1067 PXOR ACC1, ACCM 1068 MOVOU ACCM, T0 1069 PSRLDQ $8, ACCM 1070 PSLLDQ $8, T0 1071 PXOR ACCM, ACC1 1072 PXOR T0, ACC0 1073 1074 reduceRound(ACC0) 1075 aesRound(9) 1076 1077 reduceRound(ACC0) 1078 PXOR ACC1, ACC0 1079 1080 MOVOU (16*10)(ks), T0 1081 CMPQ NR, $12 1082 JB decLast1 1083 aesRnd(T0) 1084 aesRound(11) 1085 MOVOU (16*12)(ks), T0 1086 JE decLast1 1087 aesRnd(T0) 1088 aesRound(13) 1089 MOVOU (16*14)(ks), T0 1090decLast1: 1091 aesRndLast(T0) 1092 1093 MOVOU (16*0)(ctx), T0 1094 PXOR T0, B0 1095 MOVOU (16*1)(ctx), T0 1096 PXOR T0, B1 1097 MOVOU (16*2)(ctx), T0 1098 PXOR T0, B2 1099 MOVOU (16*3)(ctx), T0 1100 PXOR T0, B3 1101 MOVOU (16*4)(ctx), T0 1102 PXOR T0, B4 1103 MOVOU (16*5)(ctx), T0 1104 PXOR T0, B5 1105 MOVOU (16*6)(ctx), T0 1106 PXOR T0, B6 1107 MOVOU (16*7)(ctx), T0 1108 PXOR T0, B7 1109 1110 MOVOU B0, (16*0)(ptx) 1111 MOVOU B1, (16*1)(ptx) 1112 MOVOU B2, (16*2)(ptx) 1113 MOVOU B3, (16*3)(ptx) 1114 MOVOU B4, (16*4)(ptx) 1115 MOVOU B5, (16*5)(ptx) 1116 MOVOU B6, (16*6)(ptx) 1117 MOVOU B7, (16*7)(ptx) 1118 1119 LEAQ 128(ptx), ptx 1120 LEAQ 128(ctx), ctx 1121 1122 JMP gcmAesDecOctetsLoop 1123 1124gcmAesDecEndOctets: 1125 1126 SUBQ $7, aluCTR 1127 1128gcmAesDecSingles: 1129 1130 MOVOU (16*1)(ks), B1 1131 MOVOU (16*2)(ks), B2 1132 MOVOU (16*3)(ks), B3 1133 MOVOU (16*4)(ks), B4 1134 MOVOU (16*5)(ks), B5 1135 MOVOU (16*6)(ks), B6 1136 MOVOU (16*7)(ks), B7 1137 1138 MOVOU (16*14)(pTbl), T2 1139 1140gcmAesDecSinglesLoop: 1141 1142 CMPQ ptxLen, $16 1143 JB gcmAesDecTail 1144 SUBQ $16, ptxLen 1145 1146 MOVOU (ctx), B0 1147 MOVOU B0, T1 1148 PSHUFB BSWAP, B0 1149 PXOR ACC0, B0 1150 1151 MOVOU T2, ACC0 1152 MOVOU T2, ACC1 1153 MOVOU (16*15)(pTbl), ACCM 1154 1155 PCLMULQDQ $0x00, B0, ACC0 1156 PCLMULQDQ $0x11, B0, ACC1 1157 PSHUFD $78, B0, T0 1158 PXOR B0, T0 1159 PCLMULQDQ $0x00, T0, ACCM 1160 1161 PXOR ACC0, ACCM 1162 PXOR ACC1, ACCM 1163 MOVOU ACCM, T0 1164 PSRLDQ $8, ACCM 1165 PSLLDQ $8, T0 1166 PXOR ACCM, ACC1 1167 PXOR T0, ACC0 1168 1169 reduceRound(ACC0) 1170 reduceRound(ACC0) 1171 PXOR ACC1, ACC0 1172 1173 MOVOU (0*16)(SP), B0 1174 increment(0) 1175 AESENC B1, B0 1176 AESENC B2, B0 1177 AESENC B3, B0 1178 AESENC B4, B0 1179 AESENC B5, B0 1180 AESENC B6, B0 1181 AESENC B7, B0 1182 MOVOU (16*8)(ks), T0 1183 AESENC T0, B0 1184 MOVOU (16*9)(ks), T0 1185 AESENC T0, B0 1186 MOVOU (16*10)(ks), T0 1187 CMPQ NR, $12 1188 JB decLast2 1189 AESENC T0, B0 1190 MOVOU (16*11)(ks), T0 1191 AESENC T0, B0 1192 MOVOU (16*12)(ks), T0 1193 JE decLast2 1194 AESENC T0, B0 1195 MOVOU (16*13)(ks), T0 1196 AESENC T0, B0 1197 MOVOU (16*14)(ks), T0 1198decLast2: 1199 AESENCLAST T0, B0 1200 1201 PXOR T1, B0 1202 MOVOU B0, (ptx) 1203 1204 LEAQ (16*1)(ptx), ptx 1205 LEAQ (16*1)(ctx), ctx 1206 1207 JMP gcmAesDecSinglesLoop 1208 1209gcmAesDecTail: 1210 1211 TESTQ ptxLen, ptxLen 1212 JE gcmAesDecDone 1213 1214 MOVQ ptxLen, aluTMP 1215 SHLQ $4, aluTMP 1216 LEAQ andMask<>(SB), aluCTR 1217 MOVOU -16(aluCTR)(aluTMP*1), T1 1218 1219 MOVOU (ctx), B0 // I assume there is TAG attached to the ctx, and there is no read overflow 1220 PAND T1, B0 1221 1222 MOVOU B0, T1 1223 PSHUFB BSWAP, B0 1224 PXOR ACC0, B0 1225 1226 MOVOU (16*14)(pTbl), ACC0 1227 MOVOU (16*15)(pTbl), ACCM 1228 MOVOU ACC0, ACC1 1229 1230 PCLMULQDQ $0x00, B0, ACC0 1231 PCLMULQDQ $0x11, B0, ACC1 1232 PSHUFD $78, B0, T0 1233 PXOR B0, T0 1234 PCLMULQDQ $0x00, T0, ACCM 1235 1236 PXOR ACC0, ACCM 1237 PXOR ACC1, ACCM 1238 MOVOU ACCM, T0 1239 PSRLDQ $8, ACCM 1240 PSLLDQ $8, T0 1241 PXOR ACCM, ACC1 1242 PXOR T0, ACC0 1243 1244 reduceRound(ACC0) 1245 reduceRound(ACC0) 1246 PXOR ACC1, ACC0 1247 1248 MOVOU (0*16)(SP), B0 1249 increment(0) 1250 AESENC B1, B0 1251 AESENC B2, B0 1252 AESENC B3, B0 1253 AESENC B4, B0 1254 AESENC B5, B0 1255 AESENC B6, B0 1256 AESENC B7, B0 1257 MOVOU (16*8)(ks), T0 1258 AESENC T0, B0 1259 MOVOU (16*9)(ks), T0 1260 AESENC T0, B0 1261 MOVOU (16*10)(ks), T0 1262 CMPQ NR, $12 1263 JB decLast3 1264 AESENC T0, B0 1265 MOVOU (16*11)(ks), T0 1266 AESENC T0, B0 1267 MOVOU (16*12)(ks), T0 1268 JE decLast3 1269 AESENC T0, B0 1270 MOVOU (16*13)(ks), T0 1271 AESENC T0, B0 1272 MOVOU (16*14)(ks), T0 1273decLast3: 1274 AESENCLAST T0, B0 1275 PXOR T1, B0 1276 1277ptxStoreLoop: 1278 PEXTRB $0, B0, (ptx) 1279 PSRLDQ $1, B0 1280 LEAQ 1(ptx), ptx 1281 DECQ ptxLen 1282 1283 JNE ptxStoreLoop 1284 1285gcmAesDecDone: 1286 1287 MOVOU ACC0, (tPtr) 1288 RET 1289