1// Copyright 2018 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build !purego 6 7#include "textflag.h" 8 9#define B0 V0 10#define B1 V1 11#define B2 V2 12#define B3 V3 13#define B4 V4 14#define B5 V5 15#define B6 V6 16#define B7 V7 17 18#define ACC0 V8 19#define ACC1 V9 20#define ACCM V10 21 22#define T0 V11 23#define T1 V12 24#define T2 V13 25#define T3 V14 26 27#define POLY V15 28#define ZERO V16 29#define INC V17 30#define CTR V18 31 32#define K0 V19 33#define K1 V20 34#define K2 V21 35#define K3 V22 36#define K4 V23 37#define K5 V24 38#define K6 V25 39#define K7 V26 40#define K8 V27 41#define K9 V28 42#define K10 V29 43#define K11 V30 44#define KLAST V31 45 46#define reduce() \ 47 VEOR ACC0.B16, ACCM.B16, ACCM.B16 \ 48 VEOR ACC1.B16, ACCM.B16, ACCM.B16 \ 49 VEXT $8, ZERO.B16, ACCM.B16, T0.B16 \ 50 VEXT $8, ACCM.B16, ZERO.B16, ACCM.B16 \ 51 VEOR ACCM.B16, ACC0.B16, ACC0.B16 \ 52 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 53 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 54 VEXT $8, ACC0.B16, ACC0.B16, ACC0.B16 \ 55 VEOR T0.B16, ACC0.B16, ACC0.B16 \ 56 VPMULL POLY.D1, ACC0.D1, T0.Q1 \ 57 VEOR T0.B16, ACC1.B16, ACC1.B16 \ 58 VEXT $8, ACC1.B16, ACC1.B16, ACC1.B16 \ 59 VEOR ACC1.B16, ACC0.B16, ACC0.B16 \ 60 61// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64) 62TEXT ·gcmAesFinish(SB),NOSPLIT,$0 63#define pTbl R0 64#define tMsk R1 65#define tPtr R2 66#define plen R3 67#define dlen R4 68 69 MOVD $0xC2, R1 70 LSL $56, R1 71 MOVD $1, R0 72 VMOV R1, POLY.D[0] 73 VMOV R0, POLY.D[1] 74 VEOR ZERO.B16, ZERO.B16, ZERO.B16 75 76 MOVD productTable+0(FP), pTbl 77 MOVD tagMask+8(FP), tMsk 78 MOVD T+16(FP), tPtr 79 MOVD pLen+24(FP), plen 80 MOVD dLen+32(FP), dlen 81 82 VLD1 (tPtr), [ACC0.B16] 83 VLD1 (tMsk), [B1.B16] 84 85 LSL $3, plen 86 LSL $3, dlen 87 88 VMOV dlen, B0.D[0] 89 VMOV plen, B0.D[1] 90 91 ADD $14*16, pTbl 92 VLD1.P (pTbl), [T1.B16, T2.B16] 93 94 VEOR ACC0.B16, B0.B16, B0.B16 95 96 VEXT $8, B0.B16, B0.B16, T0.B16 97 VEOR B0.B16, T0.B16, T0.B16 98 VPMULL B0.D1, T1.D1, ACC1.Q1 99 VPMULL2 B0.D2, T1.D2, ACC0.Q1 100 VPMULL T0.D1, T2.D1, ACCM.Q1 101 102 reduce() 103 104 VREV64 ACC0.B16, ACC0.B16 105 VEOR B1.B16, ACC0.B16, ACC0.B16 106 107 VST1 [ACC0.B16], (tPtr) 108 RET 109#undef pTbl 110#undef tMsk 111#undef tPtr 112#undef plen 113#undef dlen 114 115// func gcmAesInit(productTable *[256]byte, ks []uint32) 116TEXT ·gcmAesInit(SB),NOSPLIT,$0 117#define pTbl R0 118#define KS R1 119#define NR R2 120#define I R3 121 MOVD productTable+0(FP), pTbl 122 MOVD ks_base+8(FP), KS 123 MOVD ks_len+16(FP), NR 124 125 MOVD $0xC2, I 126 LSL $56, I 127 VMOV I, POLY.D[0] 128 MOVD $1, I 129 VMOV I, POLY.D[1] 130 VEOR ZERO.B16, ZERO.B16, ZERO.B16 131 132 // Encrypt block 0 with the AES key to generate the hash key H 133 VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16] 134 VEOR B0.B16, B0.B16, B0.B16 135 AESE T0.B16, B0.B16 136 AESMC B0.B16, B0.B16 137 AESE T1.B16, B0.B16 138 AESMC B0.B16, B0.B16 139 AESE T2.B16, B0.B16 140 AESMC B0.B16, B0.B16 141 AESE T3.B16, B0.B16 142 AESMC B0.B16, B0.B16 143 VLD1.P 64(KS), [T0.B16, T1.B16, T2.B16, T3.B16] 144 AESE T0.B16, B0.B16 145 AESMC B0.B16, B0.B16 146 AESE T1.B16, B0.B16 147 AESMC B0.B16, B0.B16 148 AESE T2.B16, B0.B16 149 AESMC B0.B16, B0.B16 150 AESE T3.B16, B0.B16 151 AESMC B0.B16, B0.B16 152 TBZ $4, NR, initEncFinish 153 VLD1.P 32(KS), [T0.B16, T1.B16] 154 AESE T0.B16, B0.B16 155 AESMC B0.B16, B0.B16 156 AESE T1.B16, B0.B16 157 AESMC B0.B16, B0.B16 158 TBZ $3, NR, initEncFinish 159 VLD1.P 32(KS), [T0.B16, T1.B16] 160 AESE T0.B16, B0.B16 161 AESMC B0.B16, B0.B16 162 AESE T1.B16, B0.B16 163 AESMC B0.B16, B0.B16 164initEncFinish: 165 VLD1 (KS), [T0.B16, T1.B16, T2.B16] 166 AESE T0.B16, B0.B16 167 AESMC B0.B16, B0.B16 168 AESE T1.B16, B0.B16 169 VEOR T2.B16, B0.B16, B0.B16 170 171 VREV64 B0.B16, B0.B16 172 173 // Multiply by 2 modulo P 174 VMOV B0.D[0], I 175 ASR $63, I 176 VMOV I, T1.D[0] 177 VMOV I, T1.D[1] 178 VAND POLY.B16, T1.B16, T1.B16 179 VUSHR $63, B0.D2, T2.D2 180 VEXT $8, ZERO.B16, T2.B16, T2.B16 181 VSHL $1, B0.D2, B0.D2 182 VEOR T1.B16, B0.B16, B0.B16 183 VEOR T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available 184 185 // Karatsuba pre-computation 186 VEXT $8, B0.B16, B0.B16, B1.B16 187 VEOR B0.B16, B1.B16, B1.B16 188 189 ADD $14*16, pTbl 190 VST1 [B0.B16, B1.B16], (pTbl) 191 SUB $2*16, pTbl 192 193 VMOV B0.B16, B2.B16 194 VMOV B1.B16, B3.B16 195 196 MOVD $7, I 197 198initLoop: 199 // Compute powers of H 200 SUBS $1, I 201 202 VPMULL B0.D1, B2.D1, T1.Q1 203 VPMULL2 B0.D2, B2.D2, T0.Q1 204 VPMULL B1.D1, B3.D1, T2.Q1 205 VEOR T0.B16, T2.B16, T2.B16 206 VEOR T1.B16, T2.B16, T2.B16 207 VEXT $8, ZERO.B16, T2.B16, T3.B16 208 VEXT $8, T2.B16, ZERO.B16, T2.B16 209 VEOR T2.B16, T0.B16, T0.B16 210 VEOR T3.B16, T1.B16, T1.B16 211 VPMULL POLY.D1, T0.D1, T2.Q1 212 VEXT $8, T0.B16, T0.B16, T0.B16 213 VEOR T2.B16, T0.B16, T0.B16 214 VPMULL POLY.D1, T0.D1, T2.Q1 215 VEXT $8, T0.B16, T0.B16, T0.B16 216 VEOR T2.B16, T0.B16, T0.B16 217 VEOR T1.B16, T0.B16, B2.B16 218 VMOV B2.B16, B3.B16 219 VEXT $8, B2.B16, B2.B16, B2.B16 220 VEOR B2.B16, B3.B16, B3.B16 221 222 VST1 [B2.B16, B3.B16], (pTbl) 223 SUB $2*16, pTbl 224 225 BNE initLoop 226 RET 227#undef I 228#undef NR 229#undef KS 230#undef pTbl 231 232// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte) 233TEXT ·gcmAesData(SB),NOSPLIT,$0 234#define pTbl R0 235#define aut R1 236#define tPtr R2 237#define autLen R3 238#define H0 R4 239#define pTblSave R5 240 241#define mulRound(X) \ 242 VLD1.P 32(pTbl), [T1.B16, T2.B16] \ 243 VREV64 X.B16, X.B16 \ 244 VEXT $8, X.B16, X.B16, T0.B16 \ 245 VEOR X.B16, T0.B16, T0.B16 \ 246 VPMULL X.D1, T1.D1, T3.Q1 \ 247 VEOR T3.B16, ACC1.B16, ACC1.B16 \ 248 VPMULL2 X.D2, T1.D2, T3.Q1 \ 249 VEOR T3.B16, ACC0.B16, ACC0.B16 \ 250 VPMULL T0.D1, T2.D1, T3.Q1 \ 251 VEOR T3.B16, ACCM.B16, ACCM.B16 252 253 MOVD productTable+0(FP), pTbl 254 MOVD data_base+8(FP), aut 255 MOVD data_len+16(FP), autLen 256 MOVD T+32(FP), tPtr 257 258 VEOR ACC0.B16, ACC0.B16, ACC0.B16 259 CBZ autLen, dataBail 260 261 MOVD $0xC2, H0 262 LSL $56, H0 263 VMOV H0, POLY.D[0] 264 MOVD $1, H0 265 VMOV H0, POLY.D[1] 266 VEOR ZERO.B16, ZERO.B16, ZERO.B16 267 MOVD pTbl, pTblSave 268 269 CMP $13, autLen 270 BEQ dataTLS 271 CMP $128, autLen 272 BLT startSinglesLoop 273 B octetsLoop 274 275dataTLS: 276 ADD $14*16, pTbl 277 VLD1.P (pTbl), [T1.B16, T2.B16] 278 VEOR B0.B16, B0.B16, B0.B16 279 280 MOVD (aut), H0 281 VMOV H0, B0.D[0] 282 MOVW 8(aut), H0 283 VMOV H0, B0.S[2] 284 MOVB 12(aut), H0 285 VMOV H0, B0.B[12] 286 287 MOVD $0, autLen 288 B dataMul 289 290octetsLoop: 291 CMP $128, autLen 292 BLT startSinglesLoop 293 SUB $128, autLen 294 295 VLD1.P 32(aut), [B0.B16, B1.B16] 296 297 VLD1.P 32(pTbl), [T1.B16, T2.B16] 298 VREV64 B0.B16, B0.B16 299 VEOR ACC0.B16, B0.B16, B0.B16 300 VEXT $8, B0.B16, B0.B16, T0.B16 301 VEOR B0.B16, T0.B16, T0.B16 302 VPMULL B0.D1, T1.D1, ACC1.Q1 303 VPMULL2 B0.D2, T1.D2, ACC0.Q1 304 VPMULL T0.D1, T2.D1, ACCM.Q1 305 306 mulRound(B1) 307 VLD1.P 32(aut), [B2.B16, B3.B16] 308 mulRound(B2) 309 mulRound(B3) 310 VLD1.P 32(aut), [B4.B16, B5.B16] 311 mulRound(B4) 312 mulRound(B5) 313 VLD1.P 32(aut), [B6.B16, B7.B16] 314 mulRound(B6) 315 mulRound(B7) 316 317 MOVD pTblSave, pTbl 318 reduce() 319 B octetsLoop 320 321startSinglesLoop: 322 323 ADD $14*16, pTbl 324 VLD1.P (pTbl), [T1.B16, T2.B16] 325 326singlesLoop: 327 328 CMP $16, autLen 329 BLT dataEnd 330 SUB $16, autLen 331 332 VLD1.P 16(aut), [B0.B16] 333dataMul: 334 VREV64 B0.B16, B0.B16 335 VEOR ACC0.B16, B0.B16, B0.B16 336 337 VEXT $8, B0.B16, B0.B16, T0.B16 338 VEOR B0.B16, T0.B16, T0.B16 339 VPMULL B0.D1, T1.D1, ACC1.Q1 340 VPMULL2 B0.D2, T1.D2, ACC0.Q1 341 VPMULL T0.D1, T2.D1, ACCM.Q1 342 343 reduce() 344 345 B singlesLoop 346 347dataEnd: 348 349 CBZ autLen, dataBail 350 VEOR B0.B16, B0.B16, B0.B16 351 ADD autLen, aut 352 353dataLoadLoop: 354 MOVB.W -1(aut), H0 355 VEXT $15, B0.B16, ZERO.B16, B0.B16 356 VMOV H0, B0.B[0] 357 SUBS $1, autLen 358 BNE dataLoadLoop 359 B dataMul 360 361dataBail: 362 VST1 [ACC0.B16], (tPtr) 363 RET 364 365#undef pTbl 366#undef aut 367#undef tPtr 368#undef autLen 369#undef H0 370#undef pTblSave 371 372// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 373TEXT ·gcmAesEnc(SB),NOSPLIT,$0 374#define pTbl R0 375#define dstPtr R1 376#define ctrPtr R2 377#define srcPtr R3 378#define ks R4 379#define tPtr R5 380#define srcPtrLen R6 381#define aluCTR R7 382#define aluTMP R8 383#define aluK R9 384#define NR R10 385#define H0 R11 386#define H1 R12 387#define curK R13 388#define pTblSave R14 389 390#define aesrndx8(K) \ 391 AESE K.B16, B0.B16 \ 392 AESMC B0.B16, B0.B16 \ 393 AESE K.B16, B1.B16 \ 394 AESMC B1.B16, B1.B16 \ 395 AESE K.B16, B2.B16 \ 396 AESMC B2.B16, B2.B16 \ 397 AESE K.B16, B3.B16 \ 398 AESMC B3.B16, B3.B16 \ 399 AESE K.B16, B4.B16 \ 400 AESMC B4.B16, B4.B16 \ 401 AESE K.B16, B5.B16 \ 402 AESMC B5.B16, B5.B16 \ 403 AESE K.B16, B6.B16 \ 404 AESMC B6.B16, B6.B16 \ 405 AESE K.B16, B7.B16 \ 406 AESMC B7.B16, B7.B16 407 408#define aesrndlastx8(K) \ 409 AESE K.B16, B0.B16 \ 410 AESE K.B16, B1.B16 \ 411 AESE K.B16, B2.B16 \ 412 AESE K.B16, B3.B16 \ 413 AESE K.B16, B4.B16 \ 414 AESE K.B16, B5.B16 \ 415 AESE K.B16, B6.B16 \ 416 AESE K.B16, B7.B16 417 418 MOVD productTable+0(FP), pTbl 419 MOVD dst+8(FP), dstPtr 420 MOVD src_base+32(FP), srcPtr 421 MOVD src_len+40(FP), srcPtrLen 422 MOVD ctr+56(FP), ctrPtr 423 MOVD T+64(FP), tPtr 424 MOVD ks_base+72(FP), ks 425 MOVD ks_len+80(FP), NR 426 427 MOVD $0xC2, H1 428 LSL $56, H1 429 MOVD $1, H0 430 VMOV H1, POLY.D[0] 431 VMOV H0, POLY.D[1] 432 VEOR ZERO.B16, ZERO.B16, ZERO.B16 433 // Compute NR from len(ks) 434 MOVD pTbl, pTblSave 435 // Current tag, after AAD 436 VLD1 (tPtr), [ACC0.B16] 437 VEOR ACC1.B16, ACC1.B16, ACC1.B16 438 VEOR ACCM.B16, ACCM.B16, ACCM.B16 439 // Prepare initial counter, and the increment vector 440 VLD1 (ctrPtr), [CTR.B16] 441 VEOR INC.B16, INC.B16, INC.B16 442 MOVD $1, H0 443 VMOV H0, INC.S[3] 444 VREV32 CTR.B16, CTR.B16 445 VADD CTR.S4, INC.S4, CTR.S4 446 // Skip to <8 blocks loop 447 CMP $128, srcPtrLen 448 449 MOVD ks, H0 450 // For AES-128 round keys are stored in: K0 .. K10, KLAST 451 VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16] 452 VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16] 453 VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16] 454 VMOV K10.B16, KLAST.B16 455 456 BLT startSingles 457 // There are at least 8 blocks to encrypt 458 TBZ $4, NR, octetsLoop 459 460 // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST 461 VMOV K8.B16, K10.B16 462 VMOV K9.B16, K11.B16 463 VMOV KLAST.B16, K8.B16 464 VLD1.P 16(H0), [K9.B16] 465 VLD1.P 16(H0), [KLAST.B16] 466 TBZ $3, NR, octetsLoop 467 // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST 468 VMOV KLAST.B16, K8.B16 469 VLD1.P 16(H0), [K9.B16] 470 VLD1.P 16(H0), [KLAST.B16] 471 ADD $10*16, ks, H0 472 MOVD H0, curK 473 474octetsLoop: 475 SUB $128, srcPtrLen 476 477 VMOV CTR.B16, B0.B16 478 VADD B0.S4, INC.S4, B1.S4 479 VREV32 B0.B16, B0.B16 480 VADD B1.S4, INC.S4, B2.S4 481 VREV32 B1.B16, B1.B16 482 VADD B2.S4, INC.S4, B3.S4 483 VREV32 B2.B16, B2.B16 484 VADD B3.S4, INC.S4, B4.S4 485 VREV32 B3.B16, B3.B16 486 VADD B4.S4, INC.S4, B5.S4 487 VREV32 B4.B16, B4.B16 488 VADD B5.S4, INC.S4, B6.S4 489 VREV32 B5.B16, B5.B16 490 VADD B6.S4, INC.S4, B7.S4 491 VREV32 B6.B16, B6.B16 492 VADD B7.S4, INC.S4, CTR.S4 493 VREV32 B7.B16, B7.B16 494 495 aesrndx8(K0) 496 aesrndx8(K1) 497 aesrndx8(K2) 498 aesrndx8(K3) 499 aesrndx8(K4) 500 aesrndx8(K5) 501 aesrndx8(K6) 502 aesrndx8(K7) 503 TBZ $4, NR, octetsFinish 504 aesrndx8(K10) 505 aesrndx8(K11) 506 TBZ $3, NR, octetsFinish 507 VLD1.P 32(curK), [T1.B16, T2.B16] 508 aesrndx8(T1) 509 aesrndx8(T2) 510 MOVD H0, curK 511octetsFinish: 512 aesrndx8(K8) 513 aesrndlastx8(K9) 514 515 VEOR KLAST.B16, B0.B16, B0.B16 516 VEOR KLAST.B16, B1.B16, B1.B16 517 VEOR KLAST.B16, B2.B16, B2.B16 518 VEOR KLAST.B16, B3.B16, B3.B16 519 VEOR KLAST.B16, B4.B16, B4.B16 520 VEOR KLAST.B16, B5.B16, B5.B16 521 VEOR KLAST.B16, B6.B16, B6.B16 522 VEOR KLAST.B16, B7.B16, B7.B16 523 524 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 525 VEOR B0.B16, T1.B16, B0.B16 526 VEOR B1.B16, T2.B16, B1.B16 527 VST1.P [B0.B16, B1.B16], 32(dstPtr) 528 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 529 VEOR B2.B16, T1.B16, B2.B16 530 VEOR B3.B16, T2.B16, B3.B16 531 VST1.P [B2.B16, B3.B16], 32(dstPtr) 532 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 533 VEOR B4.B16, T1.B16, B4.B16 534 VEOR B5.B16, T2.B16, B5.B16 535 VST1.P [B4.B16, B5.B16], 32(dstPtr) 536 VLD1.P 32(srcPtr), [T1.B16, T2.B16] 537 VEOR B6.B16, T1.B16, B6.B16 538 VEOR B7.B16, T2.B16, B7.B16 539 VST1.P [B6.B16, B7.B16], 32(dstPtr) 540 541 VLD1.P 32(pTbl), [T1.B16, T2.B16] 542 VREV64 B0.B16, B0.B16 543 VEOR ACC0.B16, B0.B16, B0.B16 544 VEXT $8, B0.B16, B0.B16, T0.B16 545 VEOR B0.B16, T0.B16, T0.B16 546 VPMULL B0.D1, T1.D1, ACC1.Q1 547 VPMULL2 B0.D2, T1.D2, ACC0.Q1 548 VPMULL T0.D1, T2.D1, ACCM.Q1 549 550 mulRound(B1) 551 mulRound(B2) 552 mulRound(B3) 553 mulRound(B4) 554 mulRound(B5) 555 mulRound(B6) 556 mulRound(B7) 557 MOVD pTblSave, pTbl 558 reduce() 559 560 CMP $128, srcPtrLen 561 BGE octetsLoop 562 563startSingles: 564 CBZ srcPtrLen, done 565 ADD $14*16, pTbl 566 // Preload H and its Karatsuba precomp 567 VLD1.P (pTbl), [T1.B16, T2.B16] 568 // Preload AES round keys 569 ADD $128, ks 570 VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16] 571 VMOV K10.B16, KLAST.B16 572 TBZ $4, NR, singlesLoop 573 VLD1.P 32(ks), [B1.B16, B2.B16] 574 VMOV B2.B16, KLAST.B16 575 TBZ $3, NR, singlesLoop 576 VLD1.P 32(ks), [B3.B16, B4.B16] 577 VMOV B4.B16, KLAST.B16 578 579singlesLoop: 580 CMP $16, srcPtrLen 581 BLT tail 582 SUB $16, srcPtrLen 583 584 VLD1.P 16(srcPtr), [T0.B16] 585 VEOR KLAST.B16, T0.B16, T0.B16 586 587 VREV32 CTR.B16, B0.B16 588 VADD CTR.S4, INC.S4, CTR.S4 589 590 AESE K0.B16, B0.B16 591 AESMC B0.B16, B0.B16 592 AESE K1.B16, B0.B16 593 AESMC B0.B16, B0.B16 594 AESE K2.B16, B0.B16 595 AESMC B0.B16, B0.B16 596 AESE K3.B16, B0.B16 597 AESMC B0.B16, B0.B16 598 AESE K4.B16, B0.B16 599 AESMC B0.B16, B0.B16 600 AESE K5.B16, B0.B16 601 AESMC B0.B16, B0.B16 602 AESE K6.B16, B0.B16 603 AESMC B0.B16, B0.B16 604 AESE K7.B16, B0.B16 605 AESMC B0.B16, B0.B16 606 AESE K8.B16, B0.B16 607 AESMC B0.B16, B0.B16 608 AESE K9.B16, B0.B16 609 TBZ $4, NR, singlesLast 610 AESMC B0.B16, B0.B16 611 AESE K10.B16, B0.B16 612 AESMC B0.B16, B0.B16 613 AESE B1.B16, B0.B16 614 TBZ $3, NR, singlesLast 615 AESMC B0.B16, B0.B16 616 AESE B2.B16, B0.B16 617 AESMC B0.B16, B0.B16 618 AESE B3.B16, B0.B16 619singlesLast: 620 VEOR T0.B16, B0.B16, B0.B16 621encReduce: 622 VST1.P [B0.B16], 16(dstPtr) 623 624 VREV64 B0.B16, B0.B16 625 VEOR ACC0.B16, B0.B16, B0.B16 626 627 VEXT $8, B0.B16, B0.B16, T0.B16 628 VEOR B0.B16, T0.B16, T0.B16 629 VPMULL B0.D1, T1.D1, ACC1.Q1 630 VPMULL2 B0.D2, T1.D2, ACC0.Q1 631 VPMULL T0.D1, T2.D1, ACCM.Q1 632 633 reduce() 634 635 B singlesLoop 636tail: 637 CBZ srcPtrLen, done 638 639 VEOR T0.B16, T0.B16, T0.B16 640 VEOR T3.B16, T3.B16, T3.B16 641 MOVD $0, H1 642 SUB $1, H1 643 ADD srcPtrLen, srcPtr 644 645 TBZ $3, srcPtrLen, ld4 646 MOVD.W -8(srcPtr), H0 647 VMOV H0, T0.D[0] 648 VMOV H1, T3.D[0] 649ld4: 650 TBZ $2, srcPtrLen, ld2 651 MOVW.W -4(srcPtr), H0 652 VEXT $12, T0.B16, ZERO.B16, T0.B16 653 VEXT $12, T3.B16, ZERO.B16, T3.B16 654 VMOV H0, T0.S[0] 655 VMOV H1, T3.S[0] 656ld2: 657 TBZ $1, srcPtrLen, ld1 658 MOVH.W -2(srcPtr), H0 659 VEXT $14, T0.B16, ZERO.B16, T0.B16 660 VEXT $14, T3.B16, ZERO.B16, T3.B16 661 VMOV H0, T0.H[0] 662 VMOV H1, T3.H[0] 663ld1: 664 TBZ $0, srcPtrLen, ld0 665 MOVB.W -1(srcPtr), H0 666 VEXT $15, T0.B16, ZERO.B16, T0.B16 667 VEXT $15, T3.B16, ZERO.B16, T3.B16 668 VMOV H0, T0.B[0] 669 VMOV H1, T3.B[0] 670ld0: 671 672 MOVD ZR, srcPtrLen 673 VEOR KLAST.B16, T0.B16, T0.B16 674 VREV32 CTR.B16, B0.B16 675 676 AESE K0.B16, B0.B16 677 AESMC B0.B16, B0.B16 678 AESE K1.B16, B0.B16 679 AESMC B0.B16, B0.B16 680 AESE K2.B16, B0.B16 681 AESMC B0.B16, B0.B16 682 AESE K3.B16, B0.B16 683 AESMC B0.B16, B0.B16 684 AESE K4.B16, B0.B16 685 AESMC B0.B16, B0.B16 686 AESE K5.B16, B0.B16 687 AESMC B0.B16, B0.B16 688 AESE K6.B16, B0.B16 689 AESMC B0.B16, B0.B16 690 AESE K7.B16, B0.B16 691 AESMC B0.B16, B0.B16 692 AESE K8.B16, B0.B16 693 AESMC B0.B16, B0.B16 694 AESE K9.B16, B0.B16 695 TBZ $4, NR, tailLast 696 AESMC B0.B16, B0.B16 697 AESE K10.B16, B0.B16 698 AESMC B0.B16, B0.B16 699 AESE B1.B16, B0.B16 700 TBZ $3, NR, tailLast 701 AESMC B0.B16, B0.B16 702 AESE B2.B16, B0.B16 703 AESMC B0.B16, B0.B16 704 AESE B3.B16, B0.B16 705 706tailLast: 707 VEOR T0.B16, B0.B16, B0.B16 708 VAND T3.B16, B0.B16, B0.B16 709 B encReduce 710 711done: 712 VST1 [ACC0.B16], (tPtr) 713 RET 714 715// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32) 716TEXT ·gcmAesDec(SB),NOSPLIT,$0 717 MOVD productTable+0(FP), pTbl 718 MOVD dst+8(FP), dstPtr 719 MOVD src_base+32(FP), srcPtr 720 MOVD src_len+40(FP), srcPtrLen 721 MOVD ctr+56(FP), ctrPtr 722 MOVD T+64(FP), tPtr 723 MOVD ks_base+72(FP), ks 724 MOVD ks_len+80(FP), NR 725 726 MOVD $0xC2, H1 727 LSL $56, H1 728 MOVD $1, H0 729 VMOV H1, POLY.D[0] 730 VMOV H0, POLY.D[1] 731 VEOR ZERO.B16, ZERO.B16, ZERO.B16 732 // Compute NR from len(ks) 733 MOVD pTbl, pTblSave 734 // Current tag, after AAD 735 VLD1 (tPtr), [ACC0.B16] 736 VEOR ACC1.B16, ACC1.B16, ACC1.B16 737 VEOR ACCM.B16, ACCM.B16, ACCM.B16 738 // Prepare initial counter, and the increment vector 739 VLD1 (ctrPtr), [CTR.B16] 740 VEOR INC.B16, INC.B16, INC.B16 741 MOVD $1, H0 742 VMOV H0, INC.S[3] 743 VREV32 CTR.B16, CTR.B16 744 VADD CTR.S4, INC.S4, CTR.S4 745 746 MOVD ks, H0 747 // For AES-128 round keys are stored in: K0 .. K10, KLAST 748 VLD1.P 64(H0), [K0.B16, K1.B16, K2.B16, K3.B16] 749 VLD1.P 64(H0), [K4.B16, K5.B16, K6.B16, K7.B16] 750 VLD1.P 48(H0), [K8.B16, K9.B16, K10.B16] 751 VMOV K10.B16, KLAST.B16 752 753 // Skip to <8 blocks loop 754 CMP $128, srcPtrLen 755 BLT startSingles 756 // There are at least 8 blocks to encrypt 757 TBZ $4, NR, octetsLoop 758 759 // For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST 760 VMOV K8.B16, K10.B16 761 VMOV K9.B16, K11.B16 762 VMOV KLAST.B16, K8.B16 763 VLD1.P 16(H0), [K9.B16] 764 VLD1.P 16(H0), [KLAST.B16] 765 TBZ $3, NR, octetsLoop 766 // For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST 767 VMOV KLAST.B16, K8.B16 768 VLD1.P 16(H0), [K9.B16] 769 VLD1.P 16(H0), [KLAST.B16] 770 ADD $10*16, ks, H0 771 MOVD H0, curK 772 773octetsLoop: 774 SUB $128, srcPtrLen 775 776 VMOV CTR.B16, B0.B16 777 VADD B0.S4, INC.S4, B1.S4 778 VREV32 B0.B16, B0.B16 779 VADD B1.S4, INC.S4, B2.S4 780 VREV32 B1.B16, B1.B16 781 VADD B2.S4, INC.S4, B3.S4 782 VREV32 B2.B16, B2.B16 783 VADD B3.S4, INC.S4, B4.S4 784 VREV32 B3.B16, B3.B16 785 VADD B4.S4, INC.S4, B5.S4 786 VREV32 B4.B16, B4.B16 787 VADD B5.S4, INC.S4, B6.S4 788 VREV32 B5.B16, B5.B16 789 VADD B6.S4, INC.S4, B7.S4 790 VREV32 B6.B16, B6.B16 791 VADD B7.S4, INC.S4, CTR.S4 792 VREV32 B7.B16, B7.B16 793 794 aesrndx8(K0) 795 aesrndx8(K1) 796 aesrndx8(K2) 797 aesrndx8(K3) 798 aesrndx8(K4) 799 aesrndx8(K5) 800 aesrndx8(K6) 801 aesrndx8(K7) 802 TBZ $4, NR, octetsFinish 803 aesrndx8(K10) 804 aesrndx8(K11) 805 TBZ $3, NR, octetsFinish 806 VLD1.P 32(curK), [T1.B16, T2.B16] 807 aesrndx8(T1) 808 aesrndx8(T2) 809 MOVD H0, curK 810octetsFinish: 811 aesrndx8(K8) 812 aesrndlastx8(K9) 813 814 VEOR KLAST.B16, B0.B16, T1.B16 815 VEOR KLAST.B16, B1.B16, T2.B16 816 VEOR KLAST.B16, B2.B16, B2.B16 817 VEOR KLAST.B16, B3.B16, B3.B16 818 VEOR KLAST.B16, B4.B16, B4.B16 819 VEOR KLAST.B16, B5.B16, B5.B16 820 VEOR KLAST.B16, B6.B16, B6.B16 821 VEOR KLAST.B16, B7.B16, B7.B16 822 823 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 824 VEOR B0.B16, T1.B16, T1.B16 825 VEOR B1.B16, T2.B16, T2.B16 826 VST1.P [T1.B16, T2.B16], 32(dstPtr) 827 828 VLD1.P 32(pTbl), [T1.B16, T2.B16] 829 VREV64 B0.B16, B0.B16 830 VEOR ACC0.B16, B0.B16, B0.B16 831 VEXT $8, B0.B16, B0.B16, T0.B16 832 VEOR B0.B16, T0.B16, T0.B16 833 VPMULL B0.D1, T1.D1, ACC1.Q1 834 VPMULL2 B0.D2, T1.D2, ACC0.Q1 835 VPMULL T0.D1, T2.D1, ACCM.Q1 836 mulRound(B1) 837 838 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 839 VEOR B2.B16, B0.B16, T1.B16 840 VEOR B3.B16, B1.B16, T2.B16 841 VST1.P [T1.B16, T2.B16], 32(dstPtr) 842 mulRound(B0) 843 mulRound(B1) 844 845 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 846 VEOR B4.B16, B0.B16, T1.B16 847 VEOR B5.B16, B1.B16, T2.B16 848 VST1.P [T1.B16, T2.B16], 32(dstPtr) 849 mulRound(B0) 850 mulRound(B1) 851 852 VLD1.P 32(srcPtr), [B0.B16, B1.B16] 853 VEOR B6.B16, B0.B16, T1.B16 854 VEOR B7.B16, B1.B16, T2.B16 855 VST1.P [T1.B16, T2.B16], 32(dstPtr) 856 mulRound(B0) 857 mulRound(B1) 858 859 MOVD pTblSave, pTbl 860 reduce() 861 862 CMP $128, srcPtrLen 863 BGE octetsLoop 864 865startSingles: 866 CBZ srcPtrLen, done 867 ADD $14*16, pTbl 868 // Preload H and its Karatsuba precomp 869 VLD1.P (pTbl), [T1.B16, T2.B16] 870 // Preload AES round keys 871 ADD $128, ks 872 VLD1.P 48(ks), [K8.B16, K9.B16, K10.B16] 873 VMOV K10.B16, KLAST.B16 874 TBZ $4, NR, singlesLoop 875 VLD1.P 32(ks), [B1.B16, B2.B16] 876 VMOV B2.B16, KLAST.B16 877 TBZ $3, NR, singlesLoop 878 VLD1.P 32(ks), [B3.B16, B4.B16] 879 VMOV B4.B16, KLAST.B16 880 881singlesLoop: 882 CMP $16, srcPtrLen 883 BLT tail 884 SUB $16, srcPtrLen 885 886 VLD1.P 16(srcPtr), [T0.B16] 887 VREV64 T0.B16, B5.B16 888 VEOR KLAST.B16, T0.B16, T0.B16 889 890 VREV32 CTR.B16, B0.B16 891 VADD CTR.S4, INC.S4, CTR.S4 892 893 AESE K0.B16, B0.B16 894 AESMC B0.B16, B0.B16 895 AESE K1.B16, B0.B16 896 AESMC B0.B16, B0.B16 897 AESE K2.B16, B0.B16 898 AESMC B0.B16, B0.B16 899 AESE K3.B16, B0.B16 900 AESMC B0.B16, B0.B16 901 AESE K4.B16, B0.B16 902 AESMC B0.B16, B0.B16 903 AESE K5.B16, B0.B16 904 AESMC B0.B16, B0.B16 905 AESE K6.B16, B0.B16 906 AESMC B0.B16, B0.B16 907 AESE K7.B16, B0.B16 908 AESMC B0.B16, B0.B16 909 AESE K8.B16, B0.B16 910 AESMC B0.B16, B0.B16 911 AESE K9.B16, B0.B16 912 TBZ $4, NR, singlesLast 913 AESMC B0.B16, B0.B16 914 AESE K10.B16, B0.B16 915 AESMC B0.B16, B0.B16 916 AESE B1.B16, B0.B16 917 TBZ $3, NR, singlesLast 918 AESMC B0.B16, B0.B16 919 AESE B2.B16, B0.B16 920 AESMC B0.B16, B0.B16 921 AESE B3.B16, B0.B16 922singlesLast: 923 VEOR T0.B16, B0.B16, B0.B16 924 925 VST1.P [B0.B16], 16(dstPtr) 926 927 VEOR ACC0.B16, B5.B16, B5.B16 928 VEXT $8, B5.B16, B5.B16, T0.B16 929 VEOR B5.B16, T0.B16, T0.B16 930 VPMULL B5.D1, T1.D1, ACC1.Q1 931 VPMULL2 B5.D2, T1.D2, ACC0.Q1 932 VPMULL T0.D1, T2.D1, ACCM.Q1 933 reduce() 934 935 B singlesLoop 936tail: 937 CBZ srcPtrLen, done 938 939 VREV32 CTR.B16, B0.B16 940 VADD CTR.S4, INC.S4, CTR.S4 941 942 AESE K0.B16, B0.B16 943 AESMC B0.B16, B0.B16 944 AESE K1.B16, B0.B16 945 AESMC B0.B16, B0.B16 946 AESE K2.B16, B0.B16 947 AESMC B0.B16, B0.B16 948 AESE K3.B16, B0.B16 949 AESMC B0.B16, B0.B16 950 AESE K4.B16, B0.B16 951 AESMC B0.B16, B0.B16 952 AESE K5.B16, B0.B16 953 AESMC B0.B16, B0.B16 954 AESE K6.B16, B0.B16 955 AESMC B0.B16, B0.B16 956 AESE K7.B16, B0.B16 957 AESMC B0.B16, B0.B16 958 AESE K8.B16, B0.B16 959 AESMC B0.B16, B0.B16 960 AESE K9.B16, B0.B16 961 TBZ $4, NR, tailLast 962 AESMC B0.B16, B0.B16 963 AESE K10.B16, B0.B16 964 AESMC B0.B16, B0.B16 965 AESE B1.B16, B0.B16 966 TBZ $3, NR, tailLast 967 AESMC B0.B16, B0.B16 968 AESE B2.B16, B0.B16 969 AESMC B0.B16, B0.B16 970 AESE B3.B16, B0.B16 971tailLast: 972 VEOR KLAST.B16, B0.B16, B0.B16 973 974 // Assuming it is safe to load past dstPtr due to the presence of the tag 975 VLD1 (srcPtr), [B5.B16] 976 977 VEOR B5.B16, B0.B16, B0.B16 978 979 VEOR T3.B16, T3.B16, T3.B16 980 MOVD $0, H1 981 SUB $1, H1 982 983 TBZ $3, srcPtrLen, ld4 984 VMOV B0.D[0], H0 985 MOVD.P H0, 8(dstPtr) 986 VMOV H1, T3.D[0] 987 VEXT $8, ZERO.B16, B0.B16, B0.B16 988ld4: 989 TBZ $2, srcPtrLen, ld2 990 VMOV B0.S[0], H0 991 MOVW.P H0, 4(dstPtr) 992 VEXT $12, T3.B16, ZERO.B16, T3.B16 993 VMOV H1, T3.S[0] 994 VEXT $4, ZERO.B16, B0.B16, B0.B16 995ld2: 996 TBZ $1, srcPtrLen, ld1 997 VMOV B0.H[0], H0 998 MOVH.P H0, 2(dstPtr) 999 VEXT $14, T3.B16, ZERO.B16, T3.B16 1000 VMOV H1, T3.H[0] 1001 VEXT $2, ZERO.B16, B0.B16, B0.B16 1002ld1: 1003 TBZ $0, srcPtrLen, ld0 1004 VMOV B0.B[0], H0 1005 MOVB.P H0, 1(dstPtr) 1006 VEXT $15, T3.B16, ZERO.B16, T3.B16 1007 VMOV H1, T3.B[0] 1008ld0: 1009 1010 VAND T3.B16, B5.B16, B5.B16 1011 VREV64 B5.B16, B5.B16 1012 1013 VEOR ACC0.B16, B5.B16, B5.B16 1014 VEXT $8, B5.B16, B5.B16, T0.B16 1015 VEOR B5.B16, T0.B16, T0.B16 1016 VPMULL B5.D1, T1.D1, ACC1.Q1 1017 VPMULL2 B5.D2, T1.D2, ACC0.Q1 1018 VPMULL T0.D1, T2.D1, ACCM.Q1 1019 reduce() 1020done: 1021 VST1 [ACC0.B16], (tPtr) 1022 1023 RET 1024