1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7// This is an optimized implementation of AES-GCM using AES-NI and CLMUL-NI
8// The implementation uses some optimization as described in:
9// [1] Gueron, S., Kounavis, M.E.: Intel® Carry-Less Multiplication
10//     Instruction and its Usage for Computing the GCM Mode rev. 2.02
11// [2] Gueron, S., Krasnov, V.: Speeding up Counter Mode in Software and
12//     Hardware
13
14#include "textflag.h"
15
16#define B0 X0
17#define B1 X1
18#define B2 X2
19#define B3 X3
20#define B4 X4
21#define B5 X5
22#define B6 X6
23#define B7 X7
24
25#define ACC0 X8
26#define ACC1 X9
27#define ACCM X10
28
29#define T0 X11
30#define T1 X12
31#define T2 X13
32#define POLY X14
33#define BSWAP X15
34
35DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f
36DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607
37
38DATA gcmPoly<>+0x00(SB)/8, $0x0000000000000001
39DATA gcmPoly<>+0x08(SB)/8, $0xc200000000000000
40
41DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
42DATA andMask<>+0x08(SB)/8, $0x0000000000000000
43DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
44DATA andMask<>+0x18(SB)/8, $0x0000000000000000
45DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
46DATA andMask<>+0x28(SB)/8, $0x0000000000000000
47DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
48DATA andMask<>+0x38(SB)/8, $0x0000000000000000
49DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
50DATA andMask<>+0x48(SB)/8, $0x0000000000000000
51DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
52DATA andMask<>+0x58(SB)/8, $0x0000000000000000
53DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
54DATA andMask<>+0x68(SB)/8, $0x0000000000000000
55DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
56DATA andMask<>+0x78(SB)/8, $0x0000000000000000
57DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
58DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
59DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
60DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
61DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
62DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
63DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
64DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
65DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
66DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
67DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
68DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
69DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
70DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
71
72GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
73GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
74GLOBL andMask<>(SB), (NOPTR+RODATA), $240
75
76// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
77TEXT ·gcmAesFinish(SB),NOSPLIT,$0
78#define pTbl DI
79#define tMsk SI
80#define tPtr DX
81#define plen AX
82#define dlen CX
83
84	MOVQ productTable+0(FP), pTbl
85	MOVQ tagMask+8(FP), tMsk
86	MOVQ T+16(FP), tPtr
87	MOVQ pLen+24(FP), plen
88	MOVQ dLen+32(FP), dlen
89
90	MOVOU (tPtr), ACC0
91	MOVOU (tMsk), T2
92
93	MOVOU bswapMask<>(SB), BSWAP
94	MOVOU gcmPoly<>(SB), POLY
95
96	SHLQ $3, plen
97	SHLQ $3, dlen
98
99	MOVQ plen, B0
100	PINSRQ $1, dlen, B0
101
102	PXOR ACC0, B0
103
104	MOVOU (16*14)(pTbl), ACC0
105	MOVOU (16*15)(pTbl), ACCM
106	MOVOU ACC0, ACC1
107
108	PCLMULQDQ $0x00, B0, ACC0
109	PCLMULQDQ $0x11, B0, ACC1
110	PSHUFD $78, B0, T0
111	PXOR B0, T0
112	PCLMULQDQ $0x00, T0, ACCM
113
114	PXOR ACC0, ACCM
115	PXOR ACC1, ACCM
116	MOVOU ACCM, T0
117	PSRLDQ $8, ACCM
118	PSLLDQ $8, T0
119	PXOR ACCM, ACC1
120	PXOR T0, ACC0
121
122	MOVOU POLY, T0
123	PCLMULQDQ $0x01, ACC0, T0
124	PSHUFD $78, ACC0, ACC0
125	PXOR T0, ACC0
126
127	MOVOU POLY, T0
128	PCLMULQDQ $0x01, ACC0, T0
129	PSHUFD $78, ACC0, ACC0
130	PXOR T0, ACC0
131
132	PXOR ACC1, ACC0
133
134	PSHUFB BSWAP, ACC0
135	PXOR T2, ACC0
136	MOVOU ACC0, (tPtr)
137
138	RET
139#undef pTbl
140#undef tMsk
141#undef tPtr
142#undef plen
143#undef dlen
144
145// func gcmAesInit(productTable *[256]byte, ks []uint32)
146TEXT ·gcmAesInit(SB),NOSPLIT,$0
147#define dst DI
148#define KS SI
149#define NR DX
150
151	MOVQ productTable+0(FP), dst
152	MOVQ ks_base+8(FP), KS
153	MOVQ ks_len+16(FP), NR
154
155	SHRQ $2, NR
156	DECQ NR
157
158	MOVOU bswapMask<>(SB), BSWAP
159	MOVOU gcmPoly<>(SB), POLY
160
161	// Encrypt block 0, with the AES key to generate the hash key H
162	MOVOU (16*0)(KS), B0
163	MOVOU (16*1)(KS), T0
164	AESENC T0, B0
165	MOVOU (16*2)(KS), T0
166	AESENC T0, B0
167	MOVOU (16*3)(KS), T0
168	AESENC T0, B0
169	MOVOU (16*4)(KS), T0
170	AESENC T0, B0
171	MOVOU (16*5)(KS), T0
172	AESENC T0, B0
173	MOVOU (16*6)(KS), T0
174	AESENC T0, B0
175	MOVOU (16*7)(KS), T0
176	AESENC T0, B0
177	MOVOU (16*8)(KS), T0
178	AESENC T0, B0
179	MOVOU (16*9)(KS), T0
180	AESENC T0, B0
181	MOVOU (16*10)(KS), T0
182	CMPQ NR, $12
183	JB initEncLast
184	AESENC T0, B0
185	MOVOU (16*11)(KS), T0
186	AESENC T0, B0
187	MOVOU (16*12)(KS), T0
188	JE initEncLast
189	AESENC T0, B0
190	MOVOU (16*13)(KS), T0
191	AESENC T0, B0
192	MOVOU (16*14)(KS), T0
193initEncLast:
194	AESENCLAST T0, B0
195
196	PSHUFB BSWAP, B0
197	// H * 2
198	PSHUFD $0xff, B0, T0
199	MOVOU B0, T1
200	PSRAL $31, T0
201	PAND POLY, T0
202	PSRLL $31, T1
203	PSLLDQ $4, T1
204	PSLLL $1, B0
205	PXOR T0, B0
206	PXOR T1, B0
207	// Karatsuba pre-computations
208	MOVOU B0, (16*14)(dst)
209	PSHUFD $78, B0, B1
210	PXOR B0, B1
211	MOVOU B1, (16*15)(dst)
212
213	MOVOU B0, B2
214	MOVOU B1, B3
215	// Now prepare powers of H and pre-computations for them
216	MOVQ $7, AX
217
218initLoop:
219		MOVOU B2, T0
220		MOVOU B2, T1
221		MOVOU B3, T2
222		PCLMULQDQ $0x00, B0, T0
223		PCLMULQDQ $0x11, B0, T1
224		PCLMULQDQ $0x00, B1, T2
225
226		PXOR T0, T2
227		PXOR T1, T2
228		MOVOU T2, B4
229		PSLLDQ $8, B4
230		PSRLDQ $8, T2
231		PXOR B4, T0
232		PXOR T2, T1
233
234		MOVOU POLY, B2
235		PCLMULQDQ $0x01, T0, B2
236		PSHUFD $78, T0, T0
237		PXOR B2, T0
238		MOVOU POLY, B2
239		PCLMULQDQ $0x01, T0, B2
240		PSHUFD $78, T0, T0
241		PXOR T0, B2
242		PXOR T1, B2
243
244		MOVOU B2, (16*12)(dst)
245		PSHUFD $78, B2, B3
246		PXOR B2, B3
247		MOVOU B3, (16*13)(dst)
248
249		DECQ AX
250		LEAQ (-16*2)(dst), dst
251	JNE initLoop
252
253	RET
254#undef NR
255#undef KS
256#undef dst
257
258// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
259TEXT ·gcmAesData(SB),NOSPLIT,$0
260#define pTbl DI
261#define aut SI
262#define tPtr CX
263#define autLen DX
264
265#define reduceRound(a) 	MOVOU POLY, T0;	PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
266#define mulRoundAAD(X ,i) \
267	MOVOU (16*(i*2))(pTbl), T1;\
268	MOVOU T1, T2;\
269	PCLMULQDQ $0x00, X, T1;\
270	PXOR T1, ACC0;\
271	PCLMULQDQ $0x11, X, T2;\
272	PXOR T2, ACC1;\
273	PSHUFD $78, X, T1;\
274	PXOR T1, X;\
275	MOVOU (16*(i*2+1))(pTbl), T1;\
276	PCLMULQDQ $0x00, X, T1;\
277	PXOR T1, ACCM
278
279	MOVQ productTable+0(FP), pTbl
280	MOVQ data_base+8(FP), aut
281	MOVQ data_len+16(FP), autLen
282	MOVQ T+32(FP), tPtr
283
284	PXOR ACC0, ACC0
285	MOVOU bswapMask<>(SB), BSWAP
286	MOVOU gcmPoly<>(SB), POLY
287
288	TESTQ autLen, autLen
289	JEQ dataBail
290
291	CMPQ autLen, $13	// optimize the TLS case
292	JE dataTLS
293	CMPQ autLen, $128
294	JB startSinglesLoop
295	JMP dataOctaLoop
296
297dataTLS:
298	MOVOU (16*14)(pTbl), T1
299	MOVOU (16*15)(pTbl), T2
300	PXOR B0, B0
301	MOVQ (aut), B0
302	PINSRD $2, 8(aut), B0
303	PINSRB $12, 12(aut), B0
304	XORQ autLen, autLen
305	JMP dataMul
306
307dataOctaLoop:
308		CMPQ autLen, $128
309		JB startSinglesLoop
310		SUBQ $128, autLen
311
312		MOVOU (16*0)(aut), X0
313		MOVOU (16*1)(aut), X1
314		MOVOU (16*2)(aut), X2
315		MOVOU (16*3)(aut), X3
316		MOVOU (16*4)(aut), X4
317		MOVOU (16*5)(aut), X5
318		MOVOU (16*6)(aut), X6
319		MOVOU (16*7)(aut), X7
320		LEAQ (16*8)(aut), aut
321		PSHUFB BSWAP, X0
322		PSHUFB BSWAP, X1
323		PSHUFB BSWAP, X2
324		PSHUFB BSWAP, X3
325		PSHUFB BSWAP, X4
326		PSHUFB BSWAP, X5
327		PSHUFB BSWAP, X6
328		PSHUFB BSWAP, X7
329		PXOR ACC0, X0
330
331		MOVOU (16*0)(pTbl), ACC0
332		MOVOU (16*1)(pTbl), ACCM
333		MOVOU ACC0, ACC1
334		PSHUFD $78, X0, T1
335		PXOR X0, T1
336		PCLMULQDQ $0x00, X0, ACC0
337		PCLMULQDQ $0x11, X0, ACC1
338		PCLMULQDQ $0x00, T1, ACCM
339
340		mulRoundAAD(X1, 1)
341		mulRoundAAD(X2, 2)
342		mulRoundAAD(X3, 3)
343		mulRoundAAD(X4, 4)
344		mulRoundAAD(X5, 5)
345		mulRoundAAD(X6, 6)
346		mulRoundAAD(X7, 7)
347
348		PXOR ACC0, ACCM
349		PXOR ACC1, ACCM
350		MOVOU ACCM, T0
351		PSRLDQ $8, ACCM
352		PSLLDQ $8, T0
353		PXOR ACCM, ACC1
354		PXOR T0, ACC0
355		reduceRound(ACC0)
356		reduceRound(ACC0)
357		PXOR ACC1, ACC0
358	JMP dataOctaLoop
359
360startSinglesLoop:
361	MOVOU (16*14)(pTbl), T1
362	MOVOU (16*15)(pTbl), T2
363
364dataSinglesLoop:
365
366		CMPQ autLen, $16
367		JB dataEnd
368		SUBQ $16, autLen
369
370		MOVOU (aut), B0
371dataMul:
372		PSHUFB BSWAP, B0
373		PXOR ACC0, B0
374
375		MOVOU T1, ACC0
376		MOVOU T2, ACCM
377		MOVOU T1, ACC1
378
379		PSHUFD $78, B0, T0
380		PXOR B0, T0
381		PCLMULQDQ $0x00, B0, ACC0
382		PCLMULQDQ $0x11, B0, ACC1
383		PCLMULQDQ $0x00, T0, ACCM
384
385		PXOR ACC0, ACCM
386		PXOR ACC1, ACCM
387		MOVOU ACCM, T0
388		PSRLDQ $8, ACCM
389		PSLLDQ $8, T0
390		PXOR ACCM, ACC1
391		PXOR T0, ACC0
392
393		MOVOU POLY, T0
394		PCLMULQDQ $0x01, ACC0, T0
395		PSHUFD $78, ACC0, ACC0
396		PXOR T0, ACC0
397
398		MOVOU POLY, T0
399		PCLMULQDQ $0x01, ACC0, T0
400		PSHUFD $78, ACC0, ACC0
401		PXOR T0, ACC0
402		PXOR ACC1, ACC0
403
404		LEAQ 16(aut), aut
405
406	JMP dataSinglesLoop
407
408dataEnd:
409
410	TESTQ autLen, autLen
411	JEQ dataBail
412
413	PXOR B0, B0
414	LEAQ -1(aut)(autLen*1), aut
415
416dataLoadLoop:
417
418		PSLLDQ $1, B0
419		PINSRB $0, (aut), B0
420
421		LEAQ -1(aut), aut
422		DECQ autLen
423		JNE dataLoadLoop
424
425	JMP dataMul
426
427dataBail:
428	MOVOU ACC0, (tPtr)
429	RET
430#undef pTbl
431#undef aut
432#undef tPtr
433#undef autLen
434
435// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
436TEXT ·gcmAesEnc(SB),0,$256-96
437#define pTbl DI
438#define ctx DX
439#define ctrPtr CX
440#define ptx SI
441#define ks AX
442#define tPtr R8
443#define ptxLen R9
444#define aluCTR R10
445#define aluTMP R11
446#define aluK R12
447#define NR R13
448
449#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + 8*16 + i*16)(SP)
450#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
451#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
452#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
453#define combinedRound(i) \
454	MOVOU (16*i)(ks), T0;\
455	AESENC T0, B0;\
456	AESENC T0, B1;\
457	AESENC T0, B2;\
458	AESENC T0, B3;\
459	 MOVOU (16*(i*2))(pTbl), T1;\
460	 MOVOU T1, T2;\
461	AESENC T0, B4;\
462	AESENC T0, B5;\
463	AESENC T0, B6;\
464	AESENC T0, B7;\
465	 MOVOU (16*i)(SP), T0;\
466	 PCLMULQDQ $0x00, T0, T1;\
467	 PXOR T1, ACC0;\
468	 PSHUFD $78, T0, T1;\
469	 PCLMULQDQ $0x11, T0, T2;\
470	 PXOR T1, T0;\
471	 PXOR T2, ACC1;\
472	 MOVOU (16*(i*2+1))(pTbl), T2;\
473	 PCLMULQDQ $0x00, T2, T0;\
474	 PXOR T0, ACCM
475#define mulRound(i) \
476	MOVOU (16*i)(SP), T0;\
477	MOVOU (16*(i*2))(pTbl), T1;\
478	MOVOU T1, T2;\
479	PCLMULQDQ $0x00, T0, T1;\
480	PXOR T1, ACC0;\
481	PCLMULQDQ $0x11, T0, T2;\
482	PXOR T2, ACC1;\
483	PSHUFD $78, T0, T1;\
484	PXOR T1, T0;\
485	MOVOU (16*(i*2+1))(pTbl), T1;\
486	PCLMULQDQ $0x00, T0, T1;\
487	PXOR T1, ACCM
488
489	MOVQ productTable+0(FP), pTbl
490	MOVQ dst+8(FP), ctx
491	MOVQ src_base+32(FP), ptx
492	MOVQ src_len+40(FP), ptxLen
493	MOVQ ctr+56(FP), ctrPtr
494	MOVQ T+64(FP), tPtr
495	MOVQ ks_base+72(FP), ks
496	MOVQ ks_len+80(FP), NR
497
498	SHRQ $2, NR
499	DECQ NR
500
501	MOVOU bswapMask<>(SB), BSWAP
502	MOVOU gcmPoly<>(SB), POLY
503
504	MOVOU (tPtr), ACC0
505	PXOR ACC1, ACC1
506	PXOR ACCM, ACCM
507	MOVOU (ctrPtr), B0
508	MOVL (3*4)(ctrPtr), aluCTR
509	MOVOU (ks), T0
510	MOVL (3*4)(ks), aluK
511	BSWAPL aluCTR
512	BSWAPL aluK
513
514	PXOR B0, T0
515	MOVOU T0, (8*16 + 0*16)(SP)
516	increment(0)
517
518	CMPQ ptxLen, $128
519	JB gcmAesEncSingles
520	SUBQ $128, ptxLen
521
522	// We have at least 8 blocks to encrypt, prepare the rest of the counters
523	MOVOU T0, (8*16 + 1*16)(SP)
524	increment(1)
525	MOVOU T0, (8*16 + 2*16)(SP)
526	increment(2)
527	MOVOU T0, (8*16 + 3*16)(SP)
528	increment(3)
529	MOVOU T0, (8*16 + 4*16)(SP)
530	increment(4)
531	MOVOU T0, (8*16 + 5*16)(SP)
532	increment(5)
533	MOVOU T0, (8*16 + 6*16)(SP)
534	increment(6)
535	MOVOU T0, (8*16 + 7*16)(SP)
536	increment(7)
537
538	MOVOU (8*16 + 0*16)(SP), B0
539	MOVOU (8*16 + 1*16)(SP), B1
540	MOVOU (8*16 + 2*16)(SP), B2
541	MOVOU (8*16 + 3*16)(SP), B3
542	MOVOU (8*16 + 4*16)(SP), B4
543	MOVOU (8*16 + 5*16)(SP), B5
544	MOVOU (8*16 + 6*16)(SP), B6
545	MOVOU (8*16 + 7*16)(SP), B7
546
547	aesRound(1)
548	increment(0)
549	aesRound(2)
550	increment(1)
551	aesRound(3)
552	increment(2)
553	aesRound(4)
554	increment(3)
555	aesRound(5)
556	increment(4)
557	aesRound(6)
558	increment(5)
559	aesRound(7)
560	increment(6)
561	aesRound(8)
562	increment(7)
563	aesRound(9)
564	MOVOU (16*10)(ks), T0
565	CMPQ NR, $12
566	JB encLast1
567	aesRnd(T0)
568	aesRound(11)
569	MOVOU (16*12)(ks), T0
570	JE encLast1
571	aesRnd(T0)
572	aesRound(13)
573	MOVOU (16*14)(ks), T0
574encLast1:
575	aesRndLast(T0)
576
577	MOVOU (16*0)(ptx), T0
578	PXOR T0, B0
579	MOVOU (16*1)(ptx), T0
580	PXOR T0, B1
581	MOVOU (16*2)(ptx), T0
582	PXOR T0, B2
583	MOVOU (16*3)(ptx), T0
584	PXOR T0, B3
585	MOVOU (16*4)(ptx), T0
586	PXOR T0, B4
587	MOVOU (16*5)(ptx), T0
588	PXOR T0, B5
589	MOVOU (16*6)(ptx), T0
590	PXOR T0, B6
591	MOVOU (16*7)(ptx), T0
592	PXOR T0, B7
593
594	MOVOU B0, (16*0)(ctx)
595	PSHUFB BSWAP, B0
596	PXOR ACC0, B0
597	MOVOU B1, (16*1)(ctx)
598	PSHUFB BSWAP, B1
599	MOVOU B2, (16*2)(ctx)
600	PSHUFB BSWAP, B2
601	MOVOU B3, (16*3)(ctx)
602	PSHUFB BSWAP, B3
603	MOVOU B4, (16*4)(ctx)
604	PSHUFB BSWAP, B4
605	MOVOU B5, (16*5)(ctx)
606	PSHUFB BSWAP, B5
607	MOVOU B6, (16*6)(ctx)
608	PSHUFB BSWAP, B6
609	MOVOU B7, (16*7)(ctx)
610	PSHUFB BSWAP, B7
611
612	MOVOU B0, (16*0)(SP)
613	MOVOU B1, (16*1)(SP)
614	MOVOU B2, (16*2)(SP)
615	MOVOU B3, (16*3)(SP)
616	MOVOU B4, (16*4)(SP)
617	MOVOU B5, (16*5)(SP)
618	MOVOU B6, (16*6)(SP)
619	MOVOU B7, (16*7)(SP)
620
621	LEAQ 128(ptx), ptx
622	LEAQ 128(ctx), ctx
623
624gcmAesEncOctetsLoop:
625
626		CMPQ ptxLen, $128
627		JB gcmAesEncOctetsEnd
628		SUBQ $128, ptxLen
629
630		MOVOU (8*16 + 0*16)(SP), B0
631		MOVOU (8*16 + 1*16)(SP), B1
632		MOVOU (8*16 + 2*16)(SP), B2
633		MOVOU (8*16 + 3*16)(SP), B3
634		MOVOU (8*16 + 4*16)(SP), B4
635		MOVOU (8*16 + 5*16)(SP), B5
636		MOVOU (8*16 + 6*16)(SP), B6
637		MOVOU (8*16 + 7*16)(SP), B7
638
639		MOVOU (16*0)(SP), T0
640		PSHUFD $78, T0, T1
641		PXOR T0, T1
642
643		MOVOU (16*0)(pTbl), ACC0
644		MOVOU (16*1)(pTbl), ACCM
645		MOVOU ACC0, ACC1
646
647		PCLMULQDQ $0x00, T1, ACCM
648		PCLMULQDQ $0x00, T0, ACC0
649		PCLMULQDQ $0x11, T0, ACC1
650
651		combinedRound(1)
652		increment(0)
653		combinedRound(2)
654		increment(1)
655		combinedRound(3)
656		increment(2)
657		combinedRound(4)
658		increment(3)
659		combinedRound(5)
660		increment(4)
661		combinedRound(6)
662		increment(5)
663		combinedRound(7)
664		increment(6)
665
666		aesRound(8)
667		increment(7)
668
669		PXOR ACC0, ACCM
670		PXOR ACC1, ACCM
671		MOVOU ACCM, T0
672		PSRLDQ $8, ACCM
673		PSLLDQ $8, T0
674		PXOR ACCM, ACC1
675		PXOR T0, ACC0
676
677		reduceRound(ACC0)
678		aesRound(9)
679
680		reduceRound(ACC0)
681		PXOR ACC1, ACC0
682
683		MOVOU (16*10)(ks), T0
684		CMPQ NR, $12
685		JB encLast2
686		aesRnd(T0)
687		aesRound(11)
688		MOVOU (16*12)(ks), T0
689		JE encLast2
690		aesRnd(T0)
691		aesRound(13)
692		MOVOU (16*14)(ks), T0
693encLast2:
694		aesRndLast(T0)
695
696		MOVOU (16*0)(ptx), T0
697		PXOR T0, B0
698		MOVOU (16*1)(ptx), T0
699		PXOR T0, B1
700		MOVOU (16*2)(ptx), T0
701		PXOR T0, B2
702		MOVOU (16*3)(ptx), T0
703		PXOR T0, B3
704		MOVOU (16*4)(ptx), T0
705		PXOR T0, B4
706		MOVOU (16*5)(ptx), T0
707		PXOR T0, B5
708		MOVOU (16*6)(ptx), T0
709		PXOR T0, B6
710		MOVOU (16*7)(ptx), T0
711		PXOR T0, B7
712
713		MOVOU B0, (16*0)(ctx)
714		PSHUFB BSWAP, B0
715		PXOR ACC0, B0
716		MOVOU B1, (16*1)(ctx)
717		PSHUFB BSWAP, B1
718		MOVOU B2, (16*2)(ctx)
719		PSHUFB BSWAP, B2
720		MOVOU B3, (16*3)(ctx)
721		PSHUFB BSWAP, B3
722		MOVOU B4, (16*4)(ctx)
723		PSHUFB BSWAP, B4
724		MOVOU B5, (16*5)(ctx)
725		PSHUFB BSWAP, B5
726		MOVOU B6, (16*6)(ctx)
727		PSHUFB BSWAP, B6
728		MOVOU B7, (16*7)(ctx)
729		PSHUFB BSWAP, B7
730
731		MOVOU B0, (16*0)(SP)
732		MOVOU B1, (16*1)(SP)
733		MOVOU B2, (16*2)(SP)
734		MOVOU B3, (16*3)(SP)
735		MOVOU B4, (16*4)(SP)
736		MOVOU B5, (16*5)(SP)
737		MOVOU B6, (16*6)(SP)
738		MOVOU B7, (16*7)(SP)
739
740		LEAQ 128(ptx), ptx
741		LEAQ 128(ctx), ctx
742
743		JMP gcmAesEncOctetsLoop
744
745gcmAesEncOctetsEnd:
746
747	MOVOU (16*0)(SP), T0
748	MOVOU (16*0)(pTbl), ACC0
749	MOVOU (16*1)(pTbl), ACCM
750	MOVOU ACC0, ACC1
751	PSHUFD $78, T0, T1
752	PXOR T0, T1
753	PCLMULQDQ $0x00, T0, ACC0
754	PCLMULQDQ $0x11, T0, ACC1
755	PCLMULQDQ $0x00, T1, ACCM
756
757	mulRound(1)
758	mulRound(2)
759	mulRound(3)
760	mulRound(4)
761	mulRound(5)
762	mulRound(6)
763	mulRound(7)
764
765	PXOR ACC0, ACCM
766	PXOR ACC1, ACCM
767	MOVOU ACCM, T0
768	PSRLDQ $8, ACCM
769	PSLLDQ $8, T0
770	PXOR ACCM, ACC1
771	PXOR T0, ACC0
772
773	reduceRound(ACC0)
774	reduceRound(ACC0)
775	PXOR ACC1, ACC0
776
777	TESTQ ptxLen, ptxLen
778	JE gcmAesEncDone
779
780	SUBQ $7, aluCTR
781
782gcmAesEncSingles:
783
784	MOVOU (16*1)(ks), B1
785	MOVOU (16*2)(ks), B2
786	MOVOU (16*3)(ks), B3
787	MOVOU (16*4)(ks), B4
788	MOVOU (16*5)(ks), B5
789	MOVOU (16*6)(ks), B6
790	MOVOU (16*7)(ks), B7
791
792	MOVOU (16*14)(pTbl), T2
793
794gcmAesEncSinglesLoop:
795
796		CMPQ ptxLen, $16
797		JB gcmAesEncTail
798		SUBQ $16, ptxLen
799
800		MOVOU (8*16 + 0*16)(SP), B0
801		increment(0)
802
803		AESENC B1, B0
804		AESENC B2, B0
805		AESENC B3, B0
806		AESENC B4, B0
807		AESENC B5, B0
808		AESENC B6, B0
809		AESENC B7, B0
810		MOVOU (16*8)(ks), T0
811		AESENC T0, B0
812		MOVOU (16*9)(ks), T0
813		AESENC T0, B0
814		MOVOU (16*10)(ks), T0
815		CMPQ NR, $12
816		JB encLast3
817		AESENC T0, B0
818		MOVOU (16*11)(ks), T0
819		AESENC T0, B0
820		MOVOU (16*12)(ks), T0
821		JE encLast3
822		AESENC T0, B0
823		MOVOU (16*13)(ks), T0
824		AESENC T0, B0
825		MOVOU (16*14)(ks), T0
826encLast3:
827		AESENCLAST T0, B0
828
829		MOVOU (ptx), T0
830		PXOR T0, B0
831		MOVOU B0, (ctx)
832
833		PSHUFB BSWAP, B0
834		PXOR ACC0, B0
835
836		MOVOU T2, ACC0
837		MOVOU T2, ACC1
838		MOVOU (16*15)(pTbl), ACCM
839
840		PSHUFD $78, B0, T0
841		PXOR B0, T0
842		PCLMULQDQ $0x00, B0, ACC0
843		PCLMULQDQ $0x11, B0, ACC1
844		PCLMULQDQ $0x00, T0, ACCM
845
846		PXOR ACC0, ACCM
847		PXOR ACC1, ACCM
848		MOVOU ACCM, T0
849		PSRLDQ $8, ACCM
850		PSLLDQ $8, T0
851		PXOR ACCM, ACC1
852		PXOR T0, ACC0
853
854		reduceRound(ACC0)
855		reduceRound(ACC0)
856		PXOR ACC1, ACC0
857
858		LEAQ (16*1)(ptx), ptx
859		LEAQ (16*1)(ctx), ctx
860
861	JMP gcmAesEncSinglesLoop
862
863gcmAesEncTail:
864	TESTQ ptxLen, ptxLen
865	JE gcmAesEncDone
866
867	MOVOU (8*16 + 0*16)(SP), B0
868	AESENC B1, B0
869	AESENC B2, B0
870	AESENC B3, B0
871	AESENC B4, B0
872	AESENC B5, B0
873	AESENC B6, B0
874	AESENC B7, B0
875	MOVOU (16*8)(ks), T0
876	AESENC T0, B0
877	MOVOU (16*9)(ks), T0
878	AESENC T0, B0
879	MOVOU (16*10)(ks), T0
880	CMPQ NR, $12
881	JB encLast4
882	AESENC T0, B0
883	MOVOU (16*11)(ks), T0
884	AESENC T0, B0
885	MOVOU (16*12)(ks), T0
886	JE encLast4
887	AESENC T0, B0
888	MOVOU (16*13)(ks), T0
889	AESENC T0, B0
890	MOVOU (16*14)(ks), T0
891encLast4:
892	AESENCLAST T0, B0
893	MOVOU B0, T0
894
895	LEAQ -1(ptx)(ptxLen*1), ptx
896
897	MOVQ ptxLen, aluTMP
898	SHLQ $4, aluTMP
899
900	LEAQ andMask<>(SB), aluCTR
901	MOVOU -16(aluCTR)(aluTMP*1), T1
902
903	PXOR B0, B0
904ptxLoadLoop:
905		PSLLDQ $1, B0
906		PINSRB $0, (ptx), B0
907		LEAQ -1(ptx), ptx
908		DECQ ptxLen
909	JNE ptxLoadLoop
910
911	PXOR T0, B0
912	PAND T1, B0
913	MOVOU B0, (ctx)	// I assume there is always space, due to TAG in the end of the CT
914
915	PSHUFB BSWAP, B0
916	PXOR ACC0, B0
917
918	MOVOU T2, ACC0
919	MOVOU T2, ACC1
920	MOVOU (16*15)(pTbl), ACCM
921
922	PSHUFD $78, B0, T0
923	PXOR B0, T0
924	PCLMULQDQ $0x00, B0, ACC0
925	PCLMULQDQ $0x11, B0, ACC1
926	PCLMULQDQ $0x00, T0, ACCM
927
928	PXOR ACC0, ACCM
929	PXOR ACC1, ACCM
930	MOVOU ACCM, T0
931	PSRLDQ $8, ACCM
932	PSLLDQ $8, T0
933	PXOR ACCM, ACC1
934	PXOR T0, ACC0
935
936	reduceRound(ACC0)
937	reduceRound(ACC0)
938	PXOR ACC1, ACC0
939
940gcmAesEncDone:
941	MOVOU ACC0, (tPtr)
942	RET
943#undef increment
944
945// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
946TEXT ·gcmAesDec(SB),0,$128-96
947#define increment(i) ADDL $1, aluCTR; MOVL aluCTR, aluTMP; XORL aluK, aluTMP; BSWAPL aluTMP; MOVL aluTMP, (3*4 + i*16)(SP)
948#define combinedDecRound(i) \
949	MOVOU (16*i)(ks), T0;\
950	AESENC T0, B0;\
951	AESENC T0, B1;\
952	AESENC T0, B2;\
953	AESENC T0, B3;\
954	MOVOU (16*(i*2))(pTbl), T1;\
955	MOVOU T1, T2;\
956	AESENC T0, B4;\
957	AESENC T0, B5;\
958	AESENC T0, B6;\
959	AESENC T0, B7;\
960	MOVOU (16*i)(ctx), T0;\
961	PSHUFB BSWAP, T0;\
962	PCLMULQDQ $0x00, T0, T1;\
963	PXOR T1, ACC0;\
964	PSHUFD $78, T0, T1;\
965	PCLMULQDQ $0x11, T0, T2;\
966	PXOR T1, T0;\
967	PXOR T2, ACC1;\
968	MOVOU (16*(i*2+1))(pTbl), T2;\
969	PCLMULQDQ $0x00, T2, T0;\
970	PXOR T0, ACCM
971
972	MOVQ productTable+0(FP), pTbl
973	MOVQ dst+8(FP), ptx
974	MOVQ src_base+32(FP), ctx
975	MOVQ src_len+40(FP), ptxLen
976	MOVQ ctr+56(FP), ctrPtr
977	MOVQ T+64(FP), tPtr
978	MOVQ ks_base+72(FP), ks
979	MOVQ ks_len+80(FP), NR
980
981	SHRQ $2, NR
982	DECQ NR
983
984	MOVOU bswapMask<>(SB), BSWAP
985	MOVOU gcmPoly<>(SB), POLY
986
987	MOVOU (tPtr), ACC0
988	PXOR ACC1, ACC1
989	PXOR ACCM, ACCM
990	MOVOU (ctrPtr), B0
991	MOVL (3*4)(ctrPtr), aluCTR
992	MOVOU (ks), T0
993	MOVL (3*4)(ks), aluK
994	BSWAPL aluCTR
995	BSWAPL aluK
996
997	PXOR B0, T0
998	MOVOU T0, (0*16)(SP)
999	increment(0)
1000
1001	CMPQ ptxLen, $128
1002	JB gcmAesDecSingles
1003
1004	MOVOU T0, (1*16)(SP)
1005	increment(1)
1006	MOVOU T0, (2*16)(SP)
1007	increment(2)
1008	MOVOU T0, (3*16)(SP)
1009	increment(3)
1010	MOVOU T0, (4*16)(SP)
1011	increment(4)
1012	MOVOU T0, (5*16)(SP)
1013	increment(5)
1014	MOVOU T0, (6*16)(SP)
1015	increment(6)
1016	MOVOU T0, (7*16)(SP)
1017	increment(7)
1018
1019gcmAesDecOctetsLoop:
1020
1021		CMPQ ptxLen, $128
1022		JB gcmAesDecEndOctets
1023		SUBQ $128, ptxLen
1024
1025		MOVOU (0*16)(SP), B0
1026		MOVOU (1*16)(SP), B1
1027		MOVOU (2*16)(SP), B2
1028		MOVOU (3*16)(SP), B3
1029		MOVOU (4*16)(SP), B4
1030		MOVOU (5*16)(SP), B5
1031		MOVOU (6*16)(SP), B6
1032		MOVOU (7*16)(SP), B7
1033
1034		MOVOU (16*0)(ctx), T0
1035		PSHUFB BSWAP, T0
1036		PXOR ACC0, T0
1037		PSHUFD $78, T0, T1
1038		PXOR T0, T1
1039
1040		MOVOU (16*0)(pTbl), ACC0
1041		MOVOU (16*1)(pTbl), ACCM
1042		MOVOU ACC0, ACC1
1043
1044		PCLMULQDQ $0x00, T1, ACCM
1045		PCLMULQDQ $0x00, T0, ACC0
1046		PCLMULQDQ $0x11, T0, ACC1
1047
1048		combinedDecRound(1)
1049		increment(0)
1050		combinedDecRound(2)
1051		increment(1)
1052		combinedDecRound(3)
1053		increment(2)
1054		combinedDecRound(4)
1055		increment(3)
1056		combinedDecRound(5)
1057		increment(4)
1058		combinedDecRound(6)
1059		increment(5)
1060		combinedDecRound(7)
1061		increment(6)
1062
1063		aesRound(8)
1064		increment(7)
1065
1066		PXOR ACC0, ACCM
1067		PXOR ACC1, ACCM
1068		MOVOU ACCM, T0
1069		PSRLDQ $8, ACCM
1070		PSLLDQ $8, T0
1071		PXOR ACCM, ACC1
1072		PXOR T0, ACC0
1073
1074		reduceRound(ACC0)
1075		aesRound(9)
1076
1077		reduceRound(ACC0)
1078		PXOR ACC1, ACC0
1079
1080		MOVOU (16*10)(ks), T0
1081		CMPQ NR, $12
1082		JB decLast1
1083		aesRnd(T0)
1084		aesRound(11)
1085		MOVOU (16*12)(ks), T0
1086		JE decLast1
1087		aesRnd(T0)
1088		aesRound(13)
1089		MOVOU (16*14)(ks), T0
1090decLast1:
1091		aesRndLast(T0)
1092
1093		MOVOU (16*0)(ctx), T0
1094		PXOR T0, B0
1095		MOVOU (16*1)(ctx), T0
1096		PXOR T0, B1
1097		MOVOU (16*2)(ctx), T0
1098		PXOR T0, B2
1099		MOVOU (16*3)(ctx), T0
1100		PXOR T0, B3
1101		MOVOU (16*4)(ctx), T0
1102		PXOR T0, B4
1103		MOVOU (16*5)(ctx), T0
1104		PXOR T0, B5
1105		MOVOU (16*6)(ctx), T0
1106		PXOR T0, B6
1107		MOVOU (16*7)(ctx), T0
1108		PXOR T0, B7
1109
1110		MOVOU B0, (16*0)(ptx)
1111		MOVOU B1, (16*1)(ptx)
1112		MOVOU B2, (16*2)(ptx)
1113		MOVOU B3, (16*3)(ptx)
1114		MOVOU B4, (16*4)(ptx)
1115		MOVOU B5, (16*5)(ptx)
1116		MOVOU B6, (16*6)(ptx)
1117		MOVOU B7, (16*7)(ptx)
1118
1119		LEAQ 128(ptx), ptx
1120		LEAQ 128(ctx), ctx
1121
1122		JMP gcmAesDecOctetsLoop
1123
1124gcmAesDecEndOctets:
1125
1126	SUBQ $7, aluCTR
1127
1128gcmAesDecSingles:
1129
1130	MOVOU (16*1)(ks), B1
1131	MOVOU (16*2)(ks), B2
1132	MOVOU (16*3)(ks), B3
1133	MOVOU (16*4)(ks), B4
1134	MOVOU (16*5)(ks), B5
1135	MOVOU (16*6)(ks), B6
1136	MOVOU (16*7)(ks), B7
1137
1138	MOVOU (16*14)(pTbl), T2
1139
1140gcmAesDecSinglesLoop:
1141
1142		CMPQ ptxLen, $16
1143		JB gcmAesDecTail
1144		SUBQ $16, ptxLen
1145
1146		MOVOU (ctx), B0
1147		MOVOU B0, T1
1148		PSHUFB BSWAP, B0
1149		PXOR ACC0, B0
1150
1151		MOVOU T2, ACC0
1152		MOVOU T2, ACC1
1153		MOVOU (16*15)(pTbl), ACCM
1154
1155		PCLMULQDQ $0x00, B0, ACC0
1156		PCLMULQDQ $0x11, B0, ACC1
1157		PSHUFD $78, B0, T0
1158		PXOR B0, T0
1159		PCLMULQDQ $0x00, T0, ACCM
1160
1161		PXOR ACC0, ACCM
1162		PXOR ACC1, ACCM
1163		MOVOU ACCM, T0
1164		PSRLDQ $8, ACCM
1165		PSLLDQ $8, T0
1166		PXOR ACCM, ACC1
1167		PXOR T0, ACC0
1168
1169		reduceRound(ACC0)
1170		reduceRound(ACC0)
1171		PXOR ACC1, ACC0
1172
1173		MOVOU (0*16)(SP), B0
1174		increment(0)
1175		AESENC B1, B0
1176		AESENC B2, B0
1177		AESENC B3, B0
1178		AESENC B4, B0
1179		AESENC B5, B0
1180		AESENC B6, B0
1181		AESENC B7, B0
1182		MOVOU (16*8)(ks), T0
1183		AESENC T0, B0
1184		MOVOU (16*9)(ks), T0
1185		AESENC T0, B0
1186		MOVOU (16*10)(ks), T0
1187		CMPQ NR, $12
1188		JB decLast2
1189		AESENC T0, B0
1190		MOVOU (16*11)(ks), T0
1191		AESENC T0, B0
1192		MOVOU (16*12)(ks), T0
1193		JE decLast2
1194		AESENC T0, B0
1195		MOVOU (16*13)(ks), T0
1196		AESENC T0, B0
1197		MOVOU (16*14)(ks), T0
1198decLast2:
1199		AESENCLAST T0, B0
1200
1201		PXOR T1, B0
1202		MOVOU B0, (ptx)
1203
1204		LEAQ (16*1)(ptx), ptx
1205		LEAQ (16*1)(ctx), ctx
1206
1207	JMP gcmAesDecSinglesLoop
1208
1209gcmAesDecTail:
1210
1211	TESTQ ptxLen, ptxLen
1212	JE gcmAesDecDone
1213
1214	MOVQ ptxLen, aluTMP
1215	SHLQ $4, aluTMP
1216	LEAQ andMask<>(SB), aluCTR
1217	MOVOU -16(aluCTR)(aluTMP*1), T1
1218
1219	MOVOU (ctx), B0	// I assume there is TAG attached to the ctx, and there is no read overflow
1220	PAND T1, B0
1221
1222	MOVOU B0, T1
1223	PSHUFB BSWAP, B0
1224	PXOR ACC0, B0
1225
1226	MOVOU (16*14)(pTbl), ACC0
1227	MOVOU (16*15)(pTbl), ACCM
1228	MOVOU ACC0, ACC1
1229
1230	PCLMULQDQ $0x00, B0, ACC0
1231	PCLMULQDQ $0x11, B0, ACC1
1232	PSHUFD $78, B0, T0
1233	PXOR B0, T0
1234	PCLMULQDQ $0x00, T0, ACCM
1235
1236	PXOR ACC0, ACCM
1237	PXOR ACC1, ACCM
1238	MOVOU ACCM, T0
1239	PSRLDQ $8, ACCM
1240	PSLLDQ $8, T0
1241	PXOR ACCM, ACC1
1242	PXOR T0, ACC0
1243
1244	reduceRound(ACC0)
1245	reduceRound(ACC0)
1246	PXOR ACC1, ACC0
1247
1248	MOVOU (0*16)(SP), B0
1249	increment(0)
1250	AESENC B1, B0
1251	AESENC B2, B0
1252	AESENC B3, B0
1253	AESENC B4, B0
1254	AESENC B5, B0
1255	AESENC B6, B0
1256	AESENC B7, B0
1257	MOVOU (16*8)(ks), T0
1258	AESENC T0, B0
1259	MOVOU (16*9)(ks), T0
1260	AESENC T0, B0
1261	MOVOU (16*10)(ks), T0
1262	CMPQ NR, $12
1263	JB decLast3
1264	AESENC T0, B0
1265	MOVOU (16*11)(ks), T0
1266	AESENC T0, B0
1267	MOVOU (16*12)(ks), T0
1268	JE decLast3
1269	AESENC T0, B0
1270	MOVOU (16*13)(ks), T0
1271	AESENC T0, B0
1272	MOVOU (16*14)(ks), T0
1273decLast3:
1274	AESENCLAST T0, B0
1275	PXOR T1, B0
1276
1277ptxStoreLoop:
1278		PEXTRB $0, B0, (ptx)
1279		PSRLDQ $1, B0
1280		LEAQ 1(ptx), ptx
1281		DECQ ptxLen
1282
1283	JNE ptxStoreLoop
1284
1285gcmAesDecDone:
1286
1287	MOVOU ACC0, (tPtr)
1288	RET
1289