1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7#include "textflag.h"
8
9#define B0 V0
10#define B1 V1
11#define B2 V2
12#define B3 V3
13#define B4 V4
14#define B5 V5
15#define B6 V6
16#define B7 V7
17
18#define ACC0 V8
19#define ACC1 V9
20#define ACCM V10
21
22#define T0 V11
23#define T1 V12
24#define T2 V13
25#define T3 V14
26
27#define POLY V15
28#define ZERO V16
29#define INC V17
30#define CTR V18
31
32#define K0 V19
33#define K1 V20
34#define K2 V21
35#define K3 V22
36#define K4 V23
37#define K5 V24
38#define K6 V25
39#define K7 V26
40#define K8 V27
41#define K9 V28
42#define K10 V29
43#define K11 V30
44#define KLAST V31
45
46#define reduce() \
47	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
48	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
49	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
50	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
51	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
52	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
53	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
54	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
55	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
56	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
57	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
58	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
59	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
60
61// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
62TEXT ·gcmAesFinish(SB),NOSPLIT,$0
63#define pTbl R0
64#define tMsk R1
65#define tPtr R2
66#define plen R3
67#define dlen R4
68
69	MOVD	$0xC2, R1
70	LSL	$56, R1
71	MOVD	$1, R0
72	VMOV	R1, POLY.D[0]
73	VMOV	R0, POLY.D[1]
74	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
75
76	MOVD	productTable+0(FP), pTbl
77	MOVD	tagMask+8(FP), tMsk
78	MOVD	T+16(FP), tPtr
79	MOVD	pLen+24(FP), plen
80	MOVD	dLen+32(FP), dlen
81
82	VLD1	(tPtr), [ACC0.B16]
83	VLD1	(tMsk), [B1.B16]
84
85	LSL	$3, plen
86	LSL	$3, dlen
87
88	VMOV	dlen, B0.D[0]
89	VMOV	plen, B0.D[1]
90
91	ADD	$14*16, pTbl
92	VLD1.P	(pTbl), [T1.B16, T2.B16]
93
94	VEOR	ACC0.B16, B0.B16, B0.B16
95
96	VEXT	$8, B0.B16, B0.B16, T0.B16
97	VEOR	B0.B16, T0.B16, T0.B16
98	VPMULL	B0.D1, T1.D1, ACC1.Q1
99	VPMULL2	B0.D2, T1.D2, ACC0.Q1
100	VPMULL	T0.D1, T2.D1, ACCM.Q1
101
102	reduce()
103
104	VREV64	ACC0.B16, ACC0.B16
105	VEOR	B1.B16, ACC0.B16, ACC0.B16
106
107	VST1	[ACC0.B16], (tPtr)
108	RET
109#undef pTbl
110#undef tMsk
111#undef tPtr
112#undef plen
113#undef dlen
114
115// func gcmAesInit(productTable *[256]byte, ks []uint32)
116TEXT ·gcmAesInit(SB),NOSPLIT,$0
117#define pTbl R0
118#define KS R1
119#define NR R2
120#define I R3
121	MOVD	productTable+0(FP), pTbl
122	MOVD	ks_base+8(FP), KS
123	MOVD	ks_len+16(FP), NR
124
125	MOVD	$0xC2, I
126	LSL	$56, I
127	VMOV	I, POLY.D[0]
128	MOVD	$1, I
129	VMOV	I, POLY.D[1]
130	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
131
132	// Encrypt block 0 with the AES key to generate the hash key H
133	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
134	VEOR	B0.B16, B0.B16, B0.B16
135	AESE	T0.B16, B0.B16
136	AESMC	B0.B16, B0.B16
137	AESE	T1.B16, B0.B16
138	AESMC	B0.B16, B0.B16
139	AESE	T2.B16, B0.B16
140	AESMC	B0.B16, B0.B16
141	AESE	T3.B16, B0.B16
142	AESMC	B0.B16, B0.B16
143	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
144	AESE	T0.B16, B0.B16
145	AESMC	B0.B16, B0.B16
146	AESE	T1.B16, B0.B16
147	AESMC	B0.B16, B0.B16
148	AESE	T2.B16, B0.B16
149	AESMC	B0.B16, B0.B16
150	AESE	T3.B16, B0.B16
151	AESMC	B0.B16, B0.B16
152	TBZ	$4, NR, initEncFinish
153	VLD1.P	32(KS), [T0.B16, T1.B16]
154	AESE	T0.B16, B0.B16
155	AESMC	B0.B16, B0.B16
156	AESE	T1.B16, B0.B16
157	AESMC	B0.B16, B0.B16
158	TBZ	$3, NR, initEncFinish
159	VLD1.P	32(KS), [T0.B16, T1.B16]
160	AESE	T0.B16, B0.B16
161	AESMC	B0.B16, B0.B16
162	AESE	T1.B16, B0.B16
163	AESMC	B0.B16, B0.B16
164initEncFinish:
165	VLD1	(KS), [T0.B16, T1.B16, T2.B16]
166	AESE	T0.B16, B0.B16
167	AESMC	B0.B16, B0.B16
168	AESE	T1.B16, B0.B16
169	VEOR	T2.B16, B0.B16, B0.B16
170
171	VREV64	B0.B16, B0.B16
172
173	// Multiply by 2 modulo P
174	VMOV	B0.D[0], I
175	ASR	$63, I
176	VMOV	I, T1.D[0]
177	VMOV	I, T1.D[1]
178	VAND	POLY.B16, T1.B16, T1.B16
179	VUSHR	$63, B0.D2, T2.D2
180	VEXT	$8, ZERO.B16, T2.B16, T2.B16
181	VSHL	$1, B0.D2, B0.D2
182	VEOR	T1.B16, B0.B16, B0.B16
183	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
184
185	// Karatsuba pre-computation
186	VEXT	$8, B0.B16, B0.B16, B1.B16
187	VEOR	B0.B16, B1.B16, B1.B16
188
189	ADD	$14*16, pTbl
190	VST1	[B0.B16, B1.B16], (pTbl)
191	SUB	$2*16, pTbl
192
193	VMOV	B0.B16, B2.B16
194	VMOV	B1.B16, B3.B16
195
196	MOVD	$7, I
197
198initLoop:
199	// Compute powers of H
200	SUBS	$1, I
201
202	VPMULL	B0.D1, B2.D1, T1.Q1
203	VPMULL2	B0.D2, B2.D2, T0.Q1
204	VPMULL	B1.D1, B3.D1, T2.Q1
205	VEOR	T0.B16, T2.B16, T2.B16
206	VEOR	T1.B16, T2.B16, T2.B16
207	VEXT	$8, ZERO.B16, T2.B16, T3.B16
208	VEXT	$8, T2.B16, ZERO.B16, T2.B16
209	VEOR	T2.B16, T0.B16, T0.B16
210	VEOR	T3.B16, T1.B16, T1.B16
211	VPMULL	POLY.D1, T0.D1, T2.Q1
212	VEXT	$8, T0.B16, T0.B16, T0.B16
213	VEOR	T2.B16, T0.B16, T0.B16
214	VPMULL	POLY.D1, T0.D1, T2.Q1
215	VEXT	$8, T0.B16, T0.B16, T0.B16
216	VEOR	T2.B16, T0.B16, T0.B16
217	VEOR	T1.B16, T0.B16, B2.B16
218	VMOV	B2.B16, B3.B16
219	VEXT	$8, B2.B16, B2.B16, B2.B16
220	VEOR	B2.B16, B3.B16, B3.B16
221
222	VST1	[B2.B16, B3.B16], (pTbl)
223	SUB	$2*16, pTbl
224
225	BNE	initLoop
226	RET
227#undef I
228#undef NR
229#undef KS
230#undef pTbl
231
232// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
233TEXT ·gcmAesData(SB),NOSPLIT,$0
234#define pTbl R0
235#define aut R1
236#define tPtr R2
237#define autLen R3
238#define H0 R4
239#define pTblSave R5
240
241#define mulRound(X) \
242	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
243	VREV64	X.B16, X.B16               \
244	VEXT	$8, X.B16, X.B16, T0.B16   \
245	VEOR	X.B16, T0.B16, T0.B16      \
246	VPMULL	X.D1, T1.D1, T3.Q1         \
247	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
248	VPMULL2	X.D2, T1.D2, T3.Q1         \
249	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
250	VPMULL	T0.D1, T2.D1, T3.Q1        \
251	VEOR	T3.B16, ACCM.B16, ACCM.B16
252
253	MOVD	productTable+0(FP), pTbl
254	MOVD	data_base+8(FP), aut
255	MOVD	data_len+16(FP), autLen
256	MOVD	T+32(FP), tPtr
257
258	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
259	CBZ	autLen, dataBail
260
261	MOVD	$0xC2, H0
262	LSL	$56, H0
263	VMOV	H0, POLY.D[0]
264	MOVD	$1, H0
265	VMOV	H0, POLY.D[1]
266	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
267	MOVD	pTbl, pTblSave
268
269	CMP	$13, autLen
270	BEQ	dataTLS
271	CMP	$128, autLen
272	BLT	startSinglesLoop
273	B	octetsLoop
274
275dataTLS:
276	ADD	$14*16, pTbl
277	VLD1.P	(pTbl), [T1.B16, T2.B16]
278	VEOR	B0.B16, B0.B16, B0.B16
279
280	MOVD	(aut), H0
281	VMOV	H0, B0.D[0]
282	MOVW	8(aut), H0
283	VMOV	H0, B0.S[2]
284	MOVB	12(aut), H0
285	VMOV	H0, B0.B[12]
286
287	MOVD	$0, autLen
288	B	dataMul
289
290octetsLoop:
291		CMP	$128, autLen
292		BLT	startSinglesLoop
293		SUB	$128, autLen
294
295		VLD1.P	32(aut), [B0.B16, B1.B16]
296
297		VLD1.P	32(pTbl), [T1.B16, T2.B16]
298		VREV64	B0.B16, B0.B16
299		VEOR	ACC0.B16, B0.B16, B0.B16
300		VEXT	$8, B0.B16, B0.B16, T0.B16
301		VEOR	B0.B16, T0.B16, T0.B16
302		VPMULL	B0.D1, T1.D1, ACC1.Q1
303		VPMULL2	B0.D2, T1.D2, ACC0.Q1
304		VPMULL	T0.D1, T2.D1, ACCM.Q1
305
306		mulRound(B1)
307		VLD1.P  32(aut), [B2.B16, B3.B16]
308		mulRound(B2)
309		mulRound(B3)
310		VLD1.P  32(aut), [B4.B16, B5.B16]
311		mulRound(B4)
312		mulRound(B5)
313		VLD1.P  32(aut), [B6.B16, B7.B16]
314		mulRound(B6)
315		mulRound(B7)
316
317		MOVD	pTblSave, pTbl
318		reduce()
319	B	octetsLoop
320
321startSinglesLoop:
322
323	ADD	$14*16, pTbl
324	VLD1.P	(pTbl), [T1.B16, T2.B16]
325
326singlesLoop:
327
328		CMP	$16, autLen
329		BLT	dataEnd
330		SUB	$16, autLen
331
332		VLD1.P	16(aut), [B0.B16]
333dataMul:
334		VREV64	B0.B16, B0.B16
335		VEOR	ACC0.B16, B0.B16, B0.B16
336
337		VEXT	$8, B0.B16, B0.B16, T0.B16
338		VEOR	B0.B16, T0.B16, T0.B16
339		VPMULL	B0.D1, T1.D1, ACC1.Q1
340		VPMULL2	B0.D2, T1.D2, ACC0.Q1
341		VPMULL	T0.D1, T2.D1, ACCM.Q1
342
343		reduce()
344
345	B	singlesLoop
346
347dataEnd:
348
349	CBZ	autLen, dataBail
350	VEOR	B0.B16, B0.B16, B0.B16
351	ADD	autLen, aut
352
353dataLoadLoop:
354		MOVB.W	-1(aut), H0
355		VEXT	$15, B0.B16, ZERO.B16, B0.B16
356		VMOV	H0, B0.B[0]
357		SUBS	$1, autLen
358		BNE	dataLoadLoop
359	B	dataMul
360
361dataBail:
362	VST1	[ACC0.B16], (tPtr)
363	RET
364
365#undef pTbl
366#undef aut
367#undef tPtr
368#undef autLen
369#undef H0
370#undef pTblSave
371
372// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
373TEXT ·gcmAesEnc(SB),NOSPLIT,$0
374#define pTbl R0
375#define dstPtr R1
376#define ctrPtr R2
377#define srcPtr R3
378#define ks R4
379#define tPtr R5
380#define srcPtrLen R6
381#define aluCTR R7
382#define aluTMP R8
383#define aluK R9
384#define NR R10
385#define H0 R11
386#define H1 R12
387#define curK R13
388#define pTblSave R14
389
390#define aesrndx8(K) \
391	AESE	K.B16, B0.B16    \
392	AESMC	B0.B16, B0.B16   \
393	AESE	K.B16, B1.B16    \
394	AESMC	B1.B16, B1.B16   \
395	AESE	K.B16, B2.B16    \
396	AESMC	B2.B16, B2.B16   \
397	AESE	K.B16, B3.B16    \
398	AESMC	B3.B16, B3.B16   \
399	AESE	K.B16, B4.B16    \
400	AESMC	B4.B16, B4.B16   \
401	AESE	K.B16, B5.B16    \
402	AESMC	B5.B16, B5.B16   \
403	AESE	K.B16, B6.B16    \
404	AESMC	B6.B16, B6.B16   \
405	AESE	K.B16, B7.B16    \
406	AESMC	B7.B16, B7.B16
407
408#define aesrndlastx8(K) \
409	AESE	K.B16, B0.B16    \
410	AESE	K.B16, B1.B16    \
411	AESE	K.B16, B2.B16    \
412	AESE	K.B16, B3.B16    \
413	AESE	K.B16, B4.B16    \
414	AESE	K.B16, B5.B16    \
415	AESE	K.B16, B6.B16    \
416	AESE	K.B16, B7.B16
417
418	MOVD	productTable+0(FP), pTbl
419	MOVD	dst+8(FP), dstPtr
420	MOVD	src_base+32(FP), srcPtr
421	MOVD	src_len+40(FP), srcPtrLen
422	MOVD	ctr+56(FP), ctrPtr
423	MOVD	T+64(FP), tPtr
424	MOVD	ks_base+72(FP), ks
425	MOVD	ks_len+80(FP), NR
426
427	MOVD	$0xC2, H1
428	LSL	$56, H1
429	MOVD	$1, H0
430	VMOV	H1, POLY.D[0]
431	VMOV	H0, POLY.D[1]
432	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
433	// Compute NR from len(ks)
434	MOVD	pTbl, pTblSave
435	// Current tag, after AAD
436	VLD1	(tPtr), [ACC0.B16]
437	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
438	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
439	// Prepare initial counter, and the increment vector
440	VLD1	(ctrPtr), [CTR.B16]
441	VEOR	INC.B16, INC.B16, INC.B16
442	MOVD	$1, H0
443	VMOV	H0, INC.S[3]
444	VREV32	CTR.B16, CTR.B16
445	VADD	CTR.S4, INC.S4, CTR.S4
446	// Skip to <8 blocks loop
447	CMP	$128, srcPtrLen
448
449	MOVD	ks, H0
450	// For AES-128 round keys are stored in: K0 .. K10, KLAST
451	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
452	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
453	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
454	VMOV	K10.B16, KLAST.B16
455
456	BLT	startSingles
457	// There are at least 8 blocks to encrypt
458	TBZ	$4, NR, octetsLoop
459
460	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
461	VMOV	K8.B16, K10.B16
462	VMOV	K9.B16, K11.B16
463	VMOV	KLAST.B16, K8.B16
464	VLD1.P	16(H0), [K9.B16]
465	VLD1.P  16(H0), [KLAST.B16]
466	TBZ	$3, NR, octetsLoop
467	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
468	VMOV	KLAST.B16, K8.B16
469	VLD1.P	16(H0), [K9.B16]
470	VLD1.P  16(H0), [KLAST.B16]
471	ADD	$10*16, ks, H0
472	MOVD	H0, curK
473
474octetsLoop:
475		SUB	$128, srcPtrLen
476
477		VMOV	CTR.B16, B0.B16
478		VADD	B0.S4, INC.S4, B1.S4
479		VREV32	B0.B16, B0.B16
480		VADD	B1.S4, INC.S4, B2.S4
481		VREV32	B1.B16, B1.B16
482		VADD	B2.S4, INC.S4, B3.S4
483		VREV32	B2.B16, B2.B16
484		VADD	B3.S4, INC.S4, B4.S4
485		VREV32	B3.B16, B3.B16
486		VADD	B4.S4, INC.S4, B5.S4
487		VREV32	B4.B16, B4.B16
488		VADD	B5.S4, INC.S4, B6.S4
489		VREV32	B5.B16, B5.B16
490		VADD	B6.S4, INC.S4, B7.S4
491		VREV32	B6.B16, B6.B16
492		VADD	B7.S4, INC.S4, CTR.S4
493		VREV32	B7.B16, B7.B16
494
495		aesrndx8(K0)
496		aesrndx8(K1)
497		aesrndx8(K2)
498		aesrndx8(K3)
499		aesrndx8(K4)
500		aesrndx8(K5)
501		aesrndx8(K6)
502		aesrndx8(K7)
503		TBZ	$4, NR, octetsFinish
504		aesrndx8(K10)
505		aesrndx8(K11)
506		TBZ	$3, NR, octetsFinish
507		VLD1.P	32(curK), [T1.B16, T2.B16]
508		aesrndx8(T1)
509		aesrndx8(T2)
510		MOVD	H0, curK
511octetsFinish:
512		aesrndx8(K8)
513		aesrndlastx8(K9)
514
515		VEOR	KLAST.B16, B0.B16, B0.B16
516		VEOR	KLAST.B16, B1.B16, B1.B16
517		VEOR	KLAST.B16, B2.B16, B2.B16
518		VEOR	KLAST.B16, B3.B16, B3.B16
519		VEOR	KLAST.B16, B4.B16, B4.B16
520		VEOR	KLAST.B16, B5.B16, B5.B16
521		VEOR	KLAST.B16, B6.B16, B6.B16
522		VEOR	KLAST.B16, B7.B16, B7.B16
523
524		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
525		VEOR	B0.B16, T1.B16, B0.B16
526		VEOR	B1.B16, T2.B16, B1.B16
527		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
528		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
529		VEOR	B2.B16, T1.B16, B2.B16
530		VEOR	B3.B16, T2.B16, B3.B16
531		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
532		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
533		VEOR	B4.B16, T1.B16, B4.B16
534		VEOR	B5.B16, T2.B16, B5.B16
535		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
536		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
537		VEOR	B6.B16, T1.B16, B6.B16
538		VEOR	B7.B16, T2.B16, B7.B16
539		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
540
541		VLD1.P	32(pTbl), [T1.B16, T2.B16]
542		VREV64	B0.B16, B0.B16
543		VEOR	ACC0.B16, B0.B16, B0.B16
544		VEXT	$8, B0.B16, B0.B16, T0.B16
545		VEOR	B0.B16, T0.B16, T0.B16
546		VPMULL	B0.D1, T1.D1, ACC1.Q1
547		VPMULL2	B0.D2, T1.D2, ACC0.Q1
548		VPMULL	T0.D1, T2.D1, ACCM.Q1
549
550		mulRound(B1)
551		mulRound(B2)
552		mulRound(B3)
553		mulRound(B4)
554		mulRound(B5)
555		mulRound(B6)
556		mulRound(B7)
557		MOVD	pTblSave, pTbl
558		reduce()
559
560		CMP	$128, srcPtrLen
561		BGE	octetsLoop
562
563startSingles:
564	CBZ	srcPtrLen, done
565	ADD	$14*16, pTbl
566	// Preload H and its Karatsuba precomp
567	VLD1.P	(pTbl), [T1.B16, T2.B16]
568	// Preload AES round keys
569	ADD	$128, ks
570	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
571	VMOV	K10.B16, KLAST.B16
572	TBZ	$4, NR, singlesLoop
573	VLD1.P	32(ks), [B1.B16, B2.B16]
574	VMOV	B2.B16, KLAST.B16
575	TBZ	$3, NR, singlesLoop
576	VLD1.P	32(ks), [B3.B16, B4.B16]
577	VMOV	B4.B16, KLAST.B16
578
579singlesLoop:
580		CMP	$16, srcPtrLen
581		BLT	tail
582		SUB	$16, srcPtrLen
583
584		VLD1.P	16(srcPtr), [T0.B16]
585		VEOR	KLAST.B16, T0.B16, T0.B16
586
587		VREV32	CTR.B16, B0.B16
588		VADD	CTR.S4, INC.S4, CTR.S4
589
590		AESE	K0.B16, B0.B16
591		AESMC	B0.B16, B0.B16
592		AESE	K1.B16, B0.B16
593		AESMC	B0.B16, B0.B16
594		AESE	K2.B16, B0.B16
595		AESMC	B0.B16, B0.B16
596		AESE	K3.B16, B0.B16
597		AESMC	B0.B16, B0.B16
598		AESE	K4.B16, B0.B16
599		AESMC	B0.B16, B0.B16
600		AESE	K5.B16, B0.B16
601		AESMC	B0.B16, B0.B16
602		AESE	K6.B16, B0.B16
603		AESMC	B0.B16, B0.B16
604		AESE	K7.B16, B0.B16
605		AESMC	B0.B16, B0.B16
606		AESE	K8.B16, B0.B16
607		AESMC	B0.B16, B0.B16
608		AESE	K9.B16, B0.B16
609		TBZ	$4, NR, singlesLast
610		AESMC	B0.B16, B0.B16
611		AESE	K10.B16, B0.B16
612		AESMC	B0.B16, B0.B16
613		AESE	B1.B16, B0.B16
614		TBZ	$3, NR, singlesLast
615		AESMC	B0.B16, B0.B16
616		AESE	B2.B16, B0.B16
617		AESMC	B0.B16, B0.B16
618		AESE	B3.B16, B0.B16
619singlesLast:
620		VEOR	T0.B16, B0.B16, B0.B16
621encReduce:
622		VST1.P	[B0.B16], 16(dstPtr)
623
624		VREV64	B0.B16, B0.B16
625		VEOR	ACC0.B16, B0.B16, B0.B16
626
627		VEXT	$8, B0.B16, B0.B16, T0.B16
628		VEOR	B0.B16, T0.B16, T0.B16
629		VPMULL	B0.D1, T1.D1, ACC1.Q1
630		VPMULL2	B0.D2, T1.D2, ACC0.Q1
631		VPMULL	T0.D1, T2.D1, ACCM.Q1
632
633		reduce()
634
635	B	singlesLoop
636tail:
637	CBZ	srcPtrLen, done
638
639	VEOR	T0.B16, T0.B16, T0.B16
640	VEOR	T3.B16, T3.B16, T3.B16
641	MOVD	$0, H1
642	SUB	$1, H1
643	ADD	srcPtrLen, srcPtr
644
645	TBZ	$3, srcPtrLen, ld4
646	MOVD.W	-8(srcPtr), H0
647	VMOV	H0, T0.D[0]
648	VMOV	H1, T3.D[0]
649ld4:
650	TBZ	$2, srcPtrLen, ld2
651	MOVW.W	-4(srcPtr), H0
652	VEXT	$12, T0.B16, ZERO.B16, T0.B16
653	VEXT	$12, T3.B16, ZERO.B16, T3.B16
654	VMOV	H0, T0.S[0]
655	VMOV	H1, T3.S[0]
656ld2:
657	TBZ	$1, srcPtrLen, ld1
658	MOVH.W	-2(srcPtr), H0
659	VEXT	$14, T0.B16, ZERO.B16, T0.B16
660	VEXT	$14, T3.B16, ZERO.B16, T3.B16
661	VMOV	H0, T0.H[0]
662	VMOV	H1, T3.H[0]
663ld1:
664	TBZ	$0, srcPtrLen, ld0
665	MOVB.W	-1(srcPtr), H0
666	VEXT	$15, T0.B16, ZERO.B16, T0.B16
667	VEXT	$15, T3.B16, ZERO.B16, T3.B16
668	VMOV	H0, T0.B[0]
669	VMOV	H1, T3.B[0]
670ld0:
671
672	MOVD	ZR, srcPtrLen
673	VEOR	KLAST.B16, T0.B16, T0.B16
674	VREV32	CTR.B16, B0.B16
675
676	AESE	K0.B16, B0.B16
677	AESMC	B0.B16, B0.B16
678	AESE	K1.B16, B0.B16
679	AESMC	B0.B16, B0.B16
680	AESE	K2.B16, B0.B16
681	AESMC	B0.B16, B0.B16
682	AESE	K3.B16, B0.B16
683	AESMC	B0.B16, B0.B16
684	AESE	K4.B16, B0.B16
685	AESMC	B0.B16, B0.B16
686	AESE	K5.B16, B0.B16
687	AESMC	B0.B16, B0.B16
688	AESE	K6.B16, B0.B16
689	AESMC	B0.B16, B0.B16
690	AESE	K7.B16, B0.B16
691	AESMC	B0.B16, B0.B16
692	AESE	K8.B16, B0.B16
693	AESMC	B0.B16, B0.B16
694	AESE	K9.B16, B0.B16
695	TBZ	$4, NR, tailLast
696	AESMC	B0.B16, B0.B16
697	AESE	K10.B16, B0.B16
698	AESMC	B0.B16, B0.B16
699	AESE	B1.B16, B0.B16
700	TBZ	$3, NR, tailLast
701	AESMC	B0.B16, B0.B16
702	AESE	B2.B16, B0.B16
703	AESMC	B0.B16, B0.B16
704	AESE	B3.B16, B0.B16
705
706tailLast:
707	VEOR	T0.B16, B0.B16, B0.B16
708	VAND	T3.B16, B0.B16, B0.B16
709	B	encReduce
710
711done:
712	VST1	[ACC0.B16], (tPtr)
713	RET
714
715// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
716TEXT ·gcmAesDec(SB),NOSPLIT,$0
717	MOVD	productTable+0(FP), pTbl
718	MOVD	dst+8(FP), dstPtr
719	MOVD	src_base+32(FP), srcPtr
720	MOVD	src_len+40(FP), srcPtrLen
721	MOVD	ctr+56(FP), ctrPtr
722	MOVD	T+64(FP), tPtr
723	MOVD	ks_base+72(FP), ks
724	MOVD	ks_len+80(FP), NR
725
726	MOVD	$0xC2, H1
727	LSL	$56, H1
728	MOVD	$1, H0
729	VMOV	H1, POLY.D[0]
730	VMOV	H0, POLY.D[1]
731	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
732	// Compute NR from len(ks)
733	MOVD	pTbl, pTblSave
734	// Current tag, after AAD
735	VLD1	(tPtr), [ACC0.B16]
736	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
737	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
738	// Prepare initial counter, and the increment vector
739	VLD1	(ctrPtr), [CTR.B16]
740	VEOR	INC.B16, INC.B16, INC.B16
741	MOVD	$1, H0
742	VMOV	H0, INC.S[3]
743	VREV32	CTR.B16, CTR.B16
744	VADD	CTR.S4, INC.S4, CTR.S4
745
746	MOVD	ks, H0
747	// For AES-128 round keys are stored in: K0 .. K10, KLAST
748	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
749	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
750	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
751	VMOV	K10.B16, KLAST.B16
752
753	// Skip to <8 blocks loop
754	CMP	$128, srcPtrLen
755	BLT	startSingles
756	// There are at least 8 blocks to encrypt
757	TBZ	$4, NR, octetsLoop
758
759	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
760	VMOV	K8.B16, K10.B16
761	VMOV	K9.B16, K11.B16
762	VMOV	KLAST.B16, K8.B16
763	VLD1.P	16(H0), [K9.B16]
764	VLD1.P  16(H0), [KLAST.B16]
765	TBZ	$3, NR, octetsLoop
766	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
767	VMOV	KLAST.B16, K8.B16
768	VLD1.P	16(H0), [K9.B16]
769	VLD1.P  16(H0), [KLAST.B16]
770	ADD	$10*16, ks, H0
771	MOVD	H0, curK
772
773octetsLoop:
774		SUB	$128, srcPtrLen
775
776		VMOV	CTR.B16, B0.B16
777		VADD	B0.S4, INC.S4, B1.S4
778		VREV32	B0.B16, B0.B16
779		VADD	B1.S4, INC.S4, B2.S4
780		VREV32	B1.B16, B1.B16
781		VADD	B2.S4, INC.S4, B3.S4
782		VREV32	B2.B16, B2.B16
783		VADD	B3.S4, INC.S4, B4.S4
784		VREV32	B3.B16, B3.B16
785		VADD	B4.S4, INC.S4, B5.S4
786		VREV32	B4.B16, B4.B16
787		VADD	B5.S4, INC.S4, B6.S4
788		VREV32	B5.B16, B5.B16
789		VADD	B6.S4, INC.S4, B7.S4
790		VREV32	B6.B16, B6.B16
791		VADD	B7.S4, INC.S4, CTR.S4
792		VREV32	B7.B16, B7.B16
793
794		aesrndx8(K0)
795		aesrndx8(K1)
796		aesrndx8(K2)
797		aesrndx8(K3)
798		aesrndx8(K4)
799		aesrndx8(K5)
800		aesrndx8(K6)
801		aesrndx8(K7)
802		TBZ	$4, NR, octetsFinish
803		aesrndx8(K10)
804		aesrndx8(K11)
805		TBZ	$3, NR, octetsFinish
806		VLD1.P	32(curK), [T1.B16, T2.B16]
807		aesrndx8(T1)
808		aesrndx8(T2)
809		MOVD	H0, curK
810octetsFinish:
811		aesrndx8(K8)
812		aesrndlastx8(K9)
813
814		VEOR	KLAST.B16, B0.B16, T1.B16
815		VEOR	KLAST.B16, B1.B16, T2.B16
816		VEOR	KLAST.B16, B2.B16, B2.B16
817		VEOR	KLAST.B16, B3.B16, B3.B16
818		VEOR	KLAST.B16, B4.B16, B4.B16
819		VEOR	KLAST.B16, B5.B16, B5.B16
820		VEOR	KLAST.B16, B6.B16, B6.B16
821		VEOR	KLAST.B16, B7.B16, B7.B16
822
823		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
824		VEOR	B0.B16, T1.B16, T1.B16
825		VEOR	B1.B16, T2.B16, T2.B16
826		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
827
828		VLD1.P	32(pTbl), [T1.B16, T2.B16]
829		VREV64	B0.B16, B0.B16
830		VEOR	ACC0.B16, B0.B16, B0.B16
831		VEXT	$8, B0.B16, B0.B16, T0.B16
832		VEOR	B0.B16, T0.B16, T0.B16
833		VPMULL	B0.D1, T1.D1, ACC1.Q1
834		VPMULL2	B0.D2, T1.D2, ACC0.Q1
835		VPMULL	T0.D1, T2.D1, ACCM.Q1
836		mulRound(B1)
837
838		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
839		VEOR	B2.B16, B0.B16, T1.B16
840		VEOR	B3.B16, B1.B16, T2.B16
841		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
842		mulRound(B0)
843		mulRound(B1)
844
845		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
846		VEOR	B4.B16, B0.B16, T1.B16
847		VEOR	B5.B16, B1.B16, T2.B16
848		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
849		mulRound(B0)
850		mulRound(B1)
851
852		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
853		VEOR	B6.B16, B0.B16, T1.B16
854		VEOR	B7.B16, B1.B16, T2.B16
855		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
856		mulRound(B0)
857		mulRound(B1)
858
859		MOVD	pTblSave, pTbl
860		reduce()
861
862		CMP	$128, srcPtrLen
863		BGE	octetsLoop
864
865startSingles:
866	CBZ	srcPtrLen, done
867	ADD	$14*16, pTbl
868	// Preload H and its Karatsuba precomp
869	VLD1.P	(pTbl), [T1.B16, T2.B16]
870	// Preload AES round keys
871	ADD	$128, ks
872	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
873	VMOV	K10.B16, KLAST.B16
874	TBZ	$4, NR, singlesLoop
875	VLD1.P	32(ks), [B1.B16, B2.B16]
876	VMOV	B2.B16, KLAST.B16
877	TBZ	$3, NR, singlesLoop
878	VLD1.P	32(ks), [B3.B16, B4.B16]
879	VMOV	B4.B16, KLAST.B16
880
881singlesLoop:
882		CMP	$16, srcPtrLen
883		BLT	tail
884		SUB	$16, srcPtrLen
885
886		VLD1.P	16(srcPtr), [T0.B16]
887		VREV64	T0.B16, B5.B16
888		VEOR	KLAST.B16, T0.B16, T0.B16
889
890		VREV32	CTR.B16, B0.B16
891		VADD	CTR.S4, INC.S4, CTR.S4
892
893		AESE	K0.B16, B0.B16
894		AESMC	B0.B16, B0.B16
895		AESE	K1.B16, B0.B16
896		AESMC	B0.B16, B0.B16
897		AESE	K2.B16, B0.B16
898		AESMC	B0.B16, B0.B16
899		AESE	K3.B16, B0.B16
900		AESMC	B0.B16, B0.B16
901		AESE	K4.B16, B0.B16
902		AESMC	B0.B16, B0.B16
903		AESE	K5.B16, B0.B16
904		AESMC	B0.B16, B0.B16
905		AESE	K6.B16, B0.B16
906		AESMC	B0.B16, B0.B16
907		AESE	K7.B16, B0.B16
908		AESMC	B0.B16, B0.B16
909		AESE	K8.B16, B0.B16
910		AESMC	B0.B16, B0.B16
911		AESE	K9.B16, B0.B16
912		TBZ	$4, NR, singlesLast
913		AESMC	B0.B16, B0.B16
914		AESE	K10.B16, B0.B16
915		AESMC	B0.B16, B0.B16
916		AESE	B1.B16, B0.B16
917		TBZ	$3, NR, singlesLast
918		AESMC	B0.B16, B0.B16
919		AESE	B2.B16, B0.B16
920		AESMC	B0.B16, B0.B16
921		AESE	B3.B16, B0.B16
922singlesLast:
923		VEOR	T0.B16, B0.B16, B0.B16
924
925		VST1.P	[B0.B16], 16(dstPtr)
926
927		VEOR	ACC0.B16, B5.B16, B5.B16
928		VEXT	$8, B5.B16, B5.B16, T0.B16
929		VEOR	B5.B16, T0.B16, T0.B16
930		VPMULL	B5.D1, T1.D1, ACC1.Q1
931		VPMULL2	B5.D2, T1.D2, ACC0.Q1
932		VPMULL	T0.D1, T2.D1, ACCM.Q1
933		reduce()
934
935	B	singlesLoop
936tail:
937	CBZ	srcPtrLen, done
938
939	VREV32	CTR.B16, B0.B16
940	VADD	CTR.S4, INC.S4, CTR.S4
941
942	AESE	K0.B16, B0.B16
943	AESMC	B0.B16, B0.B16
944	AESE	K1.B16, B0.B16
945	AESMC	B0.B16, B0.B16
946	AESE	K2.B16, B0.B16
947	AESMC	B0.B16, B0.B16
948	AESE	K3.B16, B0.B16
949	AESMC	B0.B16, B0.B16
950	AESE	K4.B16, B0.B16
951	AESMC	B0.B16, B0.B16
952	AESE	K5.B16, B0.B16
953	AESMC	B0.B16, B0.B16
954	AESE	K6.B16, B0.B16
955	AESMC	B0.B16, B0.B16
956	AESE	K7.B16, B0.B16
957	AESMC	B0.B16, B0.B16
958	AESE	K8.B16, B0.B16
959	AESMC	B0.B16, B0.B16
960	AESE	K9.B16, B0.B16
961	TBZ	$4, NR, tailLast
962	AESMC	B0.B16, B0.B16
963	AESE	K10.B16, B0.B16
964	AESMC	B0.B16, B0.B16
965	AESE	B1.B16, B0.B16
966	TBZ	$3, NR, tailLast
967	AESMC	B0.B16, B0.B16
968	AESE	B2.B16, B0.B16
969	AESMC	B0.B16, B0.B16
970	AESE	B3.B16, B0.B16
971tailLast:
972	VEOR	KLAST.B16, B0.B16, B0.B16
973
974	// Assuming it is safe to load past dstPtr due to the presence of the tag
975	VLD1	(srcPtr), [B5.B16]
976
977	VEOR	B5.B16, B0.B16, B0.B16
978
979	VEOR	T3.B16, T3.B16, T3.B16
980	MOVD	$0, H1
981	SUB	$1, H1
982
983	TBZ	$3, srcPtrLen, ld4
984	VMOV	B0.D[0], H0
985	MOVD.P	H0, 8(dstPtr)
986	VMOV	H1, T3.D[0]
987	VEXT	$8, ZERO.B16, B0.B16, B0.B16
988ld4:
989	TBZ	$2, srcPtrLen, ld2
990	VMOV	B0.S[0], H0
991	MOVW.P	H0, 4(dstPtr)
992	VEXT	$12, T3.B16, ZERO.B16, T3.B16
993	VMOV	H1, T3.S[0]
994	VEXT	$4, ZERO.B16, B0.B16, B0.B16
995ld2:
996	TBZ	$1, srcPtrLen, ld1
997	VMOV	B0.H[0], H0
998	MOVH.P	H0, 2(dstPtr)
999	VEXT	$14, T3.B16, ZERO.B16, T3.B16
1000	VMOV	H1, T3.H[0]
1001	VEXT	$2, ZERO.B16, B0.B16, B0.B16
1002ld1:
1003	TBZ	$0, srcPtrLen, ld0
1004	VMOV	B0.B[0], H0
1005	MOVB.P	H0, 1(dstPtr)
1006	VEXT	$15, T3.B16, ZERO.B16, T3.B16
1007	VMOV	H1, T3.B[0]
1008ld0:
1009
1010	VAND	T3.B16, B5.B16, B5.B16
1011	VREV64	B5.B16, B5.B16
1012
1013	VEOR	ACC0.B16, B5.B16, B5.B16
1014	VEXT	$8, B5.B16, B5.B16, T0.B16
1015	VEOR	B5.B16, T0.B16, T0.B16
1016	VPMULL	B5.D1, T1.D1, ACC1.Q1
1017	VPMULL2	B5.D2, T1.D2, ACC0.Q1
1018	VPMULL	T0.D1, T2.D1, ACCM.Q1
1019	reduce()
1020done:
1021	VST1	[ACC0.B16], (tPtr)
1022
1023	RET
1024