1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7#include "textflag.h"
8DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
9DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
10GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
11DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
12DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
13GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
14// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
15TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
16	MOVD	nr+0(FP), R9
17	MOVD	xk+8(FP), R10
18	MOVD	dst+16(FP), R11
19	MOVD	src+24(FP), R12
20
21	VLD1	(R12), [V0.B16]
22
23	CMP	$12, R9
24	BLT	enc128
25	BEQ	enc196
26enc256:
27	VLD1.P	32(R10), [V1.B16, V2.B16]
28	AESE	V1.B16, V0.B16
29	AESMC	V0.B16, V0.B16
30	AESE	V2.B16, V0.B16
31	AESMC	V0.B16, V0.B16
32enc196:
33	VLD1.P	32(R10), [V3.B16, V4.B16]
34	AESE	V3.B16, V0.B16
35	AESMC	V0.B16, V0.B16
36	AESE	V4.B16, V0.B16
37	AESMC	V0.B16, V0.B16
38enc128:
39	VLD1.P	64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
40	VLD1.P	64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
41	VLD1.P	48(R10), [V13.B16, V14.B16, V15.B16]
42	AESE	V5.B16, V0.B16
43	AESMC	V0.B16, V0.B16
44	AESE	V6.B16, V0.B16
45	AESMC	V0.B16, V0.B16
46	AESE	V7.B16, V0.B16
47	AESMC	V0.B16, V0.B16
48	AESE	V8.B16, V0.B16
49	AESMC	V0.B16, V0.B16
50	AESE	V9.B16, V0.B16
51	AESMC	V0.B16, V0.B16
52	AESE	V10.B16, V0.B16
53	AESMC	V0.B16, V0.B16
54	AESE	V11.B16, V0.B16
55	AESMC	V0.B16, V0.B16
56	AESE	V12.B16, V0.B16
57	AESMC	V0.B16, V0.B16
58	AESE	V13.B16, V0.B16
59	AESMC	V0.B16, V0.B16
60	AESE	V14.B16, V0.B16
61	VEOR    V0.B16, V15.B16, V0.B16
62	VST1	[V0.B16], (R11)
63	RET
64
65// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
66TEXT ·decryptBlockAsm(SB),NOSPLIT,$0
67	MOVD	nr+0(FP), R9
68	MOVD	xk+8(FP), R10
69	MOVD	dst+16(FP), R11
70	MOVD	src+24(FP), R12
71
72	VLD1	(R12), [V0.B16]
73
74	CMP	$12, R9
75	BLT	dec128
76	BEQ	dec196
77dec256:
78	VLD1.P	32(R10), [V1.B16, V2.B16]
79	AESD	V1.B16, V0.B16
80	AESIMC	V0.B16, V0.B16
81	AESD	V2.B16, V0.B16
82	AESIMC	V0.B16, V0.B16
83dec196:
84	VLD1.P	32(R10), [V3.B16, V4.B16]
85	AESD	V3.B16, V0.B16
86	AESIMC	V0.B16, V0.B16
87	AESD	V4.B16, V0.B16
88	AESIMC	V0.B16, V0.B16
89dec128:
90	VLD1.P	64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
91	VLD1.P	64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
92	VLD1.P	48(R10), [V13.B16, V14.B16, V15.B16]
93	AESD	V5.B16, V0.B16
94	AESIMC	V0.B16, V0.B16
95	AESD	V6.B16, V0.B16
96	AESIMC	V0.B16, V0.B16
97	AESD	V7.B16, V0.B16
98	AESIMC	V0.B16, V0.B16
99	AESD	V8.B16, V0.B16
100	AESIMC	V0.B16, V0.B16
101	AESD	V9.B16, V0.B16
102	AESIMC	V0.B16, V0.B16
103	AESD	V10.B16, V0.B16
104	AESIMC	V0.B16, V0.B16
105	AESD	V11.B16, V0.B16
106	AESIMC	V0.B16, V0.B16
107	AESD	V12.B16, V0.B16
108	AESIMC	V0.B16, V0.B16
109	AESD	V13.B16, V0.B16
110	AESIMC	V0.B16, V0.B16
111	AESD	V14.B16, V0.B16
112	VEOR    V0.B16, V15.B16, V0.B16
113	VST1	[V0.B16], (R11)
114	RET
115
116// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
117// Note that round keys are stored in uint128 format, not uint32
118TEXT ·expandKeyAsm(SB),NOSPLIT,$0
119	MOVD	nr+0(FP), R8
120	MOVD	key+8(FP), R9
121	MOVD	enc+16(FP), R10
122	MOVD	dec+24(FP), R11
123	LDP	rotInvSRows<>(SB), (R0, R1)
124	VMOV	R0, V3.D[0]
125	VMOV	R1, V3.D[1]
126	VEOR	V0.B16, V0.B16, V0.B16 // All zeroes
127	MOVW	$1, R13
128	TBZ	$1, R8, ks192
129	TBNZ	$2, R8, ks256
130	LDPW	(R9), (R4, R5)
131	LDPW	8(R9), (R6, R7)
132	STPW.P	(R4, R5), 8(R10)
133	STPW.P	(R6, R7), 8(R10)
134	MOVW	$0x1b, R14
135ks128Loop:
136		VMOV	R7, V2.S[0]
137		WORD	$0x4E030042       // TBL V3.B16, [V2.B16], V2.B16
138		AESE	V0.B16, V2.B16    // Use AES to compute the SBOX
139		EORW	R13, R4
140		LSLW	$1, R13           // Compute next Rcon
141		ANDSW	$0x100, R13, ZR
142		CSELW	NE, R14, R13, R13 // Fake modulo
143		SUBS	$1, R8
144		VMOV	V2.S[0], R0
145		EORW	R0, R4
146		EORW	R4, R5
147		EORW	R5, R6
148		EORW	R6, R7
149		STPW.P	(R4, R5), 8(R10)
150		STPW.P	(R6, R7), 8(R10)
151	BNE	ks128Loop
152	CBZ	R11, ksDone       // If dec is nil we are done
153	SUB	$176, R10
154	// Decryption keys are encryption keys with InverseMixColumns applied
155	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
156	VMOV	V0.B16, V7.B16
157	AESIMC	V1.B16, V6.B16
158	AESIMC	V2.B16, V5.B16
159	AESIMC	V3.B16, V4.B16
160	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
161	AESIMC	V0.B16, V11.B16
162	AESIMC	V1.B16, V10.B16
163	AESIMC	V2.B16, V9.B16
164	AESIMC	V3.B16, V8.B16
165	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
166	AESIMC	V0.B16, V14.B16
167	AESIMC	V1.B16, V13.B16
168	VMOV	V2.B16, V12.B16
169	VST1.P	[V12.B16, V13.B16, V14.B16], 48(R11)
170	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
171	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
172	B	ksDone
173ks192:
174	LDPW	(R9), (R2, R3)
175	LDPW	8(R9), (R4, R5)
176	LDPW	16(R9), (R6, R7)
177	STPW.P	(R2, R3), 8(R10)
178	STPW.P	(R4, R5), 8(R10)
179	SUB	$4, R8
180ks192Loop:
181		STPW.P	(R6, R7), 8(R10)
182		VMOV	R7, V2.S[0]
183		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
184		AESE	V0.B16, V2.B16
185		EORW	R13, R2
186		LSLW	$1, R13
187		SUBS	$1, R8
188		VMOV	V2.S[0], R0
189		EORW	R0, R2
190		EORW	R2, R3
191		EORW	R3, R4
192		EORW	R4, R5
193		EORW	R5, R6
194		EORW	R6, R7
195		STPW.P	(R2, R3), 8(R10)
196		STPW.P	(R4, R5), 8(R10)
197	BNE	ks192Loop
198	CBZ	R11, ksDone
199	SUB	$208, R10
200	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
201	VMOV	V0.B16, V7.B16
202	AESIMC	V1.B16, V6.B16
203	AESIMC	V2.B16, V5.B16
204	AESIMC	V3.B16, V4.B16
205	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
206	AESIMC	V0.B16, V11.B16
207	AESIMC	V1.B16, V10.B16
208	AESIMC	V2.B16, V9.B16
209	AESIMC	V3.B16, V8.B16
210	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
211	AESIMC	V0.B16, V15.B16
212	AESIMC	V1.B16, V14.B16
213	AESIMC	V2.B16, V13.B16
214	AESIMC	V3.B16, V12.B16
215	VLD1	(R10), [V0.B16]
216	VST1.P	[V0.B16], 16(R11)
217	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
218	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
219	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
220	B	ksDone
221ks256:
222	LDP	invSRows<>(SB), (R0, R1)
223	VMOV	R0, V4.D[0]
224	VMOV	R1, V4.D[1]
225	LDPW	(R9), (R0, R1)
226	LDPW	8(R9), (R2, R3)
227	LDPW	16(R9), (R4, R5)
228	LDPW	24(R9), (R6, R7)
229	STPW.P	(R0, R1), 8(R10)
230	STPW.P	(R2, R3), 8(R10)
231	SUB	$7, R8
232ks256Loop:
233		STPW.P	(R4, R5), 8(R10)
234		STPW.P	(R6, R7), 8(R10)
235		VMOV	R7, V2.S[0]
236		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
237		AESE	V0.B16, V2.B16
238		EORW	R13, R0
239		LSLW	$1, R13
240		SUBS	$1, R8
241		VMOV	V2.S[0], R9
242		EORW	R9, R0
243		EORW	R0, R1
244		EORW	R1, R2
245		EORW	R2, R3
246		VMOV	R3, V2.S[0]
247		WORD	$0x4E040042 //TBL	V3.B16, [V2.B16], V2.B16
248		AESE	V0.B16, V2.B16
249		VMOV	V2.S[0], R9
250		EORW	R9, R4
251		EORW	R4, R5
252		EORW	R5, R6
253		EORW	R6, R7
254		STPW.P	(R0, R1), 8(R10)
255		STPW.P	(R2, R3), 8(R10)
256	BNE	ks256Loop
257	CBZ	R11, ksDone
258	SUB	$240, R10
259	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
260	VMOV	V0.B16, V7.B16
261	AESIMC	V1.B16, V6.B16
262	AESIMC	V2.B16, V5.B16
263	AESIMC	V3.B16, V4.B16
264	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
265	AESIMC	V0.B16, V11.B16
266	AESIMC	V1.B16, V10.B16
267	AESIMC	V2.B16, V9.B16
268	AESIMC	V3.B16, V8.B16
269	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
270	AESIMC	V0.B16, V15.B16
271	AESIMC	V1.B16, V14.B16
272	AESIMC	V2.B16, V13.B16
273	AESIMC	V3.B16, V12.B16
274	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
275	AESIMC	V0.B16, V18.B16
276	AESIMC	V1.B16, V17.B16
277	VMOV	V2.B16, V16.B16
278	VST1.P	[V16.B16, V17.B16, V18.B16], 48(R11)
279	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
280	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
281	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
282ksDone:
283	RET
284