xref: /aosp_15_r20/external/boringssl/src/gen/bcm/aesv8-armv8-linux.S (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <openssl/arm_arch.h>
8
9#if __ARM_MAX_ARCH__>=7
10.text
11.arch	armv8-a+crypto
12.section	.rodata
13.align	5
14.Lrcon:
15.long	0x01,0x01,0x01,0x01
16.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
17.long	0x1b,0x1b,0x1b,0x1b
18
19.text
20
21.globl	aes_hw_set_encrypt_key
22.hidden	aes_hw_set_encrypt_key
23.type	aes_hw_set_encrypt_key,%function
24.align	5
25aes_hw_set_encrypt_key:
26.Lenc_key:
27	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
28	AARCH64_VALID_CALL_TARGET
29	stp	x29,x30,[sp,#-16]!
30	add	x29,sp,#0
31	mov	x3,#-2
32	cmp	w1,#128
33	b.lt	.Lenc_key_abort
34	cmp	w1,#256
35	b.gt	.Lenc_key_abort
36	tst	w1,#0x3f
37	b.ne	.Lenc_key_abort
38
39	adrp	x3,.Lrcon
40	add	x3,x3,:lo12:.Lrcon
41	cmp	w1,#192
42
43	eor	v0.16b,v0.16b,v0.16b
44	ld1	{v3.16b},[x0],#16
45	mov	w1,#8		// reuse w1
46	ld1	{v1.4s,v2.4s},[x3],#32
47
48	b.lt	.Loop128
49	b.eq	.L192
50	b	.L256
51
52.align	4
53.Loop128:
54	tbl	v6.16b,{v3.16b},v2.16b
55	ext	v5.16b,v0.16b,v3.16b,#12
56	st1	{v3.4s},[x2],#16
57	aese	v6.16b,v0.16b
58	subs	w1,w1,#1
59
60	eor	v3.16b,v3.16b,v5.16b
61	ext	v5.16b,v0.16b,v5.16b,#12
62	eor	v3.16b,v3.16b,v5.16b
63	ext	v5.16b,v0.16b,v5.16b,#12
64	eor	v6.16b,v6.16b,v1.16b
65	eor	v3.16b,v3.16b,v5.16b
66	shl	v1.16b,v1.16b,#1
67	eor	v3.16b,v3.16b,v6.16b
68	b.ne	.Loop128
69
70	ld1	{v1.4s},[x3]
71
72	tbl	v6.16b,{v3.16b},v2.16b
73	ext	v5.16b,v0.16b,v3.16b,#12
74	st1	{v3.4s},[x2],#16
75	aese	v6.16b,v0.16b
76
77	eor	v3.16b,v3.16b,v5.16b
78	ext	v5.16b,v0.16b,v5.16b,#12
79	eor	v3.16b,v3.16b,v5.16b
80	ext	v5.16b,v0.16b,v5.16b,#12
81	eor	v6.16b,v6.16b,v1.16b
82	eor	v3.16b,v3.16b,v5.16b
83	shl	v1.16b,v1.16b,#1
84	eor	v3.16b,v3.16b,v6.16b
85
86	tbl	v6.16b,{v3.16b},v2.16b
87	ext	v5.16b,v0.16b,v3.16b,#12
88	st1	{v3.4s},[x2],#16
89	aese	v6.16b,v0.16b
90
91	eor	v3.16b,v3.16b,v5.16b
92	ext	v5.16b,v0.16b,v5.16b,#12
93	eor	v3.16b,v3.16b,v5.16b
94	ext	v5.16b,v0.16b,v5.16b,#12
95	eor	v6.16b,v6.16b,v1.16b
96	eor	v3.16b,v3.16b,v5.16b
97	eor	v3.16b,v3.16b,v6.16b
98	st1	{v3.4s},[x2]
99	add	x2,x2,#0x50
100
101	mov	w12,#10
102	b	.Ldone
103
104.align	4
105.L192:
106	ld1	{v4.8b},[x0],#8
107	movi	v6.16b,#8			// borrow v6.16b
108	st1	{v3.4s},[x2],#16
109	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
110
111.Loop192:
112	tbl	v6.16b,{v4.16b},v2.16b
113	ext	v5.16b,v0.16b,v3.16b,#12
114	st1	{v4.8b},[x2],#8
115	aese	v6.16b,v0.16b
116	subs	w1,w1,#1
117
118	eor	v3.16b,v3.16b,v5.16b
119	ext	v5.16b,v0.16b,v5.16b,#12
120	eor	v3.16b,v3.16b,v5.16b
121	ext	v5.16b,v0.16b,v5.16b,#12
122	eor	v3.16b,v3.16b,v5.16b
123
124	dup	v5.4s,v3.s[3]
125	eor	v5.16b,v5.16b,v4.16b
126	eor	v6.16b,v6.16b,v1.16b
127	ext	v4.16b,v0.16b,v4.16b,#12
128	shl	v1.16b,v1.16b,#1
129	eor	v4.16b,v4.16b,v5.16b
130	eor	v3.16b,v3.16b,v6.16b
131	eor	v4.16b,v4.16b,v6.16b
132	st1	{v3.4s},[x2],#16
133	b.ne	.Loop192
134
135	mov	w12,#12
136	add	x2,x2,#0x20
137	b	.Ldone
138
139.align	4
140.L256:
141	ld1	{v4.16b},[x0]
142	mov	w1,#7
143	mov	w12,#14
144	st1	{v3.4s},[x2],#16
145
146.Loop256:
147	tbl	v6.16b,{v4.16b},v2.16b
148	ext	v5.16b,v0.16b,v3.16b,#12
149	st1	{v4.4s},[x2],#16
150	aese	v6.16b,v0.16b
151	subs	w1,w1,#1
152
153	eor	v3.16b,v3.16b,v5.16b
154	ext	v5.16b,v0.16b,v5.16b,#12
155	eor	v3.16b,v3.16b,v5.16b
156	ext	v5.16b,v0.16b,v5.16b,#12
157	eor	v6.16b,v6.16b,v1.16b
158	eor	v3.16b,v3.16b,v5.16b
159	shl	v1.16b,v1.16b,#1
160	eor	v3.16b,v3.16b,v6.16b
161	st1	{v3.4s},[x2],#16
162	b.eq	.Ldone
163
164	dup	v6.4s,v3.s[3]		// just splat
165	ext	v5.16b,v0.16b,v4.16b,#12
166	aese	v6.16b,v0.16b
167
168	eor	v4.16b,v4.16b,v5.16b
169	ext	v5.16b,v0.16b,v5.16b,#12
170	eor	v4.16b,v4.16b,v5.16b
171	ext	v5.16b,v0.16b,v5.16b,#12
172	eor	v4.16b,v4.16b,v5.16b
173
174	eor	v4.16b,v4.16b,v6.16b
175	b	.Loop256
176
177.Ldone:
178	str	w12,[x2]
179	mov	x3,#0
180
181.Lenc_key_abort:
182	mov	x0,x3			// return value
183	ldr	x29,[sp],#16
184	ret
185.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
186
187.globl	aes_hw_set_decrypt_key
188.hidden	aes_hw_set_decrypt_key
189.type	aes_hw_set_decrypt_key,%function
190.align	5
191aes_hw_set_decrypt_key:
192	AARCH64_SIGN_LINK_REGISTER
193	stp	x29,x30,[sp,#-16]!
194	add	x29,sp,#0
195	bl	.Lenc_key
196
197	cmp	x0,#0
198	b.ne	.Ldec_key_abort
199
200	sub	x2,x2,#240		// restore original x2
201	mov	x4,#-16
202	add	x0,x2,x12,lsl#4	// end of key schedule
203
204	ld1	{v0.4s},[x2]
205	ld1	{v1.4s},[x0]
206	st1	{v0.4s},[x0],x4
207	st1	{v1.4s},[x2],#16
208
209.Loop_imc:
210	ld1	{v0.4s},[x2]
211	ld1	{v1.4s},[x0]
212	aesimc	v0.16b,v0.16b
213	aesimc	v1.16b,v1.16b
214	st1	{v0.4s},[x0],x4
215	st1	{v1.4s},[x2],#16
216	cmp	x0,x2
217	b.hi	.Loop_imc
218
219	ld1	{v0.4s},[x2]
220	aesimc	v0.16b,v0.16b
221	st1	{v0.4s},[x0]
222
223	eor	x0,x0,x0		// return value
224.Ldec_key_abort:
225	ldp	x29,x30,[sp],#16
226	AARCH64_VALIDATE_LINK_REGISTER
227	ret
228.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
229.globl	aes_hw_encrypt
230.hidden	aes_hw_encrypt
231.type	aes_hw_encrypt,%function
232.align	5
233aes_hw_encrypt:
234	AARCH64_VALID_CALL_TARGET
235	ldr	w3,[x2,#240]
236	ld1	{v0.4s},[x2],#16
237	ld1	{v2.16b},[x0]
238	sub	w3,w3,#2
239	ld1	{v1.4s},[x2],#16
240
241.Loop_enc:
242	aese	v2.16b,v0.16b
243	aesmc	v2.16b,v2.16b
244	ld1	{v0.4s},[x2],#16
245	subs	w3,w3,#2
246	aese	v2.16b,v1.16b
247	aesmc	v2.16b,v2.16b
248	ld1	{v1.4s},[x2],#16
249	b.gt	.Loop_enc
250
251	aese	v2.16b,v0.16b
252	aesmc	v2.16b,v2.16b
253	ld1	{v0.4s},[x2]
254	aese	v2.16b,v1.16b
255	eor	v2.16b,v2.16b,v0.16b
256
257	st1	{v2.16b},[x1]
258	ret
259.size	aes_hw_encrypt,.-aes_hw_encrypt
260.globl	aes_hw_decrypt
261.hidden	aes_hw_decrypt
262.type	aes_hw_decrypt,%function
263.align	5
264aes_hw_decrypt:
265	AARCH64_VALID_CALL_TARGET
266	ldr	w3,[x2,#240]
267	ld1	{v0.4s},[x2],#16
268	ld1	{v2.16b},[x0]
269	sub	w3,w3,#2
270	ld1	{v1.4s},[x2],#16
271
272.Loop_dec:
273	aesd	v2.16b,v0.16b
274	aesimc	v2.16b,v2.16b
275	ld1	{v0.4s},[x2],#16
276	subs	w3,w3,#2
277	aesd	v2.16b,v1.16b
278	aesimc	v2.16b,v2.16b
279	ld1	{v1.4s},[x2],#16
280	b.gt	.Loop_dec
281
282	aesd	v2.16b,v0.16b
283	aesimc	v2.16b,v2.16b
284	ld1	{v0.4s},[x2]
285	aesd	v2.16b,v1.16b
286	eor	v2.16b,v2.16b,v0.16b
287
288	st1	{v2.16b},[x1]
289	ret
290.size	aes_hw_decrypt,.-aes_hw_decrypt
291.globl	aes_hw_cbc_encrypt
292.hidden	aes_hw_cbc_encrypt
293.type	aes_hw_cbc_encrypt,%function
294.align	5
295aes_hw_cbc_encrypt:
296	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
297	AARCH64_VALID_CALL_TARGET
298	stp	x29,x30,[sp,#-16]!
299	add	x29,sp,#0
300	subs	x2,x2,#16
301	mov	x8,#16
302	b.lo	.Lcbc_abort
303	csel	x8,xzr,x8,eq
304
305	cmp	w5,#0			// en- or decrypting?
306	ldr	w5,[x3,#240]
307	and	x2,x2,#-16
308	ld1	{v6.16b},[x4]
309	ld1	{v0.16b},[x0],x8
310
311	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
312	sub	w5,w5,#6
313	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
314	sub	w5,w5,#2
315	ld1	{v18.4s,v19.4s},[x7],#32
316	ld1	{v20.4s,v21.4s},[x7],#32
317	ld1	{v22.4s,v23.4s},[x7],#32
318	ld1	{v7.4s},[x7]
319
320	add	x7,x3,#32
321	mov	w6,w5
322	b.eq	.Lcbc_dec
323
324	cmp	w5,#2
325	eor	v0.16b,v0.16b,v6.16b
326	eor	v5.16b,v16.16b,v7.16b
327	b.eq	.Lcbc_enc128
328
329	ld1	{v2.4s,v3.4s},[x7]
330	add	x7,x3,#16
331	add	x6,x3,#16*4
332	add	x12,x3,#16*5
333	aese	v0.16b,v16.16b
334	aesmc	v0.16b,v0.16b
335	add	x14,x3,#16*6
336	add	x3,x3,#16*7
337	b	.Lenter_cbc_enc
338
339.align	4
340.Loop_cbc_enc:
341	aese	v0.16b,v16.16b
342	aesmc	v0.16b,v0.16b
343	st1	{v6.16b},[x1],#16
344.Lenter_cbc_enc:
345	aese	v0.16b,v17.16b
346	aesmc	v0.16b,v0.16b
347	aese	v0.16b,v2.16b
348	aesmc	v0.16b,v0.16b
349	ld1	{v16.4s},[x6]
350	cmp	w5,#4
351	aese	v0.16b,v3.16b
352	aesmc	v0.16b,v0.16b
353	ld1	{v17.4s},[x12]
354	b.eq	.Lcbc_enc192
355
356	aese	v0.16b,v16.16b
357	aesmc	v0.16b,v0.16b
358	ld1	{v16.4s},[x14]
359	aese	v0.16b,v17.16b
360	aesmc	v0.16b,v0.16b
361	ld1	{v17.4s},[x3]
362	nop
363
364.Lcbc_enc192:
365	aese	v0.16b,v16.16b
366	aesmc	v0.16b,v0.16b
367	subs	x2,x2,#16
368	aese	v0.16b,v17.16b
369	aesmc	v0.16b,v0.16b
370	csel	x8,xzr,x8,eq
371	aese	v0.16b,v18.16b
372	aesmc	v0.16b,v0.16b
373	aese	v0.16b,v19.16b
374	aesmc	v0.16b,v0.16b
375	ld1	{v16.16b},[x0],x8
376	aese	v0.16b,v20.16b
377	aesmc	v0.16b,v0.16b
378	eor	v16.16b,v16.16b,v5.16b
379	aese	v0.16b,v21.16b
380	aesmc	v0.16b,v0.16b
381	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
382	aese	v0.16b,v22.16b
383	aesmc	v0.16b,v0.16b
384	aese	v0.16b,v23.16b
385	eor	v6.16b,v0.16b,v7.16b
386	b.hs	.Loop_cbc_enc
387
388	st1	{v6.16b},[x1],#16
389	b	.Lcbc_done
390
391.align	5
392.Lcbc_enc128:
393	ld1	{v2.4s,v3.4s},[x7]
394	aese	v0.16b,v16.16b
395	aesmc	v0.16b,v0.16b
396	b	.Lenter_cbc_enc128
397.Loop_cbc_enc128:
398	aese	v0.16b,v16.16b
399	aesmc	v0.16b,v0.16b
400	st1	{v6.16b},[x1],#16
401.Lenter_cbc_enc128:
402	aese	v0.16b,v17.16b
403	aesmc	v0.16b,v0.16b
404	subs	x2,x2,#16
405	aese	v0.16b,v2.16b
406	aesmc	v0.16b,v0.16b
407	csel	x8,xzr,x8,eq
408	aese	v0.16b,v3.16b
409	aesmc	v0.16b,v0.16b
410	aese	v0.16b,v18.16b
411	aesmc	v0.16b,v0.16b
412	aese	v0.16b,v19.16b
413	aesmc	v0.16b,v0.16b
414	ld1	{v16.16b},[x0],x8
415	aese	v0.16b,v20.16b
416	aesmc	v0.16b,v0.16b
417	aese	v0.16b,v21.16b
418	aesmc	v0.16b,v0.16b
419	aese	v0.16b,v22.16b
420	aesmc	v0.16b,v0.16b
421	eor	v16.16b,v16.16b,v5.16b
422	aese	v0.16b,v23.16b
423	eor	v6.16b,v0.16b,v7.16b
424	b.hs	.Loop_cbc_enc128
425
426	st1	{v6.16b},[x1],#16
427	b	.Lcbc_done
428.align	5
429.Lcbc_dec:
430	ld1	{v18.16b},[x0],#16
431	subs	x2,x2,#32		// bias
432	add	w6,w5,#2
433	orr	v3.16b,v0.16b,v0.16b
434	orr	v1.16b,v0.16b,v0.16b
435	orr	v19.16b,v18.16b,v18.16b
436	b.lo	.Lcbc_dec_tail
437
438	orr	v1.16b,v18.16b,v18.16b
439	ld1	{v18.16b},[x0],#16
440	orr	v2.16b,v0.16b,v0.16b
441	orr	v3.16b,v1.16b,v1.16b
442	orr	v19.16b,v18.16b,v18.16b
443
444.Loop3x_cbc_dec:
445	aesd	v0.16b,v16.16b
446	aesimc	v0.16b,v0.16b
447	aesd	v1.16b,v16.16b
448	aesimc	v1.16b,v1.16b
449	aesd	v18.16b,v16.16b
450	aesimc	v18.16b,v18.16b
451	ld1	{v16.4s},[x7],#16
452	subs	w6,w6,#2
453	aesd	v0.16b,v17.16b
454	aesimc	v0.16b,v0.16b
455	aesd	v1.16b,v17.16b
456	aesimc	v1.16b,v1.16b
457	aesd	v18.16b,v17.16b
458	aesimc	v18.16b,v18.16b
459	ld1	{v17.4s},[x7],#16
460	b.gt	.Loop3x_cbc_dec
461
462	aesd	v0.16b,v16.16b
463	aesimc	v0.16b,v0.16b
464	aesd	v1.16b,v16.16b
465	aesimc	v1.16b,v1.16b
466	aesd	v18.16b,v16.16b
467	aesimc	v18.16b,v18.16b
468	eor	v4.16b,v6.16b,v7.16b
469	subs	x2,x2,#0x30
470	eor	v5.16b,v2.16b,v7.16b
471	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
472	aesd	v0.16b,v17.16b
473	aesimc	v0.16b,v0.16b
474	aesd	v1.16b,v17.16b
475	aesimc	v1.16b,v1.16b
476	aesd	v18.16b,v17.16b
477	aesimc	v18.16b,v18.16b
478	eor	v17.16b,v3.16b,v7.16b
479	add	x0,x0,x6		// x0 is adjusted in such way that
480					// at exit from the loop v1.16b-v18.16b
481					// are loaded with last "words"
482	orr	v6.16b,v19.16b,v19.16b
483	mov	x7,x3
484	aesd	v0.16b,v20.16b
485	aesimc	v0.16b,v0.16b
486	aesd	v1.16b,v20.16b
487	aesimc	v1.16b,v1.16b
488	aesd	v18.16b,v20.16b
489	aesimc	v18.16b,v18.16b
490	ld1	{v2.16b},[x0],#16
491	aesd	v0.16b,v21.16b
492	aesimc	v0.16b,v0.16b
493	aesd	v1.16b,v21.16b
494	aesimc	v1.16b,v1.16b
495	aesd	v18.16b,v21.16b
496	aesimc	v18.16b,v18.16b
497	ld1	{v3.16b},[x0],#16
498	aesd	v0.16b,v22.16b
499	aesimc	v0.16b,v0.16b
500	aesd	v1.16b,v22.16b
501	aesimc	v1.16b,v1.16b
502	aesd	v18.16b,v22.16b
503	aesimc	v18.16b,v18.16b
504	ld1	{v19.16b},[x0],#16
505	aesd	v0.16b,v23.16b
506	aesd	v1.16b,v23.16b
507	aesd	v18.16b,v23.16b
508	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
509	add	w6,w5,#2
510	eor	v4.16b,v4.16b,v0.16b
511	eor	v5.16b,v5.16b,v1.16b
512	eor	v18.16b,v18.16b,v17.16b
513	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
514	st1	{v4.16b},[x1],#16
515	orr	v0.16b,v2.16b,v2.16b
516	st1	{v5.16b},[x1],#16
517	orr	v1.16b,v3.16b,v3.16b
518	st1	{v18.16b},[x1],#16
519	orr	v18.16b,v19.16b,v19.16b
520	b.hs	.Loop3x_cbc_dec
521
522	cmn	x2,#0x30
523	b.eq	.Lcbc_done
524	nop
525
526.Lcbc_dec_tail:
527	aesd	v1.16b,v16.16b
528	aesimc	v1.16b,v1.16b
529	aesd	v18.16b,v16.16b
530	aesimc	v18.16b,v18.16b
531	ld1	{v16.4s},[x7],#16
532	subs	w6,w6,#2
533	aesd	v1.16b,v17.16b
534	aesimc	v1.16b,v1.16b
535	aesd	v18.16b,v17.16b
536	aesimc	v18.16b,v18.16b
537	ld1	{v17.4s},[x7],#16
538	b.gt	.Lcbc_dec_tail
539
540	aesd	v1.16b,v16.16b
541	aesimc	v1.16b,v1.16b
542	aesd	v18.16b,v16.16b
543	aesimc	v18.16b,v18.16b
544	aesd	v1.16b,v17.16b
545	aesimc	v1.16b,v1.16b
546	aesd	v18.16b,v17.16b
547	aesimc	v18.16b,v18.16b
548	aesd	v1.16b,v20.16b
549	aesimc	v1.16b,v1.16b
550	aesd	v18.16b,v20.16b
551	aesimc	v18.16b,v18.16b
552	cmn	x2,#0x20
553	aesd	v1.16b,v21.16b
554	aesimc	v1.16b,v1.16b
555	aesd	v18.16b,v21.16b
556	aesimc	v18.16b,v18.16b
557	eor	v5.16b,v6.16b,v7.16b
558	aesd	v1.16b,v22.16b
559	aesimc	v1.16b,v1.16b
560	aesd	v18.16b,v22.16b
561	aesimc	v18.16b,v18.16b
562	eor	v17.16b,v3.16b,v7.16b
563	aesd	v1.16b,v23.16b
564	aesd	v18.16b,v23.16b
565	b.eq	.Lcbc_dec_one
566	eor	v5.16b,v5.16b,v1.16b
567	eor	v17.16b,v17.16b,v18.16b
568	orr	v6.16b,v19.16b,v19.16b
569	st1	{v5.16b},[x1],#16
570	st1	{v17.16b},[x1],#16
571	b	.Lcbc_done
572
573.Lcbc_dec_one:
574	eor	v5.16b,v5.16b,v18.16b
575	orr	v6.16b,v19.16b,v19.16b
576	st1	{v5.16b},[x1],#16
577
578.Lcbc_done:
579	st1	{v6.16b},[x4]
580.Lcbc_abort:
581	ldr	x29,[sp],#16
582	ret
583.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
584.globl	aes_hw_ctr32_encrypt_blocks
585.hidden	aes_hw_ctr32_encrypt_blocks
586.type	aes_hw_ctr32_encrypt_blocks,%function
587.align	5
588aes_hw_ctr32_encrypt_blocks:
589	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
590	AARCH64_VALID_CALL_TARGET
591	stp	x29,x30,[sp,#-16]!
592	add	x29,sp,#0
593	ldr	w5,[x3,#240]
594
595	ldr	w8, [x4, #12]
596	ld1	{v0.4s},[x4]
597
598	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
599	sub	w5,w5,#4
600	mov	x12,#16
601	cmp	x2,#2
602	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
603	sub	w5,w5,#2
604	ld1	{v20.4s,v21.4s},[x7],#32
605	ld1	{v22.4s,v23.4s},[x7],#32
606	ld1	{v7.4s},[x7]
607	add	x7,x3,#32
608	mov	w6,w5
609	csel	x12,xzr,x12,lo
610
611	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
612	// affected by silicon errata #1742098 [0] and #1655431 [1],
613	// respectively, where the second instruction of an aese/aesmc
614	// instruction pair may execute twice if an interrupt is taken right
615	// after the first instruction consumes an input register of which a
616	// single 32-bit lane has been updated the last time it was modified.
617	//
618	// This function uses a counter in one 32-bit lane. The vmov lines
619	// could write to v1.16b and v18.16b directly, but that trips this bugs.
620	// We write to v6.16b and copy to the final register as a workaround.
621	//
622	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
623	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
624#ifndef __AARCH64EB__
625	rev	w8, w8
626#endif
627	add	w10, w8, #1
628	orr	v6.16b,v0.16b,v0.16b
629	rev	w10, w10
630	mov	v6.s[3],w10
631	add	w8, w8, #2
632	orr	v1.16b,v6.16b,v6.16b
633	b.ls	.Lctr32_tail
634	rev	w12, w8
635	mov	v6.s[3],w12
636	sub	x2,x2,#3		// bias
637	orr	v18.16b,v6.16b,v6.16b
638	b	.Loop3x_ctr32
639
640.align	4
641.Loop3x_ctr32:
642	aese	v0.16b,v16.16b
643	aesmc	v0.16b,v0.16b
644	aese	v1.16b,v16.16b
645	aesmc	v1.16b,v1.16b
646	aese	v18.16b,v16.16b
647	aesmc	v18.16b,v18.16b
648	ld1	{v16.4s},[x7],#16
649	subs	w6,w6,#2
650	aese	v0.16b,v17.16b
651	aesmc	v0.16b,v0.16b
652	aese	v1.16b,v17.16b
653	aesmc	v1.16b,v1.16b
654	aese	v18.16b,v17.16b
655	aesmc	v18.16b,v18.16b
656	ld1	{v17.4s},[x7],#16
657	b.gt	.Loop3x_ctr32
658
659	aese	v0.16b,v16.16b
660	aesmc	v4.16b,v0.16b
661	aese	v1.16b,v16.16b
662	aesmc	v5.16b,v1.16b
663	ld1	{v2.16b},[x0],#16
664	add	w9,w8,#1
665	aese	v18.16b,v16.16b
666	aesmc	v18.16b,v18.16b
667	ld1	{v3.16b},[x0],#16
668	rev	w9,w9
669	aese	v4.16b,v17.16b
670	aesmc	v4.16b,v4.16b
671	aese	v5.16b,v17.16b
672	aesmc	v5.16b,v5.16b
673	ld1	{v19.16b},[x0],#16
674	mov	x7,x3
675	aese	v18.16b,v17.16b
676	aesmc	v17.16b,v18.16b
677	aese	v4.16b,v20.16b
678	aesmc	v4.16b,v4.16b
679	aese	v5.16b,v20.16b
680	aesmc	v5.16b,v5.16b
681	eor	v2.16b,v2.16b,v7.16b
682	add	w10,w8,#2
683	aese	v17.16b,v20.16b
684	aesmc	v17.16b,v17.16b
685	eor	v3.16b,v3.16b,v7.16b
686	add	w8,w8,#3
687	aese	v4.16b,v21.16b
688	aesmc	v4.16b,v4.16b
689	aese	v5.16b,v21.16b
690	aesmc	v5.16b,v5.16b
691	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
692	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
693	 // 32-bit mode. See the comment above.
694	eor	v19.16b,v19.16b,v7.16b
695	mov	v6.s[3], w9
696	aese	v17.16b,v21.16b
697	aesmc	v17.16b,v17.16b
698	orr	v0.16b,v6.16b,v6.16b
699	rev	w10,w10
700	aese	v4.16b,v22.16b
701	aesmc	v4.16b,v4.16b
702	mov	v6.s[3], w10
703	rev	w12,w8
704	aese	v5.16b,v22.16b
705	aesmc	v5.16b,v5.16b
706	orr	v1.16b,v6.16b,v6.16b
707	mov	v6.s[3], w12
708	aese	v17.16b,v22.16b
709	aesmc	v17.16b,v17.16b
710	orr	v18.16b,v6.16b,v6.16b
711	subs	x2,x2,#3
712	aese	v4.16b,v23.16b
713	aese	v5.16b,v23.16b
714	aese	v17.16b,v23.16b
715
716	eor	v2.16b,v2.16b,v4.16b
717	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
718	st1	{v2.16b},[x1],#16
719	eor	v3.16b,v3.16b,v5.16b
720	mov	w6,w5
721	st1	{v3.16b},[x1],#16
722	eor	v19.16b,v19.16b,v17.16b
723	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
724	st1	{v19.16b},[x1],#16
725	b.hs	.Loop3x_ctr32
726
727	adds	x2,x2,#3
728	b.eq	.Lctr32_done
729	cmp	x2,#1
730	mov	x12,#16
731	csel	x12,xzr,x12,eq
732
733.Lctr32_tail:
734	aese	v0.16b,v16.16b
735	aesmc	v0.16b,v0.16b
736	aese	v1.16b,v16.16b
737	aesmc	v1.16b,v1.16b
738	ld1	{v16.4s},[x7],#16
739	subs	w6,w6,#2
740	aese	v0.16b,v17.16b
741	aesmc	v0.16b,v0.16b
742	aese	v1.16b,v17.16b
743	aesmc	v1.16b,v1.16b
744	ld1	{v17.4s},[x7],#16
745	b.gt	.Lctr32_tail
746
747	aese	v0.16b,v16.16b
748	aesmc	v0.16b,v0.16b
749	aese	v1.16b,v16.16b
750	aesmc	v1.16b,v1.16b
751	aese	v0.16b,v17.16b
752	aesmc	v0.16b,v0.16b
753	aese	v1.16b,v17.16b
754	aesmc	v1.16b,v1.16b
755	ld1	{v2.16b},[x0],x12
756	aese	v0.16b,v20.16b
757	aesmc	v0.16b,v0.16b
758	aese	v1.16b,v20.16b
759	aesmc	v1.16b,v1.16b
760	ld1	{v3.16b},[x0]
761	aese	v0.16b,v21.16b
762	aesmc	v0.16b,v0.16b
763	aese	v1.16b,v21.16b
764	aesmc	v1.16b,v1.16b
765	eor	v2.16b,v2.16b,v7.16b
766	aese	v0.16b,v22.16b
767	aesmc	v0.16b,v0.16b
768	aese	v1.16b,v22.16b
769	aesmc	v1.16b,v1.16b
770	eor	v3.16b,v3.16b,v7.16b
771	aese	v0.16b,v23.16b
772	aese	v1.16b,v23.16b
773
774	cmp	x2,#1
775	eor	v2.16b,v2.16b,v0.16b
776	eor	v3.16b,v3.16b,v1.16b
777	st1	{v2.16b},[x1],#16
778	b.eq	.Lctr32_done
779	st1	{v3.16b},[x1]
780
781.Lctr32_done:
782	ldr	x29,[sp],#16
783	ret
784.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
785#endif
786#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
787