xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/aesv8-armv8-win.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include <openssl/arm_arch.h>
8
9#if __ARM_MAX_ARCH__>=7
10.text
11.arch	armv8-a+crypto
12.section	.rodata
13.align	5
14Lrcon:
15.long	0x01,0x01,0x01,0x01
16.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
17.long	0x1b,0x1b,0x1b,0x1b
18
19.text
20
21.globl	aes_hw_set_encrypt_key
22
23.def aes_hw_set_encrypt_key
24   .type 32
25.endef
26.align	5
27aes_hw_set_encrypt_key:
28Lenc_key:
29	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
30	AARCH64_VALID_CALL_TARGET
31	stp	x29,x30,[sp,#-16]!
32	add	x29,sp,#0
33	mov	x3,#-1
34	cmp	x0,#0
35	b.eq	Lenc_key_abort
36	cmp	x2,#0
37	b.eq	Lenc_key_abort
38	mov	x3,#-2
39	cmp	w1,#128
40	b.lt	Lenc_key_abort
41	cmp	w1,#256
42	b.gt	Lenc_key_abort
43	tst	w1,#0x3f
44	b.ne	Lenc_key_abort
45
46	adrp	x3,Lrcon
47	add	x3,x3,:lo12:Lrcon
48	cmp	w1,#192
49
50	eor	v0.16b,v0.16b,v0.16b
51	ld1	{v3.16b},[x0],#16
52	mov	w1,#8		// reuse w1
53	ld1	{v1.4s,v2.4s},[x3],#32
54
55	b.lt	Loop128
56	b.eq	L192
57	b	L256
58
59.align	4
60Loop128:
61	tbl	v6.16b,{v3.16b},v2.16b
62	ext	v5.16b,v0.16b,v3.16b,#12
63	st1	{v3.4s},[x2],#16
64	aese	v6.16b,v0.16b
65	subs	w1,w1,#1
66
67	eor	v3.16b,v3.16b,v5.16b
68	ext	v5.16b,v0.16b,v5.16b,#12
69	eor	v3.16b,v3.16b,v5.16b
70	ext	v5.16b,v0.16b,v5.16b,#12
71	eor	v6.16b,v6.16b,v1.16b
72	eor	v3.16b,v3.16b,v5.16b
73	shl	v1.16b,v1.16b,#1
74	eor	v3.16b,v3.16b,v6.16b
75	b.ne	Loop128
76
77	ld1	{v1.4s},[x3]
78
79	tbl	v6.16b,{v3.16b},v2.16b
80	ext	v5.16b,v0.16b,v3.16b,#12
81	st1	{v3.4s},[x2],#16
82	aese	v6.16b,v0.16b
83
84	eor	v3.16b,v3.16b,v5.16b
85	ext	v5.16b,v0.16b,v5.16b,#12
86	eor	v3.16b,v3.16b,v5.16b
87	ext	v5.16b,v0.16b,v5.16b,#12
88	eor	v6.16b,v6.16b,v1.16b
89	eor	v3.16b,v3.16b,v5.16b
90	shl	v1.16b,v1.16b,#1
91	eor	v3.16b,v3.16b,v6.16b
92
93	tbl	v6.16b,{v3.16b},v2.16b
94	ext	v5.16b,v0.16b,v3.16b,#12
95	st1	{v3.4s},[x2],#16
96	aese	v6.16b,v0.16b
97
98	eor	v3.16b,v3.16b,v5.16b
99	ext	v5.16b,v0.16b,v5.16b,#12
100	eor	v3.16b,v3.16b,v5.16b
101	ext	v5.16b,v0.16b,v5.16b,#12
102	eor	v6.16b,v6.16b,v1.16b
103	eor	v3.16b,v3.16b,v5.16b
104	eor	v3.16b,v3.16b,v6.16b
105	st1	{v3.4s},[x2]
106	add	x2,x2,#0x50
107
108	mov	w12,#10
109	b	Ldone
110
111.align	4
112L192:
113	ld1	{v4.8b},[x0],#8
114	movi	v6.16b,#8			// borrow v6.16b
115	st1	{v3.4s},[x2],#16
116	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
117
118Loop192:
119	tbl	v6.16b,{v4.16b},v2.16b
120	ext	v5.16b,v0.16b,v3.16b,#12
121	st1	{v4.8b},[x2],#8
122	aese	v6.16b,v0.16b
123	subs	w1,w1,#1
124
125	eor	v3.16b,v3.16b,v5.16b
126	ext	v5.16b,v0.16b,v5.16b,#12
127	eor	v3.16b,v3.16b,v5.16b
128	ext	v5.16b,v0.16b,v5.16b,#12
129	eor	v3.16b,v3.16b,v5.16b
130
131	dup	v5.4s,v3.s[3]
132	eor	v5.16b,v5.16b,v4.16b
133	eor	v6.16b,v6.16b,v1.16b
134	ext	v4.16b,v0.16b,v4.16b,#12
135	shl	v1.16b,v1.16b,#1
136	eor	v4.16b,v4.16b,v5.16b
137	eor	v3.16b,v3.16b,v6.16b
138	eor	v4.16b,v4.16b,v6.16b
139	st1	{v3.4s},[x2],#16
140	b.ne	Loop192
141
142	mov	w12,#12
143	add	x2,x2,#0x20
144	b	Ldone
145
146.align	4
147L256:
148	ld1	{v4.16b},[x0]
149	mov	w1,#7
150	mov	w12,#14
151	st1	{v3.4s},[x2],#16
152
153Loop256:
154	tbl	v6.16b,{v4.16b},v2.16b
155	ext	v5.16b,v0.16b,v3.16b,#12
156	st1	{v4.4s},[x2],#16
157	aese	v6.16b,v0.16b
158	subs	w1,w1,#1
159
160	eor	v3.16b,v3.16b,v5.16b
161	ext	v5.16b,v0.16b,v5.16b,#12
162	eor	v3.16b,v3.16b,v5.16b
163	ext	v5.16b,v0.16b,v5.16b,#12
164	eor	v6.16b,v6.16b,v1.16b
165	eor	v3.16b,v3.16b,v5.16b
166	shl	v1.16b,v1.16b,#1
167	eor	v3.16b,v3.16b,v6.16b
168	st1	{v3.4s},[x2],#16
169	b.eq	Ldone
170
171	dup	v6.4s,v3.s[3]		// just splat
172	ext	v5.16b,v0.16b,v4.16b,#12
173	aese	v6.16b,v0.16b
174
175	eor	v4.16b,v4.16b,v5.16b
176	ext	v5.16b,v0.16b,v5.16b,#12
177	eor	v4.16b,v4.16b,v5.16b
178	ext	v5.16b,v0.16b,v5.16b,#12
179	eor	v4.16b,v4.16b,v5.16b
180
181	eor	v4.16b,v4.16b,v6.16b
182	b	Loop256
183
184Ldone:
185	str	w12,[x2]
186	mov	x3,#0
187
188Lenc_key_abort:
189	mov	x0,x3			// return value
190	ldr	x29,[sp],#16
191	ret
192
193
194.globl	aes_hw_set_decrypt_key
195
196.def aes_hw_set_decrypt_key
197   .type 32
198.endef
199.align	5
200aes_hw_set_decrypt_key:
201	AARCH64_SIGN_LINK_REGISTER
202	stp	x29,x30,[sp,#-16]!
203	add	x29,sp,#0
204	bl	Lenc_key
205
206	cmp	x0,#0
207	b.ne	Ldec_key_abort
208
209	sub	x2,x2,#240		// restore original x2
210	mov	x4,#-16
211	add	x0,x2,x12,lsl#4	// end of key schedule
212
213	ld1	{v0.4s},[x2]
214	ld1	{v1.4s},[x0]
215	st1	{v0.4s},[x0],x4
216	st1	{v1.4s},[x2],#16
217
218Loop_imc:
219	ld1	{v0.4s},[x2]
220	ld1	{v1.4s},[x0]
221	aesimc	v0.16b,v0.16b
222	aesimc	v1.16b,v1.16b
223	st1	{v0.4s},[x0],x4
224	st1	{v1.4s},[x2],#16
225	cmp	x0,x2
226	b.hi	Loop_imc
227
228	ld1	{v0.4s},[x2]
229	aesimc	v0.16b,v0.16b
230	st1	{v0.4s},[x0]
231
232	eor	x0,x0,x0		// return value
233Ldec_key_abort:
234	ldp	x29,x30,[sp],#16
235	AARCH64_VALIDATE_LINK_REGISTER
236	ret
237
238.globl	aes_hw_encrypt
239
240.def aes_hw_encrypt
241   .type 32
242.endef
243.align	5
244aes_hw_encrypt:
245	AARCH64_VALID_CALL_TARGET
246	ldr	w3,[x2,#240]
247	ld1	{v0.4s},[x2],#16
248	ld1	{v2.16b},[x0]
249	sub	w3,w3,#2
250	ld1	{v1.4s},[x2],#16
251
252Loop_enc:
253	aese	v2.16b,v0.16b
254	aesmc	v2.16b,v2.16b
255	ld1	{v0.4s},[x2],#16
256	subs	w3,w3,#2
257	aese	v2.16b,v1.16b
258	aesmc	v2.16b,v2.16b
259	ld1	{v1.4s},[x2],#16
260	b.gt	Loop_enc
261
262	aese	v2.16b,v0.16b
263	aesmc	v2.16b,v2.16b
264	ld1	{v0.4s},[x2]
265	aese	v2.16b,v1.16b
266	eor	v2.16b,v2.16b,v0.16b
267
268	st1	{v2.16b},[x1]
269	ret
270
271.globl	aes_hw_decrypt
272
273.def aes_hw_decrypt
274   .type 32
275.endef
276.align	5
277aes_hw_decrypt:
278	AARCH64_VALID_CALL_TARGET
279	ldr	w3,[x2,#240]
280	ld1	{v0.4s},[x2],#16
281	ld1	{v2.16b},[x0]
282	sub	w3,w3,#2
283	ld1	{v1.4s},[x2],#16
284
285Loop_dec:
286	aesd	v2.16b,v0.16b
287	aesimc	v2.16b,v2.16b
288	ld1	{v0.4s},[x2],#16
289	subs	w3,w3,#2
290	aesd	v2.16b,v1.16b
291	aesimc	v2.16b,v2.16b
292	ld1	{v1.4s},[x2],#16
293	b.gt	Loop_dec
294
295	aesd	v2.16b,v0.16b
296	aesimc	v2.16b,v2.16b
297	ld1	{v0.4s},[x2]
298	aesd	v2.16b,v1.16b
299	eor	v2.16b,v2.16b,v0.16b
300
301	st1	{v2.16b},[x1]
302	ret
303
304.globl	aes_hw_cbc_encrypt
305
306.def aes_hw_cbc_encrypt
307   .type 32
308.endef
309.align	5
310aes_hw_cbc_encrypt:
311	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
312	AARCH64_VALID_CALL_TARGET
313	stp	x29,x30,[sp,#-16]!
314	add	x29,sp,#0
315	subs	x2,x2,#16
316	mov	x8,#16
317	b.lo	Lcbc_abort
318	csel	x8,xzr,x8,eq
319
320	cmp	w5,#0			// en- or decrypting?
321	ldr	w5,[x3,#240]
322	and	x2,x2,#-16
323	ld1	{v6.16b},[x4]
324	ld1	{v0.16b},[x0],x8
325
326	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
327	sub	w5,w5,#6
328	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
329	sub	w5,w5,#2
330	ld1	{v18.4s,v19.4s},[x7],#32
331	ld1	{v20.4s,v21.4s},[x7],#32
332	ld1	{v22.4s,v23.4s},[x7],#32
333	ld1	{v7.4s},[x7]
334
335	add	x7,x3,#32
336	mov	w6,w5
337	b.eq	Lcbc_dec
338
339	cmp	w5,#2
340	eor	v0.16b,v0.16b,v6.16b
341	eor	v5.16b,v16.16b,v7.16b
342	b.eq	Lcbc_enc128
343
344	ld1	{v2.4s,v3.4s},[x7]
345	add	x7,x3,#16
346	add	x6,x3,#16*4
347	add	x12,x3,#16*5
348	aese	v0.16b,v16.16b
349	aesmc	v0.16b,v0.16b
350	add	x14,x3,#16*6
351	add	x3,x3,#16*7
352	b	Lenter_cbc_enc
353
354.align	4
355Loop_cbc_enc:
356	aese	v0.16b,v16.16b
357	aesmc	v0.16b,v0.16b
358	st1	{v6.16b},[x1],#16
359Lenter_cbc_enc:
360	aese	v0.16b,v17.16b
361	aesmc	v0.16b,v0.16b
362	aese	v0.16b,v2.16b
363	aesmc	v0.16b,v0.16b
364	ld1	{v16.4s},[x6]
365	cmp	w5,#4
366	aese	v0.16b,v3.16b
367	aesmc	v0.16b,v0.16b
368	ld1	{v17.4s},[x12]
369	b.eq	Lcbc_enc192
370
371	aese	v0.16b,v16.16b
372	aesmc	v0.16b,v0.16b
373	ld1	{v16.4s},[x14]
374	aese	v0.16b,v17.16b
375	aesmc	v0.16b,v0.16b
376	ld1	{v17.4s},[x3]
377	nop
378
379Lcbc_enc192:
380	aese	v0.16b,v16.16b
381	aesmc	v0.16b,v0.16b
382	subs	x2,x2,#16
383	aese	v0.16b,v17.16b
384	aesmc	v0.16b,v0.16b
385	csel	x8,xzr,x8,eq
386	aese	v0.16b,v18.16b
387	aesmc	v0.16b,v0.16b
388	aese	v0.16b,v19.16b
389	aesmc	v0.16b,v0.16b
390	ld1	{v16.16b},[x0],x8
391	aese	v0.16b,v20.16b
392	aesmc	v0.16b,v0.16b
393	eor	v16.16b,v16.16b,v5.16b
394	aese	v0.16b,v21.16b
395	aesmc	v0.16b,v0.16b
396	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
397	aese	v0.16b,v22.16b
398	aesmc	v0.16b,v0.16b
399	aese	v0.16b,v23.16b
400	eor	v6.16b,v0.16b,v7.16b
401	b.hs	Loop_cbc_enc
402
403	st1	{v6.16b},[x1],#16
404	b	Lcbc_done
405
406.align	5
407Lcbc_enc128:
408	ld1	{v2.4s,v3.4s},[x7]
409	aese	v0.16b,v16.16b
410	aesmc	v0.16b,v0.16b
411	b	Lenter_cbc_enc128
412Loop_cbc_enc128:
413	aese	v0.16b,v16.16b
414	aesmc	v0.16b,v0.16b
415	st1	{v6.16b},[x1],#16
416Lenter_cbc_enc128:
417	aese	v0.16b,v17.16b
418	aesmc	v0.16b,v0.16b
419	subs	x2,x2,#16
420	aese	v0.16b,v2.16b
421	aesmc	v0.16b,v0.16b
422	csel	x8,xzr,x8,eq
423	aese	v0.16b,v3.16b
424	aesmc	v0.16b,v0.16b
425	aese	v0.16b,v18.16b
426	aesmc	v0.16b,v0.16b
427	aese	v0.16b,v19.16b
428	aesmc	v0.16b,v0.16b
429	ld1	{v16.16b},[x0],x8
430	aese	v0.16b,v20.16b
431	aesmc	v0.16b,v0.16b
432	aese	v0.16b,v21.16b
433	aesmc	v0.16b,v0.16b
434	aese	v0.16b,v22.16b
435	aesmc	v0.16b,v0.16b
436	eor	v16.16b,v16.16b,v5.16b
437	aese	v0.16b,v23.16b
438	eor	v6.16b,v0.16b,v7.16b
439	b.hs	Loop_cbc_enc128
440
441	st1	{v6.16b},[x1],#16
442	b	Lcbc_done
443.align	5
444Lcbc_dec:
445	ld1	{v18.16b},[x0],#16
446	subs	x2,x2,#32		// bias
447	add	w6,w5,#2
448	orr	v3.16b,v0.16b,v0.16b
449	orr	v1.16b,v0.16b,v0.16b
450	orr	v19.16b,v18.16b,v18.16b
451	b.lo	Lcbc_dec_tail
452
453	orr	v1.16b,v18.16b,v18.16b
454	ld1	{v18.16b},[x0],#16
455	orr	v2.16b,v0.16b,v0.16b
456	orr	v3.16b,v1.16b,v1.16b
457	orr	v19.16b,v18.16b,v18.16b
458
459Loop3x_cbc_dec:
460	aesd	v0.16b,v16.16b
461	aesimc	v0.16b,v0.16b
462	aesd	v1.16b,v16.16b
463	aesimc	v1.16b,v1.16b
464	aesd	v18.16b,v16.16b
465	aesimc	v18.16b,v18.16b
466	ld1	{v16.4s},[x7],#16
467	subs	w6,w6,#2
468	aesd	v0.16b,v17.16b
469	aesimc	v0.16b,v0.16b
470	aesd	v1.16b,v17.16b
471	aesimc	v1.16b,v1.16b
472	aesd	v18.16b,v17.16b
473	aesimc	v18.16b,v18.16b
474	ld1	{v17.4s},[x7],#16
475	b.gt	Loop3x_cbc_dec
476
477	aesd	v0.16b,v16.16b
478	aesimc	v0.16b,v0.16b
479	aesd	v1.16b,v16.16b
480	aesimc	v1.16b,v1.16b
481	aesd	v18.16b,v16.16b
482	aesimc	v18.16b,v18.16b
483	eor	v4.16b,v6.16b,v7.16b
484	subs	x2,x2,#0x30
485	eor	v5.16b,v2.16b,v7.16b
486	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
487	aesd	v0.16b,v17.16b
488	aesimc	v0.16b,v0.16b
489	aesd	v1.16b,v17.16b
490	aesimc	v1.16b,v1.16b
491	aesd	v18.16b,v17.16b
492	aesimc	v18.16b,v18.16b
493	eor	v17.16b,v3.16b,v7.16b
494	add	x0,x0,x6		// x0 is adjusted in such way that
495					// at exit from the loop v1.16b-v18.16b
496					// are loaded with last "words"
497	orr	v6.16b,v19.16b,v19.16b
498	mov	x7,x3
499	aesd	v0.16b,v20.16b
500	aesimc	v0.16b,v0.16b
501	aesd	v1.16b,v20.16b
502	aesimc	v1.16b,v1.16b
503	aesd	v18.16b,v20.16b
504	aesimc	v18.16b,v18.16b
505	ld1	{v2.16b},[x0],#16
506	aesd	v0.16b,v21.16b
507	aesimc	v0.16b,v0.16b
508	aesd	v1.16b,v21.16b
509	aesimc	v1.16b,v1.16b
510	aesd	v18.16b,v21.16b
511	aesimc	v18.16b,v18.16b
512	ld1	{v3.16b},[x0],#16
513	aesd	v0.16b,v22.16b
514	aesimc	v0.16b,v0.16b
515	aesd	v1.16b,v22.16b
516	aesimc	v1.16b,v1.16b
517	aesd	v18.16b,v22.16b
518	aesimc	v18.16b,v18.16b
519	ld1	{v19.16b},[x0],#16
520	aesd	v0.16b,v23.16b
521	aesd	v1.16b,v23.16b
522	aesd	v18.16b,v23.16b
523	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
524	add	w6,w5,#2
525	eor	v4.16b,v4.16b,v0.16b
526	eor	v5.16b,v5.16b,v1.16b
527	eor	v18.16b,v18.16b,v17.16b
528	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
529	st1	{v4.16b},[x1],#16
530	orr	v0.16b,v2.16b,v2.16b
531	st1	{v5.16b},[x1],#16
532	orr	v1.16b,v3.16b,v3.16b
533	st1	{v18.16b},[x1],#16
534	orr	v18.16b,v19.16b,v19.16b
535	b.hs	Loop3x_cbc_dec
536
537	cmn	x2,#0x30
538	b.eq	Lcbc_done
539	nop
540
541Lcbc_dec_tail:
542	aesd	v1.16b,v16.16b
543	aesimc	v1.16b,v1.16b
544	aesd	v18.16b,v16.16b
545	aesimc	v18.16b,v18.16b
546	ld1	{v16.4s},[x7],#16
547	subs	w6,w6,#2
548	aesd	v1.16b,v17.16b
549	aesimc	v1.16b,v1.16b
550	aesd	v18.16b,v17.16b
551	aesimc	v18.16b,v18.16b
552	ld1	{v17.4s},[x7],#16
553	b.gt	Lcbc_dec_tail
554
555	aesd	v1.16b,v16.16b
556	aesimc	v1.16b,v1.16b
557	aesd	v18.16b,v16.16b
558	aesimc	v18.16b,v18.16b
559	aesd	v1.16b,v17.16b
560	aesimc	v1.16b,v1.16b
561	aesd	v18.16b,v17.16b
562	aesimc	v18.16b,v18.16b
563	aesd	v1.16b,v20.16b
564	aesimc	v1.16b,v1.16b
565	aesd	v18.16b,v20.16b
566	aesimc	v18.16b,v18.16b
567	cmn	x2,#0x20
568	aesd	v1.16b,v21.16b
569	aesimc	v1.16b,v1.16b
570	aesd	v18.16b,v21.16b
571	aesimc	v18.16b,v18.16b
572	eor	v5.16b,v6.16b,v7.16b
573	aesd	v1.16b,v22.16b
574	aesimc	v1.16b,v1.16b
575	aesd	v18.16b,v22.16b
576	aesimc	v18.16b,v18.16b
577	eor	v17.16b,v3.16b,v7.16b
578	aesd	v1.16b,v23.16b
579	aesd	v18.16b,v23.16b
580	b.eq	Lcbc_dec_one
581	eor	v5.16b,v5.16b,v1.16b
582	eor	v17.16b,v17.16b,v18.16b
583	orr	v6.16b,v19.16b,v19.16b
584	st1	{v5.16b},[x1],#16
585	st1	{v17.16b},[x1],#16
586	b	Lcbc_done
587
588Lcbc_dec_one:
589	eor	v5.16b,v5.16b,v18.16b
590	orr	v6.16b,v19.16b,v19.16b
591	st1	{v5.16b},[x1],#16
592
593Lcbc_done:
594	st1	{v6.16b},[x4]
595Lcbc_abort:
596	ldr	x29,[sp],#16
597	ret
598
599.globl	aes_hw_ctr32_encrypt_blocks
600
601.def aes_hw_ctr32_encrypt_blocks
602   .type 32
603.endef
604.align	5
605aes_hw_ctr32_encrypt_blocks:
606	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
607	AARCH64_VALID_CALL_TARGET
608	stp	x29,x30,[sp,#-16]!
609	add	x29,sp,#0
610	ldr	w5,[x3,#240]
611
612	ldr	w8, [x4, #12]
613	ld1	{v0.4s},[x4]
614
615	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
616	sub	w5,w5,#4
617	mov	x12,#16
618	cmp	x2,#2
619	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
620	sub	w5,w5,#2
621	ld1	{v20.4s,v21.4s},[x7],#32
622	ld1	{v22.4s,v23.4s},[x7],#32
623	ld1	{v7.4s},[x7]
624	add	x7,x3,#32
625	mov	w6,w5
626	csel	x12,xzr,x12,lo
627
628	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
629	// affected by silicon errata #1742098 [0] and #1655431 [1],
630	// respectively, where the second instruction of an aese/aesmc
631	// instruction pair may execute twice if an interrupt is taken right
632	// after the first instruction consumes an input register of which a
633	// single 32-bit lane has been updated the last time it was modified.
634	//
635	// This function uses a counter in one 32-bit lane. The vmov lines
636	// could write to v1.16b and v18.16b directly, but that trips this bugs.
637	// We write to v6.16b and copy to the final register as a workaround.
638	//
639	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
640	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
641#ifndef __AARCH64EB__
642	rev	w8, w8
643#endif
644	add	w10, w8, #1
645	orr	v6.16b,v0.16b,v0.16b
646	rev	w10, w10
647	mov	v6.s[3],w10
648	add	w8, w8, #2
649	orr	v1.16b,v6.16b,v6.16b
650	b.ls	Lctr32_tail
651	rev	w12, w8
652	mov	v6.s[3],w12
653	sub	x2,x2,#3		// bias
654	orr	v18.16b,v6.16b,v6.16b
655	b	Loop3x_ctr32
656
657.align	4
658Loop3x_ctr32:
659	aese	v0.16b,v16.16b
660	aesmc	v0.16b,v0.16b
661	aese	v1.16b,v16.16b
662	aesmc	v1.16b,v1.16b
663	aese	v18.16b,v16.16b
664	aesmc	v18.16b,v18.16b
665	ld1	{v16.4s},[x7],#16
666	subs	w6,w6,#2
667	aese	v0.16b,v17.16b
668	aesmc	v0.16b,v0.16b
669	aese	v1.16b,v17.16b
670	aesmc	v1.16b,v1.16b
671	aese	v18.16b,v17.16b
672	aesmc	v18.16b,v18.16b
673	ld1	{v17.4s},[x7],#16
674	b.gt	Loop3x_ctr32
675
676	aese	v0.16b,v16.16b
677	aesmc	v4.16b,v0.16b
678	aese	v1.16b,v16.16b
679	aesmc	v5.16b,v1.16b
680	ld1	{v2.16b},[x0],#16
681	add	w9,w8,#1
682	aese	v18.16b,v16.16b
683	aesmc	v18.16b,v18.16b
684	ld1	{v3.16b},[x0],#16
685	rev	w9,w9
686	aese	v4.16b,v17.16b
687	aesmc	v4.16b,v4.16b
688	aese	v5.16b,v17.16b
689	aesmc	v5.16b,v5.16b
690	ld1	{v19.16b},[x0],#16
691	mov	x7,x3
692	aese	v18.16b,v17.16b
693	aesmc	v17.16b,v18.16b
694	aese	v4.16b,v20.16b
695	aesmc	v4.16b,v4.16b
696	aese	v5.16b,v20.16b
697	aesmc	v5.16b,v5.16b
698	eor	v2.16b,v2.16b,v7.16b
699	add	w10,w8,#2
700	aese	v17.16b,v20.16b
701	aesmc	v17.16b,v17.16b
702	eor	v3.16b,v3.16b,v7.16b
703	add	w8,w8,#3
704	aese	v4.16b,v21.16b
705	aesmc	v4.16b,v4.16b
706	aese	v5.16b,v21.16b
707	aesmc	v5.16b,v5.16b
708	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
709	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
710	 // 32-bit mode. See the comment above.
711	eor	v19.16b,v19.16b,v7.16b
712	mov	v6.s[3], w9
713	aese	v17.16b,v21.16b
714	aesmc	v17.16b,v17.16b
715	orr	v0.16b,v6.16b,v6.16b
716	rev	w10,w10
717	aese	v4.16b,v22.16b
718	aesmc	v4.16b,v4.16b
719	mov	v6.s[3], w10
720	rev	w12,w8
721	aese	v5.16b,v22.16b
722	aesmc	v5.16b,v5.16b
723	orr	v1.16b,v6.16b,v6.16b
724	mov	v6.s[3], w12
725	aese	v17.16b,v22.16b
726	aesmc	v17.16b,v17.16b
727	orr	v18.16b,v6.16b,v6.16b
728	subs	x2,x2,#3
729	aese	v4.16b,v23.16b
730	aese	v5.16b,v23.16b
731	aese	v17.16b,v23.16b
732
733	eor	v2.16b,v2.16b,v4.16b
734	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
735	st1	{v2.16b},[x1],#16
736	eor	v3.16b,v3.16b,v5.16b
737	mov	w6,w5
738	st1	{v3.16b},[x1],#16
739	eor	v19.16b,v19.16b,v17.16b
740	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
741	st1	{v19.16b},[x1],#16
742	b.hs	Loop3x_ctr32
743
744	adds	x2,x2,#3
745	b.eq	Lctr32_done
746	cmp	x2,#1
747	mov	x12,#16
748	csel	x12,xzr,x12,eq
749
750Lctr32_tail:
751	aese	v0.16b,v16.16b
752	aesmc	v0.16b,v0.16b
753	aese	v1.16b,v16.16b
754	aesmc	v1.16b,v1.16b
755	ld1	{v16.4s},[x7],#16
756	subs	w6,w6,#2
757	aese	v0.16b,v17.16b
758	aesmc	v0.16b,v0.16b
759	aese	v1.16b,v17.16b
760	aesmc	v1.16b,v1.16b
761	ld1	{v17.4s},[x7],#16
762	b.gt	Lctr32_tail
763
764	aese	v0.16b,v16.16b
765	aesmc	v0.16b,v0.16b
766	aese	v1.16b,v16.16b
767	aesmc	v1.16b,v1.16b
768	aese	v0.16b,v17.16b
769	aesmc	v0.16b,v0.16b
770	aese	v1.16b,v17.16b
771	aesmc	v1.16b,v1.16b
772	ld1	{v2.16b},[x0],x12
773	aese	v0.16b,v20.16b
774	aesmc	v0.16b,v0.16b
775	aese	v1.16b,v20.16b
776	aesmc	v1.16b,v1.16b
777	ld1	{v3.16b},[x0]
778	aese	v0.16b,v21.16b
779	aesmc	v0.16b,v0.16b
780	aese	v1.16b,v21.16b
781	aesmc	v1.16b,v1.16b
782	eor	v2.16b,v2.16b,v7.16b
783	aese	v0.16b,v22.16b
784	aesmc	v0.16b,v0.16b
785	aese	v1.16b,v22.16b
786	aesmc	v1.16b,v1.16b
787	eor	v3.16b,v3.16b,v7.16b
788	aese	v0.16b,v23.16b
789	aese	v1.16b,v23.16b
790
791	cmp	x2,#1
792	eor	v2.16b,v2.16b,v0.16b
793	eor	v3.16b,v3.16b,v1.16b
794	st1	{v2.16b},[x1],#16
795	b.eq	Lctr32_done
796	st1	{v3.16b},[x1]
797
798Lctr32_done:
799	ldr	x29,[sp],#16
800	ret
801
802#endif
803#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
804