xref: /aosp_15_r20/external/boringssl/src/gen/bcm/aesv8-armv8-win.S (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include <openssl/arm_arch.h>
8
9#if __ARM_MAX_ARCH__>=7
10.text
11.arch	armv8-a+crypto
12.section	.rodata
13.align	5
14Lrcon:
15.long	0x01,0x01,0x01,0x01
16.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
17.long	0x1b,0x1b,0x1b,0x1b
18
19.text
20
21.globl	aes_hw_set_encrypt_key
22
23.def aes_hw_set_encrypt_key
24   .type 32
25.endef
26.align	5
27aes_hw_set_encrypt_key:
28Lenc_key:
29	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
30	AARCH64_VALID_CALL_TARGET
31	stp	x29,x30,[sp,#-16]!
32	add	x29,sp,#0
33	mov	x3,#-2
34	cmp	w1,#128
35	b.lt	Lenc_key_abort
36	cmp	w1,#256
37	b.gt	Lenc_key_abort
38	tst	w1,#0x3f
39	b.ne	Lenc_key_abort
40
41	adrp	x3,Lrcon
42	add	x3,x3,:lo12:Lrcon
43	cmp	w1,#192
44
45	eor	v0.16b,v0.16b,v0.16b
46	ld1	{v3.16b},[x0],#16
47	mov	w1,#8		// reuse w1
48	ld1	{v1.4s,v2.4s},[x3],#32
49
50	b.lt	Loop128
51	b.eq	L192
52	b	L256
53
54.align	4
55Loop128:
56	tbl	v6.16b,{v3.16b},v2.16b
57	ext	v5.16b,v0.16b,v3.16b,#12
58	st1	{v3.4s},[x2],#16
59	aese	v6.16b,v0.16b
60	subs	w1,w1,#1
61
62	eor	v3.16b,v3.16b,v5.16b
63	ext	v5.16b,v0.16b,v5.16b,#12
64	eor	v3.16b,v3.16b,v5.16b
65	ext	v5.16b,v0.16b,v5.16b,#12
66	eor	v6.16b,v6.16b,v1.16b
67	eor	v3.16b,v3.16b,v5.16b
68	shl	v1.16b,v1.16b,#1
69	eor	v3.16b,v3.16b,v6.16b
70	b.ne	Loop128
71
72	ld1	{v1.4s},[x3]
73
74	tbl	v6.16b,{v3.16b},v2.16b
75	ext	v5.16b,v0.16b,v3.16b,#12
76	st1	{v3.4s},[x2],#16
77	aese	v6.16b,v0.16b
78
79	eor	v3.16b,v3.16b,v5.16b
80	ext	v5.16b,v0.16b,v5.16b,#12
81	eor	v3.16b,v3.16b,v5.16b
82	ext	v5.16b,v0.16b,v5.16b,#12
83	eor	v6.16b,v6.16b,v1.16b
84	eor	v3.16b,v3.16b,v5.16b
85	shl	v1.16b,v1.16b,#1
86	eor	v3.16b,v3.16b,v6.16b
87
88	tbl	v6.16b,{v3.16b},v2.16b
89	ext	v5.16b,v0.16b,v3.16b,#12
90	st1	{v3.4s},[x2],#16
91	aese	v6.16b,v0.16b
92
93	eor	v3.16b,v3.16b,v5.16b
94	ext	v5.16b,v0.16b,v5.16b,#12
95	eor	v3.16b,v3.16b,v5.16b
96	ext	v5.16b,v0.16b,v5.16b,#12
97	eor	v6.16b,v6.16b,v1.16b
98	eor	v3.16b,v3.16b,v5.16b
99	eor	v3.16b,v3.16b,v6.16b
100	st1	{v3.4s},[x2]
101	add	x2,x2,#0x50
102
103	mov	w12,#10
104	b	Ldone
105
106.align	4
107L192:
108	ld1	{v4.8b},[x0],#8
109	movi	v6.16b,#8			// borrow v6.16b
110	st1	{v3.4s},[x2],#16
111	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
112
113Loop192:
114	tbl	v6.16b,{v4.16b},v2.16b
115	ext	v5.16b,v0.16b,v3.16b,#12
116	st1	{v4.8b},[x2],#8
117	aese	v6.16b,v0.16b
118	subs	w1,w1,#1
119
120	eor	v3.16b,v3.16b,v5.16b
121	ext	v5.16b,v0.16b,v5.16b,#12
122	eor	v3.16b,v3.16b,v5.16b
123	ext	v5.16b,v0.16b,v5.16b,#12
124	eor	v3.16b,v3.16b,v5.16b
125
126	dup	v5.4s,v3.s[3]
127	eor	v5.16b,v5.16b,v4.16b
128	eor	v6.16b,v6.16b,v1.16b
129	ext	v4.16b,v0.16b,v4.16b,#12
130	shl	v1.16b,v1.16b,#1
131	eor	v4.16b,v4.16b,v5.16b
132	eor	v3.16b,v3.16b,v6.16b
133	eor	v4.16b,v4.16b,v6.16b
134	st1	{v3.4s},[x2],#16
135	b.ne	Loop192
136
137	mov	w12,#12
138	add	x2,x2,#0x20
139	b	Ldone
140
141.align	4
142L256:
143	ld1	{v4.16b},[x0]
144	mov	w1,#7
145	mov	w12,#14
146	st1	{v3.4s},[x2],#16
147
148Loop256:
149	tbl	v6.16b,{v4.16b},v2.16b
150	ext	v5.16b,v0.16b,v3.16b,#12
151	st1	{v4.4s},[x2],#16
152	aese	v6.16b,v0.16b
153	subs	w1,w1,#1
154
155	eor	v3.16b,v3.16b,v5.16b
156	ext	v5.16b,v0.16b,v5.16b,#12
157	eor	v3.16b,v3.16b,v5.16b
158	ext	v5.16b,v0.16b,v5.16b,#12
159	eor	v6.16b,v6.16b,v1.16b
160	eor	v3.16b,v3.16b,v5.16b
161	shl	v1.16b,v1.16b,#1
162	eor	v3.16b,v3.16b,v6.16b
163	st1	{v3.4s},[x2],#16
164	b.eq	Ldone
165
166	dup	v6.4s,v3.s[3]		// just splat
167	ext	v5.16b,v0.16b,v4.16b,#12
168	aese	v6.16b,v0.16b
169
170	eor	v4.16b,v4.16b,v5.16b
171	ext	v5.16b,v0.16b,v5.16b,#12
172	eor	v4.16b,v4.16b,v5.16b
173	ext	v5.16b,v0.16b,v5.16b,#12
174	eor	v4.16b,v4.16b,v5.16b
175
176	eor	v4.16b,v4.16b,v6.16b
177	b	Loop256
178
179Ldone:
180	str	w12,[x2]
181	mov	x3,#0
182
183Lenc_key_abort:
184	mov	x0,x3			// return value
185	ldr	x29,[sp],#16
186	ret
187
188
189.globl	aes_hw_set_decrypt_key
190
191.def aes_hw_set_decrypt_key
192   .type 32
193.endef
194.align	5
195aes_hw_set_decrypt_key:
196	AARCH64_SIGN_LINK_REGISTER
197	stp	x29,x30,[sp,#-16]!
198	add	x29,sp,#0
199	bl	Lenc_key
200
201	cmp	x0,#0
202	b.ne	Ldec_key_abort
203
204	sub	x2,x2,#240		// restore original x2
205	mov	x4,#-16
206	add	x0,x2,x12,lsl#4	// end of key schedule
207
208	ld1	{v0.4s},[x2]
209	ld1	{v1.4s},[x0]
210	st1	{v0.4s},[x0],x4
211	st1	{v1.4s},[x2],#16
212
213Loop_imc:
214	ld1	{v0.4s},[x2]
215	ld1	{v1.4s},[x0]
216	aesimc	v0.16b,v0.16b
217	aesimc	v1.16b,v1.16b
218	st1	{v0.4s},[x0],x4
219	st1	{v1.4s},[x2],#16
220	cmp	x0,x2
221	b.hi	Loop_imc
222
223	ld1	{v0.4s},[x2]
224	aesimc	v0.16b,v0.16b
225	st1	{v0.4s},[x0]
226
227	eor	x0,x0,x0		// return value
228Ldec_key_abort:
229	ldp	x29,x30,[sp],#16
230	AARCH64_VALIDATE_LINK_REGISTER
231	ret
232
233.globl	aes_hw_encrypt
234
235.def aes_hw_encrypt
236   .type 32
237.endef
238.align	5
239aes_hw_encrypt:
240	AARCH64_VALID_CALL_TARGET
241	ldr	w3,[x2,#240]
242	ld1	{v0.4s},[x2],#16
243	ld1	{v2.16b},[x0]
244	sub	w3,w3,#2
245	ld1	{v1.4s},[x2],#16
246
247Loop_enc:
248	aese	v2.16b,v0.16b
249	aesmc	v2.16b,v2.16b
250	ld1	{v0.4s},[x2],#16
251	subs	w3,w3,#2
252	aese	v2.16b,v1.16b
253	aesmc	v2.16b,v2.16b
254	ld1	{v1.4s},[x2],#16
255	b.gt	Loop_enc
256
257	aese	v2.16b,v0.16b
258	aesmc	v2.16b,v2.16b
259	ld1	{v0.4s},[x2]
260	aese	v2.16b,v1.16b
261	eor	v2.16b,v2.16b,v0.16b
262
263	st1	{v2.16b},[x1]
264	ret
265
266.globl	aes_hw_decrypt
267
268.def aes_hw_decrypt
269   .type 32
270.endef
271.align	5
272aes_hw_decrypt:
273	AARCH64_VALID_CALL_TARGET
274	ldr	w3,[x2,#240]
275	ld1	{v0.4s},[x2],#16
276	ld1	{v2.16b},[x0]
277	sub	w3,w3,#2
278	ld1	{v1.4s},[x2],#16
279
280Loop_dec:
281	aesd	v2.16b,v0.16b
282	aesimc	v2.16b,v2.16b
283	ld1	{v0.4s},[x2],#16
284	subs	w3,w3,#2
285	aesd	v2.16b,v1.16b
286	aesimc	v2.16b,v2.16b
287	ld1	{v1.4s},[x2],#16
288	b.gt	Loop_dec
289
290	aesd	v2.16b,v0.16b
291	aesimc	v2.16b,v2.16b
292	ld1	{v0.4s},[x2]
293	aesd	v2.16b,v1.16b
294	eor	v2.16b,v2.16b,v0.16b
295
296	st1	{v2.16b},[x1]
297	ret
298
299.globl	aes_hw_cbc_encrypt
300
301.def aes_hw_cbc_encrypt
302   .type 32
303.endef
304.align	5
305aes_hw_cbc_encrypt:
306	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
307	AARCH64_VALID_CALL_TARGET
308	stp	x29,x30,[sp,#-16]!
309	add	x29,sp,#0
310	subs	x2,x2,#16
311	mov	x8,#16
312	b.lo	Lcbc_abort
313	csel	x8,xzr,x8,eq
314
315	cmp	w5,#0			// en- or decrypting?
316	ldr	w5,[x3,#240]
317	and	x2,x2,#-16
318	ld1	{v6.16b},[x4]
319	ld1	{v0.16b},[x0],x8
320
321	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
322	sub	w5,w5,#6
323	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
324	sub	w5,w5,#2
325	ld1	{v18.4s,v19.4s},[x7],#32
326	ld1	{v20.4s,v21.4s},[x7],#32
327	ld1	{v22.4s,v23.4s},[x7],#32
328	ld1	{v7.4s},[x7]
329
330	add	x7,x3,#32
331	mov	w6,w5
332	b.eq	Lcbc_dec
333
334	cmp	w5,#2
335	eor	v0.16b,v0.16b,v6.16b
336	eor	v5.16b,v16.16b,v7.16b
337	b.eq	Lcbc_enc128
338
339	ld1	{v2.4s,v3.4s},[x7]
340	add	x7,x3,#16
341	add	x6,x3,#16*4
342	add	x12,x3,#16*5
343	aese	v0.16b,v16.16b
344	aesmc	v0.16b,v0.16b
345	add	x14,x3,#16*6
346	add	x3,x3,#16*7
347	b	Lenter_cbc_enc
348
349.align	4
350Loop_cbc_enc:
351	aese	v0.16b,v16.16b
352	aesmc	v0.16b,v0.16b
353	st1	{v6.16b},[x1],#16
354Lenter_cbc_enc:
355	aese	v0.16b,v17.16b
356	aesmc	v0.16b,v0.16b
357	aese	v0.16b,v2.16b
358	aesmc	v0.16b,v0.16b
359	ld1	{v16.4s},[x6]
360	cmp	w5,#4
361	aese	v0.16b,v3.16b
362	aesmc	v0.16b,v0.16b
363	ld1	{v17.4s},[x12]
364	b.eq	Lcbc_enc192
365
366	aese	v0.16b,v16.16b
367	aesmc	v0.16b,v0.16b
368	ld1	{v16.4s},[x14]
369	aese	v0.16b,v17.16b
370	aesmc	v0.16b,v0.16b
371	ld1	{v17.4s},[x3]
372	nop
373
374Lcbc_enc192:
375	aese	v0.16b,v16.16b
376	aesmc	v0.16b,v0.16b
377	subs	x2,x2,#16
378	aese	v0.16b,v17.16b
379	aesmc	v0.16b,v0.16b
380	csel	x8,xzr,x8,eq
381	aese	v0.16b,v18.16b
382	aesmc	v0.16b,v0.16b
383	aese	v0.16b,v19.16b
384	aesmc	v0.16b,v0.16b
385	ld1	{v16.16b},[x0],x8
386	aese	v0.16b,v20.16b
387	aesmc	v0.16b,v0.16b
388	eor	v16.16b,v16.16b,v5.16b
389	aese	v0.16b,v21.16b
390	aesmc	v0.16b,v0.16b
391	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
392	aese	v0.16b,v22.16b
393	aesmc	v0.16b,v0.16b
394	aese	v0.16b,v23.16b
395	eor	v6.16b,v0.16b,v7.16b
396	b.hs	Loop_cbc_enc
397
398	st1	{v6.16b},[x1],#16
399	b	Lcbc_done
400
401.align	5
402Lcbc_enc128:
403	ld1	{v2.4s,v3.4s},[x7]
404	aese	v0.16b,v16.16b
405	aesmc	v0.16b,v0.16b
406	b	Lenter_cbc_enc128
407Loop_cbc_enc128:
408	aese	v0.16b,v16.16b
409	aesmc	v0.16b,v0.16b
410	st1	{v6.16b},[x1],#16
411Lenter_cbc_enc128:
412	aese	v0.16b,v17.16b
413	aesmc	v0.16b,v0.16b
414	subs	x2,x2,#16
415	aese	v0.16b,v2.16b
416	aesmc	v0.16b,v0.16b
417	csel	x8,xzr,x8,eq
418	aese	v0.16b,v3.16b
419	aesmc	v0.16b,v0.16b
420	aese	v0.16b,v18.16b
421	aesmc	v0.16b,v0.16b
422	aese	v0.16b,v19.16b
423	aesmc	v0.16b,v0.16b
424	ld1	{v16.16b},[x0],x8
425	aese	v0.16b,v20.16b
426	aesmc	v0.16b,v0.16b
427	aese	v0.16b,v21.16b
428	aesmc	v0.16b,v0.16b
429	aese	v0.16b,v22.16b
430	aesmc	v0.16b,v0.16b
431	eor	v16.16b,v16.16b,v5.16b
432	aese	v0.16b,v23.16b
433	eor	v6.16b,v0.16b,v7.16b
434	b.hs	Loop_cbc_enc128
435
436	st1	{v6.16b},[x1],#16
437	b	Lcbc_done
438.align	5
439Lcbc_dec:
440	ld1	{v18.16b},[x0],#16
441	subs	x2,x2,#32		// bias
442	add	w6,w5,#2
443	orr	v3.16b,v0.16b,v0.16b
444	orr	v1.16b,v0.16b,v0.16b
445	orr	v19.16b,v18.16b,v18.16b
446	b.lo	Lcbc_dec_tail
447
448	orr	v1.16b,v18.16b,v18.16b
449	ld1	{v18.16b},[x0],#16
450	orr	v2.16b,v0.16b,v0.16b
451	orr	v3.16b,v1.16b,v1.16b
452	orr	v19.16b,v18.16b,v18.16b
453
454Loop3x_cbc_dec:
455	aesd	v0.16b,v16.16b
456	aesimc	v0.16b,v0.16b
457	aesd	v1.16b,v16.16b
458	aesimc	v1.16b,v1.16b
459	aesd	v18.16b,v16.16b
460	aesimc	v18.16b,v18.16b
461	ld1	{v16.4s},[x7],#16
462	subs	w6,w6,#2
463	aesd	v0.16b,v17.16b
464	aesimc	v0.16b,v0.16b
465	aesd	v1.16b,v17.16b
466	aesimc	v1.16b,v1.16b
467	aesd	v18.16b,v17.16b
468	aesimc	v18.16b,v18.16b
469	ld1	{v17.4s},[x7],#16
470	b.gt	Loop3x_cbc_dec
471
472	aesd	v0.16b,v16.16b
473	aesimc	v0.16b,v0.16b
474	aesd	v1.16b,v16.16b
475	aesimc	v1.16b,v1.16b
476	aesd	v18.16b,v16.16b
477	aesimc	v18.16b,v18.16b
478	eor	v4.16b,v6.16b,v7.16b
479	subs	x2,x2,#0x30
480	eor	v5.16b,v2.16b,v7.16b
481	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
482	aesd	v0.16b,v17.16b
483	aesimc	v0.16b,v0.16b
484	aesd	v1.16b,v17.16b
485	aesimc	v1.16b,v1.16b
486	aesd	v18.16b,v17.16b
487	aesimc	v18.16b,v18.16b
488	eor	v17.16b,v3.16b,v7.16b
489	add	x0,x0,x6		// x0 is adjusted in such way that
490					// at exit from the loop v1.16b-v18.16b
491					// are loaded with last "words"
492	orr	v6.16b,v19.16b,v19.16b
493	mov	x7,x3
494	aesd	v0.16b,v20.16b
495	aesimc	v0.16b,v0.16b
496	aesd	v1.16b,v20.16b
497	aesimc	v1.16b,v1.16b
498	aesd	v18.16b,v20.16b
499	aesimc	v18.16b,v18.16b
500	ld1	{v2.16b},[x0],#16
501	aesd	v0.16b,v21.16b
502	aesimc	v0.16b,v0.16b
503	aesd	v1.16b,v21.16b
504	aesimc	v1.16b,v1.16b
505	aesd	v18.16b,v21.16b
506	aesimc	v18.16b,v18.16b
507	ld1	{v3.16b},[x0],#16
508	aesd	v0.16b,v22.16b
509	aesimc	v0.16b,v0.16b
510	aesd	v1.16b,v22.16b
511	aesimc	v1.16b,v1.16b
512	aesd	v18.16b,v22.16b
513	aesimc	v18.16b,v18.16b
514	ld1	{v19.16b},[x0],#16
515	aesd	v0.16b,v23.16b
516	aesd	v1.16b,v23.16b
517	aesd	v18.16b,v23.16b
518	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
519	add	w6,w5,#2
520	eor	v4.16b,v4.16b,v0.16b
521	eor	v5.16b,v5.16b,v1.16b
522	eor	v18.16b,v18.16b,v17.16b
523	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
524	st1	{v4.16b},[x1],#16
525	orr	v0.16b,v2.16b,v2.16b
526	st1	{v5.16b},[x1],#16
527	orr	v1.16b,v3.16b,v3.16b
528	st1	{v18.16b},[x1],#16
529	orr	v18.16b,v19.16b,v19.16b
530	b.hs	Loop3x_cbc_dec
531
532	cmn	x2,#0x30
533	b.eq	Lcbc_done
534	nop
535
536Lcbc_dec_tail:
537	aesd	v1.16b,v16.16b
538	aesimc	v1.16b,v1.16b
539	aesd	v18.16b,v16.16b
540	aesimc	v18.16b,v18.16b
541	ld1	{v16.4s},[x7],#16
542	subs	w6,w6,#2
543	aesd	v1.16b,v17.16b
544	aesimc	v1.16b,v1.16b
545	aesd	v18.16b,v17.16b
546	aesimc	v18.16b,v18.16b
547	ld1	{v17.4s},[x7],#16
548	b.gt	Lcbc_dec_tail
549
550	aesd	v1.16b,v16.16b
551	aesimc	v1.16b,v1.16b
552	aesd	v18.16b,v16.16b
553	aesimc	v18.16b,v18.16b
554	aesd	v1.16b,v17.16b
555	aesimc	v1.16b,v1.16b
556	aesd	v18.16b,v17.16b
557	aesimc	v18.16b,v18.16b
558	aesd	v1.16b,v20.16b
559	aesimc	v1.16b,v1.16b
560	aesd	v18.16b,v20.16b
561	aesimc	v18.16b,v18.16b
562	cmn	x2,#0x20
563	aesd	v1.16b,v21.16b
564	aesimc	v1.16b,v1.16b
565	aesd	v18.16b,v21.16b
566	aesimc	v18.16b,v18.16b
567	eor	v5.16b,v6.16b,v7.16b
568	aesd	v1.16b,v22.16b
569	aesimc	v1.16b,v1.16b
570	aesd	v18.16b,v22.16b
571	aesimc	v18.16b,v18.16b
572	eor	v17.16b,v3.16b,v7.16b
573	aesd	v1.16b,v23.16b
574	aesd	v18.16b,v23.16b
575	b.eq	Lcbc_dec_one
576	eor	v5.16b,v5.16b,v1.16b
577	eor	v17.16b,v17.16b,v18.16b
578	orr	v6.16b,v19.16b,v19.16b
579	st1	{v5.16b},[x1],#16
580	st1	{v17.16b},[x1],#16
581	b	Lcbc_done
582
583Lcbc_dec_one:
584	eor	v5.16b,v5.16b,v18.16b
585	orr	v6.16b,v19.16b,v19.16b
586	st1	{v5.16b},[x1],#16
587
588Lcbc_done:
589	st1	{v6.16b},[x4]
590Lcbc_abort:
591	ldr	x29,[sp],#16
592	ret
593
594.globl	aes_hw_ctr32_encrypt_blocks
595
596.def aes_hw_ctr32_encrypt_blocks
597   .type 32
598.endef
599.align	5
600aes_hw_ctr32_encrypt_blocks:
601	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
602	AARCH64_VALID_CALL_TARGET
603	stp	x29,x30,[sp,#-16]!
604	add	x29,sp,#0
605	ldr	w5,[x3,#240]
606
607	ldr	w8, [x4, #12]
608	ld1	{v0.4s},[x4]
609
610	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
611	sub	w5,w5,#4
612	mov	x12,#16
613	cmp	x2,#2
614	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
615	sub	w5,w5,#2
616	ld1	{v20.4s,v21.4s},[x7],#32
617	ld1	{v22.4s,v23.4s},[x7],#32
618	ld1	{v7.4s},[x7]
619	add	x7,x3,#32
620	mov	w6,w5
621	csel	x12,xzr,x12,lo
622
623	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
624	// affected by silicon errata #1742098 [0] and #1655431 [1],
625	// respectively, where the second instruction of an aese/aesmc
626	// instruction pair may execute twice if an interrupt is taken right
627	// after the first instruction consumes an input register of which a
628	// single 32-bit lane has been updated the last time it was modified.
629	//
630	// This function uses a counter in one 32-bit lane. The vmov lines
631	// could write to v1.16b and v18.16b directly, but that trips this bugs.
632	// We write to v6.16b and copy to the final register as a workaround.
633	//
634	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
635	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
636#ifndef __AARCH64EB__
637	rev	w8, w8
638#endif
639	add	w10, w8, #1
640	orr	v6.16b,v0.16b,v0.16b
641	rev	w10, w10
642	mov	v6.s[3],w10
643	add	w8, w8, #2
644	orr	v1.16b,v6.16b,v6.16b
645	b.ls	Lctr32_tail
646	rev	w12, w8
647	mov	v6.s[3],w12
648	sub	x2,x2,#3		// bias
649	orr	v18.16b,v6.16b,v6.16b
650	b	Loop3x_ctr32
651
652.align	4
653Loop3x_ctr32:
654	aese	v0.16b,v16.16b
655	aesmc	v0.16b,v0.16b
656	aese	v1.16b,v16.16b
657	aesmc	v1.16b,v1.16b
658	aese	v18.16b,v16.16b
659	aesmc	v18.16b,v18.16b
660	ld1	{v16.4s},[x7],#16
661	subs	w6,w6,#2
662	aese	v0.16b,v17.16b
663	aesmc	v0.16b,v0.16b
664	aese	v1.16b,v17.16b
665	aesmc	v1.16b,v1.16b
666	aese	v18.16b,v17.16b
667	aesmc	v18.16b,v18.16b
668	ld1	{v17.4s},[x7],#16
669	b.gt	Loop3x_ctr32
670
671	aese	v0.16b,v16.16b
672	aesmc	v4.16b,v0.16b
673	aese	v1.16b,v16.16b
674	aesmc	v5.16b,v1.16b
675	ld1	{v2.16b},[x0],#16
676	add	w9,w8,#1
677	aese	v18.16b,v16.16b
678	aesmc	v18.16b,v18.16b
679	ld1	{v3.16b},[x0],#16
680	rev	w9,w9
681	aese	v4.16b,v17.16b
682	aesmc	v4.16b,v4.16b
683	aese	v5.16b,v17.16b
684	aesmc	v5.16b,v5.16b
685	ld1	{v19.16b},[x0],#16
686	mov	x7,x3
687	aese	v18.16b,v17.16b
688	aesmc	v17.16b,v18.16b
689	aese	v4.16b,v20.16b
690	aesmc	v4.16b,v4.16b
691	aese	v5.16b,v20.16b
692	aesmc	v5.16b,v5.16b
693	eor	v2.16b,v2.16b,v7.16b
694	add	w10,w8,#2
695	aese	v17.16b,v20.16b
696	aesmc	v17.16b,v17.16b
697	eor	v3.16b,v3.16b,v7.16b
698	add	w8,w8,#3
699	aese	v4.16b,v21.16b
700	aesmc	v4.16b,v4.16b
701	aese	v5.16b,v21.16b
702	aesmc	v5.16b,v5.16b
703	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
704	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
705	 // 32-bit mode. See the comment above.
706	eor	v19.16b,v19.16b,v7.16b
707	mov	v6.s[3], w9
708	aese	v17.16b,v21.16b
709	aesmc	v17.16b,v17.16b
710	orr	v0.16b,v6.16b,v6.16b
711	rev	w10,w10
712	aese	v4.16b,v22.16b
713	aesmc	v4.16b,v4.16b
714	mov	v6.s[3], w10
715	rev	w12,w8
716	aese	v5.16b,v22.16b
717	aesmc	v5.16b,v5.16b
718	orr	v1.16b,v6.16b,v6.16b
719	mov	v6.s[3], w12
720	aese	v17.16b,v22.16b
721	aesmc	v17.16b,v17.16b
722	orr	v18.16b,v6.16b,v6.16b
723	subs	x2,x2,#3
724	aese	v4.16b,v23.16b
725	aese	v5.16b,v23.16b
726	aese	v17.16b,v23.16b
727
728	eor	v2.16b,v2.16b,v4.16b
729	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
730	st1	{v2.16b},[x1],#16
731	eor	v3.16b,v3.16b,v5.16b
732	mov	w6,w5
733	st1	{v3.16b},[x1],#16
734	eor	v19.16b,v19.16b,v17.16b
735	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
736	st1	{v19.16b},[x1],#16
737	b.hs	Loop3x_ctr32
738
739	adds	x2,x2,#3
740	b.eq	Lctr32_done
741	cmp	x2,#1
742	mov	x12,#16
743	csel	x12,xzr,x12,eq
744
745Lctr32_tail:
746	aese	v0.16b,v16.16b
747	aesmc	v0.16b,v0.16b
748	aese	v1.16b,v16.16b
749	aesmc	v1.16b,v1.16b
750	ld1	{v16.4s},[x7],#16
751	subs	w6,w6,#2
752	aese	v0.16b,v17.16b
753	aesmc	v0.16b,v0.16b
754	aese	v1.16b,v17.16b
755	aesmc	v1.16b,v1.16b
756	ld1	{v17.4s},[x7],#16
757	b.gt	Lctr32_tail
758
759	aese	v0.16b,v16.16b
760	aesmc	v0.16b,v0.16b
761	aese	v1.16b,v16.16b
762	aesmc	v1.16b,v1.16b
763	aese	v0.16b,v17.16b
764	aesmc	v0.16b,v0.16b
765	aese	v1.16b,v17.16b
766	aesmc	v1.16b,v1.16b
767	ld1	{v2.16b},[x0],x12
768	aese	v0.16b,v20.16b
769	aesmc	v0.16b,v0.16b
770	aese	v1.16b,v20.16b
771	aesmc	v1.16b,v1.16b
772	ld1	{v3.16b},[x0]
773	aese	v0.16b,v21.16b
774	aesmc	v0.16b,v0.16b
775	aese	v1.16b,v21.16b
776	aesmc	v1.16b,v1.16b
777	eor	v2.16b,v2.16b,v7.16b
778	aese	v0.16b,v22.16b
779	aesmc	v0.16b,v0.16b
780	aese	v1.16b,v22.16b
781	aesmc	v1.16b,v1.16b
782	eor	v3.16b,v3.16b,v7.16b
783	aese	v0.16b,v23.16b
784	aese	v1.16b,v23.16b
785
786	cmp	x2,#1
787	eor	v2.16b,v2.16b,v0.16b
788	eor	v3.16b,v3.16b,v1.16b
789	st1	{v2.16b},[x1],#16
790	b.eq	Lctr32_done
791	st1	{v3.16b},[x1]
792
793Lctr32_done:
794	ldr	x29,[sp],#16
795	ret
796
797#endif
798#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
799