xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/aesv8-gcm-armv8-win.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include <openssl/arm_arch.h>
8#if __ARM_MAX_ARCH__ >= 8
9
10.arch	armv8-a+crypto
11.text
12.globl	aes_gcm_enc_kernel
13
14.def aes_gcm_enc_kernel
15   .type 32
16.endef
17.align	4
18aes_gcm_enc_kernel:
19	AARCH64_SIGN_LINK_REGISTER
20	stp	x29, x30, [sp, #-128]!
21	mov	x29, sp
22	stp	x19, x20, [sp, #16]
23	mov	x16, x4
24	mov	x8, x5
25	stp	x21, x22, [sp, #32]
26	stp	x23, x24, [sp, #48]
27	stp	d8, d9, [sp, #64]
28	stp	d10, d11, [sp, #80]
29	stp	d12, d13, [sp, #96]
30	stp	d14, d15, [sp, #112]
31	ldr	w17, [x8, #240]
32	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
33	ldp	x13, x14, [x19]                       // load round N keys
34	ldr	q31, [x19, #-16]                        // load round N-1 keys
35	add	x4, x0, x1, lsr #3   // end_input_ptr
36	lsr	x5, x1, #3              // byte_len
37	mov	x15, x5
38	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
39	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
40	sub	x5, x5, #1      // byte_len - 1
41	ldr	q18, [x8, #0]                                  // load rk0
42	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
43	ldr	q25, [x8, #112]                                // load rk7
44	add	x5, x5, x0
45	lsr	x12, x11, #32
46	fmov	d2, x10                               // CTR block 2
47	orr	w11, w11, w11
48	rev	w12, w12                                // rev_ctr32
49	fmov	d1, x10                               // CTR block 1
50	aese	v0.16b, v18.16b
51	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
52	add	w12, w12, #1                            // increment rev_ctr32
53	rev	w9, w12                                 // CTR block 1
54	fmov	d3, x10                               // CTR block 3
55	orr	x9, x11, x9, lsl #32            // CTR block 1
56	add	w12, w12, #1                            // CTR block 1
57	ldr	q19, [x8, #16]                                 // load rk1
58	fmov	v1.d[1], x9                               // CTR block 1
59	rev	w9, w12                                 // CTR block 2
60	add	w12, w12, #1                            // CTR block 2
61	orr	x9, x11, x9, lsl #32            // CTR block 2
62	ldr	q20, [x8, #32]                                 // load rk2
63	fmov	v2.d[1], x9                               // CTR block 2
64	rev	w9, w12                                 // CTR block 3
65	aese	v0.16b, v19.16b
66	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
67	orr	x9, x11, x9, lsl #32            // CTR block 3
68	fmov	v3.d[1], x9                               // CTR block 3
69	aese	v1.16b, v18.16b
70	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
71	ldr	q21, [x8, #48]                                 // load rk3
72	aese	v0.16b, v20.16b
73	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
74	ldr	q24, [x8, #96]                                 // load rk6
75	aese	v2.16b, v18.16b
76	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
77	ldr	q23, [x8, #80]                                 // load rk5
78	aese	v1.16b, v19.16b
79	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
80	ldr	q14, [x6, #48]                              // load h3l | h3h
81	ext	v14.16b, v14.16b, v14.16b, #8
82	aese	v3.16b, v18.16b
83	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
84	aese	v2.16b, v19.16b
85	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
86	ldr	q22, [x8, #64]                                 // load rk4
87	aese	v1.16b, v20.16b
88	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
89	ldr	q13, [x6, #32]                              // load h2l | h2h
90	ext	v13.16b, v13.16b, v13.16b, #8
91	aese	v3.16b, v19.16b
92	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
93	ldr	q30, [x8, #192]                               // load rk12
94	aese	v2.16b, v20.16b
95	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
96	ldr	q15, [x6, #80]                              // load h4l | h4h
97	ext	v15.16b, v15.16b, v15.16b, #8
98	aese	v1.16b, v21.16b
99	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
100	ldr	q29, [x8, #176]                               // load rk11
101	aese	v3.16b, v20.16b
102	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
103	ldr	q26, [x8, #128]                                // load rk8
104	aese	v2.16b, v21.16b
105	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
106	add	w12, w12, #1                            // CTR block 3
107	aese	v0.16b, v21.16b
108	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
109	aese	v3.16b, v21.16b
110	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
111	ld1	{ v11.16b}, [x3]
112	ext	v11.16b, v11.16b, v11.16b, #8
113	rev64	v11.16b, v11.16b
114	aese	v2.16b, v22.16b
115	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
116	aese	v0.16b, v22.16b
117	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
118	aese	v1.16b, v22.16b
119	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
120	aese	v3.16b, v22.16b
121	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
122	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
123	aese	v0.16b, v23.16b
124	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
125	aese	v1.16b, v23.16b
126	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
127	aese	v3.16b, v23.16b
128	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
129	aese	v2.16b, v23.16b
130	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
131	aese	v1.16b, v24.16b
132	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
133	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
134	aese	v3.16b, v24.16b
135	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
136	ldr	q27, [x8, #144]                                // load rk9
137	aese	v0.16b, v24.16b
138	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
139	ldr	q12, [x6]                                   // load h1l | h1h
140	ext	v12.16b, v12.16b, v12.16b, #8
141	aese	v2.16b, v24.16b
142	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
143	ldr	q28, [x8, #160]                               // load rk10
144	aese	v1.16b, v25.16b
145	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
146	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
147	aese	v0.16b, v25.16b
148	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
149	aese	v2.16b, v25.16b
150	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
151	aese	v3.16b, v25.16b
152	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
153	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
154	aese	v1.16b, v26.16b
155	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
156	aese	v2.16b, v26.16b
157	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
158	aese	v3.16b, v26.16b
159	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
160	aese	v0.16b, v26.16b
161	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
162	b.lt	Lenc_finish_first_blocks                         // branch if AES-128
163
164	aese	v1.16b, v27.16b
165	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
166	aese	v2.16b, v27.16b
167	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
168	aese	v3.16b, v27.16b
169	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
170	aese	v0.16b, v27.16b
171	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
172	aese	v1.16b, v28.16b
173	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
174	aese	v2.16b, v28.16b
175	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
176	aese	v3.16b, v28.16b
177	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
178	aese	v0.16b, v28.16b
179	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
180	b.eq	Lenc_finish_first_blocks                         // branch if AES-192
181
182	aese	v1.16b, v29.16b
183	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
184	aese	v2.16b, v29.16b
185	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
186	aese	v0.16b, v29.16b
187	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
188	aese	v3.16b, v29.16b
189	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
190	aese	v1.16b, v30.16b
191	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
192	aese	v2.16b, v30.16b
193	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
194	aese	v0.16b, v30.16b
195	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
196	aese	v3.16b, v30.16b
197	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
198
199Lenc_finish_first_blocks:
200	cmp	x0, x5                   // check if we have <= 4 blocks
201	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
202	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
203	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
204	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
205	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
206	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
207	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
208	b.ge	Lenc_tail                                        // handle tail
209
210	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
211	rev	w9, w12                                 // CTR block 4
212	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
213	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
214	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
215	add	x0, x0, #64                       // AES input_ptr update
216	eor	x19, x19, x13                      // AES block 1 - round N low
217	eor	x20, x20, x14                      // AES block 1 - round N high
218	fmov	d5, x19                               // AES block 1 - mov low
219	eor	x6, x6, x13                      // AES block 0 - round N low
220	eor	x7, x7, x14                      // AES block 0 - round N high
221	eor	x24, x24, x14                      // AES block 3 - round N high
222	fmov	d4, x6                               // AES block 0 - mov low
223	cmp	x0, x5                   // check if we have <= 8 blocks
224	fmov	v4.d[1], x7                           // AES block 0 - mov high
225	eor	x23, x23, x13                      // AES block 3 - round N low
226	eor	x21, x21, x13                      // AES block 2 - round N low
227	fmov	v5.d[1], x20                           // AES block 1 - mov high
228	fmov	d6, x21                               // AES block 2 - mov low
229	add	w12, w12, #1                            // CTR block 4
230	orr	x9, x11, x9, lsl #32            // CTR block 4
231	fmov	d7, x23                               // AES block 3 - mov low
232	eor	x22, x22, x14                      // AES block 2 - round N high
233	fmov	v6.d[1], x22                           // AES block 2 - mov high
234	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
235	fmov	d0, x10                               // CTR block 4
236	fmov	v0.d[1], x9                               // CTR block 4
237	rev	w9, w12                                 // CTR block 5
238	add	w12, w12, #1                            // CTR block 5
239	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
240	fmov	d1, x10                               // CTR block 5
241	orr	x9, x11, x9, lsl #32            // CTR block 5
242	fmov	v1.d[1], x9                               // CTR block 5
243	rev	w9, w12                                 // CTR block 6
244	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
245	fmov	v7.d[1], x24                           // AES block 3 - mov high
246	orr	x9, x11, x9, lsl #32            // CTR block 6
247	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
248	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
249	add	w12, w12, #1                            // CTR block 6
250	fmov	d2, x10                               // CTR block 6
251	fmov	v2.d[1], x9                               // CTR block 6
252	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
253	rev	w9, w12                                 // CTR block 7
254	orr	x9, x11, x9, lsl #32            // CTR block 7
255	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
256	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
257	b.ge	Lenc_prepretail                                  // do prepretail
258
259Lenc_main_loop:	//	main loop start
260	aese	v0.16b, v18.16b
261	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
262	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
263	aese	v1.16b, v18.16b
264	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
265	fmov	d3, x10                               // CTR block 4k+3
266	aese	v2.16b, v18.16b
267	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
268	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
269	aese	v0.16b, v19.16b
270	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
271	fmov	v3.d[1], x9                               // CTR block 4k+3
272	aese	v1.16b, v19.16b
273	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
274	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
275	aese	v2.16b, v19.16b
276	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
277	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
278	aese	v0.16b, v20.16b
279	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
280	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
281	aese	v1.16b, v20.16b
282	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
283	aese	v3.16b, v18.16b
284	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
285	eor	x23, x23, x13                      // AES block 4k+7 - round N low
286	aese	v0.16b, v21.16b
287	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
288	mov	d10, v17.d[1]                               // GHASH block 4k - mid
289	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
290	eor	x22, x22, x14                      // AES block 4k+6 - round N high
291	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
292	aese	v3.16b, v19.16b
293	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
294	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
295	aese	v0.16b, v22.16b
296	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
297	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
298	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
299	aese	v2.16b, v20.16b
300	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
301	aese	v0.16b, v23.16b
302	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
303	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
304	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
305	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
306	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
307	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
308	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
309	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
310	aese	v1.16b, v21.16b
311	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
312	aese	v3.16b, v20.16b
313	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
314	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
315	aese	v2.16b, v21.16b
316	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
317	aese	v1.16b, v22.16b
318	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
319	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
320	aese	v3.16b, v21.16b
321	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
322	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
323	aese	v2.16b, v22.16b
324	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
325	aese	v0.16b, v24.16b
326	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
327	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
328	aese	v3.16b, v22.16b
329	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
330	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
331	aese	v0.16b, v25.16b
332	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
333	aese	v3.16b, v23.16b
334	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
335	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
336	aese	v1.16b, v23.16b
337	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
338	aese	v0.16b, v26.16b
339	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
340	aese	v2.16b, v23.16b
341	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
342	aese	v1.16b, v24.16b
343	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
344	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
345	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
346	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
347	aese	v1.16b, v25.16b
348	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
349	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
350	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
351	aese	v3.16b, v24.16b
352	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
353	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
354	aese	v1.16b, v26.16b
355	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
356	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
357	aese	v2.16b, v24.16b
358	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
359	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
360	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
361	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
362	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
363	aese	v2.16b, v25.16b
364	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
365	eor	x19, x19, x13                      // AES block 4k+5 - round N low
366	aese	v2.16b, v26.16b
367	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
368	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
369	aese	v3.16b, v25.16b
370	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
371	eor	x21, x21, x13                      // AES block 4k+6 - round N low
372	aese	v3.16b, v26.16b
373	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
374	movi	v8.8b, #0xc2
375	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
376	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
377	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
378	fmov	d5, x19                               // AES block 4k+5 - mov low
379	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
380	b.lt	Lenc_main_loop_continue                          // branch if AES-128
381
382	aese	v1.16b, v27.16b
383	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
384	aese	v0.16b, v27.16b
385	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
386	aese	v2.16b, v27.16b
387	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
388	aese	v3.16b, v27.16b
389	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
390	aese	v0.16b, v28.16b
391	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
392	aese	v1.16b, v28.16b
393	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
394	aese	v2.16b, v28.16b
395	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
396	aese	v3.16b, v28.16b
397	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
398	b.eq	Lenc_main_loop_continue                          // branch if AES-192
399
400	aese	v0.16b, v29.16b
401	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
402	aese	v1.16b, v29.16b
403	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
404	aese	v2.16b, v29.16b
405	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
406	aese	v3.16b, v29.16b
407	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
408	aese	v1.16b, v30.16b
409	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
410	aese	v0.16b, v30.16b
411	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
412	aese	v2.16b, v30.16b
413	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
414	aese	v3.16b, v30.16b
415	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
416
417Lenc_main_loop_continue:
418	shl	d8, d8, #56               // mod_constant
419	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
420	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
421	add	w12, w12, #1                            // CTR block 4k+3
422	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
423	add	x0, x0, #64                       // AES input_ptr update
424	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
425	rev	w9, w12                                 // CTR block 4k+8
426	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
427	eor	x6, x6, x13                      // AES block 4k+4 - round N low
428	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
429	eor	x7, x7, x14                      // AES block 4k+4 - round N high
430	fmov	d4, x6                               // AES block 4k+4 - mov low
431	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
432	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
433	eor	x20, x20, x14                      // AES block 4k+5 - round N high
434	eor	x24, x24, x14                      // AES block 4k+7 - round N high
435	add	w12, w12, #1                            // CTR block 4k+8
436	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
437	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
438	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
439	fmov	d7, x23                               // AES block 4k+7 - mov low
440	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
441	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
442	fmov	d6, x21                               // AES block 4k+6 - mov low
443	cmp	x0, x5                   // LOOP CONTROL
444	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
445	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
446	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
447	fmov	d0, x10                               // CTR block 4k+8
448	fmov	v0.d[1], x9                               // CTR block 4k+8
449	rev	w9, w12                                 // CTR block 4k+9
450	add	w12, w12, #1                            // CTR block 4k+9
451	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
452	fmov	d1, x10                               // CTR block 4k+9
453	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
454	fmov	v1.d[1], x9                               // CTR block 4k+9
455	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
456	rev	w9, w12                                 // CTR block 4k+10
457	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
458	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
459	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
460	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
461	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
462	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
463	add	w12, w12, #1                            // CTR block 4k+10
464	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
465	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
466	fmov	d2, x10                               // CTR block 4k+10
467	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
468	fmov	v2.d[1], x9                               // CTR block 4k+10
469	rev	w9, w12                                 // CTR block 4k+11
470	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
471	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
472	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
473	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
474	b.lt	Lenc_main_loop
475
476Lenc_prepretail:	//	PREPRETAIL
477	aese	v1.16b, v18.16b
478	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
479	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
480	aese	v2.16b, v18.16b
481	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
482	fmov	d3, x10                               // CTR block 4k+3
483	aese	v0.16b, v18.16b
484	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
485	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
486	fmov	v3.d[1], x9                               // CTR block 4k+3
487	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
488	aese	v2.16b, v19.16b
489	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
490	aese	v0.16b, v19.16b
491	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
492	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
493	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
494	aese	v2.16b, v20.16b
495	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
496	aese	v3.16b, v18.16b
497	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
498	mov	d10, v17.d[1]                               // GHASH block 4k - mid
499	aese	v1.16b, v19.16b
500	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
501	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
502	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
503	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
504	aese	v2.16b, v21.16b
505	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
506	aese	v1.16b, v20.16b
507	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
508	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
509	aese	v0.16b, v20.16b
510	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
511	aese	v3.16b, v19.16b
512	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
513	aese	v1.16b, v21.16b
514	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
515	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
516	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
517	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
518	aese	v3.16b, v20.16b
519	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
520	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
521	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
522	aese	v0.16b, v21.16b
523	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
524	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
525	aese	v3.16b, v21.16b
526	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
527	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
528	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
529	aese	v0.16b, v22.16b
530	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
531	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
532	aese	v3.16b, v22.16b
533	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
534	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
535	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
536	add	w12, w12, #1                            // CTR block 4k+3
537	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
538	aese	v3.16b, v23.16b
539	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
540	aese	v2.16b, v22.16b
541	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
542	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
543	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
544	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
545	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
546	aese	v2.16b, v23.16b
547	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
548	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
549	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
550	aese	v1.16b, v22.16b
551	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
552	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
553	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
554	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
555	aese	v1.16b, v23.16b
556	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
557	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
558	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
559	aese	v0.16b, v23.16b
560	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
561	aese	v1.16b, v24.16b
562	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
563	aese	v2.16b, v24.16b
564	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
565	aese	v0.16b, v24.16b
566	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
567	movi	v8.8b, #0xc2
568	aese	v3.16b, v24.16b
569	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
570	aese	v1.16b, v25.16b
571	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
572	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
573	aese	v0.16b, v25.16b
574	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
575	aese	v3.16b, v25.16b
576	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
577	shl	d8, d8, #56               // mod_constant
578	aese	v1.16b, v26.16b
579	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
580	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
581	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
582	aese	v3.16b, v26.16b
583	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
584	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
585	aese	v0.16b, v26.16b
586	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
587	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
588	aese	v2.16b, v25.16b
589	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
590	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
591	aese	v2.16b, v26.16b
592	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
593	pmull	v4.1q, v9.1d, v8.1d
594	ext	v9.16b, v9.16b, v9.16b, #8
595	eor	v10.16b, v10.16b, v11.16b
596	b.lt	Lenc_finish_prepretail                           // branch if AES-128
597
598	aese	v1.16b, v27.16b
599	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
600	aese	v3.16b, v27.16b
601	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
602	aese	v0.16b, v27.16b
603	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
604	aese	v2.16b, v27.16b
605	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
606	aese	v3.16b, v28.16b
607	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
608	aese	v1.16b, v28.16b
609	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
610	aese	v0.16b, v28.16b
611	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
612	aese	v2.16b, v28.16b
613	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
614	b.eq	Lenc_finish_prepretail                           // branch if AES-192
615
616	aese	v1.16b, v29.16b
617	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
618	aese	v0.16b, v29.16b
619	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
620	aese	v3.16b, v29.16b
621	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
622	aese	v2.16b, v29.16b
623	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
624	aese	v1.16b, v30.16b
625	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
626	aese	v0.16b, v30.16b
627	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
628	aese	v3.16b, v30.16b
629	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
630	aese	v2.16b, v30.16b
631	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
632
633Lenc_finish_prepretail:
634	eor	v10.16b, v10.16b, v4.16b
635	eor	v10.16b, v10.16b, v9.16b
636	pmull	v4.1q, v10.1d, v8.1d
637	ext	v10.16b, v10.16b, v10.16b, #8
638	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
639	eor	v11.16b, v11.16b, v4.16b
640	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
641	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
642	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
643	eor	v11.16b, v11.16b, v10.16b
644
645Lenc_tail:	//	TAIL
646	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
647	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
648	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
649	eor	x6, x6, x13                      // AES block 4k+4 - round N low
650	eor	x7, x7, x14                      // AES block 4k+4 - round N high
651	cmp	x5, #48
652	fmov	d4, x6                               // AES block 4k+4 - mov low
653	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
654	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
655	b.gt	Lenc_blocks_more_than_3
656	cmp	x5, #32
657	mov	v3.16b, v2.16b
658	movi	v11.8b, #0
659	movi	v9.8b, #0
660	sub	w12, w12, #1
661	mov	v2.16b, v1.16b
662	movi	v10.8b, #0
663	b.gt	Lenc_blocks_more_than_2
664	mov	v3.16b, v1.16b
665	sub	w12, w12, #1
666	cmp	x5, #16
667	b.gt	Lenc_blocks_more_than_1
668	sub	w12, w12, #1
669	b	Lenc_blocks_less_than_1
670Lenc_blocks_more_than_3:	//	blocks left >  3
671	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
672	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
673	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
674	eor	x6, x6, x13                     // AES final-2 block - round N low
675	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
676	eor	x7, x7, x14                     // AES final-2 block - round N high
677	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
678	fmov	d5, x6                                // AES final-2 block - mov low
679	fmov	v5.d[1], x7                            // AES final-2 block - mov high
680	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
681	movi	v8.8b, #0                                       // suppress further partial tag feed in
682	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
683	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
684	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
685	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
686	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
687Lenc_blocks_more_than_2:	//	blocks left >  2
688	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
689	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
690	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
691	eor	x6, x6, x13                     // AES final-1 block - round N low
692	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
693	fmov	d5, x6                                // AES final-1 block - mov low
694	eor	x7, x7, x14                     // AES final-1 block - round N high
695	fmov	v5.d[1], x7                            // AES final-1 block - mov high
696	movi	v8.8b, #0                                       // suppress further partial tag feed in
697	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
698	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
699	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
700	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
701	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
702	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
703	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
704	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
705	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
706Lenc_blocks_more_than_1:	//	blocks left >  1
707	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
708	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
709	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
710	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
711	movi	v8.8b, #0                                       // suppress further partial tag feed in
712	eor	x6, x6, x13                     // AES final block - round N low
713	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
714	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
715	eor	x7, x7, x14                     // AES final block - round N high
716	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
717	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
718	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
719	fmov	d5, x6                                // AES final block - mov low
720	fmov	v5.d[1], x7                            // AES final block - mov high
721	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
722	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
723	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
724	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
725	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
726Lenc_blocks_less_than_1:	//	blocks left <= 1
727	and	x1, x1, #127                   // bit_length %= 128
728	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
729	sub	x1, x1, #128                   // bit_length -= 128
730	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
731	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
732	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
733	and	x1, x1, #127                   // bit_length %= 128
734	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
735	cmp	x1, #64
736	csel	x6, x13, x14, lt
737	csel	x7, x14, xzr, lt
738	fmov	d0, x6                                // ctr0b is mask for last block
739	fmov	v0.d[1], x7
740	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
741	rev64	v4.16b, v5.16b                                   // GHASH final block
742	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
743	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
744	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
745	mov	d8, v4.d[1]                                 // GHASH final block - mid
746	rev	w9, w12
747	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
748	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
749	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
750	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
751	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
752	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
753	movi	v8.8b, #0xc2
754	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
755	shl	d8, d8, #56              // mod_constant
756	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
757	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
758	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
759	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
760	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
761	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
762	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
763	str	w9, [x16, #12]                         // store the updated counter
764	st1	{ v5.16b}, [x2]                         // store all 16B
765	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
766	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
767	ext	v11.16b, v11.16b, v11.16b, #8
768	rev64	v11.16b, v11.16b
769	mov	x0, x15
770	st1	{ v11.16b }, [x3]
771	ldp	x19, x20, [sp, #16]
772	ldp	x21, x22, [sp, #32]
773	ldp	x23, x24, [sp, #48]
774	ldp	d8, d9, [sp, #64]
775	ldp	d10, d11, [sp, #80]
776	ldp	d12, d13, [sp, #96]
777	ldp	d14, d15, [sp, #112]
778	ldp	x29, x30, [sp], #128
779	AARCH64_VALIDATE_LINK_REGISTER
780	ret
781
782.globl	aes_gcm_dec_kernel
783
784.def aes_gcm_dec_kernel
785   .type 32
786.endef
787.align	4
788aes_gcm_dec_kernel:
789	AARCH64_SIGN_LINK_REGISTER
790	stp	x29, x30, [sp, #-128]!
791	mov	x29, sp
792	stp	x19, x20, [sp, #16]
793	mov	x16, x4
794	mov	x8, x5
795	stp	x21, x22, [sp, #32]
796	stp	x23, x24, [sp, #48]
797	stp	d8, d9, [sp, #64]
798	stp	d10, d11, [sp, #80]
799	stp	d12, d13, [sp, #96]
800	stp	d14, d15, [sp, #112]
801	ldr	w17, [x8, #240]
802	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
803	ldp	x13, x14, [x19]                       // load round N keys
804	ldr	q31, [x19, #-16]                        // load round N-1 keys
805	lsr	x5, x1, #3              // byte_len
806	mov	x15, x5
807	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
808	ldr	q26, [x8, #128]                                // load rk8
809	sub	x5, x5, #1      // byte_len - 1
810	ldr	q25, [x8, #112]                                // load rk7
811	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
812	add	x4, x0, x1, lsr #3   // end_input_ptr
813	ldr	q24, [x8, #96]                                 // load rk6
814	lsr	x12, x11, #32
815	ldr	q23, [x8, #80]                                 // load rk5
816	orr	w11, w11, w11
817	ldr	q21, [x8, #48]                                 // load rk3
818	add	x5, x5, x0
819	rev	w12, w12                                // rev_ctr32
820	add	w12, w12, #1                            // increment rev_ctr32
821	fmov	d3, x10                               // CTR block 3
822	rev	w9, w12                                 // CTR block 1
823	add	w12, w12, #1                            // CTR block 1
824	fmov	d1, x10                               // CTR block 1
825	orr	x9, x11, x9, lsl #32            // CTR block 1
826	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
827	fmov	v1.d[1], x9                               // CTR block 1
828	rev	w9, w12                                 // CTR block 2
829	add	w12, w12, #1                            // CTR block 2
830	fmov	d2, x10                               // CTR block 2
831	orr	x9, x11, x9, lsl #32            // CTR block 2
832	fmov	v2.d[1], x9                               // CTR block 2
833	rev	w9, w12                                 // CTR block 3
834	orr	x9, x11, x9, lsl #32            // CTR block 3
835	ldr	q18, [x8, #0]                                  // load rk0
836	fmov	v3.d[1], x9                               // CTR block 3
837	add	w12, w12, #1                            // CTR block 3
838	ldr	q22, [x8, #64]                                 // load rk4
839	ldr	q19, [x8, #16]                                 // load rk1
840	aese	v0.16b, v18.16b
841	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
842	ldr	q14, [x6, #48]                              // load h3l | h3h
843	ext	v14.16b, v14.16b, v14.16b, #8
844	aese	v3.16b, v18.16b
845	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
846	ldr	q15, [x6, #80]                              // load h4l | h4h
847	ext	v15.16b, v15.16b, v15.16b, #8
848	aese	v1.16b, v18.16b
849	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
850	ldr	q13, [x6, #32]                              // load h2l | h2h
851	ext	v13.16b, v13.16b, v13.16b, #8
852	aese	v2.16b, v18.16b
853	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
854	ldr	q20, [x8, #32]                                 // load rk2
855	aese	v0.16b, v19.16b
856	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
857	aese	v1.16b, v19.16b
858	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
859	ld1	{ v11.16b}, [x3]
860	ext	v11.16b, v11.16b, v11.16b, #8
861	rev64	v11.16b, v11.16b
862	aese	v2.16b, v19.16b
863	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
864	ldr	q27, [x8, #144]                                // load rk9
865	aese	v3.16b, v19.16b
866	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
867	ldr	q30, [x8, #192]                               // load rk12
868	aese	v0.16b, v20.16b
869	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
870	ldr	q12, [x6]                                   // load h1l | h1h
871	ext	v12.16b, v12.16b, v12.16b, #8
872	aese	v2.16b, v20.16b
873	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
874	ldr	q28, [x8, #160]                               // load rk10
875	aese	v3.16b, v20.16b
876	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
877	aese	v0.16b, v21.16b
878	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
879	aese	v1.16b, v20.16b
880	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
881	aese	v3.16b, v21.16b
882	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
883	aese	v0.16b, v22.16b
884	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
885	aese	v2.16b, v21.16b
886	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
887	aese	v1.16b, v21.16b
888	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
889	aese	v3.16b, v22.16b
890	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
891	aese	v2.16b, v22.16b
892	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
893	aese	v1.16b, v22.16b
894	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
895	aese	v3.16b, v23.16b
896	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
897	aese	v0.16b, v23.16b
898	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
899	aese	v1.16b, v23.16b
900	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
901	aese	v2.16b, v23.16b
902	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
903	aese	v0.16b, v24.16b
904	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
905	aese	v3.16b, v24.16b
906	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
907	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
908	aese	v1.16b, v24.16b
909	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
910	aese	v2.16b, v24.16b
911	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
912	aese	v0.16b, v25.16b
913	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
914	aese	v1.16b, v25.16b
915	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
916	aese	v3.16b, v25.16b
917	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
918	aese	v0.16b, v26.16b
919	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
920	aese	v2.16b, v25.16b
921	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
922	aese	v3.16b, v26.16b
923	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
924	aese	v1.16b, v26.16b
925	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
926	ldr	q29, [x8, #176]                               // load rk11
927	aese	v2.16b, v26.16b
928	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
929	b.lt	Ldec_finish_first_blocks                         // branch if AES-128
930
931	aese	v0.16b, v27.16b
932	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
933	aese	v1.16b, v27.16b
934	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
935	aese	v3.16b, v27.16b
936	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
937	aese	v2.16b, v27.16b
938	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
939	aese	v0.16b, v28.16b
940	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
941	aese	v1.16b, v28.16b
942	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
943	aese	v3.16b, v28.16b
944	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
945	aese	v2.16b, v28.16b
946	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
947	b.eq	Ldec_finish_first_blocks                         // branch if AES-192
948
949	aese	v0.16b, v29.16b
950	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
951	aese	v3.16b, v29.16b
952	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
953	aese	v1.16b, v29.16b
954	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
955	aese	v2.16b, v29.16b
956	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
957	aese	v1.16b, v30.16b
958	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
959	aese	v0.16b, v30.16b
960	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
961	aese	v2.16b, v30.16b
962	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
963	aese	v3.16b, v30.16b
964	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
965
966Ldec_finish_first_blocks:
967	cmp	x0, x5                   // check if we have <= 4 blocks
968	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
969	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
970	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
971	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
972	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
973	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
974	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
975	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
976	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
977	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
978	b.ge	Ldec_tail                                        // handle tail
979
980	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
981	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
982	rev	w9, w12                                 // CTR block 4
983	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
984	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
985	rev64	v5.16b, v5.16b                                    // GHASH block 1
986	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
987	mov	x7, v0.d[1]                            // AES block 0 - mov high
988	mov	x6, v0.d[0]                            // AES block 0 - mov low
989	rev64	v4.16b, v4.16b                                    // GHASH block 0
990	add	w12, w12, #1                            // CTR block 4
991	fmov	d0, x10                               // CTR block 4
992	orr	x9, x11, x9, lsl #32            // CTR block 4
993	fmov	v0.d[1], x9                               // CTR block 4
994	rev	w9, w12                                 // CTR block 5
995	add	w12, w12, #1                            // CTR block 5
996	mov	x19, v1.d[0]                            // AES block 1 - mov low
997	orr	x9, x11, x9, lsl #32            // CTR block 5
998	mov	x20, v1.d[1]                            // AES block 1 - mov high
999	eor	x7, x7, x14                    // AES block 0 - round N high
1000	eor	x6, x6, x13                    // AES block 0 - round N low
1001	stp	x6, x7, [x2], #16        // AES block 0 - store result
1002	fmov	d1, x10                               // CTR block 5
1003	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
1004	add	x0, x0, #64                       // AES input_ptr update
1005	fmov	v1.d[1], x9                               // CTR block 5
1006	rev	w9, w12                                 // CTR block 6
1007	add	w12, w12, #1                            // CTR block 6
1008	eor	x19, x19, x13                    // AES block 1 - round N low
1009	orr	x9, x11, x9, lsl #32            // CTR block 6
1010	eor	x20, x20, x14                    // AES block 1 - round N high
1011	stp	x19, x20, [x2], #16        // AES block 1 - store result
1012	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
1013	cmp	x0, x5                   // check if we have <= 8 blocks
1014	b.ge	Ldec_prepretail                                  // do prepretail
1015
1016Ldec_main_loop:	//	main loop start
1017	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
1018	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
1019	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
1020	aese	v0.16b, v18.16b
1021	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
1022	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
1023	aese	v1.16b, v18.16b
1024	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
1025	fmov	d2, x10                               // CTR block 4k+6
1026	fmov	v2.d[1], x9                               // CTR block 4k+6
1027	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
1028	rev	w9, w12                                 // CTR block 4k+7
1029	aese	v0.16b, v19.16b
1030	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
1031	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
1032	aese	v1.16b, v19.16b
1033	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
1034	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
1035	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
1036	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
1037	fmov	d3, x10                               // CTR block 4k+7
1038	aese	v0.16b, v20.16b
1039	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
1040	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
1041	aese	v2.16b, v18.16b
1042	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
1043	fmov	v3.d[1], x9                               // CTR block 4k+7
1044	aese	v1.16b, v20.16b
1045	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
1046	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
1047	aese	v0.16b, v21.16b
1048	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
1049	eor	x22, x22, x14                    // AES block 4k+2 - round N high
1050	aese	v2.16b, v19.16b
1051	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
1052	mov	d10, v17.d[1]                               // GHASH block 4k - mid
1053	aese	v1.16b, v21.16b
1054	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
1055	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
1056	aese	v3.16b, v18.16b
1057	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
1058	eor	x21, x21, x13                    // AES block 4k+2 - round N low
1059	aese	v2.16b, v20.16b
1060	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
1061	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
1062	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
1063	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
1064	aese	v2.16b, v21.16b
1065	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
1066	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
1067	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
1068	eor	x23, x23, x13                    // AES block 4k+3 - round N low
1069	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
1070	eor	x24, x24, x14                    // AES block 4k+3 - round N high
1071	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
1072	aese	v2.16b, v22.16b
1073	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
1074	aese	v3.16b, v19.16b
1075	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
1076	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
1077	aese	v0.16b, v22.16b
1078	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
1079	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
1080	aese	v2.16b, v23.16b
1081	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
1082	add	w12, w12, #1                            // CTR block 4k+7
1083	aese	v3.16b, v20.16b
1084	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
1085	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
1086	aese	v1.16b, v22.16b
1087	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
1088	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
1089	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
1090	aese	v3.16b, v21.16b
1091	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
1092	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
1093	aese	v1.16b, v23.16b
1094	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
1095	aese	v0.16b, v23.16b
1096	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
1097	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
1098	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
1099	rev	w9, w12                                 // CTR block 4k+8
1100	aese	v1.16b, v24.16b
1101	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
1102	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
1103	aese	v0.16b, v24.16b
1104	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
1105	add	w12, w12, #1                            // CTR block 4k+8
1106	aese	v3.16b, v22.16b
1107	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
1108	aese	v1.16b, v25.16b
1109	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
1110	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
1111	aese	v0.16b, v25.16b
1112	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
1113	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
1114	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
1115	aese	v3.16b, v23.16b
1116	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
1117	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
1118	aese	v0.16b, v26.16b
1119	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
1120	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
1121	aese	v3.16b, v24.16b
1122	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
1123	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
1124	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
1125	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
1126	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
1127	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
1128	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
1129	aese	v1.16b, v26.16b
1130	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
1131	aese	v2.16b, v24.16b
1132	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
1133	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
1134	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
1135	movi	v8.8b, #0xc2
1136	aese	v2.16b, v25.16b
1137	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
1138	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
1139	aese	v3.16b, v25.16b
1140	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
1141	shl	d8, d8, #56               // mod_constant
1142	aese	v2.16b, v26.16b
1143	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
1144	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
1145	aese	v3.16b, v26.16b
1146	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
1147	b.lt	Ldec_main_loop_continue                          // branch if AES-128
1148
1149	aese	v0.16b, v27.16b
1150	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
1151	aese	v2.16b, v27.16b
1152	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
1153	aese	v1.16b, v27.16b
1154	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
1155	aese	v3.16b, v27.16b
1156	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
1157	aese	v0.16b, v28.16b
1158	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
1159	aese	v1.16b, v28.16b
1160	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
1161	aese	v2.16b, v28.16b
1162	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
1163	aese	v3.16b, v28.16b
1164	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
1165	b.eq	Ldec_main_loop_continue                          // branch if AES-192
1166
1167	aese	v0.16b, v29.16b
1168	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
1169	aese	v1.16b, v29.16b
1170	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
1171	aese	v2.16b, v29.16b
1172	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
1173	aese	v3.16b, v29.16b
1174	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
1175	aese	v0.16b, v30.16b
1176	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
1177	aese	v1.16b, v30.16b
1178	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
1179	aese	v2.16b, v30.16b
1180	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
1181	aese	v3.16b, v30.16b
1182	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
1183
1184Ldec_main_loop_continue:
1185	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
1186	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
1187	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
1188	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
1189	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
1190	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
1191	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
1192	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
1193	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
1194	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
1195	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
1196	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
1197	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
1198	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
1199	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
1200	add	x0, x0, #64                       // AES input_ptr update
1201	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
1202	fmov	d0, x10                               // CTR block 4k+8
1203	fmov	v0.d[1], x9                               // CTR block 4k+8
1204	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
1205	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
1206	rev	w9, w12                                 // CTR block 4k+9
1207	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
1208	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
1209	cmp	x0, x5                   // LOOP CONTROL
1210	add	w12, w12, #1                            // CTR block 4k+9
1211	eor	x6, x6, x13                    // AES block 4k+4 - round N low
1212	eor	x7, x7, x14                    // AES block 4k+4 - round N high
1213	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
1214	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
1215	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
1216	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
1217	fmov	d1, x10                               // CTR block 4k+9
1218	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
1219	fmov	v1.d[1], x9                               // CTR block 4k+9
1220	rev	w9, w12                                 // CTR block 4k+10
1221	add	w12, w12, #1                            // CTR block 4k+10
1222	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
1223	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
1224	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
1225	eor	x20, x20, x14                    // AES block 4k+5 - round N high
1226	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
1227	eor	x19, x19, x13                    // AES block 4k+5 - round N low
1228	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
1229	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
1230	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
1231	b.lt	Ldec_main_loop
1232
1233Ldec_prepretail:	//	PREPRETAIL
1234	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
1235	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
1236	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
1237	aese	v0.16b, v18.16b
1238	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
1239	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
1240	aese	v1.16b, v18.16b
1241	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
1242	fmov	d2, x10                               // CTR block 4k+6
1243	fmov	v2.d[1], x9                               // CTR block 4k+6
1244	rev	w9, w12                                 // CTR block 4k+7
1245	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
1246	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
1247	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
1248	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
1249	aese	v1.16b, v19.16b
1250	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
1251	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
1252	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
1253	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
1254	fmov	d3, x10                               // CTR block 4k+7
1255	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
1256	fmov	v3.d[1], x9                               // CTR block 4k+7
1257	aese	v2.16b, v18.16b
1258	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
1259	mov	d10, v17.d[1]                               // GHASH block 4k - mid
1260	aese	v0.16b, v19.16b
1261	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
1262	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
1263	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
1264	aese	v2.16b, v19.16b
1265	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
1266	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
1267	aese	v3.16b, v18.16b
1268	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
1269	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
1270	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
1271	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
1272	aese	v3.16b, v19.16b
1273	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
1274	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
1275	aese	v0.16b, v20.16b
1276	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
1277	aese	v1.16b, v20.16b
1278	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
1279	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
1280	aese	v2.16b, v20.16b
1281	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
1282	aese	v0.16b, v21.16b
1283	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
1284	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
1285	aese	v3.16b, v20.16b
1286	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
1287	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
1288	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
1289	aese	v0.16b, v22.16b
1290	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
1291	aese	v3.16b, v21.16b
1292	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
1293	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
1294	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
1295	aese	v0.16b, v23.16b
1296	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
1297	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
1298	aese	v3.16b, v22.16b
1299	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
1300	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
1301	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
1302	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
1303	aese	v3.16b, v23.16b
1304	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
1305	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
1306	aese	v2.16b, v21.16b
1307	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
1308	aese	v1.16b, v21.16b
1309	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
1310	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
1311	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
1312	aese	v2.16b, v22.16b
1313	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
1314	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
1315	aese	v1.16b, v22.16b
1316	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
1317	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
1318	aese	v2.16b, v23.16b
1319	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
1320	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
1321	aese	v1.16b, v23.16b
1322	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
1323	aese	v3.16b, v24.16b
1324	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
1325	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
1326	aese	v2.16b, v24.16b
1327	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
1328	aese	v0.16b, v24.16b
1329	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
1330	movi	v8.8b, #0xc2
1331	aese	v1.16b, v24.16b
1332	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
1333	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
1334	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
1335	aese	v3.16b, v25.16b
1336	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
1337	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
1338	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
1339	aese	v1.16b, v25.16b
1340	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
1341	aese	v0.16b, v25.16b
1342	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
1343	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
1344	aese	v3.16b, v26.16b
1345	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
1346	aese	v2.16b, v25.16b
1347	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
1348	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
1349	aese	v1.16b, v26.16b
1350	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
1351	aese	v0.16b, v26.16b
1352	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
1353	shl	d8, d8, #56               // mod_constant
1354	aese	v2.16b, v26.16b
1355	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
1356	b.lt	Ldec_finish_prepretail                           // branch if AES-128
1357
1358	aese	v1.16b, v27.16b
1359	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
1360	aese	v2.16b, v27.16b
1361	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
1362	aese	v3.16b, v27.16b
1363	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
1364	aese	v0.16b, v27.16b
1365	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
1366	aese	v2.16b, v28.16b
1367	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
1368	aese	v3.16b, v28.16b
1369	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
1370	aese	v0.16b, v28.16b
1371	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
1372	aese	v1.16b, v28.16b
1373	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
1374	b.eq	Ldec_finish_prepretail                           // branch if AES-192
1375
1376	aese	v2.16b, v29.16b
1377	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
1378	aese	v0.16b, v29.16b
1379	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
1380	aese	v1.16b, v29.16b
1381	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
1382	aese	v2.16b, v30.16b
1383	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
1384	aese	v3.16b, v29.16b
1385	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
1386	aese	v1.16b, v30.16b
1387	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
1388	aese	v0.16b, v30.16b
1389	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
1390	aese	v3.16b, v30.16b
1391	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
1392
1393Ldec_finish_prepretail:
1394	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
1395	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
1396	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
1397	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
1398	eor	x22, x22, x14                    // AES block 4k+2 - round N high
1399	eor	x23, x23, x13                    // AES block 4k+3 - round N low
1400	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
1401	add	w12, w12, #1                            // CTR block 4k+7
1402	eor	x21, x21, x13                    // AES block 4k+2 - round N low
1403	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
1404	eor	x24, x24, x14                    // AES block 4k+3 - round N high
1405	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
1406	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
1407	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
1408
1409	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
1410	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
1411	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
1412	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
1413	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
1414	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
1415
1416Ldec_tail:	//	TAIL
1417	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
1418	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
1419	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
1420	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
1421	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
1422	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
1423	cmp	x5, #48
1424	eor	x6, x6, x13                    // AES block 4k+4 - round N low
1425	eor	x7, x7, x14                    // AES block 4k+4 - round N high
1426	b.gt	Ldec_blocks_more_than_3
1427	sub	w12, w12, #1
1428	mov	v3.16b, v2.16b
1429	movi	v10.8b, #0
1430	movi	v11.8b, #0
1431	cmp	x5, #32
1432	movi	v9.8b, #0
1433	mov	v2.16b, v1.16b
1434	b.gt	Ldec_blocks_more_than_2
1435	sub	w12, w12, #1
1436	mov	v3.16b, v1.16b
1437	cmp	x5, #16
1438	b.gt	Ldec_blocks_more_than_1
1439	sub	w12, w12, #1
1440	b	Ldec_blocks_less_than_1
1441Ldec_blocks_more_than_3:	//	blocks left >  3
1442	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
1443	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
1444	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
1445	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
1446	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
1447	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
1448	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
1449	mov	x6, v0.d[0]                           // AES final-2 block - mov low
1450	mov	x7, v0.d[1]                           // AES final-2 block - mov high
1451	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
1452	movi	v8.8b, #0                                       // suppress further partial tag feed in
1453	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
1454	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
1455	eor	x6, x6, x13                   // AES final-2 block - round N low
1456	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
1457	eor	x7, x7, x14                   // AES final-2 block - round N high
1458Ldec_blocks_more_than_2:	//	blocks left >  2
1459	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
1460	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
1461	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
1462	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
1463	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
1464	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
1465	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
1466	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
1467	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
1468	mov	x6, v0.d[0]                           // AES final-1 block - mov low
1469	mov	x7, v0.d[1]                           // AES final-1 block - mov high
1470	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
1471	movi	v8.8b, #0                                       // suppress further partial tag feed in
1472	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
1473	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
1474	eor	x6, x6, x13                   // AES final-1 block - round N low
1475	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
1476	eor	x7, x7, x14                   // AES final-1 block - round N high
1477Ldec_blocks_more_than_1:	//	blocks left >  1
1478	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
1479	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
1480	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
1481	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
1482	movi	v8.8b, #0                                       // suppress further partial tag feed in
1483	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
1484	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
1485	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
1486	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
1487	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
1488	mov	x6, v0.d[0]                           // AES final block - mov low
1489	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
1490	mov	x7, v0.d[1]                           // AES final block - mov high
1491	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
1492	eor	x6, x6, x13                   // AES final block - round N low
1493	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
1494	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
1495	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
1496	eor	x7, x7, x14                   // AES final block - round N high
1497Ldec_blocks_less_than_1:	//	blocks left <= 1
1498	and	x1, x1, #127                   // bit_length %= 128
1499	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
1500	sub	x1, x1, #128                   // bit_length -= 128
1501	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
1502	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
1503	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
1504	and	x1, x1, #127                   // bit_length %= 128
1505	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
1506	cmp	x1, #64
1507	csel	x9, x13, x14, lt
1508	csel	x10, x14, xzr, lt
1509	fmov	d0, x9                                  // ctr0b is mask for last block
1510	and	x6, x6, x9
1511	mov	v0.d[1], x10
1512	bic	x4, x4, x9          // mask out low existing bytes
1513	rev	w9, w12
1514	bic	x5, x5, x10      // mask out high existing bytes
1515	orr	x6, x6, x4
1516	and	x7, x7, x10
1517	orr	x7, x7, x5
1518	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
1519	rev64	v4.16b, v5.16b                                    // GHASH final block
1520	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
1521	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
1522	mov	d8, v4.d[1]                                  // GHASH final block - mid
1523	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
1524	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
1525	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
1526	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
1527	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
1528	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
1529	movi	v8.8b, #0xc2
1530	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
1531	shl	d8, d8, #56               // mod_constant
1532	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
1533	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
1534	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
1535	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
1536	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
1537	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
1538	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
1539	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
1540	stp	x6, x7, [x2]
1541	str	w9, [x16, #12]                          // store the updated counter
1542	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
1543	ext	v11.16b, v11.16b, v11.16b, #8
1544	rev64	v11.16b, v11.16b
1545	mov	x0, x15
1546	st1	{ v11.16b }, [x3]
1547	ldp	x19, x20, [sp, #16]
1548	ldp	x21, x22, [sp, #32]
1549	ldp	x23, x24, [sp, #48]
1550	ldp	d8, d9, [sp, #64]
1551	ldp	d10, d11, [sp, #80]
1552	ldp	d12, d13, [sp, #96]
1553	ldp	d14, d15, [sp, #112]
1554	ldp	x29, x30, [sp], #128
1555	AARCH64_VALIDATE_LINK_REGISTER
1556	ret
1557
1558#endif
1559#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
1560