xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/gen/bcm/aesv8-gcm-armv8-linux.S (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <openssl/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <openssl/arm_arch.h>
8#if __ARM_MAX_ARCH__ >= 8
9
10.arch	armv8-a+crypto
11.text
12.globl	aes_gcm_enc_kernel
13.hidden	aes_gcm_enc_kernel
14.type	aes_gcm_enc_kernel,%function
15.align	4
16aes_gcm_enc_kernel:
17	AARCH64_SIGN_LINK_REGISTER
18	stp	x29, x30, [sp, #-128]!
19	mov	x29, sp
20	stp	x19, x20, [sp, #16]
21	mov	x16, x4
22	mov	x8, x5
23	stp	x21, x22, [sp, #32]
24	stp	x23, x24, [sp, #48]
25	stp	d8, d9, [sp, #64]
26	stp	d10, d11, [sp, #80]
27	stp	d12, d13, [sp, #96]
28	stp	d14, d15, [sp, #112]
29	ldr	w17, [x8, #240]
30	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
31	ldp	x13, x14, [x19]                       // load round N keys
32	ldr	q31, [x19, #-16]                        // load round N-1 keys
33	add	x4, x0, x1, lsr #3   // end_input_ptr
34	lsr	x5, x1, #3              // byte_len
35	mov	x15, x5
36	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
37	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
38	sub	x5, x5, #1      // byte_len - 1
39	ldr	q18, [x8, #0]                                  // load rk0
40	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
41	ldr	q25, [x8, #112]                                // load rk7
42	add	x5, x5, x0
43	lsr	x12, x11, #32
44	fmov	d2, x10                               // CTR block 2
45	orr	w11, w11, w11
46	rev	w12, w12                                // rev_ctr32
47	fmov	d1, x10                               // CTR block 1
48	aese	v0.16b, v18.16b
49	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
50	add	w12, w12, #1                            // increment rev_ctr32
51	rev	w9, w12                                 // CTR block 1
52	fmov	d3, x10                               // CTR block 3
53	orr	x9, x11, x9, lsl #32            // CTR block 1
54	add	w12, w12, #1                            // CTR block 1
55	ldr	q19, [x8, #16]                                 // load rk1
56	fmov	v1.d[1], x9                               // CTR block 1
57	rev	w9, w12                                 // CTR block 2
58	add	w12, w12, #1                            // CTR block 2
59	orr	x9, x11, x9, lsl #32            // CTR block 2
60	ldr	q20, [x8, #32]                                 // load rk2
61	fmov	v2.d[1], x9                               // CTR block 2
62	rev	w9, w12                                 // CTR block 3
63	aese	v0.16b, v19.16b
64	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
65	orr	x9, x11, x9, lsl #32            // CTR block 3
66	fmov	v3.d[1], x9                               // CTR block 3
67	aese	v1.16b, v18.16b
68	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
69	ldr	q21, [x8, #48]                                 // load rk3
70	aese	v0.16b, v20.16b
71	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
72	ldr	q24, [x8, #96]                                 // load rk6
73	aese	v2.16b, v18.16b
74	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
75	ldr	q23, [x8, #80]                                 // load rk5
76	aese	v1.16b, v19.16b
77	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
78	ldr	q14, [x6, #48]                              // load h3l | h3h
79	ext	v14.16b, v14.16b, v14.16b, #8
80	aese	v3.16b, v18.16b
81	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
82	aese	v2.16b, v19.16b
83	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
84	ldr	q22, [x8, #64]                                 // load rk4
85	aese	v1.16b, v20.16b
86	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
87	ldr	q13, [x6, #32]                              // load h2l | h2h
88	ext	v13.16b, v13.16b, v13.16b, #8
89	aese	v3.16b, v19.16b
90	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
91	ldr	q30, [x8, #192]                               // load rk12
92	aese	v2.16b, v20.16b
93	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
94	ldr	q15, [x6, #80]                              // load h4l | h4h
95	ext	v15.16b, v15.16b, v15.16b, #8
96	aese	v1.16b, v21.16b
97	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
98	ldr	q29, [x8, #176]                               // load rk11
99	aese	v3.16b, v20.16b
100	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
101	ldr	q26, [x8, #128]                                // load rk8
102	aese	v2.16b, v21.16b
103	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
104	add	w12, w12, #1                            // CTR block 3
105	aese	v0.16b, v21.16b
106	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
107	aese	v3.16b, v21.16b
108	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
109	ld1	{ v11.16b}, [x3]
110	ext	v11.16b, v11.16b, v11.16b, #8
111	rev64	v11.16b, v11.16b
112	aese	v2.16b, v22.16b
113	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
114	aese	v0.16b, v22.16b
115	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
116	aese	v1.16b, v22.16b
117	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
118	aese	v3.16b, v22.16b
119	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
120	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
121	aese	v0.16b, v23.16b
122	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
123	aese	v1.16b, v23.16b
124	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
125	aese	v3.16b, v23.16b
126	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
127	aese	v2.16b, v23.16b
128	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
129	aese	v1.16b, v24.16b
130	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
131	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
132	aese	v3.16b, v24.16b
133	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
134	ldr	q27, [x8, #144]                                // load rk9
135	aese	v0.16b, v24.16b
136	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
137	ldr	q12, [x6]                                   // load h1l | h1h
138	ext	v12.16b, v12.16b, v12.16b, #8
139	aese	v2.16b, v24.16b
140	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
141	ldr	q28, [x8, #160]                               // load rk10
142	aese	v1.16b, v25.16b
143	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
144	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
145	aese	v0.16b, v25.16b
146	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
147	aese	v2.16b, v25.16b
148	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
149	aese	v3.16b, v25.16b
150	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
151	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
152	aese	v1.16b, v26.16b
153	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
154	aese	v2.16b, v26.16b
155	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
156	aese	v3.16b, v26.16b
157	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
158	aese	v0.16b, v26.16b
159	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
160	b.lt	.Lenc_finish_first_blocks                         // branch if AES-128
161
162	aese	v1.16b, v27.16b
163	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
164	aese	v2.16b, v27.16b
165	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
166	aese	v3.16b, v27.16b
167	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
168	aese	v0.16b, v27.16b
169	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
170	aese	v1.16b, v28.16b
171	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
172	aese	v2.16b, v28.16b
173	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
174	aese	v3.16b, v28.16b
175	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
176	aese	v0.16b, v28.16b
177	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
178	b.eq	.Lenc_finish_first_blocks                         // branch if AES-192
179
180	aese	v1.16b, v29.16b
181	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
182	aese	v2.16b, v29.16b
183	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
184	aese	v0.16b, v29.16b
185	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
186	aese	v3.16b, v29.16b
187	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
188	aese	v1.16b, v30.16b
189	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
190	aese	v2.16b, v30.16b
191	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
192	aese	v0.16b, v30.16b
193	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
194	aese	v3.16b, v30.16b
195	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
196
197.Lenc_finish_first_blocks:
198	cmp	x0, x5                   // check if we have <= 4 blocks
199	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
200	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
201	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
202	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
203	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
204	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
205	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
206	b.ge	.Lenc_tail                                        // handle tail
207
208	ldp	x19, x20, [x0, #16]           // AES block 1 - load plaintext
209	rev	w9, w12                                 // CTR block 4
210	ldp	x6, x7, [x0, #0]            // AES block 0 - load plaintext
211	ldp	x23, x24, [x0, #48]           // AES block 3 - load plaintext
212	ldp	x21, x22, [x0, #32]           // AES block 2 - load plaintext
213	add	x0, x0, #64                       // AES input_ptr update
214	eor	x19, x19, x13                      // AES block 1 - round N low
215	eor	x20, x20, x14                      // AES block 1 - round N high
216	fmov	d5, x19                               // AES block 1 - mov low
217	eor	x6, x6, x13                      // AES block 0 - round N low
218	eor	x7, x7, x14                      // AES block 0 - round N high
219	eor	x24, x24, x14                      // AES block 3 - round N high
220	fmov	d4, x6                               // AES block 0 - mov low
221	cmp	x0, x5                   // check if we have <= 8 blocks
222	fmov	v4.d[1], x7                           // AES block 0 - mov high
223	eor	x23, x23, x13                      // AES block 3 - round N low
224	eor	x21, x21, x13                      // AES block 2 - round N low
225	fmov	v5.d[1], x20                           // AES block 1 - mov high
226	fmov	d6, x21                               // AES block 2 - mov low
227	add	w12, w12, #1                            // CTR block 4
228	orr	x9, x11, x9, lsl #32            // CTR block 4
229	fmov	d7, x23                               // AES block 3 - mov low
230	eor	x22, x22, x14                      // AES block 2 - round N high
231	fmov	v6.d[1], x22                           // AES block 2 - mov high
232	eor	v4.16b, v4.16b, v0.16b                          // AES block 0 - result
233	fmov	d0, x10                               // CTR block 4
234	fmov	v0.d[1], x9                               // CTR block 4
235	rev	w9, w12                                 // CTR block 5
236	add	w12, w12, #1                            // CTR block 5
237	eor	v5.16b, v5.16b, v1.16b                          // AES block 1 - result
238	fmov	d1, x10                               // CTR block 5
239	orr	x9, x11, x9, lsl #32            // CTR block 5
240	fmov	v1.d[1], x9                               // CTR block 5
241	rev	w9, w12                                 // CTR block 6
242	st1	{ v4.16b}, [x2], #16                     // AES block 0 - store result
243	fmov	v7.d[1], x24                           // AES block 3 - mov high
244	orr	x9, x11, x9, lsl #32            // CTR block 6
245	eor	v6.16b, v6.16b, v2.16b                          // AES block 2 - result
246	st1	{ v5.16b}, [x2], #16                     // AES block 1 - store result
247	add	w12, w12, #1                            // CTR block 6
248	fmov	d2, x10                               // CTR block 6
249	fmov	v2.d[1], x9                               // CTR block 6
250	st1	{ v6.16b}, [x2], #16                     // AES block 2 - store result
251	rev	w9, w12                                 // CTR block 7
252	orr	x9, x11, x9, lsl #32            // CTR block 7
253	eor	v7.16b, v7.16b, v3.16b                          // AES block 3 - result
254	st1	{ v7.16b}, [x2], #16                     // AES block 3 - store result
255	b.ge	.Lenc_prepretail                                  // do prepretail
256
257.Lenc_main_loop:	//	main loop start
258	aese	v0.16b, v18.16b
259	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
260	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
261	aese	v1.16b, v18.16b
262	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
263	fmov	d3, x10                               // CTR block 4k+3
264	aese	v2.16b, v18.16b
265	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
266	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
267	aese	v0.16b, v19.16b
268	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
269	fmov	v3.d[1], x9                               // CTR block 4k+3
270	aese	v1.16b, v19.16b
271	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
272	ldp	x23, x24, [x0, #48]           // AES block 4k+7 - load plaintext
273	aese	v2.16b, v19.16b
274	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
275	ldp	x21, x22, [x0, #32]           // AES block 4k+6 - load plaintext
276	aese	v0.16b, v20.16b
277	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
278	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
279	aese	v1.16b, v20.16b
280	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
281	aese	v3.16b, v18.16b
282	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
283	eor	x23, x23, x13                      // AES block 4k+7 - round N low
284	aese	v0.16b, v21.16b
285	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
286	mov	d10, v17.d[1]                               // GHASH block 4k - mid
287	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
288	eor	x22, x22, x14                      // AES block 4k+6 - round N high
289	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
290	aese	v3.16b, v19.16b
291	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
292	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
293	aese	v0.16b, v22.16b
294	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
295	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
296	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
297	aese	v2.16b, v20.16b
298	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
299	aese	v0.16b, v23.16b
300	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
301	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
302	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
303	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
304	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
305	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
306	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
307	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
308	aese	v1.16b, v21.16b
309	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
310	aese	v3.16b, v20.16b
311	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
312	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
313	aese	v2.16b, v21.16b
314	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
315	aese	v1.16b, v22.16b
316	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
317	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
318	aese	v3.16b, v21.16b
319	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
320	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
321	aese	v2.16b, v22.16b
322	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
323	aese	v0.16b, v24.16b
324	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
325	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
326	aese	v3.16b, v22.16b
327	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
328	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
329	aese	v0.16b, v25.16b
330	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
331	aese	v3.16b, v23.16b
332	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
333	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
334	aese	v1.16b, v23.16b
335	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
336	aese	v0.16b, v26.16b
337	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
338	aese	v2.16b, v23.16b
339	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
340	aese	v1.16b, v24.16b
341	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
342	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
343	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
344	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
345	aese	v1.16b, v25.16b
346	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
347	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
348	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
349	aese	v3.16b, v24.16b
350	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
351	ldp	x19, x20, [x0, #16]           // AES block 4k+5 - load plaintext
352	aese	v1.16b, v26.16b
353	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
354	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
355	aese	v2.16b, v24.16b
356	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
357	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
358	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
359	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
360	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
361	aese	v2.16b, v25.16b
362	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
363	eor	x19, x19, x13                      // AES block 4k+5 - round N low
364	aese	v2.16b, v26.16b
365	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
366	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
367	aese	v3.16b, v25.16b
368	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
369	eor	x21, x21, x13                      // AES block 4k+6 - round N low
370	aese	v3.16b, v26.16b
371	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
372	movi	v8.8b, #0xc2
373	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
374	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
375	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
376	fmov	d5, x19                               // AES block 4k+5 - mov low
377	ldp	x6, x7, [x0, #0]            // AES block 4k+4 - load plaintext
378	b.lt	.Lenc_main_loop_continue                          // branch if AES-128
379
380	aese	v1.16b, v27.16b
381	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
382	aese	v0.16b, v27.16b
383	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
384	aese	v2.16b, v27.16b
385	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
386	aese	v3.16b, v27.16b
387	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
388	aese	v0.16b, v28.16b
389	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
390	aese	v1.16b, v28.16b
391	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
392	aese	v2.16b, v28.16b
393	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
394	aese	v3.16b, v28.16b
395	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
396	b.eq	.Lenc_main_loop_continue                          // branch if AES-192
397
398	aese	v0.16b, v29.16b
399	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
400	aese	v1.16b, v29.16b
401	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
402	aese	v2.16b, v29.16b
403	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
404	aese	v3.16b, v29.16b
405	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
406	aese	v1.16b, v30.16b
407	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
408	aese	v0.16b, v30.16b
409	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
410	aese	v2.16b, v30.16b
411	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
412	aese	v3.16b, v30.16b
413	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
414
415.Lenc_main_loop_continue:
416	shl	d8, d8, #56               // mod_constant
417	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
418	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
419	add	w12, w12, #1                            // CTR block 4k+3
420	eor	v4.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
421	add	x0, x0, #64                       // AES input_ptr update
422	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
423	rev	w9, w12                                 // CTR block 4k+8
424	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
425	eor	x6, x6, x13                      // AES block 4k+4 - round N low
426	eor	v10.16b, v10.16b, v4.16b                         // MODULO - karatsuba tidy up
427	eor	x7, x7, x14                      // AES block 4k+4 - round N high
428	fmov	d4, x6                               // AES block 4k+4 - mov low
429	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
430	eor	v7.16b, v9.16b, v7.16b                   // MODULO - fold into mid
431	eor	x20, x20, x14                      // AES block 4k+5 - round N high
432	eor	x24, x24, x14                      // AES block 4k+7 - round N high
433	add	w12, w12, #1                            // CTR block 4k+8
434	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
435	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
436	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
437	fmov	d7, x23                               // AES block 4k+7 - mov low
438	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
439	fmov	v5.d[1], x20                           // AES block 4k+5 - mov high
440	fmov	d6, x21                               // AES block 4k+6 - mov low
441	cmp	x0, x5                   // .LOOP CONTROL
442	fmov	v6.d[1], x22                           // AES block 4k+6 - mov high
443	pmull	v9.1q, v10.1d, v8.1d            // MODULO - mid 64b align with low
444	eor	v4.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
445	fmov	d0, x10                               // CTR block 4k+8
446	fmov	v0.d[1], x9                               // CTR block 4k+8
447	rev	w9, w12                                 // CTR block 4k+9
448	add	w12, w12, #1                            // CTR block 4k+9
449	eor	v5.16b, v5.16b, v1.16b                          // AES block 4k+5 - result
450	fmov	d1, x10                               // CTR block 4k+9
451	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
452	fmov	v1.d[1], x9                               // CTR block 4k+9
453	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
454	rev	w9, w12                                 // CTR block 4k+10
455	st1	{ v4.16b}, [x2], #16                     // AES block 4k+4 - store result
456	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
457	eor	v11.16b, v11.16b, v9.16b                         // MODULO - fold into low
458	fmov	v7.d[1], x24                           // AES block 4k+7 - mov high
459	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
460	st1	{ v5.16b}, [x2], #16                     // AES block 4k+5 - store result
461	add	w12, w12, #1                            // CTR block 4k+10
462	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
463	eor	v6.16b, v6.16b, v2.16b                          // AES block 4k+6 - result
464	fmov	d2, x10                               // CTR block 4k+10
465	st1	{ v6.16b}, [x2], #16                     // AES block 4k+6 - store result
466	fmov	v2.d[1], x9                               // CTR block 4k+10
467	rev	w9, w12                                 // CTR block 4k+11
468	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
469	orr	x9, x11, x9, lsl #32            // CTR block 4k+11
470	eor	v7.16b, v7.16b, v3.16b                          // AES block 4k+7 - result
471	st1	{ v7.16b}, [x2], #16                     // AES block 4k+7 - store result
472	b.lt	.Lenc_main_loop
473
474.Lenc_prepretail:	//	PREPRETAIL
475	aese	v1.16b, v18.16b
476	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
477	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2 (t0, t1, and t2 free)
478	aese	v2.16b, v18.16b
479	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
480	fmov	d3, x10                               // CTR block 4k+3
481	aese	v0.16b, v18.16b
482	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
483	rev64	v4.16b, v4.16b                                    // GHASH block 4k (only t0 is free)
484	fmov	v3.d[1], x9                               // CTR block 4k+3
485	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
486	aese	v2.16b, v19.16b
487	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
488	aese	v0.16b, v19.16b
489	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
490	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
491	rev64	v5.16b, v5.16b                                    // GHASH block 4k+1 (t0 and t1 free)
492	aese	v2.16b, v20.16b
493	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
494	aese	v3.16b, v18.16b
495	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
496	mov	d10, v17.d[1]                               // GHASH block 4k - mid
497	aese	v1.16b, v19.16b
498	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
499	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
500	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
501	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
502	aese	v2.16b, v21.16b
503	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
504	aese	v1.16b, v20.16b
505	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
506	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
507	aese	v0.16b, v20.16b
508	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
509	aese	v3.16b, v19.16b
510	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
511	aese	v1.16b, v21.16b
512	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
513	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
514	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
515	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
516	aese	v3.16b, v20.16b
517	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
518	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
519	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
520	aese	v0.16b, v21.16b
521	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
522	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
523	aese	v3.16b, v21.16b
524	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
525	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
526	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
527	aese	v0.16b, v22.16b
528	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
529	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3 (t0, t1, t2 and t3 free)
530	aese	v3.16b, v22.16b
531	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
532	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
533	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
534	add	w12, w12, #1                            // CTR block 4k+3
535	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
536	aese	v3.16b, v23.16b
537	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
538	aese	v2.16b, v22.16b
539	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
540	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
541	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
542	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
543	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
544	aese	v2.16b, v23.16b
545	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
546	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
547	mov	d4, v7.d[1]                                  // GHASH block 4k+3 - mid
548	aese	v1.16b, v22.16b
549	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
550	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
551	eor	v4.8b, v4.8b, v7.8b                          // GHASH block 4k+3 - mid
552	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
553	aese	v1.16b, v23.16b
554	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
555	pmull	v4.1q, v4.1d, v16.1d                          // GHASH block 4k+3 - mid
556	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
557	aese	v0.16b, v23.16b
558	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
559	aese	v1.16b, v24.16b
560	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
561	aese	v2.16b, v24.16b
562	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
563	aese	v0.16b, v24.16b
564	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
565	movi	v8.8b, #0xc2
566	aese	v3.16b, v24.16b
567	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
568	aese	v1.16b, v25.16b
569	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
570	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
571	aese	v0.16b, v25.16b
572	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
573	aese	v3.16b, v25.16b
574	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
575	shl	d8, d8, #56               // mod_constant
576	aese	v1.16b, v26.16b
577	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
578	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+3 - mid
579	pmull	v6.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
580	aese	v3.16b, v26.16b
581	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
582	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
583	aese	v0.16b, v26.16b
584	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
585	eor	v11.16b, v11.16b, v6.16b                         // GHASH block 4k+3 - low
586	aese	v2.16b, v25.16b
587	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
588	eor	v10.16b, v10.16b, v9.16b                         // karatsuba tidy up
589	aese	v2.16b, v26.16b
590	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
591	pmull	v4.1q, v9.1d, v8.1d
592	ext	v9.16b, v9.16b, v9.16b, #8
593	eor	v10.16b, v10.16b, v11.16b
594	b.lt	.Lenc_finish_prepretail                           // branch if AES-128
595
596	aese	v1.16b, v27.16b
597	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
598	aese	v3.16b, v27.16b
599	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
600	aese	v0.16b, v27.16b
601	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
602	aese	v2.16b, v27.16b
603	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
604	aese	v3.16b, v28.16b
605	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
606	aese	v1.16b, v28.16b
607	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
608	aese	v0.16b, v28.16b
609	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
610	aese	v2.16b, v28.16b
611	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
612	b.eq	.Lenc_finish_prepretail                           // branch if AES-192
613
614	aese	v1.16b, v29.16b
615	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
616	aese	v0.16b, v29.16b
617	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
618	aese	v3.16b, v29.16b
619	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
620	aese	v2.16b, v29.16b
621	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
622	aese	v1.16b, v30.16b
623	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
624	aese	v0.16b, v30.16b
625	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
626	aese	v3.16b, v30.16b
627	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
628	aese	v2.16b, v30.16b
629	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
630
631.Lenc_finish_prepretail:
632	eor	v10.16b, v10.16b, v4.16b
633	eor	v10.16b, v10.16b, v9.16b
634	pmull	v4.1q, v10.1d, v8.1d
635	ext	v10.16b, v10.16b, v10.16b, #8
636	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
637	eor	v11.16b, v11.16b, v4.16b
638	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
639	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
640	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
641	eor	v11.16b, v11.16b, v10.16b
642
643.Lenc_tail:	//	TAIL
644	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
645	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
646	ldp	x6, x7, [x0], #16           // AES block 4k+4 - load plaintext
647	eor	x6, x6, x13                      // AES block 4k+4 - round N low
648	eor	x7, x7, x14                      // AES block 4k+4 - round N high
649	cmp	x5, #48
650	fmov	d4, x6                               // AES block 4k+4 - mov low
651	fmov	v4.d[1], x7                           // AES block 4k+4 - mov high
652	eor	v5.16b, v4.16b, v0.16b                          // AES block 4k+4 - result
653	b.gt	.Lenc_blocks_more_than_3
654	cmp	x5, #32
655	mov	v3.16b, v2.16b
656	movi	v11.8b, #0
657	movi	v9.8b, #0
658	sub	w12, w12, #1
659	mov	v2.16b, v1.16b
660	movi	v10.8b, #0
661	b.gt	.Lenc_blocks_more_than_2
662	mov	v3.16b, v1.16b
663	sub	w12, w12, #1
664	cmp	x5, #16
665	b.gt	.Lenc_blocks_more_than_1
666	sub	w12, w12, #1
667	b	.Lenc_blocks_less_than_1
668.Lenc_blocks_more_than_3:	//	blocks left >  3
669	st1	{ v5.16b}, [x2], #16                    // AES final-3 block  - store result
670	ldp	x6, x7, [x0], #16          // AES final-2 block - load input low & high
671	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
672	eor	x6, x6, x13                     // AES final-2 block - round N low
673	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
674	eor	x7, x7, x14                     // AES final-2 block - round N high
675	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
676	fmov	d5, x6                                // AES final-2 block - mov low
677	fmov	v5.d[1], x7                            // AES final-2 block - mov high
678	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
679	movi	v8.8b, #0                                       // suppress further partial tag feed in
680	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
681	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
682	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
683	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
684	eor	v5.16b, v5.16b, v1.16b                           // AES final-2 block - result
685.Lenc_blocks_more_than_2:	//	blocks left >  2
686	st1	{ v5.16b}, [x2], #16                    // AES final-2 block - store result
687	ldp	x6, x7, [x0], #16          // AES final-1 block - load input low & high
688	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
689	eor	x6, x6, x13                     // AES final-1 block - round N low
690	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
691	fmov	d5, x6                                // AES final-1 block - mov low
692	eor	x7, x7, x14                     // AES final-1 block - round N high
693	fmov	v5.d[1], x7                            // AES final-1 block - mov high
694	movi	v8.8b, #0                                       // suppress further partial tag feed in
695	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
696	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
697	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
698	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
699	eor	v5.16b, v5.16b, v2.16b                           // AES final-1 block - result
700	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
701	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
702	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
703	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
704.Lenc_blocks_more_than_1:	//	blocks left >  1
705	st1	{ v5.16b}, [x2], #16                    // AES final-1 block - store result
706	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
707	ldp	x6, x7, [x0], #16          // AES final block - load input low & high
708	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
709	movi	v8.8b, #0                                       // suppress further partial tag feed in
710	eor	x6, x6, x13                     // AES final block - round N low
711	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
712	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
713	eor	x7, x7, x14                     // AES final block - round N high
714	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
715	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
716	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
717	fmov	d5, x6                                // AES final block - mov low
718	fmov	v5.d[1], x7                            // AES final block - mov high
719	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
720	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
721	eor	v5.16b, v5.16b, v3.16b                           // AES final block - result
722	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
723	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
724.Lenc_blocks_less_than_1:	//	blocks left <= 1
725	and	x1, x1, #127                   // bit_length %= 128
726	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
727	sub	x1, x1, #128                   // bit_length -= 128
728	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
729	ld1	{ v18.16b}, [x2]                           // load existing bytes where the possibly partial last block is to be stored
730	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
731	and	x1, x1, #127                   // bit_length %= 128
732	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
733	cmp	x1, #64
734	csel	x6, x13, x14, lt
735	csel	x7, x14, xzr, lt
736	fmov	d0, x6                                // ctr0b is mask for last block
737	fmov	v0.d[1], x7
738	and	v5.16b, v5.16b, v0.16b                           // possibly partial last block has zeroes in highest bits
739	rev64	v4.16b, v5.16b                                   // GHASH final block
740	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
741	bif	v5.16b, v18.16b, v0.16b                             // insert existing bytes in top end of result before storing
742	pmull2	v20.1q, v4.2d, v12.2d                         // GHASH final block - high
743	mov	d8, v4.d[1]                                 // GHASH final block - mid
744	rev	w9, w12
745	pmull	v21.1q, v4.1d, v12.1d                         // GHASH final block - low
746	eor	v9.16b, v9.16b, v20.16b                           // GHASH final block - high
747	eor	v8.8b, v8.8b, v4.8b                         // GHASH final block - mid
748	pmull	v8.1q, v8.1d, v16.1d                         // GHASH final block - mid
749	eor	v11.16b, v11.16b, v21.16b                           // GHASH final block - low
750	eor	v10.16b, v10.16b, v8.16b                        // GHASH final block - mid
751	movi	v8.8b, #0xc2
752	eor	v4.16b, v11.16b, v9.16b                        // MODULO - karatsuba tidy up
753	shl	d8, d8, #56              // mod_constant
754	eor	v10.16b, v10.16b, v4.16b                        // MODULO - karatsuba tidy up
755	pmull	v7.1q, v9.1d, v8.1d           // MODULO - top 64b align with mid
756	ext	v9.16b, v9.16b, v9.16b, #8                    // MODULO - other top alignment
757	eor	v10.16b, v10.16b, v7.16b                     // MODULO - fold into mid
758	eor	v10.16b, v10.16b, v9.16b                        // MODULO - fold into mid
759	pmull	v9.1q, v10.1d, v8.1d           // MODULO - mid 64b align with low
760	ext	v10.16b, v10.16b, v10.16b, #8                    // MODULO - other mid alignment
761	str	w9, [x16, #12]                         // store the updated counter
762	st1	{ v5.16b}, [x2]                         // store all 16B
763	eor	v11.16b, v11.16b, v9.16b                        // MODULO - fold into low
764	eor	v11.16b, v11.16b, v10.16b                        // MODULO - fold into low
765	ext	v11.16b, v11.16b, v11.16b, #8
766	rev64	v11.16b, v11.16b
767	mov	x0, x15
768	st1	{ v11.16b }, [x3]
769	ldp	x19, x20, [sp, #16]
770	ldp	x21, x22, [sp, #32]
771	ldp	x23, x24, [sp, #48]
772	ldp	d8, d9, [sp, #64]
773	ldp	d10, d11, [sp, #80]
774	ldp	d12, d13, [sp, #96]
775	ldp	d14, d15, [sp, #112]
776	ldp	x29, x30, [sp], #128
777	AARCH64_VALIDATE_LINK_REGISTER
778	ret
779.size	aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
780.globl	aes_gcm_dec_kernel
781.hidden	aes_gcm_dec_kernel
782.type	aes_gcm_dec_kernel,%function
783.align	4
784aes_gcm_dec_kernel:
785	AARCH64_SIGN_LINK_REGISTER
786	stp	x29, x30, [sp, #-128]!
787	mov	x29, sp
788	stp	x19, x20, [sp, #16]
789	mov	x16, x4
790	mov	x8, x5
791	stp	x21, x22, [sp, #32]
792	stp	x23, x24, [sp, #48]
793	stp	d8, d9, [sp, #64]
794	stp	d10, d11, [sp, #80]
795	stp	d12, d13, [sp, #96]
796	stp	d14, d15, [sp, #112]
797	ldr	w17, [x8, #240]
798	add	x19, x8, x17, lsl #4                   // borrow input_l1 for last key
799	ldp	x13, x14, [x19]                       // load round N keys
800	ldr	q31, [x19, #-16]                        // load round N-1 keys
801	lsr	x5, x1, #3              // byte_len
802	mov	x15, x5
803	ldp	x10, x11, [x16]              // ctr96_b64, ctr96_t32
804	ldr	q26, [x8, #128]                                // load rk8
805	sub	x5, x5, #1      // byte_len - 1
806	ldr	q25, [x8, #112]                                // load rk7
807	and	x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
808	add	x4, x0, x1, lsr #3   // end_input_ptr
809	ldr	q24, [x8, #96]                                 // load rk6
810	lsr	x12, x11, #32
811	ldr	q23, [x8, #80]                                 // load rk5
812	orr	w11, w11, w11
813	ldr	q21, [x8, #48]                                 // load rk3
814	add	x5, x5, x0
815	rev	w12, w12                                // rev_ctr32
816	add	w12, w12, #1                            // increment rev_ctr32
817	fmov	d3, x10                               // CTR block 3
818	rev	w9, w12                                 // CTR block 1
819	add	w12, w12, #1                            // CTR block 1
820	fmov	d1, x10                               // CTR block 1
821	orr	x9, x11, x9, lsl #32            // CTR block 1
822	ld1	{ v0.16b}, [x16]                             // special case vector load initial counter so we can start first AES block as quickly as possible
823	fmov	v1.d[1], x9                               // CTR block 1
824	rev	w9, w12                                 // CTR block 2
825	add	w12, w12, #1                            // CTR block 2
826	fmov	d2, x10                               // CTR block 2
827	orr	x9, x11, x9, lsl #32            // CTR block 2
828	fmov	v2.d[1], x9                               // CTR block 2
829	rev	w9, w12                                 // CTR block 3
830	orr	x9, x11, x9, lsl #32            // CTR block 3
831	ldr	q18, [x8, #0]                                  // load rk0
832	fmov	v3.d[1], x9                               // CTR block 3
833	add	w12, w12, #1                            // CTR block 3
834	ldr	q22, [x8, #64]                                 // load rk4
835	ldr	q19, [x8, #16]                                 // load rk1
836	aese	v0.16b, v18.16b
837	aesmc	v0.16b, v0.16b          // AES block 0 - round 0
838	ldr	q14, [x6, #48]                              // load h3l | h3h
839	ext	v14.16b, v14.16b, v14.16b, #8
840	aese	v3.16b, v18.16b
841	aesmc	v3.16b, v3.16b          // AES block 3 - round 0
842	ldr	q15, [x6, #80]                              // load h4l | h4h
843	ext	v15.16b, v15.16b, v15.16b, #8
844	aese	v1.16b, v18.16b
845	aesmc	v1.16b, v1.16b          // AES block 1 - round 0
846	ldr	q13, [x6, #32]                              // load h2l | h2h
847	ext	v13.16b, v13.16b, v13.16b, #8
848	aese	v2.16b, v18.16b
849	aesmc	v2.16b, v2.16b          // AES block 2 - round 0
850	ldr	q20, [x8, #32]                                 // load rk2
851	aese	v0.16b, v19.16b
852	aesmc	v0.16b, v0.16b          // AES block 0 - round 1
853	aese	v1.16b, v19.16b
854	aesmc	v1.16b, v1.16b          // AES block 1 - round 1
855	ld1	{ v11.16b}, [x3]
856	ext	v11.16b, v11.16b, v11.16b, #8
857	rev64	v11.16b, v11.16b
858	aese	v2.16b, v19.16b
859	aesmc	v2.16b, v2.16b          // AES block 2 - round 1
860	ldr	q27, [x8, #144]                                // load rk9
861	aese	v3.16b, v19.16b
862	aesmc	v3.16b, v3.16b          // AES block 3 - round 1
863	ldr	q30, [x8, #192]                               // load rk12
864	aese	v0.16b, v20.16b
865	aesmc	v0.16b, v0.16b          // AES block 0 - round 2
866	ldr	q12, [x6]                                   // load h1l | h1h
867	ext	v12.16b, v12.16b, v12.16b, #8
868	aese	v2.16b, v20.16b
869	aesmc	v2.16b, v2.16b          // AES block 2 - round 2
870	ldr	q28, [x8, #160]                               // load rk10
871	aese	v3.16b, v20.16b
872	aesmc	v3.16b, v3.16b          // AES block 3 - round 2
873	aese	v0.16b, v21.16b
874	aesmc	v0.16b, v0.16b          // AES block 0 - round 3
875	aese	v1.16b, v20.16b
876	aesmc	v1.16b, v1.16b          // AES block 1 - round 2
877	aese	v3.16b, v21.16b
878	aesmc	v3.16b, v3.16b          // AES block 3 - round 3
879	aese	v0.16b, v22.16b
880	aesmc	v0.16b, v0.16b          // AES block 0 - round 4
881	aese	v2.16b, v21.16b
882	aesmc	v2.16b, v2.16b          // AES block 2 - round 3
883	aese	v1.16b, v21.16b
884	aesmc	v1.16b, v1.16b          // AES block 1 - round 3
885	aese	v3.16b, v22.16b
886	aesmc	v3.16b, v3.16b          // AES block 3 - round 4
887	aese	v2.16b, v22.16b
888	aesmc	v2.16b, v2.16b          // AES block 2 - round 4
889	aese	v1.16b, v22.16b
890	aesmc	v1.16b, v1.16b          // AES block 1 - round 4
891	aese	v3.16b, v23.16b
892	aesmc	v3.16b, v3.16b          // AES block 3 - round 5
893	aese	v0.16b, v23.16b
894	aesmc	v0.16b, v0.16b          // AES block 0 - round 5
895	aese	v1.16b, v23.16b
896	aesmc	v1.16b, v1.16b          // AES block 1 - round 5
897	aese	v2.16b, v23.16b
898	aesmc	v2.16b, v2.16b          // AES block 2 - round 5
899	aese	v0.16b, v24.16b
900	aesmc	v0.16b, v0.16b          // AES block 0 - round 6
901	aese	v3.16b, v24.16b
902	aesmc	v3.16b, v3.16b          // AES block 3 - round 6
903	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
904	aese	v1.16b, v24.16b
905	aesmc	v1.16b, v1.16b          // AES block 1 - round 6
906	aese	v2.16b, v24.16b
907	aesmc	v2.16b, v2.16b          // AES block 2 - round 6
908	aese	v0.16b, v25.16b
909	aesmc	v0.16b, v0.16b          // AES block 0 - round 7
910	aese	v1.16b, v25.16b
911	aesmc	v1.16b, v1.16b          // AES block 1 - round 7
912	aese	v3.16b, v25.16b
913	aesmc	v3.16b, v3.16b          // AES block 3 - round 7
914	aese	v0.16b, v26.16b
915	aesmc	v0.16b, v0.16b          // AES block 0 - round 8
916	aese	v2.16b, v25.16b
917	aesmc	v2.16b, v2.16b          // AES block 2 - round 7
918	aese	v3.16b, v26.16b
919	aesmc	v3.16b, v3.16b          // AES block 3 - round 8
920	aese	v1.16b, v26.16b
921	aesmc	v1.16b, v1.16b          // AES block 1 - round 8
922	ldr	q29, [x8, #176]                               // load rk11
923	aese	v2.16b, v26.16b
924	aesmc	v2.16b, v2.16b          // AES block 2 - round 8
925	b.lt	.Ldec_finish_first_blocks                         // branch if AES-128
926
927	aese	v0.16b, v27.16b
928	aesmc	v0.16b, v0.16b          // AES block 0 - round 9
929	aese	v1.16b, v27.16b
930	aesmc	v1.16b, v1.16b          // AES block 1 - round 9
931	aese	v3.16b, v27.16b
932	aesmc	v3.16b, v3.16b          // AES block 3 - round 9
933	aese	v2.16b, v27.16b
934	aesmc	v2.16b, v2.16b          // AES block 2 - round 9
935	aese	v0.16b, v28.16b
936	aesmc	v0.16b, v0.16b          // AES block 0 - round 10
937	aese	v1.16b, v28.16b
938	aesmc	v1.16b, v1.16b          // AES block 1 - round 10
939	aese	v3.16b, v28.16b
940	aesmc	v3.16b, v3.16b          // AES block 3 - round 10
941	aese	v2.16b, v28.16b
942	aesmc	v2.16b, v2.16b          // AES block 2 - round 10
943	b.eq	.Ldec_finish_first_blocks                         // branch if AES-192
944
945	aese	v0.16b, v29.16b
946	aesmc	v0.16b, v0.16b          // AES block 0 - round 11
947	aese	v3.16b, v29.16b
948	aesmc	v3.16b, v3.16b          // AES block 3 - round 11
949	aese	v1.16b, v29.16b
950	aesmc	v1.16b, v1.16b          // AES block 1 - round 11
951	aese	v2.16b, v29.16b
952	aesmc	v2.16b, v2.16b          // AES block 2 - round 11
953	aese	v1.16b, v30.16b
954	aesmc	v1.16b, v1.16b          // AES block 1 - round 12
955	aese	v0.16b, v30.16b
956	aesmc	v0.16b, v0.16b          // AES block 0 - round 12
957	aese	v2.16b, v30.16b
958	aesmc	v2.16b, v2.16b          // AES block 2 - round 12
959	aese	v3.16b, v30.16b
960	aesmc	v3.16b, v3.16b          // AES block 3 - round 12
961
962.Ldec_finish_first_blocks:
963	cmp	x0, x5                   // check if we have <= 4 blocks
964	trn1	v9.2d, v14.2d,    v15.2d                      // h4h | h3h
965	trn2	v17.2d,  v14.2d,    v15.2d                      // h4l | h3l
966	trn1	v8.2d,    v12.2d,    v13.2d                      // h2h | h1h
967	trn2	v16.2d,  v12.2d,    v13.2d                      // h2l | h1l
968	eor	v17.16b, v17.16b, v9.16b                  // h4k | h3k
969	aese	v1.16b, v31.16b                                    // AES block 1 - round N-1
970	aese	v2.16b, v31.16b                                    // AES block 2 - round N-1
971	eor	v16.16b, v16.16b, v8.16b                     // h2k | h1k
972	aese	v3.16b, v31.16b                                    // AES block 3 - round N-1
973	aese	v0.16b, v31.16b                                    // AES block 0 - round N-1
974	b.ge	.Ldec_tail                                        // handle tail
975
976	ldr	q4, [x0, #0]                          // AES block 0 - load ciphertext
977	ldr	q5, [x0, #16]                         // AES block 1 - load ciphertext
978	rev	w9, w12                                 // CTR block 4
979	eor	v0.16b, v4.16b, v0.16b                            // AES block 0 - result
980	eor	v1.16b, v5.16b, v1.16b                            // AES block 1 - result
981	rev64	v5.16b, v5.16b                                    // GHASH block 1
982	ldr	q7, [x0, #48]                         // AES block 3 - load ciphertext
983	mov	x7, v0.d[1]                            // AES block 0 - mov high
984	mov	x6, v0.d[0]                            // AES block 0 - mov low
985	rev64	v4.16b, v4.16b                                    // GHASH block 0
986	add	w12, w12, #1                            // CTR block 4
987	fmov	d0, x10                               // CTR block 4
988	orr	x9, x11, x9, lsl #32            // CTR block 4
989	fmov	v0.d[1], x9                               // CTR block 4
990	rev	w9, w12                                 // CTR block 5
991	add	w12, w12, #1                            // CTR block 5
992	mov	x19, v1.d[0]                            // AES block 1 - mov low
993	orr	x9, x11, x9, lsl #32            // CTR block 5
994	mov	x20, v1.d[1]                            // AES block 1 - mov high
995	eor	x7, x7, x14                    // AES block 0 - round N high
996	eor	x6, x6, x13                    // AES block 0 - round N low
997	stp	x6, x7, [x2], #16        // AES block 0 - store result
998	fmov	d1, x10                               // CTR block 5
999	ldr	q6, [x0, #32]                         // AES block 2 - load ciphertext
1000	add	x0, x0, #64                       // AES input_ptr update
1001	fmov	v1.d[1], x9                               // CTR block 5
1002	rev	w9, w12                                 // CTR block 6
1003	add	w12, w12, #1                            // CTR block 6
1004	eor	x19, x19, x13                    // AES block 1 - round N low
1005	orr	x9, x11, x9, lsl #32            // CTR block 6
1006	eor	x20, x20, x14                    // AES block 1 - round N high
1007	stp	x19, x20, [x2], #16        // AES block 1 - store result
1008	eor	v2.16b, v6.16b, v2.16b                            // AES block 2 - result
1009	cmp	x0, x5                   // check if we have <= 8 blocks
1010	b.ge	.Ldec_prepretail                                  // do prepretail
1011
1012.Ldec_main_loop:	//	main loop start
1013	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
1014	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
1015	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
1016	aese	v0.16b, v18.16b
1017	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
1018	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
1019	aese	v1.16b, v18.16b
1020	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
1021	fmov	d2, x10                               // CTR block 4k+6
1022	fmov	v2.d[1], x9                               // CTR block 4k+6
1023	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
1024	rev	w9, w12                                 // CTR block 4k+7
1025	aese	v0.16b, v19.16b
1026	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
1027	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
1028	aese	v1.16b, v19.16b
1029	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
1030	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
1031	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
1032	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
1033	fmov	d3, x10                               // CTR block 4k+7
1034	aese	v0.16b, v20.16b
1035	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
1036	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
1037	aese	v2.16b, v18.16b
1038	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
1039	fmov	v3.d[1], x9                               // CTR block 4k+7
1040	aese	v1.16b, v20.16b
1041	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
1042	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
1043	aese	v0.16b, v21.16b
1044	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
1045	eor	x22, x22, x14                    // AES block 4k+2 - round N high
1046	aese	v2.16b, v19.16b
1047	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
1048	mov	d10, v17.d[1]                               // GHASH block 4k - mid
1049	aese	v1.16b, v21.16b
1050	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
1051	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
1052	aese	v3.16b, v18.16b
1053	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
1054	eor	x21, x21, x13                    // AES block 4k+2 - round N low
1055	aese	v2.16b, v20.16b
1056	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
1057	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
1058	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
1059	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
1060	aese	v2.16b, v21.16b
1061	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
1062	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
1063	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
1064	eor	x23, x23, x13                    // AES block 4k+3 - round N low
1065	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
1066	eor	x24, x24, x14                    // AES block 4k+3 - round N high
1067	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
1068	aese	v2.16b, v22.16b
1069	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
1070	aese	v3.16b, v19.16b
1071	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
1072	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
1073	aese	v0.16b, v22.16b
1074	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
1075	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
1076	aese	v2.16b, v23.16b
1077	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
1078	add	w12, w12, #1                            // CTR block 4k+7
1079	aese	v3.16b, v20.16b
1080	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
1081	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
1082	aese	v1.16b, v22.16b
1083	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
1084	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
1085	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
1086	aese	v3.16b, v21.16b
1087	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
1088	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
1089	aese	v1.16b, v23.16b
1090	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
1091	aese	v0.16b, v23.16b
1092	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
1093	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
1094	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
1095	rev	w9, w12                                 // CTR block 4k+8
1096	aese	v1.16b, v24.16b
1097	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
1098	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
1099	aese	v0.16b, v24.16b
1100	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
1101	add	w12, w12, #1                            // CTR block 4k+8
1102	aese	v3.16b, v22.16b
1103	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
1104	aese	v1.16b, v25.16b
1105	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
1106	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
1107	aese	v0.16b, v25.16b
1108	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
1109	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
1110	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
1111	aese	v3.16b, v23.16b
1112	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
1113	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
1114	aese	v0.16b, v26.16b
1115	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
1116	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
1117	aese	v3.16b, v24.16b
1118	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
1119	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
1120	orr	x9, x11, x9, lsl #32            // CTR block 4k+8
1121	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
1122	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
1123	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
1124	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
1125	aese	v1.16b, v26.16b
1126	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
1127	aese	v2.16b, v24.16b
1128	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
1129	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
1130	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
1131	movi	v8.8b, #0xc2
1132	aese	v2.16b, v25.16b
1133	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
1134	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
1135	aese	v3.16b, v25.16b
1136	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
1137	shl	d8, d8, #56               // mod_constant
1138	aese	v2.16b, v26.16b
1139	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
1140	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
1141	aese	v3.16b, v26.16b
1142	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
1143	b.lt	.Ldec_main_loop_continue                          // branch if AES-128
1144
1145	aese	v0.16b, v27.16b
1146	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
1147	aese	v2.16b, v27.16b
1148	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
1149	aese	v1.16b, v27.16b
1150	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
1151	aese	v3.16b, v27.16b
1152	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
1153	aese	v0.16b, v28.16b
1154	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
1155	aese	v1.16b, v28.16b
1156	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
1157	aese	v2.16b, v28.16b
1158	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
1159	aese	v3.16b, v28.16b
1160	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
1161	b.eq	.Ldec_main_loop_continue                          // branch if AES-192
1162
1163	aese	v0.16b, v29.16b
1164	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
1165	aese	v1.16b, v29.16b
1166	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
1167	aese	v2.16b, v29.16b
1168	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
1169	aese	v3.16b, v29.16b
1170	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
1171	aese	v0.16b, v30.16b
1172	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
1173	aese	v1.16b, v30.16b
1174	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
1175	aese	v2.16b, v30.16b
1176	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
1177	aese	v3.16b, v30.16b
1178	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
1179
1180.Ldec_main_loop_continue:
1181	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
1182	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
1183	ldr	q4, [x0, #0]                          // AES block 4k+4 - load ciphertext
1184	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
1185	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
1186	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
1187	ldr	q5, [x0, #16]                         // AES block 4k+5 - load ciphertext
1188	eor	v0.16b, v4.16b, v0.16b                            // AES block 4k+4 - result
1189	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
1190	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
1191	ldr	q7, [x0, #48]                         // AES block 4k+7 - load ciphertext
1192	ldr	q6, [x0, #32]                         // AES block 4k+6 - load ciphertext
1193	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
1194	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
1195	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
1196	add	x0, x0, #64                       // AES input_ptr update
1197	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
1198	fmov	d0, x10                               // CTR block 4k+8
1199	fmov	v0.d[1], x9                               // CTR block 4k+8
1200	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
1201	eor	v1.16b, v5.16b, v1.16b                            // AES block 4k+5 - result
1202	rev	w9, w12                                 // CTR block 4k+9
1203	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
1204	orr	x9, x11, x9, lsl #32            // CTR block 4k+9
1205	cmp	x0, x5                   // .LOOP CONTROL
1206	add	w12, w12, #1                            // CTR block 4k+9
1207	eor	x6, x6, x13                    // AES block 4k+4 - round N low
1208	eor	x7, x7, x14                    // AES block 4k+4 - round N high
1209	mov	x20, v1.d[1]                            // AES block 4k+5 - mov high
1210	eor	v2.16b, v6.16b, v2.16b                            // AES block 4k+6 - result
1211	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
1212	mov	x19, v1.d[0]                            // AES block 4k+5 - mov low
1213	fmov	d1, x10                               // CTR block 4k+9
1214	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
1215	fmov	v1.d[1], x9                               // CTR block 4k+9
1216	rev	w9, w12                                 // CTR block 4k+10
1217	add	w12, w12, #1                            // CTR block 4k+10
1218	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
1219	orr	x9, x11, x9, lsl #32            // CTR block 4k+10
1220	rev64	v5.16b, v5.16b                                    // GHASH block 4k+5
1221	eor	x20, x20, x14                    // AES block 4k+5 - round N high
1222	stp	x6, x7, [x2], #16        // AES block 4k+4 - store result
1223	eor	x19, x19, x13                    // AES block 4k+5 - round N low
1224	stp	x19, x20, [x2], #16        // AES block 4k+5 - store result
1225	rev64	v4.16b, v4.16b                                    // GHASH block 4k+4
1226	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
1227	b.lt	.Ldec_main_loop
1228
1229.Ldec_prepretail:	//	PREPRETAIL
1230	ext	v11.16b, v11.16b, v11.16b, #8                     // PRE 0
1231	mov	x21, v2.d[0]                            // AES block 4k+2 - mov low
1232	eor	v3.16b, v7.16b, v3.16b                            // AES block 4k+3 - result
1233	aese	v0.16b, v18.16b
1234	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 0
1235	mov	x22, v2.d[1]                            // AES block 4k+2 - mov high
1236	aese	v1.16b, v18.16b
1237	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 0
1238	fmov	d2, x10                               // CTR block 4k+6
1239	fmov	v2.d[1], x9                               // CTR block 4k+6
1240	rev	w9, w12                                 // CTR block 4k+7
1241	eor	v4.16b, v4.16b, v11.16b                           // PRE 1
1242	rev64	v6.16b, v6.16b                                    // GHASH block 4k+2
1243	orr	x9, x11, x9, lsl #32            // CTR block 4k+7
1244	mov	x23, v3.d[0]                            // AES block 4k+3 - mov low
1245	aese	v1.16b, v19.16b
1246	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 1
1247	mov	x24, v3.d[1]                            // AES block 4k+3 - mov high
1248	pmull	v11.1q, v4.1d, v15.1d                       // GHASH block 4k - low
1249	mov	d8, v4.d[1]                                  // GHASH block 4k - mid
1250	fmov	d3, x10                               // CTR block 4k+7
1251	pmull2	v9.1q, v4.2d, v15.2d                       // GHASH block 4k - high
1252	fmov	v3.d[1], x9                               // CTR block 4k+7
1253	aese	v2.16b, v18.16b
1254	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 0
1255	mov	d10, v17.d[1]                               // GHASH block 4k - mid
1256	aese	v0.16b, v19.16b
1257	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 1
1258	eor	v8.8b, v8.8b, v4.8b                          // GHASH block 4k - mid
1259	pmull2	v4.1q, v5.2d, v14.2d                          // GHASH block 4k+1 - high
1260	aese	v2.16b, v19.16b
1261	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 1
1262	rev64	v7.16b, v7.16b                                    // GHASH block 4k+3
1263	aese	v3.16b, v18.16b
1264	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 0
1265	pmull	v10.1q, v8.1d, v10.1d                      // GHASH block 4k - mid
1266	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+1 - high
1267	pmull	v8.1q, v5.1d, v14.1d                          // GHASH block 4k+1 - low
1268	aese	v3.16b, v19.16b
1269	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 1
1270	mov	d4, v5.d[1]                                  // GHASH block 4k+1 - mid
1271	aese	v0.16b, v20.16b
1272	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 2
1273	aese	v1.16b, v20.16b
1274	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 2
1275	eor	v11.16b, v11.16b, v8.16b                         // GHASH block 4k+1 - low
1276	aese	v2.16b, v20.16b
1277	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 2
1278	aese	v0.16b, v21.16b
1279	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 3
1280	mov	d8, v6.d[1]                                  // GHASH block 4k+2 - mid
1281	aese	v3.16b, v20.16b
1282	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 2
1283	eor	v4.8b, v4.8b, v5.8b                          // GHASH block 4k+1 - mid
1284	pmull	v5.1q, v6.1d, v13.1d                          // GHASH block 4k+2 - low
1285	aese	v0.16b, v22.16b
1286	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 4
1287	aese	v3.16b, v21.16b
1288	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 3
1289	eor	v8.8b, v8.8b, v6.8b                          // GHASH block 4k+2 - mid
1290	pmull	v4.1q, v4.1d, v17.1d                          // GHASH block 4k+1 - mid
1291	aese	v0.16b, v23.16b
1292	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 5
1293	eor	v11.16b, v11.16b, v5.16b                         // GHASH block 4k+2 - low
1294	aese	v3.16b, v22.16b
1295	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 4
1296	pmull2	v5.1q, v7.2d, v12.2d                          // GHASH block 4k+3 - high
1297	eor	v10.16b, v10.16b, v4.16b                         // GHASH block 4k+1 - mid
1298	pmull2	v4.1q, v6.2d, v13.2d                          // GHASH block 4k+2 - high
1299	aese	v3.16b, v23.16b
1300	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 5
1301	ins	v8.d[1], v8.d[0]                                // GHASH block 4k+2 - mid
1302	aese	v2.16b, v21.16b
1303	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 3
1304	aese	v1.16b, v21.16b
1305	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 3
1306	eor	v9.16b, v9.16b, v4.16b                         // GHASH block 4k+2 - high
1307	pmull	v4.1q, v7.1d, v12.1d                          // GHASH block 4k+3 - low
1308	aese	v2.16b, v22.16b
1309	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 4
1310	mov	d6, v7.d[1]                                  // GHASH block 4k+3 - mid
1311	aese	v1.16b, v22.16b
1312	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 4
1313	pmull2	v8.1q, v8.2d, v16.2d                          // GHASH block 4k+2 - mid
1314	aese	v2.16b, v23.16b
1315	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 5
1316	eor	v6.8b, v6.8b, v7.8b                          // GHASH block 4k+3 - mid
1317	aese	v1.16b, v23.16b
1318	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 5
1319	aese	v3.16b, v24.16b
1320	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 6
1321	eor	v10.16b, v10.16b, v8.16b                         // GHASH block 4k+2 - mid
1322	aese	v2.16b, v24.16b
1323	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 6
1324	aese	v0.16b, v24.16b
1325	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 6
1326	movi	v8.8b, #0xc2
1327	aese	v1.16b, v24.16b
1328	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 6
1329	eor	v11.16b, v11.16b, v4.16b                         // GHASH block 4k+3 - low
1330	pmull	v6.1q, v6.1d, v16.1d                          // GHASH block 4k+3 - mid
1331	aese	v3.16b, v25.16b
1332	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 7
1333	cmp	x17, #12                                      // setup flags for AES-128/192/256 check
1334	eor	v9.16b, v9.16b, v5.16b                         // GHASH block 4k+3 - high
1335	aese	v1.16b, v25.16b
1336	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 7
1337	aese	v0.16b, v25.16b
1338	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 7
1339	eor	v10.16b, v10.16b, v6.16b                         // GHASH block 4k+3 - mid
1340	aese	v3.16b, v26.16b
1341	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 8
1342	aese	v2.16b, v25.16b
1343	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 7
1344	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
1345	aese	v1.16b, v26.16b
1346	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 8
1347	aese	v0.16b, v26.16b
1348	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 8
1349	shl	d8, d8, #56               // mod_constant
1350	aese	v2.16b, v26.16b
1351	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 8
1352	b.lt	.Ldec_finish_prepretail                           // branch if AES-128
1353
1354	aese	v1.16b, v27.16b
1355	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 9
1356	aese	v2.16b, v27.16b
1357	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 9
1358	aese	v3.16b, v27.16b
1359	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 9
1360	aese	v0.16b, v27.16b
1361	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 9
1362	aese	v2.16b, v28.16b
1363	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 10
1364	aese	v3.16b, v28.16b
1365	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 10
1366	aese	v0.16b, v28.16b
1367	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 10
1368	aese	v1.16b, v28.16b
1369	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 10
1370	b.eq	.Ldec_finish_prepretail                           // branch if AES-192
1371
1372	aese	v2.16b, v29.16b
1373	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 11
1374	aese	v0.16b, v29.16b
1375	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 11
1376	aese	v1.16b, v29.16b
1377	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 11
1378	aese	v2.16b, v30.16b
1379	aesmc	v2.16b, v2.16b          // AES block 4k+6 - round 12
1380	aese	v3.16b, v29.16b
1381	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 11
1382	aese	v1.16b, v30.16b
1383	aesmc	v1.16b, v1.16b          // AES block 4k+5 - round 12
1384	aese	v0.16b, v30.16b
1385	aesmc	v0.16b, v0.16b          // AES block 4k+4 - round 12
1386	aese	v3.16b, v30.16b
1387	aesmc	v3.16b, v3.16b          // AES block 4k+7 - round 12
1388
1389.Ldec_finish_prepretail:
1390	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
1391	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
1392	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
1393	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
1394	eor	x22, x22, x14                    // AES block 4k+2 - round N high
1395	eor	x23, x23, x13                    // AES block 4k+3 - round N low
1396	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
1397	add	w12, w12, #1                            // CTR block 4k+7
1398	eor	x21, x21, x13                    // AES block 4k+2 - round N low
1399	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
1400	eor	x24, x24, x14                    // AES block 4k+3 - round N high
1401	stp	x21, x22, [x2], #16        // AES block 4k+2 - store result
1402	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
1403	stp	x23, x24, [x2], #16        // AES block 4k+3 - store result
1404
1405	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
1406	aese	v1.16b, v31.16b                                    // AES block 4k+5 - round N-1
1407	aese	v0.16b, v31.16b                                    // AES block 4k+4 - round N-1
1408	aese	v3.16b, v31.16b                                    // AES block 4k+7 - round N-1
1409	aese	v2.16b, v31.16b                                    // AES block 4k+6 - round N-1
1410	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
1411
1412.Ldec_tail:	//	TAIL
1413	sub	x5, x4, x0   // main_end_input_ptr is number of bytes left to process
1414	ld1	{ v5.16b}, [x0], #16                      // AES block 4k+4 - load ciphertext
1415	eor	v0.16b, v5.16b, v0.16b                            // AES block 4k+4 - result
1416	mov	x6, v0.d[0]                            // AES block 4k+4 - mov low
1417	mov	x7, v0.d[1]                            // AES block 4k+4 - mov high
1418	ext	v8.16b, v11.16b, v11.16b, #8                     // prepare final partial tag
1419	cmp	x5, #48
1420	eor	x6, x6, x13                    // AES block 4k+4 - round N low
1421	eor	x7, x7, x14                    // AES block 4k+4 - round N high
1422	b.gt	.Ldec_blocks_more_than_3
1423	sub	w12, w12, #1
1424	mov	v3.16b, v2.16b
1425	movi	v10.8b, #0
1426	movi	v11.8b, #0
1427	cmp	x5, #32
1428	movi	v9.8b, #0
1429	mov	v2.16b, v1.16b
1430	b.gt	.Ldec_blocks_more_than_2
1431	sub	w12, w12, #1
1432	mov	v3.16b, v1.16b
1433	cmp	x5, #16
1434	b.gt	.Ldec_blocks_more_than_1
1435	sub	w12, w12, #1
1436	b	.Ldec_blocks_less_than_1
1437.Ldec_blocks_more_than_3:	//	blocks left >  3
1438	rev64	v4.16b, v5.16b                                   // GHASH final-3 block
1439	ld1	{ v5.16b}, [x0], #16                     // AES final-2 block - load ciphertext
1440	stp	x6, x7, [x2], #16       // AES final-3 block  - store result
1441	mov	d10, v17.d[1]                              // GHASH final-3 block - mid
1442	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
1443	eor	v0.16b, v5.16b, v1.16b                           // AES final-2 block - result
1444	mov	d22, v4.d[1]                                // GHASH final-3 block - mid
1445	mov	x6, v0.d[0]                           // AES final-2 block - mov low
1446	mov	x7, v0.d[1]                           // AES final-2 block - mov high
1447	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-3 block - mid
1448	movi	v8.8b, #0                                       // suppress further partial tag feed in
1449	pmull2	v9.1q, v4.2d, v15.2d                      // GHASH final-3 block - high
1450	pmull	v10.1q, v22.1d, v10.1d                   // GHASH final-3 block - mid
1451	eor	x6, x6, x13                   // AES final-2 block - round N low
1452	pmull	v11.1q, v4.1d, v15.1d                      // GHASH final-3 block - low
1453	eor	x7, x7, x14                   // AES final-2 block - round N high
1454.Ldec_blocks_more_than_2:	//	blocks left >  2
1455	rev64	v4.16b, v5.16b                                   // GHASH final-2 block
1456	ld1	{ v5.16b}, [x0], #16                     // AES final-1 block - load ciphertext
1457	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
1458	stp	x6, x7, [x2], #16       // AES final-2 block  - store result
1459	eor	v0.16b, v5.16b, v2.16b                           // AES final-1 block - result
1460	mov	d22, v4.d[1]                                // GHASH final-2 block - mid
1461	pmull	v21.1q, v4.1d, v14.1d                         // GHASH final-2 block - low
1462	pmull2	v20.1q, v4.2d, v14.2d                         // GHASH final-2 block - high
1463	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-2 block - mid
1464	mov	x6, v0.d[0]                           // AES final-1 block - mov low
1465	mov	x7, v0.d[1]                           // AES final-1 block - mov high
1466	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-2 block - low
1467	movi	v8.8b, #0                                       // suppress further partial tag feed in
1468	pmull	v22.1q, v22.1d, v17.1d                     // GHASH final-2 block - mid
1469	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-2 block - high
1470	eor	x6, x6, x13                   // AES final-1 block - round N low
1471	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-2 block - mid
1472	eor	x7, x7, x14                   // AES final-1 block - round N high
1473.Ldec_blocks_more_than_1:	//	blocks left >  1
1474	stp	x6, x7, [x2], #16       // AES final-1 block  - store result
1475	rev64	v4.16b, v5.16b                                   // GHASH final-1 block
1476	ld1	{ v5.16b}, [x0], #16                     // AES final block - load ciphertext
1477	eor	v4.16b, v4.16b, v8.16b                          // feed in partial tag
1478	movi	v8.8b, #0                                       // suppress further partial tag feed in
1479	mov	d22, v4.d[1]                                // GHASH final-1 block - mid
1480	eor	v0.16b, v5.16b, v3.16b                           // AES final block - result
1481	pmull2	v20.1q, v4.2d, v13.2d                         // GHASH final-1 block - high
1482	eor	v22.8b, v22.8b, v4.8b                     // GHASH final-1 block - mid
1483	pmull	v21.1q, v4.1d, v13.1d                         // GHASH final-1 block - low
1484	mov	x6, v0.d[0]                           // AES final block - mov low
1485	ins	v22.d[1], v22.d[0]                           // GHASH final-1 block - mid
1486	mov	x7, v0.d[1]                           // AES final block - mov high
1487	pmull2	v22.1q, v22.2d, v16.2d                     // GHASH final-1 block - mid
1488	eor	x6, x6, x13                   // AES final block - round N low
1489	eor	v11.16b, v11.16b, v21.16b                           // GHASH final-1 block - low
1490	eor	v9.16b, v9.16b, v20.16b                           // GHASH final-1 block - high
1491	eor	v10.16b, v10.16b, v22.16b                      // GHASH final-1 block - mid
1492	eor	x7, x7, x14                   // AES final block - round N high
1493.Ldec_blocks_less_than_1:	//	blocks left <= 1
1494	and	x1, x1, #127                   // bit_length %= 128
1495	mvn	x14, xzr                                      // rkN_h = 0xffffffffffffffff
1496	sub	x1, x1, #128                   // bit_length -= 128
1497	mvn	x13, xzr                                      // rkN_l = 0xffffffffffffffff
1498	ldp	x4, x5, [x2] // load existing bytes we need to not overwrite
1499	neg	x1, x1                         // bit_length = 128 - #bits in input (in range [1,128])
1500	and	x1, x1, #127                   // bit_length %= 128
1501	lsr	x14, x14, x1                      // rkN_h is mask for top 64b of last block
1502	cmp	x1, #64
1503	csel	x9, x13, x14, lt
1504	csel	x10, x14, xzr, lt
1505	fmov	d0, x9                                  // ctr0b is mask for last block
1506	and	x6, x6, x9
1507	mov	v0.d[1], x10
1508	bic	x4, x4, x9          // mask out low existing bytes
1509	rev	w9, w12
1510	bic	x5, x5, x10      // mask out high existing bytes
1511	orr	x6, x6, x4
1512	and	x7, x7, x10
1513	orr	x7, x7, x5
1514	and	v5.16b, v5.16b, v0.16b                            // possibly partial last block has zeroes in highest bits
1515	rev64	v4.16b, v5.16b                                    // GHASH final block
1516	eor	v4.16b, v4.16b, v8.16b                           // feed in partial tag
1517	pmull	v21.1q, v4.1d, v12.1d                          // GHASH final block - low
1518	mov	d8, v4.d[1]                                  // GHASH final block - mid
1519	eor	v8.8b, v8.8b, v4.8b                          // GHASH final block - mid
1520	pmull2	v20.1q, v4.2d, v12.2d                          // GHASH final block - high
1521	pmull	v8.1q, v8.1d, v16.1d                          // GHASH final block - mid
1522	eor	v9.16b, v9.16b, v20.16b                            // GHASH final block - high
1523	eor	v11.16b, v11.16b, v21.16b                            // GHASH final block - low
1524	eor	v10.16b, v10.16b, v8.16b                         // GHASH final block - mid
1525	movi	v8.8b, #0xc2
1526	eor	v6.16b, v11.16b, v9.16b                         // MODULO - karatsuba tidy up
1527	shl	d8, d8, #56               // mod_constant
1528	eor	v10.16b, v10.16b, v6.16b                         // MODULO - karatsuba tidy up
1529	pmull	v7.1q, v9.1d, v8.1d            // MODULO - top 64b align with mid
1530	ext	v9.16b, v9.16b, v9.16b, #8                     // MODULO - other top alignment
1531	eor	v10.16b, v10.16b, v7.16b                      // MODULO - fold into mid
1532	eor	v10.16b, v10.16b, v9.16b                         // MODULO - fold into mid
1533	pmull	v8.1q, v10.1d, v8.1d     // MODULO - mid 64b align with low
1534	ext	v10.16b, v10.16b, v10.16b, #8                     // MODULO - other mid alignment
1535	eor	v11.16b, v11.16b, v8.16b               // MODULO - fold into low
1536	stp	x6, x7, [x2]
1537	str	w9, [x16, #12]                          // store the updated counter
1538	eor	v11.16b, v11.16b, v10.16b                         // MODULO - fold into low
1539	ext	v11.16b, v11.16b, v11.16b, #8
1540	rev64	v11.16b, v11.16b
1541	mov	x0, x15
1542	st1	{ v11.16b }, [x3]
1543	ldp	x19, x20, [sp, #16]
1544	ldp	x21, x22, [sp, #32]
1545	ldp	x23, x24, [sp, #48]
1546	ldp	d8, d9, [sp, #64]
1547	ldp	d10, d11, [sp, #80]
1548	ldp	d12, d13, [sp, #96]
1549	ldp	d14, d15, [sp, #112]
1550	ldp	x29, x30, [sp], #128
1551	AARCH64_VALIDATE_LINK_REGISTER
1552	ret
1553.size	aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
1554#endif
1555#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
1556