1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__)
7#include <ring-core/arm_arch.h>
8.section	__TEXT,__const
9
10.align	7
11Lchacha20_consts:
12.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
13Linc:
14.long	1,2,3,4
15Lrol8:
16.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
17Lclamp:
18.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
19
20.text
21
22
23.align	6
24Lpoly_hash_ad_internal:
25.cfi_startproc
26	cbnz	x4, Lpoly_hash_intro
27	ret
28
29Lpoly_hash_intro:
30	cmp	x4, #16
31	b.lt	Lpoly_hash_ad_tail
32	ldp	x11, x12, [x3], 16
33	adds	x8, x8, x11
34	adcs	x9, x9, x12
35	adc	x10, x10, x15
36	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
37	umulh	x12, x8, x16
38	mul	x13, x9, x16
39	umulh	x14, x9, x16
40	adds	x12, x12, x13
41	mul	x13, x10, x16
42	adc	x13, x13, x14
43	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
44	umulh	x8, x8, x17
45	adds	x12, x12, x14
46	mul	x14, x9, x17
47	umulh	x9, x9, x17
48	adcs	x14, x14, x8
49	mul	x10, x10, x17
50	adc	x10, x10, x9
51	adds	x13, x13, x14
52	adc	x14, x10, xzr
53	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
54	and	x8, x13, #-4
55	extr	x13, x14, x13, #2
56	adds	x8, x8, x11
57	lsr	x11, x14, #2
58	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
59	adds	x8, x8, x13
60	adcs	x9, x9, x12
61	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
62	sub	x4, x4, #16
63	b	Lpoly_hash_ad_internal
64
65Lpoly_hash_ad_tail:
66	cbz	x4, Lpoly_hash_ad_ret
67
68	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
69	sub	x4, x4, #1
70
71Lpoly_hash_tail_16_compose:
72	ext	v20.16b, v20.16b, v20.16b, #15
73	ldrb	w11, [x3, x4]
74	mov	v20.b[0], w11
75	subs	x4, x4, #1
76	b.ge	Lpoly_hash_tail_16_compose
77	mov	x11, v20.d[0]
78	mov	x12, v20.d[1]
79	adds	x8, x8, x11
80	adcs	x9, x9, x12
81	adc	x10, x10, x15
82	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
83	umulh	x12, x8, x16
84	mul	x13, x9, x16
85	umulh	x14, x9, x16
86	adds	x12, x12, x13
87	mul	x13, x10, x16
88	adc	x13, x13, x14
89	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
90	umulh	x8, x8, x17
91	adds	x12, x12, x14
92	mul	x14, x9, x17
93	umulh	x9, x9, x17
94	adcs	x14, x14, x8
95	mul	x10, x10, x17
96	adc	x10, x10, x9
97	adds	x13, x13, x14
98	adc	x14, x10, xzr
99	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
100	and	x8, x13, #-4
101	extr	x13, x14, x13, #2
102	adds	x8, x8, x11
103	lsr	x11, x14, #2
104	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
105	adds	x8, x8, x13
106	adcs	x9, x9, x12
107	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
108
109Lpoly_hash_ad_ret:
110	ret
111.cfi_endproc
112
113
114/////////////////////////////////
115//
116// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
117//
118.globl	_chacha20_poly1305_seal
119.private_extern	_chacha20_poly1305_seal
120
121.align	6
122_chacha20_poly1305_seal:
123	AARCH64_SIGN_LINK_REGISTER
124.cfi_startproc
125	stp	x29, x30, [sp, #-80]!
126.cfi_def_cfa_offset	80
127.cfi_offset	w30, -72
128.cfi_offset	w29, -80
129	mov	x29, sp
130    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
131    // we don't actually use the frame pointer like that, it's probably not
132    // worth bothering.
133	stp	d8, d9, [sp, #16]
134	stp	d10, d11, [sp, #32]
135	stp	d12, d13, [sp, #48]
136	stp	d14, d15, [sp, #64]
137.cfi_offset	b15, -8
138.cfi_offset	b14, -16
139.cfi_offset	b13, -24
140.cfi_offset	b12, -32
141.cfi_offset	b11, -40
142.cfi_offset	b10, -48
143.cfi_offset	b9, -56
144.cfi_offset	b8, -64
145
146	adrp	x11, Lchacha20_consts@PAGE
147	add	x11, x11, Lchacha20_consts@PAGEOFF
148
149	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
150	ld1	{v28.16b - v30.16b}, [x5]
151
152	mov	x15, #1 // Prepare the Poly1305 state
153	mov	x8, #0
154	mov	x9, #0
155	mov	x10, #0
156
157	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
158	add	x12, x12, x2
159	mov	v31.d[0], x4  // Store the input and aad lengths
160	mov	v31.d[1], x12
161
162	cmp	x2, #128
163	b.le	Lseal_128 // Optimization for smaller buffers
164
165    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
166    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
167    // the fifth block (A4-D4) horizontally.
168	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
169	mov	v4.16b, v24.16b
170
171	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
172	mov	v9.16b, v28.16b
173
174	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
175	mov	v14.16b, v29.16b
176
177	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
178	add	v15.4s, v15.4s, v25.4s
179	mov	v19.16b, v30.16b
180
181	sub	x5, x5, #32
182
183	mov	x6, #10
184
185.align	5
186Lseal_init_rounds:
187	add	v0.4s, v0.4s, v5.4s
188	add	v1.4s, v1.4s, v6.4s
189	add	v2.4s, v2.4s, v7.4s
190	add	v3.4s, v3.4s, v8.4s
191	add	v4.4s, v4.4s, v9.4s
192
193	eor	v15.16b, v15.16b, v0.16b
194	eor	v16.16b, v16.16b, v1.16b
195	eor	v17.16b, v17.16b, v2.16b
196	eor	v18.16b, v18.16b, v3.16b
197	eor	v19.16b, v19.16b, v4.16b
198
199	rev32	v15.8h, v15.8h
200	rev32	v16.8h, v16.8h
201	rev32	v17.8h, v17.8h
202	rev32	v18.8h, v18.8h
203	rev32	v19.8h, v19.8h
204
205	add	v10.4s, v10.4s, v15.4s
206	add	v11.4s, v11.4s, v16.4s
207	add	v12.4s, v12.4s, v17.4s
208	add	v13.4s, v13.4s, v18.4s
209	add	v14.4s, v14.4s, v19.4s
210
211	eor	v5.16b, v5.16b, v10.16b
212	eor	v6.16b, v6.16b, v11.16b
213	eor	v7.16b, v7.16b, v12.16b
214	eor	v8.16b, v8.16b, v13.16b
215	eor	v9.16b, v9.16b, v14.16b
216
217	ushr	v20.4s, v5.4s, #20
218	sli	v20.4s, v5.4s, #12
219	ushr	v5.4s, v6.4s, #20
220	sli	v5.4s, v6.4s, #12
221	ushr	v6.4s, v7.4s, #20
222	sli	v6.4s, v7.4s, #12
223	ushr	v7.4s, v8.4s, #20
224	sli	v7.4s, v8.4s, #12
225	ushr	v8.4s, v9.4s, #20
226	sli	v8.4s, v9.4s, #12
227
228	add	v0.4s, v0.4s, v20.4s
229	add	v1.4s, v1.4s, v5.4s
230	add	v2.4s, v2.4s, v6.4s
231	add	v3.4s, v3.4s, v7.4s
232	add	v4.4s, v4.4s, v8.4s
233
234	eor	v15.16b, v15.16b, v0.16b
235	eor	v16.16b, v16.16b, v1.16b
236	eor	v17.16b, v17.16b, v2.16b
237	eor	v18.16b, v18.16b, v3.16b
238	eor	v19.16b, v19.16b, v4.16b
239
240	tbl	v15.16b, {v15.16b}, v26.16b
241	tbl	v16.16b, {v16.16b}, v26.16b
242	tbl	v17.16b, {v17.16b}, v26.16b
243	tbl	v18.16b, {v18.16b}, v26.16b
244	tbl	v19.16b, {v19.16b}, v26.16b
245
246	add	v10.4s, v10.4s, v15.4s
247	add	v11.4s, v11.4s, v16.4s
248	add	v12.4s, v12.4s, v17.4s
249	add	v13.4s, v13.4s, v18.4s
250	add	v14.4s, v14.4s, v19.4s
251
252	eor	v20.16b, v20.16b, v10.16b
253	eor	v5.16b, v5.16b, v11.16b
254	eor	v6.16b, v6.16b, v12.16b
255	eor	v7.16b, v7.16b, v13.16b
256	eor	v8.16b, v8.16b, v14.16b
257
258	ushr	v9.4s, v8.4s, #25
259	sli	v9.4s, v8.4s, #7
260	ushr	v8.4s, v7.4s, #25
261	sli	v8.4s, v7.4s, #7
262	ushr	v7.4s, v6.4s, #25
263	sli	v7.4s, v6.4s, #7
264	ushr	v6.4s, v5.4s, #25
265	sli	v6.4s, v5.4s, #7
266	ushr	v5.4s, v20.4s, #25
267	sli	v5.4s, v20.4s, #7
268
269	ext	v9.16b, v9.16b, v9.16b, #4
270	ext	v14.16b, v14.16b, v14.16b, #8
271	ext	v19.16b, v19.16b, v19.16b, #12
272	add	v0.4s, v0.4s, v6.4s
273	add	v1.4s, v1.4s, v7.4s
274	add	v2.4s, v2.4s, v8.4s
275	add	v3.4s, v3.4s, v5.4s
276	add	v4.4s, v4.4s, v9.4s
277
278	eor	v18.16b, v18.16b, v0.16b
279	eor	v15.16b, v15.16b, v1.16b
280	eor	v16.16b, v16.16b, v2.16b
281	eor	v17.16b, v17.16b, v3.16b
282	eor	v19.16b, v19.16b, v4.16b
283
284	rev32	v18.8h, v18.8h
285	rev32	v15.8h, v15.8h
286	rev32	v16.8h, v16.8h
287	rev32	v17.8h, v17.8h
288	rev32	v19.8h, v19.8h
289
290	add	v12.4s, v12.4s, v18.4s
291	add	v13.4s, v13.4s, v15.4s
292	add	v10.4s, v10.4s, v16.4s
293	add	v11.4s, v11.4s, v17.4s
294	add	v14.4s, v14.4s, v19.4s
295
296	eor	v6.16b, v6.16b, v12.16b
297	eor	v7.16b, v7.16b, v13.16b
298	eor	v8.16b, v8.16b, v10.16b
299	eor	v5.16b, v5.16b, v11.16b
300	eor	v9.16b, v9.16b, v14.16b
301
302	ushr	v20.4s, v6.4s, #20
303	sli	v20.4s, v6.4s, #12
304	ushr	v6.4s, v7.4s, #20
305	sli	v6.4s, v7.4s, #12
306	ushr	v7.4s, v8.4s, #20
307	sli	v7.4s, v8.4s, #12
308	ushr	v8.4s, v5.4s, #20
309	sli	v8.4s, v5.4s, #12
310	ushr	v5.4s, v9.4s, #20
311	sli	v5.4s, v9.4s, #12
312
313	add	v0.4s, v0.4s, v20.4s
314	add	v1.4s, v1.4s, v6.4s
315	add	v2.4s, v2.4s, v7.4s
316	add	v3.4s, v3.4s, v8.4s
317	add	v4.4s, v4.4s, v5.4s
318
319	eor	v18.16b, v18.16b, v0.16b
320	eor	v15.16b, v15.16b, v1.16b
321	eor	v16.16b, v16.16b, v2.16b
322	eor	v17.16b, v17.16b, v3.16b
323	eor	v19.16b, v19.16b, v4.16b
324
325	tbl	v18.16b, {v18.16b}, v26.16b
326	tbl	v15.16b, {v15.16b}, v26.16b
327	tbl	v16.16b, {v16.16b}, v26.16b
328	tbl	v17.16b, {v17.16b}, v26.16b
329	tbl	v19.16b, {v19.16b}, v26.16b
330
331	add	v12.4s, v12.4s, v18.4s
332	add	v13.4s, v13.4s, v15.4s
333	add	v10.4s, v10.4s, v16.4s
334	add	v11.4s, v11.4s, v17.4s
335	add	v14.4s, v14.4s, v19.4s
336
337	eor	v20.16b, v20.16b, v12.16b
338	eor	v6.16b, v6.16b, v13.16b
339	eor	v7.16b, v7.16b, v10.16b
340	eor	v8.16b, v8.16b, v11.16b
341	eor	v5.16b, v5.16b, v14.16b
342
343	ushr	v9.4s, v5.4s, #25
344	sli	v9.4s, v5.4s, #7
345	ushr	v5.4s, v8.4s, #25
346	sli	v5.4s, v8.4s, #7
347	ushr	v8.4s, v7.4s, #25
348	sli	v8.4s, v7.4s, #7
349	ushr	v7.4s, v6.4s, #25
350	sli	v7.4s, v6.4s, #7
351	ushr	v6.4s, v20.4s, #25
352	sli	v6.4s, v20.4s, #7
353
354	ext	v9.16b, v9.16b, v9.16b, #12
355	ext	v14.16b, v14.16b, v14.16b, #8
356	ext	v19.16b, v19.16b, v19.16b, #4
357	subs	x6, x6, #1
358	b.hi	Lseal_init_rounds
359
360	add	v15.4s, v15.4s, v25.4s
361	mov	x11, #4
362	dup	v20.4s, w11
363	add	v25.4s, v25.4s, v20.4s
364
365	zip1	v20.4s, v0.4s, v1.4s
366	zip2	v21.4s, v0.4s, v1.4s
367	zip1	v22.4s, v2.4s, v3.4s
368	zip2	v23.4s, v2.4s, v3.4s
369
370	zip1	v0.2d, v20.2d, v22.2d
371	zip2	v1.2d, v20.2d, v22.2d
372	zip1	v2.2d, v21.2d, v23.2d
373	zip2	v3.2d, v21.2d, v23.2d
374
375	zip1	v20.4s, v5.4s, v6.4s
376	zip2	v21.4s, v5.4s, v6.4s
377	zip1	v22.4s, v7.4s, v8.4s
378	zip2	v23.4s, v7.4s, v8.4s
379
380	zip1	v5.2d, v20.2d, v22.2d
381	zip2	v6.2d, v20.2d, v22.2d
382	zip1	v7.2d, v21.2d, v23.2d
383	zip2	v8.2d, v21.2d, v23.2d
384
385	zip1	v20.4s, v10.4s, v11.4s
386	zip2	v21.4s, v10.4s, v11.4s
387	zip1	v22.4s, v12.4s, v13.4s
388	zip2	v23.4s, v12.4s, v13.4s
389
390	zip1	v10.2d, v20.2d, v22.2d
391	zip2	v11.2d, v20.2d, v22.2d
392	zip1	v12.2d, v21.2d, v23.2d
393	zip2	v13.2d, v21.2d, v23.2d
394
395	zip1	v20.4s, v15.4s, v16.4s
396	zip2	v21.4s, v15.4s, v16.4s
397	zip1	v22.4s, v17.4s, v18.4s
398	zip2	v23.4s, v17.4s, v18.4s
399
400	zip1	v15.2d, v20.2d, v22.2d
401	zip2	v16.2d, v20.2d, v22.2d
402	zip1	v17.2d, v21.2d, v23.2d
403	zip2	v18.2d, v21.2d, v23.2d
404
405	add	v4.4s, v4.4s, v24.4s
406	add	v9.4s, v9.4s, v28.4s
407	and	v4.16b, v4.16b, v27.16b
408
409	add	v0.4s, v0.4s, v24.4s
410	add	v5.4s, v5.4s, v28.4s
411	add	v10.4s, v10.4s, v29.4s
412	add	v15.4s, v15.4s, v30.4s
413
414	add	v1.4s, v1.4s, v24.4s
415	add	v6.4s, v6.4s, v28.4s
416	add	v11.4s, v11.4s, v29.4s
417	add	v16.4s, v16.4s, v30.4s
418
419	add	v2.4s, v2.4s, v24.4s
420	add	v7.4s, v7.4s, v28.4s
421	add	v12.4s, v12.4s, v29.4s
422	add	v17.4s, v17.4s, v30.4s
423
424	add	v3.4s, v3.4s, v24.4s
425	add	v8.4s, v8.4s, v28.4s
426	add	v13.4s, v13.4s, v29.4s
427	add	v18.4s, v18.4s, v30.4s
428
429	mov	x16, v4.d[0] // Move the R key to GPRs
430	mov	x17, v4.d[1]
431	mov	v27.16b, v9.16b // Store the S key
432
433	bl	Lpoly_hash_ad_internal
434
435	mov	x3, x0
436	cmp	x2, #256
437	b.le	Lseal_tail
438
439	ld1	{v20.16b - v23.16b}, [x1], #64
440	eor	v20.16b, v20.16b, v0.16b
441	eor	v21.16b, v21.16b, v5.16b
442	eor	v22.16b, v22.16b, v10.16b
443	eor	v23.16b, v23.16b, v15.16b
444	st1	{v20.16b - v23.16b}, [x0], #64
445
446	ld1	{v20.16b - v23.16b}, [x1], #64
447	eor	v20.16b, v20.16b, v1.16b
448	eor	v21.16b, v21.16b, v6.16b
449	eor	v22.16b, v22.16b, v11.16b
450	eor	v23.16b, v23.16b, v16.16b
451	st1	{v20.16b - v23.16b}, [x0], #64
452
453	ld1	{v20.16b - v23.16b}, [x1], #64
454	eor	v20.16b, v20.16b, v2.16b
455	eor	v21.16b, v21.16b, v7.16b
456	eor	v22.16b, v22.16b, v12.16b
457	eor	v23.16b, v23.16b, v17.16b
458	st1	{v20.16b - v23.16b}, [x0], #64
459
460	ld1	{v20.16b - v23.16b}, [x1], #64
461	eor	v20.16b, v20.16b, v3.16b
462	eor	v21.16b, v21.16b, v8.16b
463	eor	v22.16b, v22.16b, v13.16b
464	eor	v23.16b, v23.16b, v18.16b
465	st1	{v20.16b - v23.16b}, [x0], #64
466
467	sub	x2, x2, #256
468
469	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
470	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
471
472Lseal_main_loop:
473	adrp	x11, Lchacha20_consts@PAGE
474	add	x11, x11, Lchacha20_consts@PAGEOFF
475
476	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
477	mov	v4.16b, v24.16b
478
479	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
480	mov	v9.16b, v28.16b
481
482	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
483	mov	v14.16b, v29.16b
484
485	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
486	add	v15.4s, v15.4s, v25.4s
487	mov	v19.16b, v30.16b
488
489	eor	v20.16b, v20.16b, v20.16b //zero
490	not	v21.16b, v20.16b // -1
491	sub	v21.4s, v25.4s, v21.4s // Add +1
492	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
493	add	v19.4s, v19.4s, v20.4s
494
495	sub	x5, x5, #32
496.align	5
497Lseal_main_loop_rounds:
498	add	v0.4s, v0.4s, v5.4s
499	add	v1.4s, v1.4s, v6.4s
500	add	v2.4s, v2.4s, v7.4s
501	add	v3.4s, v3.4s, v8.4s
502	add	v4.4s, v4.4s, v9.4s
503
504	eor	v15.16b, v15.16b, v0.16b
505	eor	v16.16b, v16.16b, v1.16b
506	eor	v17.16b, v17.16b, v2.16b
507	eor	v18.16b, v18.16b, v3.16b
508	eor	v19.16b, v19.16b, v4.16b
509
510	rev32	v15.8h, v15.8h
511	rev32	v16.8h, v16.8h
512	rev32	v17.8h, v17.8h
513	rev32	v18.8h, v18.8h
514	rev32	v19.8h, v19.8h
515
516	add	v10.4s, v10.4s, v15.4s
517	add	v11.4s, v11.4s, v16.4s
518	add	v12.4s, v12.4s, v17.4s
519	add	v13.4s, v13.4s, v18.4s
520	add	v14.4s, v14.4s, v19.4s
521
522	eor	v5.16b, v5.16b, v10.16b
523	eor	v6.16b, v6.16b, v11.16b
524	eor	v7.16b, v7.16b, v12.16b
525	eor	v8.16b, v8.16b, v13.16b
526	eor	v9.16b, v9.16b, v14.16b
527
528	ushr	v20.4s, v5.4s, #20
529	sli	v20.4s, v5.4s, #12
530	ushr	v5.4s, v6.4s, #20
531	sli	v5.4s, v6.4s, #12
532	ushr	v6.4s, v7.4s, #20
533	sli	v6.4s, v7.4s, #12
534	ushr	v7.4s, v8.4s, #20
535	sli	v7.4s, v8.4s, #12
536	ushr	v8.4s, v9.4s, #20
537	sli	v8.4s, v9.4s, #12
538
539	add	v0.4s, v0.4s, v20.4s
540	add	v1.4s, v1.4s, v5.4s
541	add	v2.4s, v2.4s, v6.4s
542	add	v3.4s, v3.4s, v7.4s
543	add	v4.4s, v4.4s, v8.4s
544
545	eor	v15.16b, v15.16b, v0.16b
546	eor	v16.16b, v16.16b, v1.16b
547	eor	v17.16b, v17.16b, v2.16b
548	eor	v18.16b, v18.16b, v3.16b
549	eor	v19.16b, v19.16b, v4.16b
550
551	tbl	v15.16b, {v15.16b}, v26.16b
552	tbl	v16.16b, {v16.16b}, v26.16b
553	tbl	v17.16b, {v17.16b}, v26.16b
554	tbl	v18.16b, {v18.16b}, v26.16b
555	tbl	v19.16b, {v19.16b}, v26.16b
556
557	add	v10.4s, v10.4s, v15.4s
558	add	v11.4s, v11.4s, v16.4s
559	add	v12.4s, v12.4s, v17.4s
560	add	v13.4s, v13.4s, v18.4s
561	add	v14.4s, v14.4s, v19.4s
562
563	eor	v20.16b, v20.16b, v10.16b
564	eor	v5.16b, v5.16b, v11.16b
565	eor	v6.16b, v6.16b, v12.16b
566	eor	v7.16b, v7.16b, v13.16b
567	eor	v8.16b, v8.16b, v14.16b
568
569	ushr	v9.4s, v8.4s, #25
570	sli	v9.4s, v8.4s, #7
571	ushr	v8.4s, v7.4s, #25
572	sli	v8.4s, v7.4s, #7
573	ushr	v7.4s, v6.4s, #25
574	sli	v7.4s, v6.4s, #7
575	ushr	v6.4s, v5.4s, #25
576	sli	v6.4s, v5.4s, #7
577	ushr	v5.4s, v20.4s, #25
578	sli	v5.4s, v20.4s, #7
579
580	ext	v9.16b, v9.16b, v9.16b, #4
581	ext	v14.16b, v14.16b, v14.16b, #8
582	ext	v19.16b, v19.16b, v19.16b, #12
583	ldp	x11, x12, [x3], 16
584	adds	x8, x8, x11
585	adcs	x9, x9, x12
586	adc	x10, x10, x15
587	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
588	umulh	x12, x8, x16
589	mul	x13, x9, x16
590	umulh	x14, x9, x16
591	adds	x12, x12, x13
592	mul	x13, x10, x16
593	adc	x13, x13, x14
594	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
595	umulh	x8, x8, x17
596	adds	x12, x12, x14
597	mul	x14, x9, x17
598	umulh	x9, x9, x17
599	adcs	x14, x14, x8
600	mul	x10, x10, x17
601	adc	x10, x10, x9
602	adds	x13, x13, x14
603	adc	x14, x10, xzr
604	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
605	and	x8, x13, #-4
606	extr	x13, x14, x13, #2
607	adds	x8, x8, x11
608	lsr	x11, x14, #2
609	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
610	adds	x8, x8, x13
611	adcs	x9, x9, x12
612	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
613	add	v0.4s, v0.4s, v6.4s
614	add	v1.4s, v1.4s, v7.4s
615	add	v2.4s, v2.4s, v8.4s
616	add	v3.4s, v3.4s, v5.4s
617	add	v4.4s, v4.4s, v9.4s
618
619	eor	v18.16b, v18.16b, v0.16b
620	eor	v15.16b, v15.16b, v1.16b
621	eor	v16.16b, v16.16b, v2.16b
622	eor	v17.16b, v17.16b, v3.16b
623	eor	v19.16b, v19.16b, v4.16b
624
625	rev32	v18.8h, v18.8h
626	rev32	v15.8h, v15.8h
627	rev32	v16.8h, v16.8h
628	rev32	v17.8h, v17.8h
629	rev32	v19.8h, v19.8h
630
631	add	v12.4s, v12.4s, v18.4s
632	add	v13.4s, v13.4s, v15.4s
633	add	v10.4s, v10.4s, v16.4s
634	add	v11.4s, v11.4s, v17.4s
635	add	v14.4s, v14.4s, v19.4s
636
637	eor	v6.16b, v6.16b, v12.16b
638	eor	v7.16b, v7.16b, v13.16b
639	eor	v8.16b, v8.16b, v10.16b
640	eor	v5.16b, v5.16b, v11.16b
641	eor	v9.16b, v9.16b, v14.16b
642
643	ushr	v20.4s, v6.4s, #20
644	sli	v20.4s, v6.4s, #12
645	ushr	v6.4s, v7.4s, #20
646	sli	v6.4s, v7.4s, #12
647	ushr	v7.4s, v8.4s, #20
648	sli	v7.4s, v8.4s, #12
649	ushr	v8.4s, v5.4s, #20
650	sli	v8.4s, v5.4s, #12
651	ushr	v5.4s, v9.4s, #20
652	sli	v5.4s, v9.4s, #12
653
654	add	v0.4s, v0.4s, v20.4s
655	add	v1.4s, v1.4s, v6.4s
656	add	v2.4s, v2.4s, v7.4s
657	add	v3.4s, v3.4s, v8.4s
658	add	v4.4s, v4.4s, v5.4s
659
660	eor	v18.16b, v18.16b, v0.16b
661	eor	v15.16b, v15.16b, v1.16b
662	eor	v16.16b, v16.16b, v2.16b
663	eor	v17.16b, v17.16b, v3.16b
664	eor	v19.16b, v19.16b, v4.16b
665
666	tbl	v18.16b, {v18.16b}, v26.16b
667	tbl	v15.16b, {v15.16b}, v26.16b
668	tbl	v16.16b, {v16.16b}, v26.16b
669	tbl	v17.16b, {v17.16b}, v26.16b
670	tbl	v19.16b, {v19.16b}, v26.16b
671
672	add	v12.4s, v12.4s, v18.4s
673	add	v13.4s, v13.4s, v15.4s
674	add	v10.4s, v10.4s, v16.4s
675	add	v11.4s, v11.4s, v17.4s
676	add	v14.4s, v14.4s, v19.4s
677
678	eor	v20.16b, v20.16b, v12.16b
679	eor	v6.16b, v6.16b, v13.16b
680	eor	v7.16b, v7.16b, v10.16b
681	eor	v8.16b, v8.16b, v11.16b
682	eor	v5.16b, v5.16b, v14.16b
683
684	ushr	v9.4s, v5.4s, #25
685	sli	v9.4s, v5.4s, #7
686	ushr	v5.4s, v8.4s, #25
687	sli	v5.4s, v8.4s, #7
688	ushr	v8.4s, v7.4s, #25
689	sli	v8.4s, v7.4s, #7
690	ushr	v7.4s, v6.4s, #25
691	sli	v7.4s, v6.4s, #7
692	ushr	v6.4s, v20.4s, #25
693	sli	v6.4s, v20.4s, #7
694
695	ext	v9.16b, v9.16b, v9.16b, #12
696	ext	v14.16b, v14.16b, v14.16b, #8
697	ext	v19.16b, v19.16b, v19.16b, #4
698	subs	x6, x6, #1
699	b.ge	Lseal_main_loop_rounds
700	ldp	x11, x12, [x3], 16
701	adds	x8, x8, x11
702	adcs	x9, x9, x12
703	adc	x10, x10, x15
704	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
705	umulh	x12, x8, x16
706	mul	x13, x9, x16
707	umulh	x14, x9, x16
708	adds	x12, x12, x13
709	mul	x13, x10, x16
710	adc	x13, x13, x14
711	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
712	umulh	x8, x8, x17
713	adds	x12, x12, x14
714	mul	x14, x9, x17
715	umulh	x9, x9, x17
716	adcs	x14, x14, x8
717	mul	x10, x10, x17
718	adc	x10, x10, x9
719	adds	x13, x13, x14
720	adc	x14, x10, xzr
721	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
722	and	x8, x13, #-4
723	extr	x13, x14, x13, #2
724	adds	x8, x8, x11
725	lsr	x11, x14, #2
726	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
727	adds	x8, x8, x13
728	adcs	x9, x9, x12
729	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
730	subs	x7, x7, #1
731	b.gt	Lseal_main_loop_rounds
732
733	eor	v20.16b, v20.16b, v20.16b //zero
734	not	v21.16b, v20.16b // -1
735	sub	v21.4s, v25.4s, v21.4s // Add +1
736	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
737	add	v19.4s, v19.4s, v20.4s
738
739	add	v15.4s, v15.4s, v25.4s
740	mov	x11, #5
741	dup	v20.4s, w11
742	add	v25.4s, v25.4s, v20.4s
743
744	zip1	v20.4s, v0.4s, v1.4s
745	zip2	v21.4s, v0.4s, v1.4s
746	zip1	v22.4s, v2.4s, v3.4s
747	zip2	v23.4s, v2.4s, v3.4s
748
749	zip1	v0.2d, v20.2d, v22.2d
750	zip2	v1.2d, v20.2d, v22.2d
751	zip1	v2.2d, v21.2d, v23.2d
752	zip2	v3.2d, v21.2d, v23.2d
753
754	zip1	v20.4s, v5.4s, v6.4s
755	zip2	v21.4s, v5.4s, v6.4s
756	zip1	v22.4s, v7.4s, v8.4s
757	zip2	v23.4s, v7.4s, v8.4s
758
759	zip1	v5.2d, v20.2d, v22.2d
760	zip2	v6.2d, v20.2d, v22.2d
761	zip1	v7.2d, v21.2d, v23.2d
762	zip2	v8.2d, v21.2d, v23.2d
763
764	zip1	v20.4s, v10.4s, v11.4s
765	zip2	v21.4s, v10.4s, v11.4s
766	zip1	v22.4s, v12.4s, v13.4s
767	zip2	v23.4s, v12.4s, v13.4s
768
769	zip1	v10.2d, v20.2d, v22.2d
770	zip2	v11.2d, v20.2d, v22.2d
771	zip1	v12.2d, v21.2d, v23.2d
772	zip2	v13.2d, v21.2d, v23.2d
773
774	zip1	v20.4s, v15.4s, v16.4s
775	zip2	v21.4s, v15.4s, v16.4s
776	zip1	v22.4s, v17.4s, v18.4s
777	zip2	v23.4s, v17.4s, v18.4s
778
779	zip1	v15.2d, v20.2d, v22.2d
780	zip2	v16.2d, v20.2d, v22.2d
781	zip1	v17.2d, v21.2d, v23.2d
782	zip2	v18.2d, v21.2d, v23.2d
783
784	add	v0.4s, v0.4s, v24.4s
785	add	v5.4s, v5.4s, v28.4s
786	add	v10.4s, v10.4s, v29.4s
787	add	v15.4s, v15.4s, v30.4s
788
789	add	v1.4s, v1.4s, v24.4s
790	add	v6.4s, v6.4s, v28.4s
791	add	v11.4s, v11.4s, v29.4s
792	add	v16.4s, v16.4s, v30.4s
793
794	add	v2.4s, v2.4s, v24.4s
795	add	v7.4s, v7.4s, v28.4s
796	add	v12.4s, v12.4s, v29.4s
797	add	v17.4s, v17.4s, v30.4s
798
799	add	v3.4s, v3.4s, v24.4s
800	add	v8.4s, v8.4s, v28.4s
801	add	v13.4s, v13.4s, v29.4s
802	add	v18.4s, v18.4s, v30.4s
803
804	add	v4.4s, v4.4s, v24.4s
805	add	v9.4s, v9.4s, v28.4s
806	add	v14.4s, v14.4s, v29.4s
807	add	v19.4s, v19.4s, v30.4s
808
809	cmp	x2, #320
810	b.le	Lseal_tail
811
812	ld1	{v20.16b - v23.16b}, [x1], #64
813	eor	v20.16b, v20.16b, v0.16b
814	eor	v21.16b, v21.16b, v5.16b
815	eor	v22.16b, v22.16b, v10.16b
816	eor	v23.16b, v23.16b, v15.16b
817	st1	{v20.16b - v23.16b}, [x0], #64
818
819	ld1	{v20.16b - v23.16b}, [x1], #64
820	eor	v20.16b, v20.16b, v1.16b
821	eor	v21.16b, v21.16b, v6.16b
822	eor	v22.16b, v22.16b, v11.16b
823	eor	v23.16b, v23.16b, v16.16b
824	st1	{v20.16b - v23.16b}, [x0], #64
825
826	ld1	{v20.16b - v23.16b}, [x1], #64
827	eor	v20.16b, v20.16b, v2.16b
828	eor	v21.16b, v21.16b, v7.16b
829	eor	v22.16b, v22.16b, v12.16b
830	eor	v23.16b, v23.16b, v17.16b
831	st1	{v20.16b - v23.16b}, [x0], #64
832
833	ld1	{v20.16b - v23.16b}, [x1], #64
834	eor	v20.16b, v20.16b, v3.16b
835	eor	v21.16b, v21.16b, v8.16b
836	eor	v22.16b, v22.16b, v13.16b
837	eor	v23.16b, v23.16b, v18.16b
838	st1	{v20.16b - v23.16b}, [x0], #64
839
840	ld1	{v20.16b - v23.16b}, [x1], #64
841	eor	v20.16b, v20.16b, v4.16b
842	eor	v21.16b, v21.16b, v9.16b
843	eor	v22.16b, v22.16b, v14.16b
844	eor	v23.16b, v23.16b, v19.16b
845	st1	{v20.16b - v23.16b}, [x0], #64
846
847	sub	x2, x2, #320
848
849	mov	x6, #0
850	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
851
852	b	Lseal_main_loop
853
854Lseal_tail:
855    // This part of the function handles the storage and authentication of the last [0,320) bytes
856    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
857	cmp	x2, #64
858	b.lt	Lseal_tail_64
859
860    // Store and authenticate 64B blocks per iteration
861	ld1	{v20.16b - v23.16b}, [x1], #64
862
863	eor	v20.16b, v20.16b, v0.16b
864	eor	v21.16b, v21.16b, v5.16b
865	eor	v22.16b, v22.16b, v10.16b
866	eor	v23.16b, v23.16b, v15.16b
867	mov	x11, v20.d[0]
868	mov	x12, v20.d[1]
869	adds	x8, x8, x11
870	adcs	x9, x9, x12
871	adc	x10, x10, x15
872	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
873	umulh	x12, x8, x16
874	mul	x13, x9, x16
875	umulh	x14, x9, x16
876	adds	x12, x12, x13
877	mul	x13, x10, x16
878	adc	x13, x13, x14
879	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
880	umulh	x8, x8, x17
881	adds	x12, x12, x14
882	mul	x14, x9, x17
883	umulh	x9, x9, x17
884	adcs	x14, x14, x8
885	mul	x10, x10, x17
886	adc	x10, x10, x9
887	adds	x13, x13, x14
888	adc	x14, x10, xzr
889	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
890	and	x8, x13, #-4
891	extr	x13, x14, x13, #2
892	adds	x8, x8, x11
893	lsr	x11, x14, #2
894	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
895	adds	x8, x8, x13
896	adcs	x9, x9, x12
897	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
898	mov	x11, v21.d[0]
899	mov	x12, v21.d[1]
900	adds	x8, x8, x11
901	adcs	x9, x9, x12
902	adc	x10, x10, x15
903	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
904	umulh	x12, x8, x16
905	mul	x13, x9, x16
906	umulh	x14, x9, x16
907	adds	x12, x12, x13
908	mul	x13, x10, x16
909	adc	x13, x13, x14
910	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
911	umulh	x8, x8, x17
912	adds	x12, x12, x14
913	mul	x14, x9, x17
914	umulh	x9, x9, x17
915	adcs	x14, x14, x8
916	mul	x10, x10, x17
917	adc	x10, x10, x9
918	adds	x13, x13, x14
919	adc	x14, x10, xzr
920	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
921	and	x8, x13, #-4
922	extr	x13, x14, x13, #2
923	adds	x8, x8, x11
924	lsr	x11, x14, #2
925	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
926	adds	x8, x8, x13
927	adcs	x9, x9, x12
928	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
929	mov	x11, v22.d[0]
930	mov	x12, v22.d[1]
931	adds	x8, x8, x11
932	adcs	x9, x9, x12
933	adc	x10, x10, x15
934	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
935	umulh	x12, x8, x16
936	mul	x13, x9, x16
937	umulh	x14, x9, x16
938	adds	x12, x12, x13
939	mul	x13, x10, x16
940	adc	x13, x13, x14
941	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
942	umulh	x8, x8, x17
943	adds	x12, x12, x14
944	mul	x14, x9, x17
945	umulh	x9, x9, x17
946	adcs	x14, x14, x8
947	mul	x10, x10, x17
948	adc	x10, x10, x9
949	adds	x13, x13, x14
950	adc	x14, x10, xzr
951	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
952	and	x8, x13, #-4
953	extr	x13, x14, x13, #2
954	adds	x8, x8, x11
955	lsr	x11, x14, #2
956	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
957	adds	x8, x8, x13
958	adcs	x9, x9, x12
959	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
960	mov	x11, v23.d[0]
961	mov	x12, v23.d[1]
962	adds	x8, x8, x11
963	adcs	x9, x9, x12
964	adc	x10, x10, x15
965	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
966	umulh	x12, x8, x16
967	mul	x13, x9, x16
968	umulh	x14, x9, x16
969	adds	x12, x12, x13
970	mul	x13, x10, x16
971	adc	x13, x13, x14
972	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
973	umulh	x8, x8, x17
974	adds	x12, x12, x14
975	mul	x14, x9, x17
976	umulh	x9, x9, x17
977	adcs	x14, x14, x8
978	mul	x10, x10, x17
979	adc	x10, x10, x9
980	adds	x13, x13, x14
981	adc	x14, x10, xzr
982	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
983	and	x8, x13, #-4
984	extr	x13, x14, x13, #2
985	adds	x8, x8, x11
986	lsr	x11, x14, #2
987	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
988	adds	x8, x8, x13
989	adcs	x9, x9, x12
990	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
991	st1	{v20.16b - v23.16b}, [x0], #64
992	sub	x2, x2, #64
993
994    // Shift the state left by 64 bytes for the next iteration of the loop
995	mov	v0.16b, v1.16b
996	mov	v5.16b, v6.16b
997	mov	v10.16b, v11.16b
998	mov	v15.16b, v16.16b
999
1000	mov	v1.16b, v2.16b
1001	mov	v6.16b, v7.16b
1002	mov	v11.16b, v12.16b
1003	mov	v16.16b, v17.16b
1004
1005	mov	v2.16b, v3.16b
1006	mov	v7.16b, v8.16b
1007	mov	v12.16b, v13.16b
1008	mov	v17.16b, v18.16b
1009
1010	mov	v3.16b, v4.16b
1011	mov	v8.16b, v9.16b
1012	mov	v13.16b, v14.16b
1013	mov	v18.16b, v19.16b
1014
1015	b	Lseal_tail
1016
1017Lseal_tail_64:
1018	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
1019
1020    // Here we handle the last [0,64) bytes of plaintext
1021	cmp	x2, #16
1022	b.lt	Lseal_tail_16
1023    // Each iteration encrypt and authenticate a 16B block
1024	ld1	{v20.16b}, [x1], #16
1025	eor	v20.16b, v20.16b, v0.16b
1026	mov	x11, v20.d[0]
1027	mov	x12, v20.d[1]
1028	adds	x8, x8, x11
1029	adcs	x9, x9, x12
1030	adc	x10, x10, x15
1031	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1032	umulh	x12, x8, x16
1033	mul	x13, x9, x16
1034	umulh	x14, x9, x16
1035	adds	x12, x12, x13
1036	mul	x13, x10, x16
1037	adc	x13, x13, x14
1038	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1039	umulh	x8, x8, x17
1040	adds	x12, x12, x14
1041	mul	x14, x9, x17
1042	umulh	x9, x9, x17
1043	adcs	x14, x14, x8
1044	mul	x10, x10, x17
1045	adc	x10, x10, x9
1046	adds	x13, x13, x14
1047	adc	x14, x10, xzr
1048	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1049	and	x8, x13, #-4
1050	extr	x13, x14, x13, #2
1051	adds	x8, x8, x11
1052	lsr	x11, x14, #2
1053	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1054	adds	x8, x8, x13
1055	adcs	x9, x9, x12
1056	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1057	st1	{v20.16b}, [x0], #16
1058
1059	sub	x2, x2, #16
1060
1061    // Shift the state left by 16 bytes for the next iteration of the loop
1062	mov	v0.16b, v5.16b
1063	mov	v5.16b, v10.16b
1064	mov	v10.16b, v15.16b
1065
1066	b	Lseal_tail_64
1067
1068Lseal_tail_16:
1069    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
1070	cbz	x2, Lseal_hash_extra
1071
1072	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
1073	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
1074	not	v22.16b, v20.16b
1075
1076	mov	x6, x2
1077	add	x1, x1, x2
1078
1079	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
1080
1081	mov	x7, #16          // We need to load some extra_in first for padding
1082	sub	x7, x7, x2
1083	cmp	x4, x7
1084	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
1085	mov	x12, x7
1086	add	x3, x3, x7
1087	sub	x4, x4, x7
1088
1089Lseal_tail16_compose_extra_in:
1090	ext	v20.16b, v20.16b, v20.16b, #15
1091	ldrb	w11, [x3, #-1]!
1092	mov	v20.b[0], w11
1093	subs	x7, x7, #1
1094	b.gt	Lseal_tail16_compose_extra_in
1095
1096	add	x3, x3, x12
1097
1098Lseal_tail_16_compose:
1099	ext	v20.16b, v20.16b, v20.16b, #15
1100	ldrb	w11, [x1, #-1]!
1101	mov	v20.b[0], w11
1102	ext	v21.16b, v22.16b, v21.16b, #15
1103	subs	x2, x2, #1
1104	b.gt	Lseal_tail_16_compose
1105
1106	and	v0.16b, v0.16b, v21.16b
1107	eor	v20.16b, v20.16b, v0.16b
1108	mov	v21.16b, v20.16b
1109
1110Lseal_tail_16_store:
1111	umov	w11, v20.b[0]
1112	strb	w11, [x0], #1
1113	ext	v20.16b, v20.16b, v20.16b, #1
1114	subs	x6, x6, #1
1115	b.gt	Lseal_tail_16_store
1116
1117    // Hash in the final ct block concatenated with extra_in
1118	mov	x11, v21.d[0]
1119	mov	x12, v21.d[1]
1120	adds	x8, x8, x11
1121	adcs	x9, x9, x12
1122	adc	x10, x10, x15
1123	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1124	umulh	x12, x8, x16
1125	mul	x13, x9, x16
1126	umulh	x14, x9, x16
1127	adds	x12, x12, x13
1128	mul	x13, x10, x16
1129	adc	x13, x13, x14
1130	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1131	umulh	x8, x8, x17
1132	adds	x12, x12, x14
1133	mul	x14, x9, x17
1134	umulh	x9, x9, x17
1135	adcs	x14, x14, x8
1136	mul	x10, x10, x17
1137	adc	x10, x10, x9
1138	adds	x13, x13, x14
1139	adc	x14, x10, xzr
1140	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1141	and	x8, x13, #-4
1142	extr	x13, x14, x13, #2
1143	adds	x8, x8, x11
1144	lsr	x11, x14, #2
1145	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1146	adds	x8, x8, x13
1147	adcs	x9, x9, x12
1148	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1149
1150Lseal_hash_extra:
1151	cbz	x4, Lseal_finalize
1152
1153Lseal_hash_extra_loop:
1154	cmp	x4, #16
1155	b.lt	Lseal_hash_extra_tail
1156	ld1	{v20.16b}, [x3], #16
1157	mov	x11, v20.d[0]
1158	mov	x12, v20.d[1]
1159	adds	x8, x8, x11
1160	adcs	x9, x9, x12
1161	adc	x10, x10, x15
1162	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1163	umulh	x12, x8, x16
1164	mul	x13, x9, x16
1165	umulh	x14, x9, x16
1166	adds	x12, x12, x13
1167	mul	x13, x10, x16
1168	adc	x13, x13, x14
1169	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1170	umulh	x8, x8, x17
1171	adds	x12, x12, x14
1172	mul	x14, x9, x17
1173	umulh	x9, x9, x17
1174	adcs	x14, x14, x8
1175	mul	x10, x10, x17
1176	adc	x10, x10, x9
1177	adds	x13, x13, x14
1178	adc	x14, x10, xzr
1179	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1180	and	x8, x13, #-4
1181	extr	x13, x14, x13, #2
1182	adds	x8, x8, x11
1183	lsr	x11, x14, #2
1184	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1185	adds	x8, x8, x13
1186	adcs	x9, x9, x12
1187	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1188	sub	x4, x4, #16
1189	b	Lseal_hash_extra_loop
1190
1191Lseal_hash_extra_tail:
1192	cbz	x4, Lseal_finalize
1193	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
1194	add	x3, x3, x4
1195
1196Lseal_hash_extra_load:
1197	ext	v20.16b, v20.16b, v20.16b, #15
1198	ldrb	w11, [x3, #-1]!
1199	mov	v20.b[0], w11
1200	subs	x4, x4, #1
1201	b.gt	Lseal_hash_extra_load
1202
1203    // Hash in the final padded extra_in blcok
1204	mov	x11, v20.d[0]
1205	mov	x12, v20.d[1]
1206	adds	x8, x8, x11
1207	adcs	x9, x9, x12
1208	adc	x10, x10, x15
1209	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1210	umulh	x12, x8, x16
1211	mul	x13, x9, x16
1212	umulh	x14, x9, x16
1213	adds	x12, x12, x13
1214	mul	x13, x10, x16
1215	adc	x13, x13, x14
1216	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1217	umulh	x8, x8, x17
1218	adds	x12, x12, x14
1219	mul	x14, x9, x17
1220	umulh	x9, x9, x17
1221	adcs	x14, x14, x8
1222	mul	x10, x10, x17
1223	adc	x10, x10, x9
1224	adds	x13, x13, x14
1225	adc	x14, x10, xzr
1226	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1227	and	x8, x13, #-4
1228	extr	x13, x14, x13, #2
1229	adds	x8, x8, x11
1230	lsr	x11, x14, #2
1231	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1232	adds	x8, x8, x13
1233	adcs	x9, x9, x12
1234	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1235
1236Lseal_finalize:
1237	mov	x11, v31.d[0]
1238	mov	x12, v31.d[1]
1239	adds	x8, x8, x11
1240	adcs	x9, x9, x12
1241	adc	x10, x10, x15
1242	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1243	umulh	x12, x8, x16
1244	mul	x13, x9, x16
1245	umulh	x14, x9, x16
1246	adds	x12, x12, x13
1247	mul	x13, x10, x16
1248	adc	x13, x13, x14
1249	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1250	umulh	x8, x8, x17
1251	adds	x12, x12, x14
1252	mul	x14, x9, x17
1253	umulh	x9, x9, x17
1254	adcs	x14, x14, x8
1255	mul	x10, x10, x17
1256	adc	x10, x10, x9
1257	adds	x13, x13, x14
1258	adc	x14, x10, xzr
1259	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1260	and	x8, x13, #-4
1261	extr	x13, x14, x13, #2
1262	adds	x8, x8, x11
1263	lsr	x11, x14, #2
1264	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1265	adds	x8, x8, x13
1266	adcs	x9, x9, x12
1267	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1268    // Final reduction step
1269	sub	x12, xzr, x15
1270	orr	x13, xzr, #3
1271	subs	x11, x8, #-5
1272	sbcs	x12, x9, x12
1273	sbcs	x13, x10, x13
1274	csel	x8, x11, x8, cs
1275	csel	x9, x12, x9, cs
1276	csel	x10, x13, x10, cs
1277	mov	x11, v27.d[0]
1278	mov	x12, v27.d[1]
1279	adds	x8, x8, x11
1280	adcs	x9, x9, x12
1281	adc	x10, x10, x15
1282
1283	stp	x8, x9, [x5]
1284
1285	ldp	d8, d9, [sp, #16]
1286	ldp	d10, d11, [sp, #32]
1287	ldp	d12, d13, [sp, #48]
1288	ldp	d14, d15, [sp, #64]
1289.cfi_restore	b15
1290.cfi_restore	b14
1291.cfi_restore	b13
1292.cfi_restore	b12
1293.cfi_restore	b11
1294.cfi_restore	b10
1295.cfi_restore	b9
1296.cfi_restore	b8
1297	ldp	x29, x30, [sp], 80
1298.cfi_restore	w29
1299.cfi_restore	w30
1300.cfi_def_cfa_offset	0
1301	AARCH64_VALIDATE_LINK_REGISTER
1302	ret
1303
1304Lseal_128:
1305    // On some architectures preparing 5 blocks for small buffers is wasteful
1306	eor	v25.16b, v25.16b, v25.16b
1307	mov	x11, #1
1308	mov	v25.s[0], w11
1309	mov	v0.16b, v24.16b
1310	mov	v1.16b, v24.16b
1311	mov	v2.16b, v24.16b
1312	mov	v5.16b, v28.16b
1313	mov	v6.16b, v28.16b
1314	mov	v7.16b, v28.16b
1315	mov	v10.16b, v29.16b
1316	mov	v11.16b, v29.16b
1317	mov	v12.16b, v29.16b
1318	mov	v17.16b, v30.16b
1319	add	v15.4s, v17.4s, v25.4s
1320	add	v16.4s, v15.4s, v25.4s
1321
1322	mov	x6, #10
1323
1324Lseal_128_rounds:
1325	add	v0.4s, v0.4s, v5.4s
1326	add	v1.4s, v1.4s, v6.4s
1327	add	v2.4s, v2.4s, v7.4s
1328	eor	v15.16b, v15.16b, v0.16b
1329	eor	v16.16b, v16.16b, v1.16b
1330	eor	v17.16b, v17.16b, v2.16b
1331	rev32	v15.8h, v15.8h
1332	rev32	v16.8h, v16.8h
1333	rev32	v17.8h, v17.8h
1334
1335	add	v10.4s, v10.4s, v15.4s
1336	add	v11.4s, v11.4s, v16.4s
1337	add	v12.4s, v12.4s, v17.4s
1338	eor	v5.16b, v5.16b, v10.16b
1339	eor	v6.16b, v6.16b, v11.16b
1340	eor	v7.16b, v7.16b, v12.16b
1341	ushr	v20.4s, v5.4s, #20
1342	sli	v20.4s, v5.4s, #12
1343	ushr	v5.4s, v6.4s, #20
1344	sli	v5.4s, v6.4s, #12
1345	ushr	v6.4s, v7.4s, #20
1346	sli	v6.4s, v7.4s, #12
1347
1348	add	v0.4s, v0.4s, v20.4s
1349	add	v1.4s, v1.4s, v5.4s
1350	add	v2.4s, v2.4s, v6.4s
1351	eor	v15.16b, v15.16b, v0.16b
1352	eor	v16.16b, v16.16b, v1.16b
1353	eor	v17.16b, v17.16b, v2.16b
1354	tbl	v15.16b, {v15.16b}, v26.16b
1355	tbl	v16.16b, {v16.16b}, v26.16b
1356	tbl	v17.16b, {v17.16b}, v26.16b
1357
1358	add	v10.4s, v10.4s, v15.4s
1359	add	v11.4s, v11.4s, v16.4s
1360	add	v12.4s, v12.4s, v17.4s
1361	eor	v20.16b, v20.16b, v10.16b
1362	eor	v5.16b, v5.16b, v11.16b
1363	eor	v6.16b, v6.16b, v12.16b
1364	ushr	v7.4s, v6.4s, #25
1365	sli	v7.4s, v6.4s, #7
1366	ushr	v6.4s, v5.4s, #25
1367	sli	v6.4s, v5.4s, #7
1368	ushr	v5.4s, v20.4s, #25
1369	sli	v5.4s, v20.4s, #7
1370
1371	ext	v5.16b, v5.16b, v5.16b, #4
1372	ext	v6.16b, v6.16b, v6.16b, #4
1373	ext	v7.16b, v7.16b, v7.16b, #4
1374
1375	ext	v10.16b, v10.16b, v10.16b, #8
1376	ext	v11.16b, v11.16b, v11.16b, #8
1377	ext	v12.16b, v12.16b, v12.16b, #8
1378
1379	ext	v15.16b, v15.16b, v15.16b, #12
1380	ext	v16.16b, v16.16b, v16.16b, #12
1381	ext	v17.16b, v17.16b, v17.16b, #12
1382	add	v0.4s, v0.4s, v5.4s
1383	add	v1.4s, v1.4s, v6.4s
1384	add	v2.4s, v2.4s, v7.4s
1385	eor	v15.16b, v15.16b, v0.16b
1386	eor	v16.16b, v16.16b, v1.16b
1387	eor	v17.16b, v17.16b, v2.16b
1388	rev32	v15.8h, v15.8h
1389	rev32	v16.8h, v16.8h
1390	rev32	v17.8h, v17.8h
1391
1392	add	v10.4s, v10.4s, v15.4s
1393	add	v11.4s, v11.4s, v16.4s
1394	add	v12.4s, v12.4s, v17.4s
1395	eor	v5.16b, v5.16b, v10.16b
1396	eor	v6.16b, v6.16b, v11.16b
1397	eor	v7.16b, v7.16b, v12.16b
1398	ushr	v20.4s, v5.4s, #20
1399	sli	v20.4s, v5.4s, #12
1400	ushr	v5.4s, v6.4s, #20
1401	sli	v5.4s, v6.4s, #12
1402	ushr	v6.4s, v7.4s, #20
1403	sli	v6.4s, v7.4s, #12
1404
1405	add	v0.4s, v0.4s, v20.4s
1406	add	v1.4s, v1.4s, v5.4s
1407	add	v2.4s, v2.4s, v6.4s
1408	eor	v15.16b, v15.16b, v0.16b
1409	eor	v16.16b, v16.16b, v1.16b
1410	eor	v17.16b, v17.16b, v2.16b
1411	tbl	v15.16b, {v15.16b}, v26.16b
1412	tbl	v16.16b, {v16.16b}, v26.16b
1413	tbl	v17.16b, {v17.16b}, v26.16b
1414
1415	add	v10.4s, v10.4s, v15.4s
1416	add	v11.4s, v11.4s, v16.4s
1417	add	v12.4s, v12.4s, v17.4s
1418	eor	v20.16b, v20.16b, v10.16b
1419	eor	v5.16b, v5.16b, v11.16b
1420	eor	v6.16b, v6.16b, v12.16b
1421	ushr	v7.4s, v6.4s, #25
1422	sli	v7.4s, v6.4s, #7
1423	ushr	v6.4s, v5.4s, #25
1424	sli	v6.4s, v5.4s, #7
1425	ushr	v5.4s, v20.4s, #25
1426	sli	v5.4s, v20.4s, #7
1427
1428	ext	v5.16b, v5.16b, v5.16b, #12
1429	ext	v6.16b, v6.16b, v6.16b, #12
1430	ext	v7.16b, v7.16b, v7.16b, #12
1431
1432	ext	v10.16b, v10.16b, v10.16b, #8
1433	ext	v11.16b, v11.16b, v11.16b, #8
1434	ext	v12.16b, v12.16b, v12.16b, #8
1435
1436	ext	v15.16b, v15.16b, v15.16b, #4
1437	ext	v16.16b, v16.16b, v16.16b, #4
1438	ext	v17.16b, v17.16b, v17.16b, #4
1439	subs	x6, x6, #1
1440	b.hi	Lseal_128_rounds
1441
1442	add	v0.4s, v0.4s, v24.4s
1443	add	v1.4s, v1.4s, v24.4s
1444	add	v2.4s, v2.4s, v24.4s
1445
1446	add	v5.4s, v5.4s, v28.4s
1447	add	v6.4s, v6.4s, v28.4s
1448	add	v7.4s, v7.4s, v28.4s
1449
1450    // Only the first 32 bytes of the third block (counter = 0) are needed,
1451    // so skip updating v12 and v17.
1452	add	v10.4s, v10.4s, v29.4s
1453	add	v11.4s, v11.4s, v29.4s
1454
1455	add	v30.4s, v30.4s, v25.4s
1456	add	v15.4s, v15.4s, v30.4s
1457	add	v30.4s, v30.4s, v25.4s
1458	add	v16.4s, v16.4s, v30.4s
1459
1460	and	v2.16b, v2.16b, v27.16b
1461	mov	x16, v2.d[0] // Move the R key to GPRs
1462	mov	x17, v2.d[1]
1463	mov	v27.16b, v7.16b // Store the S key
1464
1465	bl	Lpoly_hash_ad_internal
1466	b	Lseal_tail
1467.cfi_endproc
1468
1469
1470/////////////////////////////////
1471//
1472// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
1473//
1474.globl	_chacha20_poly1305_open
1475.private_extern	_chacha20_poly1305_open
1476
1477.align	6
1478_chacha20_poly1305_open:
1479	AARCH64_SIGN_LINK_REGISTER
1480.cfi_startproc
1481	stp	x29, x30, [sp, #-80]!
1482.cfi_def_cfa_offset	80
1483.cfi_offset	w30, -72
1484.cfi_offset	w29, -80
1485	mov	x29, sp
1486    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
1487    // we don't actually use the frame pointer like that, it's probably not
1488    // worth bothering.
1489	stp	d8, d9, [sp, #16]
1490	stp	d10, d11, [sp, #32]
1491	stp	d12, d13, [sp, #48]
1492	stp	d14, d15, [sp, #64]
1493.cfi_offset	b15, -8
1494.cfi_offset	b14, -16
1495.cfi_offset	b13, -24
1496.cfi_offset	b12, -32
1497.cfi_offset	b11, -40
1498.cfi_offset	b10, -48
1499.cfi_offset	b9, -56
1500.cfi_offset	b8, -64
1501
1502	adrp	x11, Lchacha20_consts@PAGE
1503	add	x11, x11, Lchacha20_consts@PAGEOFF
1504
1505	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
1506	ld1	{v28.16b - v30.16b}, [x5]
1507
1508	mov	x15, #1 // Prepare the Poly1305 state
1509	mov	x8, #0
1510	mov	x9, #0
1511	mov	x10, #0
1512
1513	mov	v31.d[0], x4  // Store the input and aad lengths
1514	mov	v31.d[1], x2
1515
1516	cmp	x2, #128
1517	b.le	Lopen_128 // Optimization for smaller buffers
1518
1519    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
1520	mov	v0.16b, v24.16b
1521	mov	v5.16b, v28.16b
1522	mov	v10.16b, v29.16b
1523	mov	v15.16b, v30.16b
1524
1525	mov	x6, #10
1526
1527.align	5
1528Lopen_init_rounds:
1529	add	v0.4s, v0.4s, v5.4s
1530	eor	v15.16b, v15.16b, v0.16b
1531	rev32	v15.8h, v15.8h
1532
1533	add	v10.4s, v10.4s, v15.4s
1534	eor	v5.16b, v5.16b, v10.16b
1535	ushr	v20.4s, v5.4s, #20
1536	sli	v20.4s, v5.4s, #12
1537	add	v0.4s, v0.4s, v20.4s
1538	eor	v15.16b, v15.16b, v0.16b
1539	tbl	v15.16b, {v15.16b}, v26.16b
1540
1541	add	v10.4s, v10.4s, v15.4s
1542	eor	v20.16b, v20.16b, v10.16b
1543	ushr	v5.4s, v20.4s, #25
1544	sli	v5.4s, v20.4s, #7
1545	ext	v5.16b, v5.16b, v5.16b, #4
1546	ext	v10.16b, v10.16b, v10.16b, #8
1547	ext	v15.16b, v15.16b, v15.16b, #12
1548	add	v0.4s, v0.4s, v5.4s
1549	eor	v15.16b, v15.16b, v0.16b
1550	rev32	v15.8h, v15.8h
1551
1552	add	v10.4s, v10.4s, v15.4s
1553	eor	v5.16b, v5.16b, v10.16b
1554	ushr	v20.4s, v5.4s, #20
1555	sli	v20.4s, v5.4s, #12
1556	add	v0.4s, v0.4s, v20.4s
1557	eor	v15.16b, v15.16b, v0.16b
1558	tbl	v15.16b, {v15.16b}, v26.16b
1559
1560	add	v10.4s, v10.4s, v15.4s
1561	eor	v20.16b, v20.16b, v10.16b
1562	ushr	v5.4s, v20.4s, #25
1563	sli	v5.4s, v20.4s, #7
1564	ext	v5.16b, v5.16b, v5.16b, #12
1565	ext	v10.16b, v10.16b, v10.16b, #8
1566	ext	v15.16b, v15.16b, v15.16b, #4
1567	subs	x6, x6, #1
1568	b.hi	Lopen_init_rounds
1569
1570	add	v0.4s, v0.4s, v24.4s
1571	add	v5.4s, v5.4s, v28.4s
1572
1573	and	v0.16b, v0.16b, v27.16b
1574	mov	x16, v0.d[0] // Move the R key to GPRs
1575	mov	x17, v0.d[1]
1576	mov	v27.16b, v5.16b // Store the S key
1577
1578	bl	Lpoly_hash_ad_internal
1579
1580Lopen_ad_done:
1581	mov	x3, x1
1582
1583// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
1584Lopen_main_loop:
1585
1586	cmp	x2, #192
1587	b.lt	Lopen_tail
1588
1589	adrp	x11, Lchacha20_consts@PAGE
1590	add	x11, x11, Lchacha20_consts@PAGEOFF
1591
1592	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
1593	mov	v4.16b, v24.16b
1594
1595	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
1596	mov	v9.16b, v28.16b
1597
1598	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
1599	mov	v14.16b, v29.16b
1600
1601	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
1602	sub	x5, x5, #32
1603	add	v15.4s, v15.4s, v25.4s
1604	mov	v19.16b, v30.16b
1605
1606	eor	v20.16b, v20.16b, v20.16b //zero
1607	not	v21.16b, v20.16b // -1
1608	sub	v21.4s, v25.4s, v21.4s // Add +1
1609	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
1610	add	v19.4s, v19.4s, v20.4s
1611
1612	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
1613	sub	x4, x4, #10
1614
1615	mov	x7, #10
1616	subs	x6, x7, x4
1617	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
1618	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
1619
1620	cbz	x7, Lopen_main_loop_rounds_short
1621
1622.align	5
1623Lopen_main_loop_rounds:
1624	ldp	x11, x12, [x3], 16
1625	adds	x8, x8, x11
1626	adcs	x9, x9, x12
1627	adc	x10, x10, x15
1628	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1629	umulh	x12, x8, x16
1630	mul	x13, x9, x16
1631	umulh	x14, x9, x16
1632	adds	x12, x12, x13
1633	mul	x13, x10, x16
1634	adc	x13, x13, x14
1635	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1636	umulh	x8, x8, x17
1637	adds	x12, x12, x14
1638	mul	x14, x9, x17
1639	umulh	x9, x9, x17
1640	adcs	x14, x14, x8
1641	mul	x10, x10, x17
1642	adc	x10, x10, x9
1643	adds	x13, x13, x14
1644	adc	x14, x10, xzr
1645	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1646	and	x8, x13, #-4
1647	extr	x13, x14, x13, #2
1648	adds	x8, x8, x11
1649	lsr	x11, x14, #2
1650	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1651	adds	x8, x8, x13
1652	adcs	x9, x9, x12
1653	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1654Lopen_main_loop_rounds_short:
1655	add	v0.4s, v0.4s, v5.4s
1656	add	v1.4s, v1.4s, v6.4s
1657	add	v2.4s, v2.4s, v7.4s
1658	add	v3.4s, v3.4s, v8.4s
1659	add	v4.4s, v4.4s, v9.4s
1660
1661	eor	v15.16b, v15.16b, v0.16b
1662	eor	v16.16b, v16.16b, v1.16b
1663	eor	v17.16b, v17.16b, v2.16b
1664	eor	v18.16b, v18.16b, v3.16b
1665	eor	v19.16b, v19.16b, v4.16b
1666
1667	rev32	v15.8h, v15.8h
1668	rev32	v16.8h, v16.8h
1669	rev32	v17.8h, v17.8h
1670	rev32	v18.8h, v18.8h
1671	rev32	v19.8h, v19.8h
1672
1673	add	v10.4s, v10.4s, v15.4s
1674	add	v11.4s, v11.4s, v16.4s
1675	add	v12.4s, v12.4s, v17.4s
1676	add	v13.4s, v13.4s, v18.4s
1677	add	v14.4s, v14.4s, v19.4s
1678
1679	eor	v5.16b, v5.16b, v10.16b
1680	eor	v6.16b, v6.16b, v11.16b
1681	eor	v7.16b, v7.16b, v12.16b
1682	eor	v8.16b, v8.16b, v13.16b
1683	eor	v9.16b, v9.16b, v14.16b
1684
1685	ushr	v20.4s, v5.4s, #20
1686	sli	v20.4s, v5.4s, #12
1687	ushr	v5.4s, v6.4s, #20
1688	sli	v5.4s, v6.4s, #12
1689	ushr	v6.4s, v7.4s, #20
1690	sli	v6.4s, v7.4s, #12
1691	ushr	v7.4s, v8.4s, #20
1692	sli	v7.4s, v8.4s, #12
1693	ushr	v8.4s, v9.4s, #20
1694	sli	v8.4s, v9.4s, #12
1695
1696	add	v0.4s, v0.4s, v20.4s
1697	add	v1.4s, v1.4s, v5.4s
1698	add	v2.4s, v2.4s, v6.4s
1699	add	v3.4s, v3.4s, v7.4s
1700	add	v4.4s, v4.4s, v8.4s
1701
1702	eor	v15.16b, v15.16b, v0.16b
1703	eor	v16.16b, v16.16b, v1.16b
1704	eor	v17.16b, v17.16b, v2.16b
1705	eor	v18.16b, v18.16b, v3.16b
1706	eor	v19.16b, v19.16b, v4.16b
1707
1708	tbl	v15.16b, {v15.16b}, v26.16b
1709	tbl	v16.16b, {v16.16b}, v26.16b
1710	tbl	v17.16b, {v17.16b}, v26.16b
1711	tbl	v18.16b, {v18.16b}, v26.16b
1712	tbl	v19.16b, {v19.16b}, v26.16b
1713
1714	add	v10.4s, v10.4s, v15.4s
1715	add	v11.4s, v11.4s, v16.4s
1716	add	v12.4s, v12.4s, v17.4s
1717	add	v13.4s, v13.4s, v18.4s
1718	add	v14.4s, v14.4s, v19.4s
1719
1720	eor	v20.16b, v20.16b, v10.16b
1721	eor	v5.16b, v5.16b, v11.16b
1722	eor	v6.16b, v6.16b, v12.16b
1723	eor	v7.16b, v7.16b, v13.16b
1724	eor	v8.16b, v8.16b, v14.16b
1725
1726	ushr	v9.4s, v8.4s, #25
1727	sli	v9.4s, v8.4s, #7
1728	ushr	v8.4s, v7.4s, #25
1729	sli	v8.4s, v7.4s, #7
1730	ushr	v7.4s, v6.4s, #25
1731	sli	v7.4s, v6.4s, #7
1732	ushr	v6.4s, v5.4s, #25
1733	sli	v6.4s, v5.4s, #7
1734	ushr	v5.4s, v20.4s, #25
1735	sli	v5.4s, v20.4s, #7
1736
1737	ext	v9.16b, v9.16b, v9.16b, #4
1738	ext	v14.16b, v14.16b, v14.16b, #8
1739	ext	v19.16b, v19.16b, v19.16b, #12
1740	ldp	x11, x12, [x3], 16
1741	adds	x8, x8, x11
1742	adcs	x9, x9, x12
1743	adc	x10, x10, x15
1744	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1745	umulh	x12, x8, x16
1746	mul	x13, x9, x16
1747	umulh	x14, x9, x16
1748	adds	x12, x12, x13
1749	mul	x13, x10, x16
1750	adc	x13, x13, x14
1751	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1752	umulh	x8, x8, x17
1753	adds	x12, x12, x14
1754	mul	x14, x9, x17
1755	umulh	x9, x9, x17
1756	adcs	x14, x14, x8
1757	mul	x10, x10, x17
1758	adc	x10, x10, x9
1759	adds	x13, x13, x14
1760	adc	x14, x10, xzr
1761	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1762	and	x8, x13, #-4
1763	extr	x13, x14, x13, #2
1764	adds	x8, x8, x11
1765	lsr	x11, x14, #2
1766	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1767	adds	x8, x8, x13
1768	adcs	x9, x9, x12
1769	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1770	add	v0.4s, v0.4s, v6.4s
1771	add	v1.4s, v1.4s, v7.4s
1772	add	v2.4s, v2.4s, v8.4s
1773	add	v3.4s, v3.4s, v5.4s
1774	add	v4.4s, v4.4s, v9.4s
1775
1776	eor	v18.16b, v18.16b, v0.16b
1777	eor	v15.16b, v15.16b, v1.16b
1778	eor	v16.16b, v16.16b, v2.16b
1779	eor	v17.16b, v17.16b, v3.16b
1780	eor	v19.16b, v19.16b, v4.16b
1781
1782	rev32	v18.8h, v18.8h
1783	rev32	v15.8h, v15.8h
1784	rev32	v16.8h, v16.8h
1785	rev32	v17.8h, v17.8h
1786	rev32	v19.8h, v19.8h
1787
1788	add	v12.4s, v12.4s, v18.4s
1789	add	v13.4s, v13.4s, v15.4s
1790	add	v10.4s, v10.4s, v16.4s
1791	add	v11.4s, v11.4s, v17.4s
1792	add	v14.4s, v14.4s, v19.4s
1793
1794	eor	v6.16b, v6.16b, v12.16b
1795	eor	v7.16b, v7.16b, v13.16b
1796	eor	v8.16b, v8.16b, v10.16b
1797	eor	v5.16b, v5.16b, v11.16b
1798	eor	v9.16b, v9.16b, v14.16b
1799
1800	ushr	v20.4s, v6.4s, #20
1801	sli	v20.4s, v6.4s, #12
1802	ushr	v6.4s, v7.4s, #20
1803	sli	v6.4s, v7.4s, #12
1804	ushr	v7.4s, v8.4s, #20
1805	sli	v7.4s, v8.4s, #12
1806	ushr	v8.4s, v5.4s, #20
1807	sli	v8.4s, v5.4s, #12
1808	ushr	v5.4s, v9.4s, #20
1809	sli	v5.4s, v9.4s, #12
1810
1811	add	v0.4s, v0.4s, v20.4s
1812	add	v1.4s, v1.4s, v6.4s
1813	add	v2.4s, v2.4s, v7.4s
1814	add	v3.4s, v3.4s, v8.4s
1815	add	v4.4s, v4.4s, v5.4s
1816
1817	eor	v18.16b, v18.16b, v0.16b
1818	eor	v15.16b, v15.16b, v1.16b
1819	eor	v16.16b, v16.16b, v2.16b
1820	eor	v17.16b, v17.16b, v3.16b
1821	eor	v19.16b, v19.16b, v4.16b
1822
1823	tbl	v18.16b, {v18.16b}, v26.16b
1824	tbl	v15.16b, {v15.16b}, v26.16b
1825	tbl	v16.16b, {v16.16b}, v26.16b
1826	tbl	v17.16b, {v17.16b}, v26.16b
1827	tbl	v19.16b, {v19.16b}, v26.16b
1828
1829	add	v12.4s, v12.4s, v18.4s
1830	add	v13.4s, v13.4s, v15.4s
1831	add	v10.4s, v10.4s, v16.4s
1832	add	v11.4s, v11.4s, v17.4s
1833	add	v14.4s, v14.4s, v19.4s
1834
1835	eor	v20.16b, v20.16b, v12.16b
1836	eor	v6.16b, v6.16b, v13.16b
1837	eor	v7.16b, v7.16b, v10.16b
1838	eor	v8.16b, v8.16b, v11.16b
1839	eor	v5.16b, v5.16b, v14.16b
1840
1841	ushr	v9.4s, v5.4s, #25
1842	sli	v9.4s, v5.4s, #7
1843	ushr	v5.4s, v8.4s, #25
1844	sli	v5.4s, v8.4s, #7
1845	ushr	v8.4s, v7.4s, #25
1846	sli	v8.4s, v7.4s, #7
1847	ushr	v7.4s, v6.4s, #25
1848	sli	v7.4s, v6.4s, #7
1849	ushr	v6.4s, v20.4s, #25
1850	sli	v6.4s, v20.4s, #7
1851
1852	ext	v9.16b, v9.16b, v9.16b, #12
1853	ext	v14.16b, v14.16b, v14.16b, #8
1854	ext	v19.16b, v19.16b, v19.16b, #4
1855	subs	x7, x7, #1
1856	b.gt	Lopen_main_loop_rounds
1857	subs	x6, x6, #1
1858	b.ge	Lopen_main_loop_rounds_short
1859
1860	eor	v20.16b, v20.16b, v20.16b //zero
1861	not	v21.16b, v20.16b // -1
1862	sub	v21.4s, v25.4s, v21.4s // Add +1
1863	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
1864	add	v19.4s, v19.4s, v20.4s
1865
1866	add	v15.4s, v15.4s, v25.4s
1867	mov	x11, #5
1868	dup	v20.4s, w11
1869	add	v25.4s, v25.4s, v20.4s
1870
1871	zip1	v20.4s, v0.4s, v1.4s
1872	zip2	v21.4s, v0.4s, v1.4s
1873	zip1	v22.4s, v2.4s, v3.4s
1874	zip2	v23.4s, v2.4s, v3.4s
1875
1876	zip1	v0.2d, v20.2d, v22.2d
1877	zip2	v1.2d, v20.2d, v22.2d
1878	zip1	v2.2d, v21.2d, v23.2d
1879	zip2	v3.2d, v21.2d, v23.2d
1880
1881	zip1	v20.4s, v5.4s, v6.4s
1882	zip2	v21.4s, v5.4s, v6.4s
1883	zip1	v22.4s, v7.4s, v8.4s
1884	zip2	v23.4s, v7.4s, v8.4s
1885
1886	zip1	v5.2d, v20.2d, v22.2d
1887	zip2	v6.2d, v20.2d, v22.2d
1888	zip1	v7.2d, v21.2d, v23.2d
1889	zip2	v8.2d, v21.2d, v23.2d
1890
1891	zip1	v20.4s, v10.4s, v11.4s
1892	zip2	v21.4s, v10.4s, v11.4s
1893	zip1	v22.4s, v12.4s, v13.4s
1894	zip2	v23.4s, v12.4s, v13.4s
1895
1896	zip1	v10.2d, v20.2d, v22.2d
1897	zip2	v11.2d, v20.2d, v22.2d
1898	zip1	v12.2d, v21.2d, v23.2d
1899	zip2	v13.2d, v21.2d, v23.2d
1900
1901	zip1	v20.4s, v15.4s, v16.4s
1902	zip2	v21.4s, v15.4s, v16.4s
1903	zip1	v22.4s, v17.4s, v18.4s
1904	zip2	v23.4s, v17.4s, v18.4s
1905
1906	zip1	v15.2d, v20.2d, v22.2d
1907	zip2	v16.2d, v20.2d, v22.2d
1908	zip1	v17.2d, v21.2d, v23.2d
1909	zip2	v18.2d, v21.2d, v23.2d
1910
1911	add	v0.4s, v0.4s, v24.4s
1912	add	v5.4s, v5.4s, v28.4s
1913	add	v10.4s, v10.4s, v29.4s
1914	add	v15.4s, v15.4s, v30.4s
1915
1916	add	v1.4s, v1.4s, v24.4s
1917	add	v6.4s, v6.4s, v28.4s
1918	add	v11.4s, v11.4s, v29.4s
1919	add	v16.4s, v16.4s, v30.4s
1920
1921	add	v2.4s, v2.4s, v24.4s
1922	add	v7.4s, v7.4s, v28.4s
1923	add	v12.4s, v12.4s, v29.4s
1924	add	v17.4s, v17.4s, v30.4s
1925
1926	add	v3.4s, v3.4s, v24.4s
1927	add	v8.4s, v8.4s, v28.4s
1928	add	v13.4s, v13.4s, v29.4s
1929	add	v18.4s, v18.4s, v30.4s
1930
1931	add	v4.4s, v4.4s, v24.4s
1932	add	v9.4s, v9.4s, v28.4s
1933	add	v14.4s, v14.4s, v29.4s
1934	add	v19.4s, v19.4s, v30.4s
1935
1936    // We can always safely store 192 bytes
1937	ld1	{v20.16b - v23.16b}, [x1], #64
1938	eor	v20.16b, v20.16b, v0.16b
1939	eor	v21.16b, v21.16b, v5.16b
1940	eor	v22.16b, v22.16b, v10.16b
1941	eor	v23.16b, v23.16b, v15.16b
1942	st1	{v20.16b - v23.16b}, [x0], #64
1943
1944	ld1	{v20.16b - v23.16b}, [x1], #64
1945	eor	v20.16b, v20.16b, v1.16b
1946	eor	v21.16b, v21.16b, v6.16b
1947	eor	v22.16b, v22.16b, v11.16b
1948	eor	v23.16b, v23.16b, v16.16b
1949	st1	{v20.16b - v23.16b}, [x0], #64
1950
1951	ld1	{v20.16b - v23.16b}, [x1], #64
1952	eor	v20.16b, v20.16b, v2.16b
1953	eor	v21.16b, v21.16b, v7.16b
1954	eor	v22.16b, v22.16b, v12.16b
1955	eor	v23.16b, v23.16b, v17.16b
1956	st1	{v20.16b - v23.16b}, [x0], #64
1957
1958	sub	x2, x2, #192
1959
1960	mov	v0.16b, v3.16b
1961	mov	v5.16b, v8.16b
1962	mov	v10.16b, v13.16b
1963	mov	v15.16b, v18.16b
1964
1965	cmp	x2, #64
1966	b.lt	Lopen_tail_64_store
1967
1968	ld1	{v20.16b - v23.16b}, [x1], #64
1969	eor	v20.16b, v20.16b, v3.16b
1970	eor	v21.16b, v21.16b, v8.16b
1971	eor	v22.16b, v22.16b, v13.16b
1972	eor	v23.16b, v23.16b, v18.16b
1973	st1	{v20.16b - v23.16b}, [x0], #64
1974
1975	sub	x2, x2, #64
1976
1977	mov	v0.16b, v4.16b
1978	mov	v5.16b, v9.16b
1979	mov	v10.16b, v14.16b
1980	mov	v15.16b, v19.16b
1981
1982	cmp	x2, #64
1983	b.lt	Lopen_tail_64_store
1984
1985	ld1	{v20.16b - v23.16b}, [x1], #64
1986	eor	v20.16b, v20.16b, v4.16b
1987	eor	v21.16b, v21.16b, v9.16b
1988	eor	v22.16b, v22.16b, v14.16b
1989	eor	v23.16b, v23.16b, v19.16b
1990	st1	{v20.16b - v23.16b}, [x0], #64
1991
1992	sub	x2, x2, #64
1993	b	Lopen_main_loop
1994
1995Lopen_tail:
1996
1997	cbz	x2, Lopen_finalize
1998
1999	lsr	x4, x2, #4 // How many whole blocks we have to hash
2000
2001	cmp	x2, #64
2002	b.le	Lopen_tail_64
2003	cmp	x2, #128
2004	b.le	Lopen_tail_128
2005
2006Lopen_tail_192:
2007     // We need three more blocks
2008	mov	v0.16b, v24.16b
2009	mov	v1.16b, v24.16b
2010	mov	v2.16b, v24.16b
2011	mov	v5.16b, v28.16b
2012	mov	v6.16b, v28.16b
2013	mov	v7.16b, v28.16b
2014	mov	v10.16b, v29.16b
2015	mov	v11.16b, v29.16b
2016	mov	v12.16b, v29.16b
2017	mov	v15.16b, v30.16b
2018	mov	v16.16b, v30.16b
2019	mov	v17.16b, v30.16b
2020	eor	v23.16b, v23.16b, v23.16b
2021	eor	v21.16b, v21.16b, v21.16b
2022	ins	v23.s[0], v25.s[0]
2023	ins	v21.d[0], x15
2024
2025	add	v22.4s, v23.4s, v21.4s
2026	add	v21.4s, v22.4s, v21.4s
2027
2028	add	v15.4s, v15.4s, v21.4s
2029	add	v16.4s, v16.4s, v23.4s
2030	add	v17.4s, v17.4s, v22.4s
2031
2032	mov	x7, #10
2033	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
2034	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
2035	sub	x4, x4, x7
2036
2037	cbz	x7, Lopen_tail_192_rounds_no_hash
2038
2039Lopen_tail_192_rounds:
2040	ldp	x11, x12, [x3], 16
2041	adds	x8, x8, x11
2042	adcs	x9, x9, x12
2043	adc	x10, x10, x15
2044	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2045	umulh	x12, x8, x16
2046	mul	x13, x9, x16
2047	umulh	x14, x9, x16
2048	adds	x12, x12, x13
2049	mul	x13, x10, x16
2050	adc	x13, x13, x14
2051	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2052	umulh	x8, x8, x17
2053	adds	x12, x12, x14
2054	mul	x14, x9, x17
2055	umulh	x9, x9, x17
2056	adcs	x14, x14, x8
2057	mul	x10, x10, x17
2058	adc	x10, x10, x9
2059	adds	x13, x13, x14
2060	adc	x14, x10, xzr
2061	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2062	and	x8, x13, #-4
2063	extr	x13, x14, x13, #2
2064	adds	x8, x8, x11
2065	lsr	x11, x14, #2
2066	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2067	adds	x8, x8, x13
2068	adcs	x9, x9, x12
2069	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2070Lopen_tail_192_rounds_no_hash:
2071	add	v0.4s, v0.4s, v5.4s
2072	add	v1.4s, v1.4s, v6.4s
2073	add	v2.4s, v2.4s, v7.4s
2074	eor	v15.16b, v15.16b, v0.16b
2075	eor	v16.16b, v16.16b, v1.16b
2076	eor	v17.16b, v17.16b, v2.16b
2077	rev32	v15.8h, v15.8h
2078	rev32	v16.8h, v16.8h
2079	rev32	v17.8h, v17.8h
2080
2081	add	v10.4s, v10.4s, v15.4s
2082	add	v11.4s, v11.4s, v16.4s
2083	add	v12.4s, v12.4s, v17.4s
2084	eor	v5.16b, v5.16b, v10.16b
2085	eor	v6.16b, v6.16b, v11.16b
2086	eor	v7.16b, v7.16b, v12.16b
2087	ushr	v20.4s, v5.4s, #20
2088	sli	v20.4s, v5.4s, #12
2089	ushr	v5.4s, v6.4s, #20
2090	sli	v5.4s, v6.4s, #12
2091	ushr	v6.4s, v7.4s, #20
2092	sli	v6.4s, v7.4s, #12
2093
2094	add	v0.4s, v0.4s, v20.4s
2095	add	v1.4s, v1.4s, v5.4s
2096	add	v2.4s, v2.4s, v6.4s
2097	eor	v15.16b, v15.16b, v0.16b
2098	eor	v16.16b, v16.16b, v1.16b
2099	eor	v17.16b, v17.16b, v2.16b
2100	tbl	v15.16b, {v15.16b}, v26.16b
2101	tbl	v16.16b, {v16.16b}, v26.16b
2102	tbl	v17.16b, {v17.16b}, v26.16b
2103
2104	add	v10.4s, v10.4s, v15.4s
2105	add	v11.4s, v11.4s, v16.4s
2106	add	v12.4s, v12.4s, v17.4s
2107	eor	v20.16b, v20.16b, v10.16b
2108	eor	v5.16b, v5.16b, v11.16b
2109	eor	v6.16b, v6.16b, v12.16b
2110	ushr	v7.4s, v6.4s, #25
2111	sli	v7.4s, v6.4s, #7
2112	ushr	v6.4s, v5.4s, #25
2113	sli	v6.4s, v5.4s, #7
2114	ushr	v5.4s, v20.4s, #25
2115	sli	v5.4s, v20.4s, #7
2116
2117	ext	v5.16b, v5.16b, v5.16b, #4
2118	ext	v6.16b, v6.16b, v6.16b, #4
2119	ext	v7.16b, v7.16b, v7.16b, #4
2120
2121	ext	v10.16b, v10.16b, v10.16b, #8
2122	ext	v11.16b, v11.16b, v11.16b, #8
2123	ext	v12.16b, v12.16b, v12.16b, #8
2124
2125	ext	v15.16b, v15.16b, v15.16b, #12
2126	ext	v16.16b, v16.16b, v16.16b, #12
2127	ext	v17.16b, v17.16b, v17.16b, #12
2128	add	v0.4s, v0.4s, v5.4s
2129	add	v1.4s, v1.4s, v6.4s
2130	add	v2.4s, v2.4s, v7.4s
2131	eor	v15.16b, v15.16b, v0.16b
2132	eor	v16.16b, v16.16b, v1.16b
2133	eor	v17.16b, v17.16b, v2.16b
2134	rev32	v15.8h, v15.8h
2135	rev32	v16.8h, v16.8h
2136	rev32	v17.8h, v17.8h
2137
2138	add	v10.4s, v10.4s, v15.4s
2139	add	v11.4s, v11.4s, v16.4s
2140	add	v12.4s, v12.4s, v17.4s
2141	eor	v5.16b, v5.16b, v10.16b
2142	eor	v6.16b, v6.16b, v11.16b
2143	eor	v7.16b, v7.16b, v12.16b
2144	ushr	v20.4s, v5.4s, #20
2145	sli	v20.4s, v5.4s, #12
2146	ushr	v5.4s, v6.4s, #20
2147	sli	v5.4s, v6.4s, #12
2148	ushr	v6.4s, v7.4s, #20
2149	sli	v6.4s, v7.4s, #12
2150
2151	add	v0.4s, v0.4s, v20.4s
2152	add	v1.4s, v1.4s, v5.4s
2153	add	v2.4s, v2.4s, v6.4s
2154	eor	v15.16b, v15.16b, v0.16b
2155	eor	v16.16b, v16.16b, v1.16b
2156	eor	v17.16b, v17.16b, v2.16b
2157	tbl	v15.16b, {v15.16b}, v26.16b
2158	tbl	v16.16b, {v16.16b}, v26.16b
2159	tbl	v17.16b, {v17.16b}, v26.16b
2160
2161	add	v10.4s, v10.4s, v15.4s
2162	add	v11.4s, v11.4s, v16.4s
2163	add	v12.4s, v12.4s, v17.4s
2164	eor	v20.16b, v20.16b, v10.16b
2165	eor	v5.16b, v5.16b, v11.16b
2166	eor	v6.16b, v6.16b, v12.16b
2167	ushr	v7.4s, v6.4s, #25
2168	sli	v7.4s, v6.4s, #7
2169	ushr	v6.4s, v5.4s, #25
2170	sli	v6.4s, v5.4s, #7
2171	ushr	v5.4s, v20.4s, #25
2172	sli	v5.4s, v20.4s, #7
2173
2174	ext	v5.16b, v5.16b, v5.16b, #12
2175	ext	v6.16b, v6.16b, v6.16b, #12
2176	ext	v7.16b, v7.16b, v7.16b, #12
2177
2178	ext	v10.16b, v10.16b, v10.16b, #8
2179	ext	v11.16b, v11.16b, v11.16b, #8
2180	ext	v12.16b, v12.16b, v12.16b, #8
2181
2182	ext	v15.16b, v15.16b, v15.16b, #4
2183	ext	v16.16b, v16.16b, v16.16b, #4
2184	ext	v17.16b, v17.16b, v17.16b, #4
2185	subs	x7, x7, #1
2186	b.gt	Lopen_tail_192_rounds
2187	subs	x6, x6, #1
2188	b.ge	Lopen_tail_192_rounds_no_hash
2189
2190    // We hashed 160 bytes at most, may still have 32 bytes left
2191Lopen_tail_192_hash:
2192	cbz	x4, Lopen_tail_192_hash_done
2193	ldp	x11, x12, [x3], 16
2194	adds	x8, x8, x11
2195	adcs	x9, x9, x12
2196	adc	x10, x10, x15
2197	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2198	umulh	x12, x8, x16
2199	mul	x13, x9, x16
2200	umulh	x14, x9, x16
2201	adds	x12, x12, x13
2202	mul	x13, x10, x16
2203	adc	x13, x13, x14
2204	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2205	umulh	x8, x8, x17
2206	adds	x12, x12, x14
2207	mul	x14, x9, x17
2208	umulh	x9, x9, x17
2209	adcs	x14, x14, x8
2210	mul	x10, x10, x17
2211	adc	x10, x10, x9
2212	adds	x13, x13, x14
2213	adc	x14, x10, xzr
2214	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2215	and	x8, x13, #-4
2216	extr	x13, x14, x13, #2
2217	adds	x8, x8, x11
2218	lsr	x11, x14, #2
2219	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2220	adds	x8, x8, x13
2221	adcs	x9, x9, x12
2222	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2223	sub	x4, x4, #1
2224	b	Lopen_tail_192_hash
2225
2226Lopen_tail_192_hash_done:
2227
2228	add	v0.4s, v0.4s, v24.4s
2229	add	v1.4s, v1.4s, v24.4s
2230	add	v2.4s, v2.4s, v24.4s
2231	add	v5.4s, v5.4s, v28.4s
2232	add	v6.4s, v6.4s, v28.4s
2233	add	v7.4s, v7.4s, v28.4s
2234	add	v10.4s, v10.4s, v29.4s
2235	add	v11.4s, v11.4s, v29.4s
2236	add	v12.4s, v12.4s, v29.4s
2237	add	v15.4s, v15.4s, v30.4s
2238	add	v16.4s, v16.4s, v30.4s
2239	add	v17.4s, v17.4s, v30.4s
2240
2241	add	v15.4s, v15.4s, v21.4s
2242	add	v16.4s, v16.4s, v23.4s
2243	add	v17.4s, v17.4s, v22.4s
2244
2245	ld1	{v20.16b - v23.16b}, [x1], #64
2246
2247	eor	v20.16b, v20.16b, v1.16b
2248	eor	v21.16b, v21.16b, v6.16b
2249	eor	v22.16b, v22.16b, v11.16b
2250	eor	v23.16b, v23.16b, v16.16b
2251
2252	st1	{v20.16b - v23.16b}, [x0], #64
2253
2254	ld1	{v20.16b - v23.16b}, [x1], #64
2255
2256	eor	v20.16b, v20.16b, v2.16b
2257	eor	v21.16b, v21.16b, v7.16b
2258	eor	v22.16b, v22.16b, v12.16b
2259	eor	v23.16b, v23.16b, v17.16b
2260
2261	st1	{v20.16b - v23.16b}, [x0], #64
2262
2263	sub	x2, x2, #128
2264	b	Lopen_tail_64_store
2265
2266Lopen_tail_128:
2267     // We need two more blocks
2268	mov	v0.16b, v24.16b
2269	mov	v1.16b, v24.16b
2270	mov	v5.16b, v28.16b
2271	mov	v6.16b, v28.16b
2272	mov	v10.16b, v29.16b
2273	mov	v11.16b, v29.16b
2274	mov	v15.16b, v30.16b
2275	mov	v16.16b, v30.16b
2276	eor	v23.16b, v23.16b, v23.16b
2277	eor	v22.16b, v22.16b, v22.16b
2278	ins	v23.s[0], v25.s[0]
2279	ins	v22.d[0], x15
2280	add	v22.4s, v22.4s, v23.4s
2281
2282	add	v15.4s, v15.4s, v22.4s
2283	add	v16.4s, v16.4s, v23.4s
2284
2285	mov	x6, #10
2286	sub	x6, x6, x4
2287
2288Lopen_tail_128_rounds:
2289	add	v0.4s, v0.4s, v5.4s
2290	eor	v15.16b, v15.16b, v0.16b
2291	rev32	v15.8h, v15.8h
2292
2293	add	v10.4s, v10.4s, v15.4s
2294	eor	v5.16b, v5.16b, v10.16b
2295	ushr	v20.4s, v5.4s, #20
2296	sli	v20.4s, v5.4s, #12
2297	add	v0.4s, v0.4s, v20.4s
2298	eor	v15.16b, v15.16b, v0.16b
2299	tbl	v15.16b, {v15.16b}, v26.16b
2300
2301	add	v10.4s, v10.4s, v15.4s
2302	eor	v20.16b, v20.16b, v10.16b
2303	ushr	v5.4s, v20.4s, #25
2304	sli	v5.4s, v20.4s, #7
2305	ext	v5.16b, v5.16b, v5.16b, #4
2306	ext	v10.16b, v10.16b, v10.16b, #8
2307	ext	v15.16b, v15.16b, v15.16b, #12
2308	add	v1.4s, v1.4s, v6.4s
2309	eor	v16.16b, v16.16b, v1.16b
2310	rev32	v16.8h, v16.8h
2311
2312	add	v11.4s, v11.4s, v16.4s
2313	eor	v6.16b, v6.16b, v11.16b
2314	ushr	v20.4s, v6.4s, #20
2315	sli	v20.4s, v6.4s, #12
2316	add	v1.4s, v1.4s, v20.4s
2317	eor	v16.16b, v16.16b, v1.16b
2318	tbl	v16.16b, {v16.16b}, v26.16b
2319
2320	add	v11.4s, v11.4s, v16.4s
2321	eor	v20.16b, v20.16b, v11.16b
2322	ushr	v6.4s, v20.4s, #25
2323	sli	v6.4s, v20.4s, #7
2324	ext	v6.16b, v6.16b, v6.16b, #4
2325	ext	v11.16b, v11.16b, v11.16b, #8
2326	ext	v16.16b, v16.16b, v16.16b, #12
2327	add	v0.4s, v0.4s, v5.4s
2328	eor	v15.16b, v15.16b, v0.16b
2329	rev32	v15.8h, v15.8h
2330
2331	add	v10.4s, v10.4s, v15.4s
2332	eor	v5.16b, v5.16b, v10.16b
2333	ushr	v20.4s, v5.4s, #20
2334	sli	v20.4s, v5.4s, #12
2335	add	v0.4s, v0.4s, v20.4s
2336	eor	v15.16b, v15.16b, v0.16b
2337	tbl	v15.16b, {v15.16b}, v26.16b
2338
2339	add	v10.4s, v10.4s, v15.4s
2340	eor	v20.16b, v20.16b, v10.16b
2341	ushr	v5.4s, v20.4s, #25
2342	sli	v5.4s, v20.4s, #7
2343	ext	v5.16b, v5.16b, v5.16b, #12
2344	ext	v10.16b, v10.16b, v10.16b, #8
2345	ext	v15.16b, v15.16b, v15.16b, #4
2346	add	v1.4s, v1.4s, v6.4s
2347	eor	v16.16b, v16.16b, v1.16b
2348	rev32	v16.8h, v16.8h
2349
2350	add	v11.4s, v11.4s, v16.4s
2351	eor	v6.16b, v6.16b, v11.16b
2352	ushr	v20.4s, v6.4s, #20
2353	sli	v20.4s, v6.4s, #12
2354	add	v1.4s, v1.4s, v20.4s
2355	eor	v16.16b, v16.16b, v1.16b
2356	tbl	v16.16b, {v16.16b}, v26.16b
2357
2358	add	v11.4s, v11.4s, v16.4s
2359	eor	v20.16b, v20.16b, v11.16b
2360	ushr	v6.4s, v20.4s, #25
2361	sli	v6.4s, v20.4s, #7
2362	ext	v6.16b, v6.16b, v6.16b, #12
2363	ext	v11.16b, v11.16b, v11.16b, #8
2364	ext	v16.16b, v16.16b, v16.16b, #4
2365	subs	x6, x6, #1
2366	b.gt	Lopen_tail_128_rounds
2367	cbz	x4, Lopen_tail_128_rounds_done
2368	subs	x4, x4, #1
2369	ldp	x11, x12, [x3], 16
2370	adds	x8, x8, x11
2371	adcs	x9, x9, x12
2372	adc	x10, x10, x15
2373	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2374	umulh	x12, x8, x16
2375	mul	x13, x9, x16
2376	umulh	x14, x9, x16
2377	adds	x12, x12, x13
2378	mul	x13, x10, x16
2379	adc	x13, x13, x14
2380	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2381	umulh	x8, x8, x17
2382	adds	x12, x12, x14
2383	mul	x14, x9, x17
2384	umulh	x9, x9, x17
2385	adcs	x14, x14, x8
2386	mul	x10, x10, x17
2387	adc	x10, x10, x9
2388	adds	x13, x13, x14
2389	adc	x14, x10, xzr
2390	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2391	and	x8, x13, #-4
2392	extr	x13, x14, x13, #2
2393	adds	x8, x8, x11
2394	lsr	x11, x14, #2
2395	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2396	adds	x8, x8, x13
2397	adcs	x9, x9, x12
2398	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2399	b	Lopen_tail_128_rounds
2400
2401Lopen_tail_128_rounds_done:
2402	add	v0.4s, v0.4s, v24.4s
2403	add	v1.4s, v1.4s, v24.4s
2404	add	v5.4s, v5.4s, v28.4s
2405	add	v6.4s, v6.4s, v28.4s
2406	add	v10.4s, v10.4s, v29.4s
2407	add	v11.4s, v11.4s, v29.4s
2408	add	v15.4s, v15.4s, v30.4s
2409	add	v16.4s, v16.4s, v30.4s
2410	add	v15.4s, v15.4s, v22.4s
2411	add	v16.4s, v16.4s, v23.4s
2412
2413	ld1	{v20.16b - v23.16b}, [x1], #64
2414
2415	eor	v20.16b, v20.16b, v1.16b
2416	eor	v21.16b, v21.16b, v6.16b
2417	eor	v22.16b, v22.16b, v11.16b
2418	eor	v23.16b, v23.16b, v16.16b
2419
2420	st1	{v20.16b - v23.16b}, [x0], #64
2421	sub	x2, x2, #64
2422
2423	b	Lopen_tail_64_store
2424
2425Lopen_tail_64:
2426    // We just need a single block
2427	mov	v0.16b, v24.16b
2428	mov	v5.16b, v28.16b
2429	mov	v10.16b, v29.16b
2430	mov	v15.16b, v30.16b
2431	eor	v23.16b, v23.16b, v23.16b
2432	ins	v23.s[0], v25.s[0]
2433	add	v15.4s, v15.4s, v23.4s
2434
2435	mov	x6, #10
2436	sub	x6, x6, x4
2437
2438Lopen_tail_64_rounds:
2439	add	v0.4s, v0.4s, v5.4s
2440	eor	v15.16b, v15.16b, v0.16b
2441	rev32	v15.8h, v15.8h
2442
2443	add	v10.4s, v10.4s, v15.4s
2444	eor	v5.16b, v5.16b, v10.16b
2445	ushr	v20.4s, v5.4s, #20
2446	sli	v20.4s, v5.4s, #12
2447	add	v0.4s, v0.4s, v20.4s
2448	eor	v15.16b, v15.16b, v0.16b
2449	tbl	v15.16b, {v15.16b}, v26.16b
2450
2451	add	v10.4s, v10.4s, v15.4s
2452	eor	v20.16b, v20.16b, v10.16b
2453	ushr	v5.4s, v20.4s, #25
2454	sli	v5.4s, v20.4s, #7
2455	ext	v5.16b, v5.16b, v5.16b, #4
2456	ext	v10.16b, v10.16b, v10.16b, #8
2457	ext	v15.16b, v15.16b, v15.16b, #12
2458	add	v0.4s, v0.4s, v5.4s
2459	eor	v15.16b, v15.16b, v0.16b
2460	rev32	v15.8h, v15.8h
2461
2462	add	v10.4s, v10.4s, v15.4s
2463	eor	v5.16b, v5.16b, v10.16b
2464	ushr	v20.4s, v5.4s, #20
2465	sli	v20.4s, v5.4s, #12
2466	add	v0.4s, v0.4s, v20.4s
2467	eor	v15.16b, v15.16b, v0.16b
2468	tbl	v15.16b, {v15.16b}, v26.16b
2469
2470	add	v10.4s, v10.4s, v15.4s
2471	eor	v20.16b, v20.16b, v10.16b
2472	ushr	v5.4s, v20.4s, #25
2473	sli	v5.4s, v20.4s, #7
2474	ext	v5.16b, v5.16b, v5.16b, #12
2475	ext	v10.16b, v10.16b, v10.16b, #8
2476	ext	v15.16b, v15.16b, v15.16b, #4
2477	subs	x6, x6, #1
2478	b.gt	Lopen_tail_64_rounds
2479	cbz	x4, Lopen_tail_64_rounds_done
2480	subs	x4, x4, #1
2481	ldp	x11, x12, [x3], 16
2482	adds	x8, x8, x11
2483	adcs	x9, x9, x12
2484	adc	x10, x10, x15
2485	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2486	umulh	x12, x8, x16
2487	mul	x13, x9, x16
2488	umulh	x14, x9, x16
2489	adds	x12, x12, x13
2490	mul	x13, x10, x16
2491	adc	x13, x13, x14
2492	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2493	umulh	x8, x8, x17
2494	adds	x12, x12, x14
2495	mul	x14, x9, x17
2496	umulh	x9, x9, x17
2497	adcs	x14, x14, x8
2498	mul	x10, x10, x17
2499	adc	x10, x10, x9
2500	adds	x13, x13, x14
2501	adc	x14, x10, xzr
2502	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2503	and	x8, x13, #-4
2504	extr	x13, x14, x13, #2
2505	adds	x8, x8, x11
2506	lsr	x11, x14, #2
2507	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2508	adds	x8, x8, x13
2509	adcs	x9, x9, x12
2510	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2511	b	Lopen_tail_64_rounds
2512
2513Lopen_tail_64_rounds_done:
2514	add	v0.4s, v0.4s, v24.4s
2515	add	v5.4s, v5.4s, v28.4s
2516	add	v10.4s, v10.4s, v29.4s
2517	add	v15.4s, v15.4s, v30.4s
2518	add	v15.4s, v15.4s, v23.4s
2519
2520Lopen_tail_64_store:
2521	cmp	x2, #16
2522	b.lt	Lopen_tail_16
2523
2524	ld1	{v20.16b}, [x1], #16
2525	eor	v20.16b, v20.16b, v0.16b
2526	st1	{v20.16b}, [x0], #16
2527	mov	v0.16b, v5.16b
2528	mov	v5.16b, v10.16b
2529	mov	v10.16b, v15.16b
2530	sub	x2, x2, #16
2531	b	Lopen_tail_64_store
2532
2533Lopen_tail_16:
2534    // Here we handle the last [0,16) bytes that require a padded block
2535	cbz	x2, Lopen_finalize
2536
2537	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
2538	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
2539	not	v22.16b, v20.16b
2540
2541	add	x7, x1, x2
2542	mov	x6, x2
2543
2544Lopen_tail_16_compose:
2545	ext	v20.16b, v20.16b, v20.16b, #15
2546	ldrb	w11, [x7, #-1]!
2547	mov	v20.b[0], w11
2548	ext	v21.16b, v22.16b, v21.16b, #15
2549	subs	x2, x2, #1
2550	b.gt	Lopen_tail_16_compose
2551
2552	and	v20.16b, v20.16b, v21.16b
2553    // Hash in the final padded block
2554	mov	x11, v20.d[0]
2555	mov	x12, v20.d[1]
2556	adds	x8, x8, x11
2557	adcs	x9, x9, x12
2558	adc	x10, x10, x15
2559	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2560	umulh	x12, x8, x16
2561	mul	x13, x9, x16
2562	umulh	x14, x9, x16
2563	adds	x12, x12, x13
2564	mul	x13, x10, x16
2565	adc	x13, x13, x14
2566	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2567	umulh	x8, x8, x17
2568	adds	x12, x12, x14
2569	mul	x14, x9, x17
2570	umulh	x9, x9, x17
2571	adcs	x14, x14, x8
2572	mul	x10, x10, x17
2573	adc	x10, x10, x9
2574	adds	x13, x13, x14
2575	adc	x14, x10, xzr
2576	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2577	and	x8, x13, #-4
2578	extr	x13, x14, x13, #2
2579	adds	x8, x8, x11
2580	lsr	x11, x14, #2
2581	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2582	adds	x8, x8, x13
2583	adcs	x9, x9, x12
2584	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2585	eor	v20.16b, v20.16b, v0.16b
2586
2587Lopen_tail_16_store:
2588	umov	w11, v20.b[0]
2589	strb	w11, [x0], #1
2590	ext	v20.16b, v20.16b, v20.16b, #1
2591	subs	x6, x6, #1
2592	b.gt	Lopen_tail_16_store
2593
2594Lopen_finalize:
2595	mov	x11, v31.d[0]
2596	mov	x12, v31.d[1]
2597	adds	x8, x8, x11
2598	adcs	x9, x9, x12
2599	adc	x10, x10, x15
2600	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2601	umulh	x12, x8, x16
2602	mul	x13, x9, x16
2603	umulh	x14, x9, x16
2604	adds	x12, x12, x13
2605	mul	x13, x10, x16
2606	adc	x13, x13, x14
2607	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2608	umulh	x8, x8, x17
2609	adds	x12, x12, x14
2610	mul	x14, x9, x17
2611	umulh	x9, x9, x17
2612	adcs	x14, x14, x8
2613	mul	x10, x10, x17
2614	adc	x10, x10, x9
2615	adds	x13, x13, x14
2616	adc	x14, x10, xzr
2617	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2618	and	x8, x13, #-4
2619	extr	x13, x14, x13, #2
2620	adds	x8, x8, x11
2621	lsr	x11, x14, #2
2622	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2623	adds	x8, x8, x13
2624	adcs	x9, x9, x12
2625	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2626    // Final reduction step
2627	sub	x12, xzr, x15
2628	orr	x13, xzr, #3
2629	subs	x11, x8, #-5
2630	sbcs	x12, x9, x12
2631	sbcs	x13, x10, x13
2632	csel	x8, x11, x8, cs
2633	csel	x9, x12, x9, cs
2634	csel	x10, x13, x10, cs
2635	mov	x11, v27.d[0]
2636	mov	x12, v27.d[1]
2637	adds	x8, x8, x11
2638	adcs	x9, x9, x12
2639	adc	x10, x10, x15
2640
2641	stp	x8, x9, [x5]
2642
2643	ldp	d8, d9, [sp, #16]
2644	ldp	d10, d11, [sp, #32]
2645	ldp	d12, d13, [sp, #48]
2646	ldp	d14, d15, [sp, #64]
2647.cfi_restore	b15
2648.cfi_restore	b14
2649.cfi_restore	b13
2650.cfi_restore	b12
2651.cfi_restore	b11
2652.cfi_restore	b10
2653.cfi_restore	b9
2654.cfi_restore	b8
2655	ldp	x29, x30, [sp], 80
2656.cfi_restore	w29
2657.cfi_restore	w30
2658.cfi_def_cfa_offset	0
2659	AARCH64_VALIDATE_LINK_REGISTER
2660	ret
2661
2662Lopen_128:
2663    // On some architectures preparing 5 blocks for small buffers is wasteful
2664	eor	v25.16b, v25.16b, v25.16b
2665	mov	x11, #1
2666	mov	v25.s[0], w11
2667	mov	v0.16b, v24.16b
2668	mov	v1.16b, v24.16b
2669	mov	v2.16b, v24.16b
2670	mov	v5.16b, v28.16b
2671	mov	v6.16b, v28.16b
2672	mov	v7.16b, v28.16b
2673	mov	v10.16b, v29.16b
2674	mov	v11.16b, v29.16b
2675	mov	v12.16b, v29.16b
2676	mov	v17.16b, v30.16b
2677	add	v15.4s, v17.4s, v25.4s
2678	add	v16.4s, v15.4s, v25.4s
2679
2680	mov	x6, #10
2681
2682Lopen_128_rounds:
2683	add	v0.4s, v0.4s, v5.4s
2684	add	v1.4s, v1.4s, v6.4s
2685	add	v2.4s, v2.4s, v7.4s
2686	eor	v15.16b, v15.16b, v0.16b
2687	eor	v16.16b, v16.16b, v1.16b
2688	eor	v17.16b, v17.16b, v2.16b
2689	rev32	v15.8h, v15.8h
2690	rev32	v16.8h, v16.8h
2691	rev32	v17.8h, v17.8h
2692
2693	add	v10.4s, v10.4s, v15.4s
2694	add	v11.4s, v11.4s, v16.4s
2695	add	v12.4s, v12.4s, v17.4s
2696	eor	v5.16b, v5.16b, v10.16b
2697	eor	v6.16b, v6.16b, v11.16b
2698	eor	v7.16b, v7.16b, v12.16b
2699	ushr	v20.4s, v5.4s, #20
2700	sli	v20.4s, v5.4s, #12
2701	ushr	v5.4s, v6.4s, #20
2702	sli	v5.4s, v6.4s, #12
2703	ushr	v6.4s, v7.4s, #20
2704	sli	v6.4s, v7.4s, #12
2705
2706	add	v0.4s, v0.4s, v20.4s
2707	add	v1.4s, v1.4s, v5.4s
2708	add	v2.4s, v2.4s, v6.4s
2709	eor	v15.16b, v15.16b, v0.16b
2710	eor	v16.16b, v16.16b, v1.16b
2711	eor	v17.16b, v17.16b, v2.16b
2712	tbl	v15.16b, {v15.16b}, v26.16b
2713	tbl	v16.16b, {v16.16b}, v26.16b
2714	tbl	v17.16b, {v17.16b}, v26.16b
2715
2716	add	v10.4s, v10.4s, v15.4s
2717	add	v11.4s, v11.4s, v16.4s
2718	add	v12.4s, v12.4s, v17.4s
2719	eor	v20.16b, v20.16b, v10.16b
2720	eor	v5.16b, v5.16b, v11.16b
2721	eor	v6.16b, v6.16b, v12.16b
2722	ushr	v7.4s, v6.4s, #25
2723	sli	v7.4s, v6.4s, #7
2724	ushr	v6.4s, v5.4s, #25
2725	sli	v6.4s, v5.4s, #7
2726	ushr	v5.4s, v20.4s, #25
2727	sli	v5.4s, v20.4s, #7
2728
2729	ext	v5.16b, v5.16b, v5.16b, #4
2730	ext	v6.16b, v6.16b, v6.16b, #4
2731	ext	v7.16b, v7.16b, v7.16b, #4
2732
2733	ext	v10.16b, v10.16b, v10.16b, #8
2734	ext	v11.16b, v11.16b, v11.16b, #8
2735	ext	v12.16b, v12.16b, v12.16b, #8
2736
2737	ext	v15.16b, v15.16b, v15.16b, #12
2738	ext	v16.16b, v16.16b, v16.16b, #12
2739	ext	v17.16b, v17.16b, v17.16b, #12
2740	add	v0.4s, v0.4s, v5.4s
2741	add	v1.4s, v1.4s, v6.4s
2742	add	v2.4s, v2.4s, v7.4s
2743	eor	v15.16b, v15.16b, v0.16b
2744	eor	v16.16b, v16.16b, v1.16b
2745	eor	v17.16b, v17.16b, v2.16b
2746	rev32	v15.8h, v15.8h
2747	rev32	v16.8h, v16.8h
2748	rev32	v17.8h, v17.8h
2749
2750	add	v10.4s, v10.4s, v15.4s
2751	add	v11.4s, v11.4s, v16.4s
2752	add	v12.4s, v12.4s, v17.4s
2753	eor	v5.16b, v5.16b, v10.16b
2754	eor	v6.16b, v6.16b, v11.16b
2755	eor	v7.16b, v7.16b, v12.16b
2756	ushr	v20.4s, v5.4s, #20
2757	sli	v20.4s, v5.4s, #12
2758	ushr	v5.4s, v6.4s, #20
2759	sli	v5.4s, v6.4s, #12
2760	ushr	v6.4s, v7.4s, #20
2761	sli	v6.4s, v7.4s, #12
2762
2763	add	v0.4s, v0.4s, v20.4s
2764	add	v1.4s, v1.4s, v5.4s
2765	add	v2.4s, v2.4s, v6.4s
2766	eor	v15.16b, v15.16b, v0.16b
2767	eor	v16.16b, v16.16b, v1.16b
2768	eor	v17.16b, v17.16b, v2.16b
2769	tbl	v15.16b, {v15.16b}, v26.16b
2770	tbl	v16.16b, {v16.16b}, v26.16b
2771	tbl	v17.16b, {v17.16b}, v26.16b
2772
2773	add	v10.4s, v10.4s, v15.4s
2774	add	v11.4s, v11.4s, v16.4s
2775	add	v12.4s, v12.4s, v17.4s
2776	eor	v20.16b, v20.16b, v10.16b
2777	eor	v5.16b, v5.16b, v11.16b
2778	eor	v6.16b, v6.16b, v12.16b
2779	ushr	v7.4s, v6.4s, #25
2780	sli	v7.4s, v6.4s, #7
2781	ushr	v6.4s, v5.4s, #25
2782	sli	v6.4s, v5.4s, #7
2783	ushr	v5.4s, v20.4s, #25
2784	sli	v5.4s, v20.4s, #7
2785
2786	ext	v5.16b, v5.16b, v5.16b, #12
2787	ext	v6.16b, v6.16b, v6.16b, #12
2788	ext	v7.16b, v7.16b, v7.16b, #12
2789
2790	ext	v10.16b, v10.16b, v10.16b, #8
2791	ext	v11.16b, v11.16b, v11.16b, #8
2792	ext	v12.16b, v12.16b, v12.16b, #8
2793
2794	ext	v15.16b, v15.16b, v15.16b, #4
2795	ext	v16.16b, v16.16b, v16.16b, #4
2796	ext	v17.16b, v17.16b, v17.16b, #4
2797	subs	x6, x6, #1
2798	b.hi	Lopen_128_rounds
2799
2800	add	v0.4s, v0.4s, v24.4s
2801	add	v1.4s, v1.4s, v24.4s
2802	add	v2.4s, v2.4s, v24.4s
2803
2804	add	v5.4s, v5.4s, v28.4s
2805	add	v6.4s, v6.4s, v28.4s
2806	add	v7.4s, v7.4s, v28.4s
2807
2808	add	v10.4s, v10.4s, v29.4s
2809	add	v11.4s, v11.4s, v29.4s
2810
2811	add	v30.4s, v30.4s, v25.4s
2812	add	v15.4s, v15.4s, v30.4s
2813	add	v30.4s, v30.4s, v25.4s
2814	add	v16.4s, v16.4s, v30.4s
2815
2816	and	v2.16b, v2.16b, v27.16b
2817	mov	x16, v2.d[0] // Move the R key to GPRs
2818	mov	x17, v2.d[1]
2819	mov	v27.16b, v7.16b // Store the S key
2820
2821	bl	Lpoly_hash_ad_internal
2822
2823Lopen_128_store:
2824	cmp	x2, #64
2825	b.lt	Lopen_128_store_64
2826
2827	ld1	{v20.16b - v23.16b}, [x1], #64
2828
2829	mov	x11, v20.d[0]
2830	mov	x12, v20.d[1]
2831	adds	x8, x8, x11
2832	adcs	x9, x9, x12
2833	adc	x10, x10, x15
2834	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2835	umulh	x12, x8, x16
2836	mul	x13, x9, x16
2837	umulh	x14, x9, x16
2838	adds	x12, x12, x13
2839	mul	x13, x10, x16
2840	adc	x13, x13, x14
2841	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2842	umulh	x8, x8, x17
2843	adds	x12, x12, x14
2844	mul	x14, x9, x17
2845	umulh	x9, x9, x17
2846	adcs	x14, x14, x8
2847	mul	x10, x10, x17
2848	adc	x10, x10, x9
2849	adds	x13, x13, x14
2850	adc	x14, x10, xzr
2851	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2852	and	x8, x13, #-4
2853	extr	x13, x14, x13, #2
2854	adds	x8, x8, x11
2855	lsr	x11, x14, #2
2856	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2857	adds	x8, x8, x13
2858	adcs	x9, x9, x12
2859	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2860	mov	x11, v21.d[0]
2861	mov	x12, v21.d[1]
2862	adds	x8, x8, x11
2863	adcs	x9, x9, x12
2864	adc	x10, x10, x15
2865	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2866	umulh	x12, x8, x16
2867	mul	x13, x9, x16
2868	umulh	x14, x9, x16
2869	adds	x12, x12, x13
2870	mul	x13, x10, x16
2871	adc	x13, x13, x14
2872	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2873	umulh	x8, x8, x17
2874	adds	x12, x12, x14
2875	mul	x14, x9, x17
2876	umulh	x9, x9, x17
2877	adcs	x14, x14, x8
2878	mul	x10, x10, x17
2879	adc	x10, x10, x9
2880	adds	x13, x13, x14
2881	adc	x14, x10, xzr
2882	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2883	and	x8, x13, #-4
2884	extr	x13, x14, x13, #2
2885	adds	x8, x8, x11
2886	lsr	x11, x14, #2
2887	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2888	adds	x8, x8, x13
2889	adcs	x9, x9, x12
2890	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2891	mov	x11, v22.d[0]
2892	mov	x12, v22.d[1]
2893	adds	x8, x8, x11
2894	adcs	x9, x9, x12
2895	adc	x10, x10, x15
2896	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2897	umulh	x12, x8, x16
2898	mul	x13, x9, x16
2899	umulh	x14, x9, x16
2900	adds	x12, x12, x13
2901	mul	x13, x10, x16
2902	adc	x13, x13, x14
2903	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2904	umulh	x8, x8, x17
2905	adds	x12, x12, x14
2906	mul	x14, x9, x17
2907	umulh	x9, x9, x17
2908	adcs	x14, x14, x8
2909	mul	x10, x10, x17
2910	adc	x10, x10, x9
2911	adds	x13, x13, x14
2912	adc	x14, x10, xzr
2913	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2914	and	x8, x13, #-4
2915	extr	x13, x14, x13, #2
2916	adds	x8, x8, x11
2917	lsr	x11, x14, #2
2918	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2919	adds	x8, x8, x13
2920	adcs	x9, x9, x12
2921	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2922	mov	x11, v23.d[0]
2923	mov	x12, v23.d[1]
2924	adds	x8, x8, x11
2925	adcs	x9, x9, x12
2926	adc	x10, x10, x15
2927	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2928	umulh	x12, x8, x16
2929	mul	x13, x9, x16
2930	umulh	x14, x9, x16
2931	adds	x12, x12, x13
2932	mul	x13, x10, x16
2933	adc	x13, x13, x14
2934	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2935	umulh	x8, x8, x17
2936	adds	x12, x12, x14
2937	mul	x14, x9, x17
2938	umulh	x9, x9, x17
2939	adcs	x14, x14, x8
2940	mul	x10, x10, x17
2941	adc	x10, x10, x9
2942	adds	x13, x13, x14
2943	adc	x14, x10, xzr
2944	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2945	and	x8, x13, #-4
2946	extr	x13, x14, x13, #2
2947	adds	x8, x8, x11
2948	lsr	x11, x14, #2
2949	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2950	adds	x8, x8, x13
2951	adcs	x9, x9, x12
2952	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2953
2954	eor	v20.16b, v20.16b, v0.16b
2955	eor	v21.16b, v21.16b, v5.16b
2956	eor	v22.16b, v22.16b, v10.16b
2957	eor	v23.16b, v23.16b, v15.16b
2958
2959	st1	{v20.16b - v23.16b}, [x0], #64
2960
2961	sub	x2, x2, #64
2962
2963	mov	v0.16b, v1.16b
2964	mov	v5.16b, v6.16b
2965	mov	v10.16b, v11.16b
2966	mov	v15.16b, v16.16b
2967
2968Lopen_128_store_64:
2969
2970	lsr	x4, x2, #4
2971	mov	x3, x1
2972
2973Lopen_128_hash_64:
2974	cbz	x4, Lopen_tail_64_store
2975	ldp	x11, x12, [x3], 16
2976	adds	x8, x8, x11
2977	adcs	x9, x9, x12
2978	adc	x10, x10, x15
2979	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2980	umulh	x12, x8, x16
2981	mul	x13, x9, x16
2982	umulh	x14, x9, x16
2983	adds	x12, x12, x13
2984	mul	x13, x10, x16
2985	adc	x13, x13, x14
2986	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2987	umulh	x8, x8, x17
2988	adds	x12, x12, x14
2989	mul	x14, x9, x17
2990	umulh	x9, x9, x17
2991	adcs	x14, x14, x8
2992	mul	x10, x10, x17
2993	adc	x10, x10, x9
2994	adds	x13, x13, x14
2995	adc	x14, x10, xzr
2996	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2997	and	x8, x13, #-4
2998	extr	x13, x14, x13, #2
2999	adds	x8, x8, x11
3000	lsr	x11, x14, #2
3001	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
3002	adds	x8, x8, x13
3003	adcs	x9, x9, x12
3004	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
3005	sub	x4, x4, #1
3006	b	Lopen_128_hash_64
3007.cfi_endproc
3008
3009#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__)
3010