1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7#include <ring-core/arm_arch.h>
8.section	.rodata
9
10.align	7
11Lchacha20_consts:
12.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
13Linc:
14.long	1,2,3,4
15Lrol8:
16.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
17Lclamp:
18.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
19
20.text
21
22.def Lpoly_hash_ad_internal
23   .type 32
24.endef
25.align	6
26Lpoly_hash_ad_internal:
27.cfi_startproc
28	cbnz	x4, Lpoly_hash_intro
29	ret
30
31Lpoly_hash_intro:
32	cmp	x4, #16
33	b.lt	Lpoly_hash_ad_tail
34	ldp	x11, x12, [x3], 16
35	adds	x8, x8, x11
36	adcs	x9, x9, x12
37	adc	x10, x10, x15
38	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
39	umulh	x12, x8, x16
40	mul	x13, x9, x16
41	umulh	x14, x9, x16
42	adds	x12, x12, x13
43	mul	x13, x10, x16
44	adc	x13, x13, x14
45	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
46	umulh	x8, x8, x17
47	adds	x12, x12, x14
48	mul	x14, x9, x17
49	umulh	x9, x9, x17
50	adcs	x14, x14, x8
51	mul	x10, x10, x17
52	adc	x10, x10, x9
53	adds	x13, x13, x14
54	adc	x14, x10, xzr
55	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
56	and	x8, x13, #-4
57	extr	x13, x14, x13, #2
58	adds	x8, x8, x11
59	lsr	x11, x14, #2
60	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
61	adds	x8, x8, x13
62	adcs	x9, x9, x12
63	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
64	sub	x4, x4, #16
65	b	Lpoly_hash_ad_internal
66
67Lpoly_hash_ad_tail:
68	cbz	x4, Lpoly_hash_ad_ret
69
70	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
71	sub	x4, x4, #1
72
73Lpoly_hash_tail_16_compose:
74	ext	v20.16b, v20.16b, v20.16b, #15
75	ldrb	w11, [x3, x4]
76	mov	v20.b[0], w11
77	subs	x4, x4, #1
78	b.ge	Lpoly_hash_tail_16_compose
79	mov	x11, v20.d[0]
80	mov	x12, v20.d[1]
81	adds	x8, x8, x11
82	adcs	x9, x9, x12
83	adc	x10, x10, x15
84	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
85	umulh	x12, x8, x16
86	mul	x13, x9, x16
87	umulh	x14, x9, x16
88	adds	x12, x12, x13
89	mul	x13, x10, x16
90	adc	x13, x13, x14
91	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
92	umulh	x8, x8, x17
93	adds	x12, x12, x14
94	mul	x14, x9, x17
95	umulh	x9, x9, x17
96	adcs	x14, x14, x8
97	mul	x10, x10, x17
98	adc	x10, x10, x9
99	adds	x13, x13, x14
100	adc	x14, x10, xzr
101	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
102	and	x8, x13, #-4
103	extr	x13, x14, x13, #2
104	adds	x8, x8, x11
105	lsr	x11, x14, #2
106	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
107	adds	x8, x8, x13
108	adcs	x9, x9, x12
109	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
110
111Lpoly_hash_ad_ret:
112	ret
113.cfi_endproc
114
115
116/////////////////////////////////
117//
118// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
119//
120.globl	chacha20_poly1305_seal
121
122.def chacha20_poly1305_seal
123   .type 32
124.endef
125.align	6
126chacha20_poly1305_seal:
127	AARCH64_SIGN_LINK_REGISTER
128.cfi_startproc
129	stp	x29, x30, [sp, #-80]!
130.cfi_def_cfa_offset	80
131.cfi_offset	w30, -72
132.cfi_offset	w29, -80
133	mov	x29, sp
134    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
135    // we don't actually use the frame pointer like that, it's probably not
136    // worth bothering.
137	stp	d8, d9, [sp, #16]
138	stp	d10, d11, [sp, #32]
139	stp	d12, d13, [sp, #48]
140	stp	d14, d15, [sp, #64]
141.cfi_offset	b15, -8
142.cfi_offset	b14, -16
143.cfi_offset	b13, -24
144.cfi_offset	b12, -32
145.cfi_offset	b11, -40
146.cfi_offset	b10, -48
147.cfi_offset	b9, -56
148.cfi_offset	b8, -64
149
150	adrp	x11, Lchacha20_consts
151	add	x11, x11, :lo12:Lchacha20_consts
152
153	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
154	ld1	{v28.16b - v30.16b}, [x5]
155
156	mov	x15, #1 // Prepare the Poly1305 state
157	mov	x8, #0
158	mov	x9, #0
159	mov	x10, #0
160
161	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
162	add	x12, x12, x2
163	mov	v31.d[0], x4  // Store the input and aad lengths
164	mov	v31.d[1], x12
165
166	cmp	x2, #128
167	b.le	Lseal_128 // Optimization for smaller buffers
168
169    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
170    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
171    // the fifth block (A4-D4) horizontally.
172	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
173	mov	v4.16b, v24.16b
174
175	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
176	mov	v9.16b, v28.16b
177
178	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
179	mov	v14.16b, v29.16b
180
181	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
182	add	v15.4s, v15.4s, v25.4s
183	mov	v19.16b, v30.16b
184
185	sub	x5, x5, #32
186
187	mov	x6, #10
188
189.align	5
190Lseal_init_rounds:
191	add	v0.4s, v0.4s, v5.4s
192	add	v1.4s, v1.4s, v6.4s
193	add	v2.4s, v2.4s, v7.4s
194	add	v3.4s, v3.4s, v8.4s
195	add	v4.4s, v4.4s, v9.4s
196
197	eor	v15.16b, v15.16b, v0.16b
198	eor	v16.16b, v16.16b, v1.16b
199	eor	v17.16b, v17.16b, v2.16b
200	eor	v18.16b, v18.16b, v3.16b
201	eor	v19.16b, v19.16b, v4.16b
202
203	rev32	v15.8h, v15.8h
204	rev32	v16.8h, v16.8h
205	rev32	v17.8h, v17.8h
206	rev32	v18.8h, v18.8h
207	rev32	v19.8h, v19.8h
208
209	add	v10.4s, v10.4s, v15.4s
210	add	v11.4s, v11.4s, v16.4s
211	add	v12.4s, v12.4s, v17.4s
212	add	v13.4s, v13.4s, v18.4s
213	add	v14.4s, v14.4s, v19.4s
214
215	eor	v5.16b, v5.16b, v10.16b
216	eor	v6.16b, v6.16b, v11.16b
217	eor	v7.16b, v7.16b, v12.16b
218	eor	v8.16b, v8.16b, v13.16b
219	eor	v9.16b, v9.16b, v14.16b
220
221	ushr	v20.4s, v5.4s, #20
222	sli	v20.4s, v5.4s, #12
223	ushr	v5.4s, v6.4s, #20
224	sli	v5.4s, v6.4s, #12
225	ushr	v6.4s, v7.4s, #20
226	sli	v6.4s, v7.4s, #12
227	ushr	v7.4s, v8.4s, #20
228	sli	v7.4s, v8.4s, #12
229	ushr	v8.4s, v9.4s, #20
230	sli	v8.4s, v9.4s, #12
231
232	add	v0.4s, v0.4s, v20.4s
233	add	v1.4s, v1.4s, v5.4s
234	add	v2.4s, v2.4s, v6.4s
235	add	v3.4s, v3.4s, v7.4s
236	add	v4.4s, v4.4s, v8.4s
237
238	eor	v15.16b, v15.16b, v0.16b
239	eor	v16.16b, v16.16b, v1.16b
240	eor	v17.16b, v17.16b, v2.16b
241	eor	v18.16b, v18.16b, v3.16b
242	eor	v19.16b, v19.16b, v4.16b
243
244	tbl	v15.16b, {v15.16b}, v26.16b
245	tbl	v16.16b, {v16.16b}, v26.16b
246	tbl	v17.16b, {v17.16b}, v26.16b
247	tbl	v18.16b, {v18.16b}, v26.16b
248	tbl	v19.16b, {v19.16b}, v26.16b
249
250	add	v10.4s, v10.4s, v15.4s
251	add	v11.4s, v11.4s, v16.4s
252	add	v12.4s, v12.4s, v17.4s
253	add	v13.4s, v13.4s, v18.4s
254	add	v14.4s, v14.4s, v19.4s
255
256	eor	v20.16b, v20.16b, v10.16b
257	eor	v5.16b, v5.16b, v11.16b
258	eor	v6.16b, v6.16b, v12.16b
259	eor	v7.16b, v7.16b, v13.16b
260	eor	v8.16b, v8.16b, v14.16b
261
262	ushr	v9.4s, v8.4s, #25
263	sli	v9.4s, v8.4s, #7
264	ushr	v8.4s, v7.4s, #25
265	sli	v8.4s, v7.4s, #7
266	ushr	v7.4s, v6.4s, #25
267	sli	v7.4s, v6.4s, #7
268	ushr	v6.4s, v5.4s, #25
269	sli	v6.4s, v5.4s, #7
270	ushr	v5.4s, v20.4s, #25
271	sli	v5.4s, v20.4s, #7
272
273	ext	v9.16b, v9.16b, v9.16b, #4
274	ext	v14.16b, v14.16b, v14.16b, #8
275	ext	v19.16b, v19.16b, v19.16b, #12
276	add	v0.4s, v0.4s, v6.4s
277	add	v1.4s, v1.4s, v7.4s
278	add	v2.4s, v2.4s, v8.4s
279	add	v3.4s, v3.4s, v5.4s
280	add	v4.4s, v4.4s, v9.4s
281
282	eor	v18.16b, v18.16b, v0.16b
283	eor	v15.16b, v15.16b, v1.16b
284	eor	v16.16b, v16.16b, v2.16b
285	eor	v17.16b, v17.16b, v3.16b
286	eor	v19.16b, v19.16b, v4.16b
287
288	rev32	v18.8h, v18.8h
289	rev32	v15.8h, v15.8h
290	rev32	v16.8h, v16.8h
291	rev32	v17.8h, v17.8h
292	rev32	v19.8h, v19.8h
293
294	add	v12.4s, v12.4s, v18.4s
295	add	v13.4s, v13.4s, v15.4s
296	add	v10.4s, v10.4s, v16.4s
297	add	v11.4s, v11.4s, v17.4s
298	add	v14.4s, v14.4s, v19.4s
299
300	eor	v6.16b, v6.16b, v12.16b
301	eor	v7.16b, v7.16b, v13.16b
302	eor	v8.16b, v8.16b, v10.16b
303	eor	v5.16b, v5.16b, v11.16b
304	eor	v9.16b, v9.16b, v14.16b
305
306	ushr	v20.4s, v6.4s, #20
307	sli	v20.4s, v6.4s, #12
308	ushr	v6.4s, v7.4s, #20
309	sli	v6.4s, v7.4s, #12
310	ushr	v7.4s, v8.4s, #20
311	sli	v7.4s, v8.4s, #12
312	ushr	v8.4s, v5.4s, #20
313	sli	v8.4s, v5.4s, #12
314	ushr	v5.4s, v9.4s, #20
315	sli	v5.4s, v9.4s, #12
316
317	add	v0.4s, v0.4s, v20.4s
318	add	v1.4s, v1.4s, v6.4s
319	add	v2.4s, v2.4s, v7.4s
320	add	v3.4s, v3.4s, v8.4s
321	add	v4.4s, v4.4s, v5.4s
322
323	eor	v18.16b, v18.16b, v0.16b
324	eor	v15.16b, v15.16b, v1.16b
325	eor	v16.16b, v16.16b, v2.16b
326	eor	v17.16b, v17.16b, v3.16b
327	eor	v19.16b, v19.16b, v4.16b
328
329	tbl	v18.16b, {v18.16b}, v26.16b
330	tbl	v15.16b, {v15.16b}, v26.16b
331	tbl	v16.16b, {v16.16b}, v26.16b
332	tbl	v17.16b, {v17.16b}, v26.16b
333	tbl	v19.16b, {v19.16b}, v26.16b
334
335	add	v12.4s, v12.4s, v18.4s
336	add	v13.4s, v13.4s, v15.4s
337	add	v10.4s, v10.4s, v16.4s
338	add	v11.4s, v11.4s, v17.4s
339	add	v14.4s, v14.4s, v19.4s
340
341	eor	v20.16b, v20.16b, v12.16b
342	eor	v6.16b, v6.16b, v13.16b
343	eor	v7.16b, v7.16b, v10.16b
344	eor	v8.16b, v8.16b, v11.16b
345	eor	v5.16b, v5.16b, v14.16b
346
347	ushr	v9.4s, v5.4s, #25
348	sli	v9.4s, v5.4s, #7
349	ushr	v5.4s, v8.4s, #25
350	sli	v5.4s, v8.4s, #7
351	ushr	v8.4s, v7.4s, #25
352	sli	v8.4s, v7.4s, #7
353	ushr	v7.4s, v6.4s, #25
354	sli	v7.4s, v6.4s, #7
355	ushr	v6.4s, v20.4s, #25
356	sli	v6.4s, v20.4s, #7
357
358	ext	v9.16b, v9.16b, v9.16b, #12
359	ext	v14.16b, v14.16b, v14.16b, #8
360	ext	v19.16b, v19.16b, v19.16b, #4
361	subs	x6, x6, #1
362	b.hi	Lseal_init_rounds
363
364	add	v15.4s, v15.4s, v25.4s
365	mov	x11, #4
366	dup	v20.4s, w11
367	add	v25.4s, v25.4s, v20.4s
368
369	zip1	v20.4s, v0.4s, v1.4s
370	zip2	v21.4s, v0.4s, v1.4s
371	zip1	v22.4s, v2.4s, v3.4s
372	zip2	v23.4s, v2.4s, v3.4s
373
374	zip1	v0.2d, v20.2d, v22.2d
375	zip2	v1.2d, v20.2d, v22.2d
376	zip1	v2.2d, v21.2d, v23.2d
377	zip2	v3.2d, v21.2d, v23.2d
378
379	zip1	v20.4s, v5.4s, v6.4s
380	zip2	v21.4s, v5.4s, v6.4s
381	zip1	v22.4s, v7.4s, v8.4s
382	zip2	v23.4s, v7.4s, v8.4s
383
384	zip1	v5.2d, v20.2d, v22.2d
385	zip2	v6.2d, v20.2d, v22.2d
386	zip1	v7.2d, v21.2d, v23.2d
387	zip2	v8.2d, v21.2d, v23.2d
388
389	zip1	v20.4s, v10.4s, v11.4s
390	zip2	v21.4s, v10.4s, v11.4s
391	zip1	v22.4s, v12.4s, v13.4s
392	zip2	v23.4s, v12.4s, v13.4s
393
394	zip1	v10.2d, v20.2d, v22.2d
395	zip2	v11.2d, v20.2d, v22.2d
396	zip1	v12.2d, v21.2d, v23.2d
397	zip2	v13.2d, v21.2d, v23.2d
398
399	zip1	v20.4s, v15.4s, v16.4s
400	zip2	v21.4s, v15.4s, v16.4s
401	zip1	v22.4s, v17.4s, v18.4s
402	zip2	v23.4s, v17.4s, v18.4s
403
404	zip1	v15.2d, v20.2d, v22.2d
405	zip2	v16.2d, v20.2d, v22.2d
406	zip1	v17.2d, v21.2d, v23.2d
407	zip2	v18.2d, v21.2d, v23.2d
408
409	add	v4.4s, v4.4s, v24.4s
410	add	v9.4s, v9.4s, v28.4s
411	and	v4.16b, v4.16b, v27.16b
412
413	add	v0.4s, v0.4s, v24.4s
414	add	v5.4s, v5.4s, v28.4s
415	add	v10.4s, v10.4s, v29.4s
416	add	v15.4s, v15.4s, v30.4s
417
418	add	v1.4s, v1.4s, v24.4s
419	add	v6.4s, v6.4s, v28.4s
420	add	v11.4s, v11.4s, v29.4s
421	add	v16.4s, v16.4s, v30.4s
422
423	add	v2.4s, v2.4s, v24.4s
424	add	v7.4s, v7.4s, v28.4s
425	add	v12.4s, v12.4s, v29.4s
426	add	v17.4s, v17.4s, v30.4s
427
428	add	v3.4s, v3.4s, v24.4s
429	add	v8.4s, v8.4s, v28.4s
430	add	v13.4s, v13.4s, v29.4s
431	add	v18.4s, v18.4s, v30.4s
432
433	mov	x16, v4.d[0] // Move the R key to GPRs
434	mov	x17, v4.d[1]
435	mov	v27.16b, v9.16b // Store the S key
436
437	bl	Lpoly_hash_ad_internal
438
439	mov	x3, x0
440	cmp	x2, #256
441	b.le	Lseal_tail
442
443	ld1	{v20.16b - v23.16b}, [x1], #64
444	eor	v20.16b, v20.16b, v0.16b
445	eor	v21.16b, v21.16b, v5.16b
446	eor	v22.16b, v22.16b, v10.16b
447	eor	v23.16b, v23.16b, v15.16b
448	st1	{v20.16b - v23.16b}, [x0], #64
449
450	ld1	{v20.16b - v23.16b}, [x1], #64
451	eor	v20.16b, v20.16b, v1.16b
452	eor	v21.16b, v21.16b, v6.16b
453	eor	v22.16b, v22.16b, v11.16b
454	eor	v23.16b, v23.16b, v16.16b
455	st1	{v20.16b - v23.16b}, [x0], #64
456
457	ld1	{v20.16b - v23.16b}, [x1], #64
458	eor	v20.16b, v20.16b, v2.16b
459	eor	v21.16b, v21.16b, v7.16b
460	eor	v22.16b, v22.16b, v12.16b
461	eor	v23.16b, v23.16b, v17.16b
462	st1	{v20.16b - v23.16b}, [x0], #64
463
464	ld1	{v20.16b - v23.16b}, [x1], #64
465	eor	v20.16b, v20.16b, v3.16b
466	eor	v21.16b, v21.16b, v8.16b
467	eor	v22.16b, v22.16b, v13.16b
468	eor	v23.16b, v23.16b, v18.16b
469	st1	{v20.16b - v23.16b}, [x0], #64
470
471	sub	x2, x2, #256
472
473	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
474	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
475
476Lseal_main_loop:
477	adrp	x11, Lchacha20_consts
478	add	x11, x11, :lo12:Lchacha20_consts
479
480	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
481	mov	v4.16b, v24.16b
482
483	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
484	mov	v9.16b, v28.16b
485
486	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
487	mov	v14.16b, v29.16b
488
489	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
490	add	v15.4s, v15.4s, v25.4s
491	mov	v19.16b, v30.16b
492
493	eor	v20.16b, v20.16b, v20.16b //zero
494	not	v21.16b, v20.16b // -1
495	sub	v21.4s, v25.4s, v21.4s // Add +1
496	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
497	add	v19.4s, v19.4s, v20.4s
498
499	sub	x5, x5, #32
500.align	5
501Lseal_main_loop_rounds:
502	add	v0.4s, v0.4s, v5.4s
503	add	v1.4s, v1.4s, v6.4s
504	add	v2.4s, v2.4s, v7.4s
505	add	v3.4s, v3.4s, v8.4s
506	add	v4.4s, v4.4s, v9.4s
507
508	eor	v15.16b, v15.16b, v0.16b
509	eor	v16.16b, v16.16b, v1.16b
510	eor	v17.16b, v17.16b, v2.16b
511	eor	v18.16b, v18.16b, v3.16b
512	eor	v19.16b, v19.16b, v4.16b
513
514	rev32	v15.8h, v15.8h
515	rev32	v16.8h, v16.8h
516	rev32	v17.8h, v17.8h
517	rev32	v18.8h, v18.8h
518	rev32	v19.8h, v19.8h
519
520	add	v10.4s, v10.4s, v15.4s
521	add	v11.4s, v11.4s, v16.4s
522	add	v12.4s, v12.4s, v17.4s
523	add	v13.4s, v13.4s, v18.4s
524	add	v14.4s, v14.4s, v19.4s
525
526	eor	v5.16b, v5.16b, v10.16b
527	eor	v6.16b, v6.16b, v11.16b
528	eor	v7.16b, v7.16b, v12.16b
529	eor	v8.16b, v8.16b, v13.16b
530	eor	v9.16b, v9.16b, v14.16b
531
532	ushr	v20.4s, v5.4s, #20
533	sli	v20.4s, v5.4s, #12
534	ushr	v5.4s, v6.4s, #20
535	sli	v5.4s, v6.4s, #12
536	ushr	v6.4s, v7.4s, #20
537	sli	v6.4s, v7.4s, #12
538	ushr	v7.4s, v8.4s, #20
539	sli	v7.4s, v8.4s, #12
540	ushr	v8.4s, v9.4s, #20
541	sli	v8.4s, v9.4s, #12
542
543	add	v0.4s, v0.4s, v20.4s
544	add	v1.4s, v1.4s, v5.4s
545	add	v2.4s, v2.4s, v6.4s
546	add	v3.4s, v3.4s, v7.4s
547	add	v4.4s, v4.4s, v8.4s
548
549	eor	v15.16b, v15.16b, v0.16b
550	eor	v16.16b, v16.16b, v1.16b
551	eor	v17.16b, v17.16b, v2.16b
552	eor	v18.16b, v18.16b, v3.16b
553	eor	v19.16b, v19.16b, v4.16b
554
555	tbl	v15.16b, {v15.16b}, v26.16b
556	tbl	v16.16b, {v16.16b}, v26.16b
557	tbl	v17.16b, {v17.16b}, v26.16b
558	tbl	v18.16b, {v18.16b}, v26.16b
559	tbl	v19.16b, {v19.16b}, v26.16b
560
561	add	v10.4s, v10.4s, v15.4s
562	add	v11.4s, v11.4s, v16.4s
563	add	v12.4s, v12.4s, v17.4s
564	add	v13.4s, v13.4s, v18.4s
565	add	v14.4s, v14.4s, v19.4s
566
567	eor	v20.16b, v20.16b, v10.16b
568	eor	v5.16b, v5.16b, v11.16b
569	eor	v6.16b, v6.16b, v12.16b
570	eor	v7.16b, v7.16b, v13.16b
571	eor	v8.16b, v8.16b, v14.16b
572
573	ushr	v9.4s, v8.4s, #25
574	sli	v9.4s, v8.4s, #7
575	ushr	v8.4s, v7.4s, #25
576	sli	v8.4s, v7.4s, #7
577	ushr	v7.4s, v6.4s, #25
578	sli	v7.4s, v6.4s, #7
579	ushr	v6.4s, v5.4s, #25
580	sli	v6.4s, v5.4s, #7
581	ushr	v5.4s, v20.4s, #25
582	sli	v5.4s, v20.4s, #7
583
584	ext	v9.16b, v9.16b, v9.16b, #4
585	ext	v14.16b, v14.16b, v14.16b, #8
586	ext	v19.16b, v19.16b, v19.16b, #12
587	ldp	x11, x12, [x3], 16
588	adds	x8, x8, x11
589	adcs	x9, x9, x12
590	adc	x10, x10, x15
591	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
592	umulh	x12, x8, x16
593	mul	x13, x9, x16
594	umulh	x14, x9, x16
595	adds	x12, x12, x13
596	mul	x13, x10, x16
597	adc	x13, x13, x14
598	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
599	umulh	x8, x8, x17
600	adds	x12, x12, x14
601	mul	x14, x9, x17
602	umulh	x9, x9, x17
603	adcs	x14, x14, x8
604	mul	x10, x10, x17
605	adc	x10, x10, x9
606	adds	x13, x13, x14
607	adc	x14, x10, xzr
608	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
609	and	x8, x13, #-4
610	extr	x13, x14, x13, #2
611	adds	x8, x8, x11
612	lsr	x11, x14, #2
613	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
614	adds	x8, x8, x13
615	adcs	x9, x9, x12
616	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
617	add	v0.4s, v0.4s, v6.4s
618	add	v1.4s, v1.4s, v7.4s
619	add	v2.4s, v2.4s, v8.4s
620	add	v3.4s, v3.4s, v5.4s
621	add	v4.4s, v4.4s, v9.4s
622
623	eor	v18.16b, v18.16b, v0.16b
624	eor	v15.16b, v15.16b, v1.16b
625	eor	v16.16b, v16.16b, v2.16b
626	eor	v17.16b, v17.16b, v3.16b
627	eor	v19.16b, v19.16b, v4.16b
628
629	rev32	v18.8h, v18.8h
630	rev32	v15.8h, v15.8h
631	rev32	v16.8h, v16.8h
632	rev32	v17.8h, v17.8h
633	rev32	v19.8h, v19.8h
634
635	add	v12.4s, v12.4s, v18.4s
636	add	v13.4s, v13.4s, v15.4s
637	add	v10.4s, v10.4s, v16.4s
638	add	v11.4s, v11.4s, v17.4s
639	add	v14.4s, v14.4s, v19.4s
640
641	eor	v6.16b, v6.16b, v12.16b
642	eor	v7.16b, v7.16b, v13.16b
643	eor	v8.16b, v8.16b, v10.16b
644	eor	v5.16b, v5.16b, v11.16b
645	eor	v9.16b, v9.16b, v14.16b
646
647	ushr	v20.4s, v6.4s, #20
648	sli	v20.4s, v6.4s, #12
649	ushr	v6.4s, v7.4s, #20
650	sli	v6.4s, v7.4s, #12
651	ushr	v7.4s, v8.4s, #20
652	sli	v7.4s, v8.4s, #12
653	ushr	v8.4s, v5.4s, #20
654	sli	v8.4s, v5.4s, #12
655	ushr	v5.4s, v9.4s, #20
656	sli	v5.4s, v9.4s, #12
657
658	add	v0.4s, v0.4s, v20.4s
659	add	v1.4s, v1.4s, v6.4s
660	add	v2.4s, v2.4s, v7.4s
661	add	v3.4s, v3.4s, v8.4s
662	add	v4.4s, v4.4s, v5.4s
663
664	eor	v18.16b, v18.16b, v0.16b
665	eor	v15.16b, v15.16b, v1.16b
666	eor	v16.16b, v16.16b, v2.16b
667	eor	v17.16b, v17.16b, v3.16b
668	eor	v19.16b, v19.16b, v4.16b
669
670	tbl	v18.16b, {v18.16b}, v26.16b
671	tbl	v15.16b, {v15.16b}, v26.16b
672	tbl	v16.16b, {v16.16b}, v26.16b
673	tbl	v17.16b, {v17.16b}, v26.16b
674	tbl	v19.16b, {v19.16b}, v26.16b
675
676	add	v12.4s, v12.4s, v18.4s
677	add	v13.4s, v13.4s, v15.4s
678	add	v10.4s, v10.4s, v16.4s
679	add	v11.4s, v11.4s, v17.4s
680	add	v14.4s, v14.4s, v19.4s
681
682	eor	v20.16b, v20.16b, v12.16b
683	eor	v6.16b, v6.16b, v13.16b
684	eor	v7.16b, v7.16b, v10.16b
685	eor	v8.16b, v8.16b, v11.16b
686	eor	v5.16b, v5.16b, v14.16b
687
688	ushr	v9.4s, v5.4s, #25
689	sli	v9.4s, v5.4s, #7
690	ushr	v5.4s, v8.4s, #25
691	sli	v5.4s, v8.4s, #7
692	ushr	v8.4s, v7.4s, #25
693	sli	v8.4s, v7.4s, #7
694	ushr	v7.4s, v6.4s, #25
695	sli	v7.4s, v6.4s, #7
696	ushr	v6.4s, v20.4s, #25
697	sli	v6.4s, v20.4s, #7
698
699	ext	v9.16b, v9.16b, v9.16b, #12
700	ext	v14.16b, v14.16b, v14.16b, #8
701	ext	v19.16b, v19.16b, v19.16b, #4
702	subs	x6, x6, #1
703	b.ge	Lseal_main_loop_rounds
704	ldp	x11, x12, [x3], 16
705	adds	x8, x8, x11
706	adcs	x9, x9, x12
707	adc	x10, x10, x15
708	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
709	umulh	x12, x8, x16
710	mul	x13, x9, x16
711	umulh	x14, x9, x16
712	adds	x12, x12, x13
713	mul	x13, x10, x16
714	adc	x13, x13, x14
715	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
716	umulh	x8, x8, x17
717	adds	x12, x12, x14
718	mul	x14, x9, x17
719	umulh	x9, x9, x17
720	adcs	x14, x14, x8
721	mul	x10, x10, x17
722	adc	x10, x10, x9
723	adds	x13, x13, x14
724	adc	x14, x10, xzr
725	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
726	and	x8, x13, #-4
727	extr	x13, x14, x13, #2
728	adds	x8, x8, x11
729	lsr	x11, x14, #2
730	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
731	adds	x8, x8, x13
732	adcs	x9, x9, x12
733	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
734	subs	x7, x7, #1
735	b.gt	Lseal_main_loop_rounds
736
737	eor	v20.16b, v20.16b, v20.16b //zero
738	not	v21.16b, v20.16b // -1
739	sub	v21.4s, v25.4s, v21.4s // Add +1
740	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
741	add	v19.4s, v19.4s, v20.4s
742
743	add	v15.4s, v15.4s, v25.4s
744	mov	x11, #5
745	dup	v20.4s, w11
746	add	v25.4s, v25.4s, v20.4s
747
748	zip1	v20.4s, v0.4s, v1.4s
749	zip2	v21.4s, v0.4s, v1.4s
750	zip1	v22.4s, v2.4s, v3.4s
751	zip2	v23.4s, v2.4s, v3.4s
752
753	zip1	v0.2d, v20.2d, v22.2d
754	zip2	v1.2d, v20.2d, v22.2d
755	zip1	v2.2d, v21.2d, v23.2d
756	zip2	v3.2d, v21.2d, v23.2d
757
758	zip1	v20.4s, v5.4s, v6.4s
759	zip2	v21.4s, v5.4s, v6.4s
760	zip1	v22.4s, v7.4s, v8.4s
761	zip2	v23.4s, v7.4s, v8.4s
762
763	zip1	v5.2d, v20.2d, v22.2d
764	zip2	v6.2d, v20.2d, v22.2d
765	zip1	v7.2d, v21.2d, v23.2d
766	zip2	v8.2d, v21.2d, v23.2d
767
768	zip1	v20.4s, v10.4s, v11.4s
769	zip2	v21.4s, v10.4s, v11.4s
770	zip1	v22.4s, v12.4s, v13.4s
771	zip2	v23.4s, v12.4s, v13.4s
772
773	zip1	v10.2d, v20.2d, v22.2d
774	zip2	v11.2d, v20.2d, v22.2d
775	zip1	v12.2d, v21.2d, v23.2d
776	zip2	v13.2d, v21.2d, v23.2d
777
778	zip1	v20.4s, v15.4s, v16.4s
779	zip2	v21.4s, v15.4s, v16.4s
780	zip1	v22.4s, v17.4s, v18.4s
781	zip2	v23.4s, v17.4s, v18.4s
782
783	zip1	v15.2d, v20.2d, v22.2d
784	zip2	v16.2d, v20.2d, v22.2d
785	zip1	v17.2d, v21.2d, v23.2d
786	zip2	v18.2d, v21.2d, v23.2d
787
788	add	v0.4s, v0.4s, v24.4s
789	add	v5.4s, v5.4s, v28.4s
790	add	v10.4s, v10.4s, v29.4s
791	add	v15.4s, v15.4s, v30.4s
792
793	add	v1.4s, v1.4s, v24.4s
794	add	v6.4s, v6.4s, v28.4s
795	add	v11.4s, v11.4s, v29.4s
796	add	v16.4s, v16.4s, v30.4s
797
798	add	v2.4s, v2.4s, v24.4s
799	add	v7.4s, v7.4s, v28.4s
800	add	v12.4s, v12.4s, v29.4s
801	add	v17.4s, v17.4s, v30.4s
802
803	add	v3.4s, v3.4s, v24.4s
804	add	v8.4s, v8.4s, v28.4s
805	add	v13.4s, v13.4s, v29.4s
806	add	v18.4s, v18.4s, v30.4s
807
808	add	v4.4s, v4.4s, v24.4s
809	add	v9.4s, v9.4s, v28.4s
810	add	v14.4s, v14.4s, v29.4s
811	add	v19.4s, v19.4s, v30.4s
812
813	cmp	x2, #320
814	b.le	Lseal_tail
815
816	ld1	{v20.16b - v23.16b}, [x1], #64
817	eor	v20.16b, v20.16b, v0.16b
818	eor	v21.16b, v21.16b, v5.16b
819	eor	v22.16b, v22.16b, v10.16b
820	eor	v23.16b, v23.16b, v15.16b
821	st1	{v20.16b - v23.16b}, [x0], #64
822
823	ld1	{v20.16b - v23.16b}, [x1], #64
824	eor	v20.16b, v20.16b, v1.16b
825	eor	v21.16b, v21.16b, v6.16b
826	eor	v22.16b, v22.16b, v11.16b
827	eor	v23.16b, v23.16b, v16.16b
828	st1	{v20.16b - v23.16b}, [x0], #64
829
830	ld1	{v20.16b - v23.16b}, [x1], #64
831	eor	v20.16b, v20.16b, v2.16b
832	eor	v21.16b, v21.16b, v7.16b
833	eor	v22.16b, v22.16b, v12.16b
834	eor	v23.16b, v23.16b, v17.16b
835	st1	{v20.16b - v23.16b}, [x0], #64
836
837	ld1	{v20.16b - v23.16b}, [x1], #64
838	eor	v20.16b, v20.16b, v3.16b
839	eor	v21.16b, v21.16b, v8.16b
840	eor	v22.16b, v22.16b, v13.16b
841	eor	v23.16b, v23.16b, v18.16b
842	st1	{v20.16b - v23.16b}, [x0], #64
843
844	ld1	{v20.16b - v23.16b}, [x1], #64
845	eor	v20.16b, v20.16b, v4.16b
846	eor	v21.16b, v21.16b, v9.16b
847	eor	v22.16b, v22.16b, v14.16b
848	eor	v23.16b, v23.16b, v19.16b
849	st1	{v20.16b - v23.16b}, [x0], #64
850
851	sub	x2, x2, #320
852
853	mov	x6, #0
854	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
855
856	b	Lseal_main_loop
857
858Lseal_tail:
859    // This part of the function handles the storage and authentication of the last [0,320) bytes
860    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
861	cmp	x2, #64
862	b.lt	Lseal_tail_64
863
864    // Store and authenticate 64B blocks per iteration
865	ld1	{v20.16b - v23.16b}, [x1], #64
866
867	eor	v20.16b, v20.16b, v0.16b
868	eor	v21.16b, v21.16b, v5.16b
869	eor	v22.16b, v22.16b, v10.16b
870	eor	v23.16b, v23.16b, v15.16b
871	mov	x11, v20.d[0]
872	mov	x12, v20.d[1]
873	adds	x8, x8, x11
874	adcs	x9, x9, x12
875	adc	x10, x10, x15
876	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
877	umulh	x12, x8, x16
878	mul	x13, x9, x16
879	umulh	x14, x9, x16
880	adds	x12, x12, x13
881	mul	x13, x10, x16
882	adc	x13, x13, x14
883	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
884	umulh	x8, x8, x17
885	adds	x12, x12, x14
886	mul	x14, x9, x17
887	umulh	x9, x9, x17
888	adcs	x14, x14, x8
889	mul	x10, x10, x17
890	adc	x10, x10, x9
891	adds	x13, x13, x14
892	adc	x14, x10, xzr
893	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
894	and	x8, x13, #-4
895	extr	x13, x14, x13, #2
896	adds	x8, x8, x11
897	lsr	x11, x14, #2
898	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
899	adds	x8, x8, x13
900	adcs	x9, x9, x12
901	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
902	mov	x11, v21.d[0]
903	mov	x12, v21.d[1]
904	adds	x8, x8, x11
905	adcs	x9, x9, x12
906	adc	x10, x10, x15
907	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
908	umulh	x12, x8, x16
909	mul	x13, x9, x16
910	umulh	x14, x9, x16
911	adds	x12, x12, x13
912	mul	x13, x10, x16
913	adc	x13, x13, x14
914	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
915	umulh	x8, x8, x17
916	adds	x12, x12, x14
917	mul	x14, x9, x17
918	umulh	x9, x9, x17
919	adcs	x14, x14, x8
920	mul	x10, x10, x17
921	adc	x10, x10, x9
922	adds	x13, x13, x14
923	adc	x14, x10, xzr
924	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
925	and	x8, x13, #-4
926	extr	x13, x14, x13, #2
927	adds	x8, x8, x11
928	lsr	x11, x14, #2
929	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
930	adds	x8, x8, x13
931	adcs	x9, x9, x12
932	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
933	mov	x11, v22.d[0]
934	mov	x12, v22.d[1]
935	adds	x8, x8, x11
936	adcs	x9, x9, x12
937	adc	x10, x10, x15
938	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
939	umulh	x12, x8, x16
940	mul	x13, x9, x16
941	umulh	x14, x9, x16
942	adds	x12, x12, x13
943	mul	x13, x10, x16
944	adc	x13, x13, x14
945	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
946	umulh	x8, x8, x17
947	adds	x12, x12, x14
948	mul	x14, x9, x17
949	umulh	x9, x9, x17
950	adcs	x14, x14, x8
951	mul	x10, x10, x17
952	adc	x10, x10, x9
953	adds	x13, x13, x14
954	adc	x14, x10, xzr
955	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
956	and	x8, x13, #-4
957	extr	x13, x14, x13, #2
958	adds	x8, x8, x11
959	lsr	x11, x14, #2
960	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
961	adds	x8, x8, x13
962	adcs	x9, x9, x12
963	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
964	mov	x11, v23.d[0]
965	mov	x12, v23.d[1]
966	adds	x8, x8, x11
967	adcs	x9, x9, x12
968	adc	x10, x10, x15
969	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
970	umulh	x12, x8, x16
971	mul	x13, x9, x16
972	umulh	x14, x9, x16
973	adds	x12, x12, x13
974	mul	x13, x10, x16
975	adc	x13, x13, x14
976	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
977	umulh	x8, x8, x17
978	adds	x12, x12, x14
979	mul	x14, x9, x17
980	umulh	x9, x9, x17
981	adcs	x14, x14, x8
982	mul	x10, x10, x17
983	adc	x10, x10, x9
984	adds	x13, x13, x14
985	adc	x14, x10, xzr
986	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
987	and	x8, x13, #-4
988	extr	x13, x14, x13, #2
989	adds	x8, x8, x11
990	lsr	x11, x14, #2
991	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
992	adds	x8, x8, x13
993	adcs	x9, x9, x12
994	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
995	st1	{v20.16b - v23.16b}, [x0], #64
996	sub	x2, x2, #64
997
998    // Shift the state left by 64 bytes for the next iteration of the loop
999	mov	v0.16b, v1.16b
1000	mov	v5.16b, v6.16b
1001	mov	v10.16b, v11.16b
1002	mov	v15.16b, v16.16b
1003
1004	mov	v1.16b, v2.16b
1005	mov	v6.16b, v7.16b
1006	mov	v11.16b, v12.16b
1007	mov	v16.16b, v17.16b
1008
1009	mov	v2.16b, v3.16b
1010	mov	v7.16b, v8.16b
1011	mov	v12.16b, v13.16b
1012	mov	v17.16b, v18.16b
1013
1014	mov	v3.16b, v4.16b
1015	mov	v8.16b, v9.16b
1016	mov	v13.16b, v14.16b
1017	mov	v18.16b, v19.16b
1018
1019	b	Lseal_tail
1020
1021Lseal_tail_64:
1022	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
1023
1024    // Here we handle the last [0,64) bytes of plaintext
1025	cmp	x2, #16
1026	b.lt	Lseal_tail_16
1027    // Each iteration encrypt and authenticate a 16B block
1028	ld1	{v20.16b}, [x1], #16
1029	eor	v20.16b, v20.16b, v0.16b
1030	mov	x11, v20.d[0]
1031	mov	x12, v20.d[1]
1032	adds	x8, x8, x11
1033	adcs	x9, x9, x12
1034	adc	x10, x10, x15
1035	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1036	umulh	x12, x8, x16
1037	mul	x13, x9, x16
1038	umulh	x14, x9, x16
1039	adds	x12, x12, x13
1040	mul	x13, x10, x16
1041	adc	x13, x13, x14
1042	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1043	umulh	x8, x8, x17
1044	adds	x12, x12, x14
1045	mul	x14, x9, x17
1046	umulh	x9, x9, x17
1047	adcs	x14, x14, x8
1048	mul	x10, x10, x17
1049	adc	x10, x10, x9
1050	adds	x13, x13, x14
1051	adc	x14, x10, xzr
1052	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1053	and	x8, x13, #-4
1054	extr	x13, x14, x13, #2
1055	adds	x8, x8, x11
1056	lsr	x11, x14, #2
1057	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1058	adds	x8, x8, x13
1059	adcs	x9, x9, x12
1060	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1061	st1	{v20.16b}, [x0], #16
1062
1063	sub	x2, x2, #16
1064
1065    // Shift the state left by 16 bytes for the next iteration of the loop
1066	mov	v0.16b, v5.16b
1067	mov	v5.16b, v10.16b
1068	mov	v10.16b, v15.16b
1069
1070	b	Lseal_tail_64
1071
1072Lseal_tail_16:
1073    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
1074	cbz	x2, Lseal_hash_extra
1075
1076	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
1077	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
1078	not	v22.16b, v20.16b
1079
1080	mov	x6, x2
1081	add	x1, x1, x2
1082
1083	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding
1084
1085	mov	x7, #16          // We need to load some extra_in first for padding
1086	sub	x7, x7, x2
1087	cmp	x4, x7
1088	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
1089	mov	x12, x7
1090	add	x3, x3, x7
1091	sub	x4, x4, x7
1092
1093Lseal_tail16_compose_extra_in:
1094	ext	v20.16b, v20.16b, v20.16b, #15
1095	ldrb	w11, [x3, #-1]!
1096	mov	v20.b[0], w11
1097	subs	x7, x7, #1
1098	b.gt	Lseal_tail16_compose_extra_in
1099
1100	add	x3, x3, x12
1101
1102Lseal_tail_16_compose:
1103	ext	v20.16b, v20.16b, v20.16b, #15
1104	ldrb	w11, [x1, #-1]!
1105	mov	v20.b[0], w11
1106	ext	v21.16b, v22.16b, v21.16b, #15
1107	subs	x2, x2, #1
1108	b.gt	Lseal_tail_16_compose
1109
1110	and	v0.16b, v0.16b, v21.16b
1111	eor	v20.16b, v20.16b, v0.16b
1112	mov	v21.16b, v20.16b
1113
1114Lseal_tail_16_store:
1115	umov	w11, v20.b[0]
1116	strb	w11, [x0], #1
1117	ext	v20.16b, v20.16b, v20.16b, #1
1118	subs	x6, x6, #1
1119	b.gt	Lseal_tail_16_store
1120
1121    // Hash in the final ct block concatenated with extra_in
1122	mov	x11, v21.d[0]
1123	mov	x12, v21.d[1]
1124	adds	x8, x8, x11
1125	adcs	x9, x9, x12
1126	adc	x10, x10, x15
1127	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1128	umulh	x12, x8, x16
1129	mul	x13, x9, x16
1130	umulh	x14, x9, x16
1131	adds	x12, x12, x13
1132	mul	x13, x10, x16
1133	adc	x13, x13, x14
1134	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1135	umulh	x8, x8, x17
1136	adds	x12, x12, x14
1137	mul	x14, x9, x17
1138	umulh	x9, x9, x17
1139	adcs	x14, x14, x8
1140	mul	x10, x10, x17
1141	adc	x10, x10, x9
1142	adds	x13, x13, x14
1143	adc	x14, x10, xzr
1144	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1145	and	x8, x13, #-4
1146	extr	x13, x14, x13, #2
1147	adds	x8, x8, x11
1148	lsr	x11, x14, #2
1149	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1150	adds	x8, x8, x13
1151	adcs	x9, x9, x12
1152	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1153
1154Lseal_hash_extra:
1155	cbz	x4, Lseal_finalize
1156
1157Lseal_hash_extra_loop:
1158	cmp	x4, #16
1159	b.lt	Lseal_hash_extra_tail
1160	ld1	{v20.16b}, [x3], #16
1161	mov	x11, v20.d[0]
1162	mov	x12, v20.d[1]
1163	adds	x8, x8, x11
1164	adcs	x9, x9, x12
1165	adc	x10, x10, x15
1166	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1167	umulh	x12, x8, x16
1168	mul	x13, x9, x16
1169	umulh	x14, x9, x16
1170	adds	x12, x12, x13
1171	mul	x13, x10, x16
1172	adc	x13, x13, x14
1173	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1174	umulh	x8, x8, x17
1175	adds	x12, x12, x14
1176	mul	x14, x9, x17
1177	umulh	x9, x9, x17
1178	adcs	x14, x14, x8
1179	mul	x10, x10, x17
1180	adc	x10, x10, x9
1181	adds	x13, x13, x14
1182	adc	x14, x10, xzr
1183	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1184	and	x8, x13, #-4
1185	extr	x13, x14, x13, #2
1186	adds	x8, x8, x11
1187	lsr	x11, x14, #2
1188	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1189	adds	x8, x8, x13
1190	adcs	x9, x9, x12
1191	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1192	sub	x4, x4, #16
1193	b	Lseal_hash_extra_loop
1194
1195Lseal_hash_extra_tail:
1196	cbz	x4, Lseal_finalize
1197	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
1198	add	x3, x3, x4
1199
1200Lseal_hash_extra_load:
1201	ext	v20.16b, v20.16b, v20.16b, #15
1202	ldrb	w11, [x3, #-1]!
1203	mov	v20.b[0], w11
1204	subs	x4, x4, #1
1205	b.gt	Lseal_hash_extra_load
1206
1207    // Hash in the final padded extra_in blcok
1208	mov	x11, v20.d[0]
1209	mov	x12, v20.d[1]
1210	adds	x8, x8, x11
1211	adcs	x9, x9, x12
1212	adc	x10, x10, x15
1213	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1214	umulh	x12, x8, x16
1215	mul	x13, x9, x16
1216	umulh	x14, x9, x16
1217	adds	x12, x12, x13
1218	mul	x13, x10, x16
1219	adc	x13, x13, x14
1220	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1221	umulh	x8, x8, x17
1222	adds	x12, x12, x14
1223	mul	x14, x9, x17
1224	umulh	x9, x9, x17
1225	adcs	x14, x14, x8
1226	mul	x10, x10, x17
1227	adc	x10, x10, x9
1228	adds	x13, x13, x14
1229	adc	x14, x10, xzr
1230	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1231	and	x8, x13, #-4
1232	extr	x13, x14, x13, #2
1233	adds	x8, x8, x11
1234	lsr	x11, x14, #2
1235	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1236	adds	x8, x8, x13
1237	adcs	x9, x9, x12
1238	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1239
1240Lseal_finalize:
1241	mov	x11, v31.d[0]
1242	mov	x12, v31.d[1]
1243	adds	x8, x8, x11
1244	adcs	x9, x9, x12
1245	adc	x10, x10, x15
1246	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1247	umulh	x12, x8, x16
1248	mul	x13, x9, x16
1249	umulh	x14, x9, x16
1250	adds	x12, x12, x13
1251	mul	x13, x10, x16
1252	adc	x13, x13, x14
1253	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1254	umulh	x8, x8, x17
1255	adds	x12, x12, x14
1256	mul	x14, x9, x17
1257	umulh	x9, x9, x17
1258	adcs	x14, x14, x8
1259	mul	x10, x10, x17
1260	adc	x10, x10, x9
1261	adds	x13, x13, x14
1262	adc	x14, x10, xzr
1263	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1264	and	x8, x13, #-4
1265	extr	x13, x14, x13, #2
1266	adds	x8, x8, x11
1267	lsr	x11, x14, #2
1268	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1269	adds	x8, x8, x13
1270	adcs	x9, x9, x12
1271	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1272    // Final reduction step
1273	sub	x12, xzr, x15
1274	orr	x13, xzr, #3
1275	subs	x11, x8, #-5
1276	sbcs	x12, x9, x12
1277	sbcs	x13, x10, x13
1278	csel	x8, x11, x8, cs
1279	csel	x9, x12, x9, cs
1280	csel	x10, x13, x10, cs
1281	mov	x11, v27.d[0]
1282	mov	x12, v27.d[1]
1283	adds	x8, x8, x11
1284	adcs	x9, x9, x12
1285	adc	x10, x10, x15
1286
1287	stp	x8, x9, [x5]
1288
1289	ldp	d8, d9, [sp, #16]
1290	ldp	d10, d11, [sp, #32]
1291	ldp	d12, d13, [sp, #48]
1292	ldp	d14, d15, [sp, #64]
1293.cfi_restore	b15
1294.cfi_restore	b14
1295.cfi_restore	b13
1296.cfi_restore	b12
1297.cfi_restore	b11
1298.cfi_restore	b10
1299.cfi_restore	b9
1300.cfi_restore	b8
1301	ldp	x29, x30, [sp], 80
1302.cfi_restore	w29
1303.cfi_restore	w30
1304.cfi_def_cfa_offset	0
1305	AARCH64_VALIDATE_LINK_REGISTER
1306	ret
1307
1308Lseal_128:
1309    // On some architectures preparing 5 blocks for small buffers is wasteful
1310	eor	v25.16b, v25.16b, v25.16b
1311	mov	x11, #1
1312	mov	v25.s[0], w11
1313	mov	v0.16b, v24.16b
1314	mov	v1.16b, v24.16b
1315	mov	v2.16b, v24.16b
1316	mov	v5.16b, v28.16b
1317	mov	v6.16b, v28.16b
1318	mov	v7.16b, v28.16b
1319	mov	v10.16b, v29.16b
1320	mov	v11.16b, v29.16b
1321	mov	v12.16b, v29.16b
1322	mov	v17.16b, v30.16b
1323	add	v15.4s, v17.4s, v25.4s
1324	add	v16.4s, v15.4s, v25.4s
1325
1326	mov	x6, #10
1327
1328Lseal_128_rounds:
1329	add	v0.4s, v0.4s, v5.4s
1330	add	v1.4s, v1.4s, v6.4s
1331	add	v2.4s, v2.4s, v7.4s
1332	eor	v15.16b, v15.16b, v0.16b
1333	eor	v16.16b, v16.16b, v1.16b
1334	eor	v17.16b, v17.16b, v2.16b
1335	rev32	v15.8h, v15.8h
1336	rev32	v16.8h, v16.8h
1337	rev32	v17.8h, v17.8h
1338
1339	add	v10.4s, v10.4s, v15.4s
1340	add	v11.4s, v11.4s, v16.4s
1341	add	v12.4s, v12.4s, v17.4s
1342	eor	v5.16b, v5.16b, v10.16b
1343	eor	v6.16b, v6.16b, v11.16b
1344	eor	v7.16b, v7.16b, v12.16b
1345	ushr	v20.4s, v5.4s, #20
1346	sli	v20.4s, v5.4s, #12
1347	ushr	v5.4s, v6.4s, #20
1348	sli	v5.4s, v6.4s, #12
1349	ushr	v6.4s, v7.4s, #20
1350	sli	v6.4s, v7.4s, #12
1351
1352	add	v0.4s, v0.4s, v20.4s
1353	add	v1.4s, v1.4s, v5.4s
1354	add	v2.4s, v2.4s, v6.4s
1355	eor	v15.16b, v15.16b, v0.16b
1356	eor	v16.16b, v16.16b, v1.16b
1357	eor	v17.16b, v17.16b, v2.16b
1358	tbl	v15.16b, {v15.16b}, v26.16b
1359	tbl	v16.16b, {v16.16b}, v26.16b
1360	tbl	v17.16b, {v17.16b}, v26.16b
1361
1362	add	v10.4s, v10.4s, v15.4s
1363	add	v11.4s, v11.4s, v16.4s
1364	add	v12.4s, v12.4s, v17.4s
1365	eor	v20.16b, v20.16b, v10.16b
1366	eor	v5.16b, v5.16b, v11.16b
1367	eor	v6.16b, v6.16b, v12.16b
1368	ushr	v7.4s, v6.4s, #25
1369	sli	v7.4s, v6.4s, #7
1370	ushr	v6.4s, v5.4s, #25
1371	sli	v6.4s, v5.4s, #7
1372	ushr	v5.4s, v20.4s, #25
1373	sli	v5.4s, v20.4s, #7
1374
1375	ext	v5.16b, v5.16b, v5.16b, #4
1376	ext	v6.16b, v6.16b, v6.16b, #4
1377	ext	v7.16b, v7.16b, v7.16b, #4
1378
1379	ext	v10.16b, v10.16b, v10.16b, #8
1380	ext	v11.16b, v11.16b, v11.16b, #8
1381	ext	v12.16b, v12.16b, v12.16b, #8
1382
1383	ext	v15.16b, v15.16b, v15.16b, #12
1384	ext	v16.16b, v16.16b, v16.16b, #12
1385	ext	v17.16b, v17.16b, v17.16b, #12
1386	add	v0.4s, v0.4s, v5.4s
1387	add	v1.4s, v1.4s, v6.4s
1388	add	v2.4s, v2.4s, v7.4s
1389	eor	v15.16b, v15.16b, v0.16b
1390	eor	v16.16b, v16.16b, v1.16b
1391	eor	v17.16b, v17.16b, v2.16b
1392	rev32	v15.8h, v15.8h
1393	rev32	v16.8h, v16.8h
1394	rev32	v17.8h, v17.8h
1395
1396	add	v10.4s, v10.4s, v15.4s
1397	add	v11.4s, v11.4s, v16.4s
1398	add	v12.4s, v12.4s, v17.4s
1399	eor	v5.16b, v5.16b, v10.16b
1400	eor	v6.16b, v6.16b, v11.16b
1401	eor	v7.16b, v7.16b, v12.16b
1402	ushr	v20.4s, v5.4s, #20
1403	sli	v20.4s, v5.4s, #12
1404	ushr	v5.4s, v6.4s, #20
1405	sli	v5.4s, v6.4s, #12
1406	ushr	v6.4s, v7.4s, #20
1407	sli	v6.4s, v7.4s, #12
1408
1409	add	v0.4s, v0.4s, v20.4s
1410	add	v1.4s, v1.4s, v5.4s
1411	add	v2.4s, v2.4s, v6.4s
1412	eor	v15.16b, v15.16b, v0.16b
1413	eor	v16.16b, v16.16b, v1.16b
1414	eor	v17.16b, v17.16b, v2.16b
1415	tbl	v15.16b, {v15.16b}, v26.16b
1416	tbl	v16.16b, {v16.16b}, v26.16b
1417	tbl	v17.16b, {v17.16b}, v26.16b
1418
1419	add	v10.4s, v10.4s, v15.4s
1420	add	v11.4s, v11.4s, v16.4s
1421	add	v12.4s, v12.4s, v17.4s
1422	eor	v20.16b, v20.16b, v10.16b
1423	eor	v5.16b, v5.16b, v11.16b
1424	eor	v6.16b, v6.16b, v12.16b
1425	ushr	v7.4s, v6.4s, #25
1426	sli	v7.4s, v6.4s, #7
1427	ushr	v6.4s, v5.4s, #25
1428	sli	v6.4s, v5.4s, #7
1429	ushr	v5.4s, v20.4s, #25
1430	sli	v5.4s, v20.4s, #7
1431
1432	ext	v5.16b, v5.16b, v5.16b, #12
1433	ext	v6.16b, v6.16b, v6.16b, #12
1434	ext	v7.16b, v7.16b, v7.16b, #12
1435
1436	ext	v10.16b, v10.16b, v10.16b, #8
1437	ext	v11.16b, v11.16b, v11.16b, #8
1438	ext	v12.16b, v12.16b, v12.16b, #8
1439
1440	ext	v15.16b, v15.16b, v15.16b, #4
1441	ext	v16.16b, v16.16b, v16.16b, #4
1442	ext	v17.16b, v17.16b, v17.16b, #4
1443	subs	x6, x6, #1
1444	b.hi	Lseal_128_rounds
1445
1446	add	v0.4s, v0.4s, v24.4s
1447	add	v1.4s, v1.4s, v24.4s
1448	add	v2.4s, v2.4s, v24.4s
1449
1450	add	v5.4s, v5.4s, v28.4s
1451	add	v6.4s, v6.4s, v28.4s
1452	add	v7.4s, v7.4s, v28.4s
1453
1454    // Only the first 32 bytes of the third block (counter = 0) are needed,
1455    // so skip updating v12 and v17.
1456	add	v10.4s, v10.4s, v29.4s
1457	add	v11.4s, v11.4s, v29.4s
1458
1459	add	v30.4s, v30.4s, v25.4s
1460	add	v15.4s, v15.4s, v30.4s
1461	add	v30.4s, v30.4s, v25.4s
1462	add	v16.4s, v16.4s, v30.4s
1463
1464	and	v2.16b, v2.16b, v27.16b
1465	mov	x16, v2.d[0] // Move the R key to GPRs
1466	mov	x17, v2.d[1]
1467	mov	v27.16b, v7.16b // Store the S key
1468
1469	bl	Lpoly_hash_ad_internal
1470	b	Lseal_tail
1471.cfi_endproc
1472
1473
1474/////////////////////////////////
1475//
1476// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
1477//
1478.globl	chacha20_poly1305_open
1479
1480.def chacha20_poly1305_open
1481   .type 32
1482.endef
1483.align	6
1484chacha20_poly1305_open:
1485	AARCH64_SIGN_LINK_REGISTER
1486.cfi_startproc
1487	stp	x29, x30, [sp, #-80]!
1488.cfi_def_cfa_offset	80
1489.cfi_offset	w30, -72
1490.cfi_offset	w29, -80
1491	mov	x29, sp
1492    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
1493    // we don't actually use the frame pointer like that, it's probably not
1494    // worth bothering.
1495	stp	d8, d9, [sp, #16]
1496	stp	d10, d11, [sp, #32]
1497	stp	d12, d13, [sp, #48]
1498	stp	d14, d15, [sp, #64]
1499.cfi_offset	b15, -8
1500.cfi_offset	b14, -16
1501.cfi_offset	b13, -24
1502.cfi_offset	b12, -32
1503.cfi_offset	b11, -40
1504.cfi_offset	b10, -48
1505.cfi_offset	b9, -56
1506.cfi_offset	b8, -64
1507
1508	adrp	x11, Lchacha20_consts
1509	add	x11, x11, :lo12:Lchacha20_consts
1510
1511	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
1512	ld1	{v28.16b - v30.16b}, [x5]
1513
1514	mov	x15, #1 // Prepare the Poly1305 state
1515	mov	x8, #0
1516	mov	x9, #0
1517	mov	x10, #0
1518
1519	mov	v31.d[0], x4  // Store the input and aad lengths
1520	mov	v31.d[1], x2
1521
1522	cmp	x2, #128
1523	b.le	Lopen_128 // Optimization for smaller buffers
1524
1525    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
1526	mov	v0.16b, v24.16b
1527	mov	v5.16b, v28.16b
1528	mov	v10.16b, v29.16b
1529	mov	v15.16b, v30.16b
1530
1531	mov	x6, #10
1532
1533.align	5
1534Lopen_init_rounds:
1535	add	v0.4s, v0.4s, v5.4s
1536	eor	v15.16b, v15.16b, v0.16b
1537	rev32	v15.8h, v15.8h
1538
1539	add	v10.4s, v10.4s, v15.4s
1540	eor	v5.16b, v5.16b, v10.16b
1541	ushr	v20.4s, v5.4s, #20
1542	sli	v20.4s, v5.4s, #12
1543	add	v0.4s, v0.4s, v20.4s
1544	eor	v15.16b, v15.16b, v0.16b
1545	tbl	v15.16b, {v15.16b}, v26.16b
1546
1547	add	v10.4s, v10.4s, v15.4s
1548	eor	v20.16b, v20.16b, v10.16b
1549	ushr	v5.4s, v20.4s, #25
1550	sli	v5.4s, v20.4s, #7
1551	ext	v5.16b, v5.16b, v5.16b, #4
1552	ext	v10.16b, v10.16b, v10.16b, #8
1553	ext	v15.16b, v15.16b, v15.16b, #12
1554	add	v0.4s, v0.4s, v5.4s
1555	eor	v15.16b, v15.16b, v0.16b
1556	rev32	v15.8h, v15.8h
1557
1558	add	v10.4s, v10.4s, v15.4s
1559	eor	v5.16b, v5.16b, v10.16b
1560	ushr	v20.4s, v5.4s, #20
1561	sli	v20.4s, v5.4s, #12
1562	add	v0.4s, v0.4s, v20.4s
1563	eor	v15.16b, v15.16b, v0.16b
1564	tbl	v15.16b, {v15.16b}, v26.16b
1565
1566	add	v10.4s, v10.4s, v15.4s
1567	eor	v20.16b, v20.16b, v10.16b
1568	ushr	v5.4s, v20.4s, #25
1569	sli	v5.4s, v20.4s, #7
1570	ext	v5.16b, v5.16b, v5.16b, #12
1571	ext	v10.16b, v10.16b, v10.16b, #8
1572	ext	v15.16b, v15.16b, v15.16b, #4
1573	subs	x6, x6, #1
1574	b.hi	Lopen_init_rounds
1575
1576	add	v0.4s, v0.4s, v24.4s
1577	add	v5.4s, v5.4s, v28.4s
1578
1579	and	v0.16b, v0.16b, v27.16b
1580	mov	x16, v0.d[0] // Move the R key to GPRs
1581	mov	x17, v0.d[1]
1582	mov	v27.16b, v5.16b // Store the S key
1583
1584	bl	Lpoly_hash_ad_internal
1585
1586Lopen_ad_done:
1587	mov	x3, x1
1588
1589// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
1590Lopen_main_loop:
1591
1592	cmp	x2, #192
1593	b.lt	Lopen_tail
1594
1595	adrp	x11, Lchacha20_consts
1596	add	x11, x11, :lo12:Lchacha20_consts
1597
1598	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
1599	mov	v4.16b, v24.16b
1600
1601	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
1602	mov	v9.16b, v28.16b
1603
1604	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
1605	mov	v14.16b, v29.16b
1606
1607	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
1608	sub	x5, x5, #32
1609	add	v15.4s, v15.4s, v25.4s
1610	mov	v19.16b, v30.16b
1611
1612	eor	v20.16b, v20.16b, v20.16b //zero
1613	not	v21.16b, v20.16b // -1
1614	sub	v21.4s, v25.4s, v21.4s // Add +1
1615	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
1616	add	v19.4s, v19.4s, v20.4s
1617
1618	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
1619	sub	x4, x4, #10
1620
1621	mov	x7, #10
1622	subs	x6, x7, x4
1623	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
1624	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
1625
1626	cbz	x7, Lopen_main_loop_rounds_short
1627
1628.align	5
1629Lopen_main_loop_rounds:
1630	ldp	x11, x12, [x3], 16
1631	adds	x8, x8, x11
1632	adcs	x9, x9, x12
1633	adc	x10, x10, x15
1634	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1635	umulh	x12, x8, x16
1636	mul	x13, x9, x16
1637	umulh	x14, x9, x16
1638	adds	x12, x12, x13
1639	mul	x13, x10, x16
1640	adc	x13, x13, x14
1641	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1642	umulh	x8, x8, x17
1643	adds	x12, x12, x14
1644	mul	x14, x9, x17
1645	umulh	x9, x9, x17
1646	adcs	x14, x14, x8
1647	mul	x10, x10, x17
1648	adc	x10, x10, x9
1649	adds	x13, x13, x14
1650	adc	x14, x10, xzr
1651	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1652	and	x8, x13, #-4
1653	extr	x13, x14, x13, #2
1654	adds	x8, x8, x11
1655	lsr	x11, x14, #2
1656	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1657	adds	x8, x8, x13
1658	adcs	x9, x9, x12
1659	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1660Lopen_main_loop_rounds_short:
1661	add	v0.4s, v0.4s, v5.4s
1662	add	v1.4s, v1.4s, v6.4s
1663	add	v2.4s, v2.4s, v7.4s
1664	add	v3.4s, v3.4s, v8.4s
1665	add	v4.4s, v4.4s, v9.4s
1666
1667	eor	v15.16b, v15.16b, v0.16b
1668	eor	v16.16b, v16.16b, v1.16b
1669	eor	v17.16b, v17.16b, v2.16b
1670	eor	v18.16b, v18.16b, v3.16b
1671	eor	v19.16b, v19.16b, v4.16b
1672
1673	rev32	v15.8h, v15.8h
1674	rev32	v16.8h, v16.8h
1675	rev32	v17.8h, v17.8h
1676	rev32	v18.8h, v18.8h
1677	rev32	v19.8h, v19.8h
1678
1679	add	v10.4s, v10.4s, v15.4s
1680	add	v11.4s, v11.4s, v16.4s
1681	add	v12.4s, v12.4s, v17.4s
1682	add	v13.4s, v13.4s, v18.4s
1683	add	v14.4s, v14.4s, v19.4s
1684
1685	eor	v5.16b, v5.16b, v10.16b
1686	eor	v6.16b, v6.16b, v11.16b
1687	eor	v7.16b, v7.16b, v12.16b
1688	eor	v8.16b, v8.16b, v13.16b
1689	eor	v9.16b, v9.16b, v14.16b
1690
1691	ushr	v20.4s, v5.4s, #20
1692	sli	v20.4s, v5.4s, #12
1693	ushr	v5.4s, v6.4s, #20
1694	sli	v5.4s, v6.4s, #12
1695	ushr	v6.4s, v7.4s, #20
1696	sli	v6.4s, v7.4s, #12
1697	ushr	v7.4s, v8.4s, #20
1698	sli	v7.4s, v8.4s, #12
1699	ushr	v8.4s, v9.4s, #20
1700	sli	v8.4s, v9.4s, #12
1701
1702	add	v0.4s, v0.4s, v20.4s
1703	add	v1.4s, v1.4s, v5.4s
1704	add	v2.4s, v2.4s, v6.4s
1705	add	v3.4s, v3.4s, v7.4s
1706	add	v4.4s, v4.4s, v8.4s
1707
1708	eor	v15.16b, v15.16b, v0.16b
1709	eor	v16.16b, v16.16b, v1.16b
1710	eor	v17.16b, v17.16b, v2.16b
1711	eor	v18.16b, v18.16b, v3.16b
1712	eor	v19.16b, v19.16b, v4.16b
1713
1714	tbl	v15.16b, {v15.16b}, v26.16b
1715	tbl	v16.16b, {v16.16b}, v26.16b
1716	tbl	v17.16b, {v17.16b}, v26.16b
1717	tbl	v18.16b, {v18.16b}, v26.16b
1718	tbl	v19.16b, {v19.16b}, v26.16b
1719
1720	add	v10.4s, v10.4s, v15.4s
1721	add	v11.4s, v11.4s, v16.4s
1722	add	v12.4s, v12.4s, v17.4s
1723	add	v13.4s, v13.4s, v18.4s
1724	add	v14.4s, v14.4s, v19.4s
1725
1726	eor	v20.16b, v20.16b, v10.16b
1727	eor	v5.16b, v5.16b, v11.16b
1728	eor	v6.16b, v6.16b, v12.16b
1729	eor	v7.16b, v7.16b, v13.16b
1730	eor	v8.16b, v8.16b, v14.16b
1731
1732	ushr	v9.4s, v8.4s, #25
1733	sli	v9.4s, v8.4s, #7
1734	ushr	v8.4s, v7.4s, #25
1735	sli	v8.4s, v7.4s, #7
1736	ushr	v7.4s, v6.4s, #25
1737	sli	v7.4s, v6.4s, #7
1738	ushr	v6.4s, v5.4s, #25
1739	sli	v6.4s, v5.4s, #7
1740	ushr	v5.4s, v20.4s, #25
1741	sli	v5.4s, v20.4s, #7
1742
1743	ext	v9.16b, v9.16b, v9.16b, #4
1744	ext	v14.16b, v14.16b, v14.16b, #8
1745	ext	v19.16b, v19.16b, v19.16b, #12
1746	ldp	x11, x12, [x3], 16
1747	adds	x8, x8, x11
1748	adcs	x9, x9, x12
1749	adc	x10, x10, x15
1750	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1751	umulh	x12, x8, x16
1752	mul	x13, x9, x16
1753	umulh	x14, x9, x16
1754	adds	x12, x12, x13
1755	mul	x13, x10, x16
1756	adc	x13, x13, x14
1757	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1758	umulh	x8, x8, x17
1759	adds	x12, x12, x14
1760	mul	x14, x9, x17
1761	umulh	x9, x9, x17
1762	adcs	x14, x14, x8
1763	mul	x10, x10, x17
1764	adc	x10, x10, x9
1765	adds	x13, x13, x14
1766	adc	x14, x10, xzr
1767	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1768	and	x8, x13, #-4
1769	extr	x13, x14, x13, #2
1770	adds	x8, x8, x11
1771	lsr	x11, x14, #2
1772	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1773	adds	x8, x8, x13
1774	adcs	x9, x9, x12
1775	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1776	add	v0.4s, v0.4s, v6.4s
1777	add	v1.4s, v1.4s, v7.4s
1778	add	v2.4s, v2.4s, v8.4s
1779	add	v3.4s, v3.4s, v5.4s
1780	add	v4.4s, v4.4s, v9.4s
1781
1782	eor	v18.16b, v18.16b, v0.16b
1783	eor	v15.16b, v15.16b, v1.16b
1784	eor	v16.16b, v16.16b, v2.16b
1785	eor	v17.16b, v17.16b, v3.16b
1786	eor	v19.16b, v19.16b, v4.16b
1787
1788	rev32	v18.8h, v18.8h
1789	rev32	v15.8h, v15.8h
1790	rev32	v16.8h, v16.8h
1791	rev32	v17.8h, v17.8h
1792	rev32	v19.8h, v19.8h
1793
1794	add	v12.4s, v12.4s, v18.4s
1795	add	v13.4s, v13.4s, v15.4s
1796	add	v10.4s, v10.4s, v16.4s
1797	add	v11.4s, v11.4s, v17.4s
1798	add	v14.4s, v14.4s, v19.4s
1799
1800	eor	v6.16b, v6.16b, v12.16b
1801	eor	v7.16b, v7.16b, v13.16b
1802	eor	v8.16b, v8.16b, v10.16b
1803	eor	v5.16b, v5.16b, v11.16b
1804	eor	v9.16b, v9.16b, v14.16b
1805
1806	ushr	v20.4s, v6.4s, #20
1807	sli	v20.4s, v6.4s, #12
1808	ushr	v6.4s, v7.4s, #20
1809	sli	v6.4s, v7.4s, #12
1810	ushr	v7.4s, v8.4s, #20
1811	sli	v7.4s, v8.4s, #12
1812	ushr	v8.4s, v5.4s, #20
1813	sli	v8.4s, v5.4s, #12
1814	ushr	v5.4s, v9.4s, #20
1815	sli	v5.4s, v9.4s, #12
1816
1817	add	v0.4s, v0.4s, v20.4s
1818	add	v1.4s, v1.4s, v6.4s
1819	add	v2.4s, v2.4s, v7.4s
1820	add	v3.4s, v3.4s, v8.4s
1821	add	v4.4s, v4.4s, v5.4s
1822
1823	eor	v18.16b, v18.16b, v0.16b
1824	eor	v15.16b, v15.16b, v1.16b
1825	eor	v16.16b, v16.16b, v2.16b
1826	eor	v17.16b, v17.16b, v3.16b
1827	eor	v19.16b, v19.16b, v4.16b
1828
1829	tbl	v18.16b, {v18.16b}, v26.16b
1830	tbl	v15.16b, {v15.16b}, v26.16b
1831	tbl	v16.16b, {v16.16b}, v26.16b
1832	tbl	v17.16b, {v17.16b}, v26.16b
1833	tbl	v19.16b, {v19.16b}, v26.16b
1834
1835	add	v12.4s, v12.4s, v18.4s
1836	add	v13.4s, v13.4s, v15.4s
1837	add	v10.4s, v10.4s, v16.4s
1838	add	v11.4s, v11.4s, v17.4s
1839	add	v14.4s, v14.4s, v19.4s
1840
1841	eor	v20.16b, v20.16b, v12.16b
1842	eor	v6.16b, v6.16b, v13.16b
1843	eor	v7.16b, v7.16b, v10.16b
1844	eor	v8.16b, v8.16b, v11.16b
1845	eor	v5.16b, v5.16b, v14.16b
1846
1847	ushr	v9.4s, v5.4s, #25
1848	sli	v9.4s, v5.4s, #7
1849	ushr	v5.4s, v8.4s, #25
1850	sli	v5.4s, v8.4s, #7
1851	ushr	v8.4s, v7.4s, #25
1852	sli	v8.4s, v7.4s, #7
1853	ushr	v7.4s, v6.4s, #25
1854	sli	v7.4s, v6.4s, #7
1855	ushr	v6.4s, v20.4s, #25
1856	sli	v6.4s, v20.4s, #7
1857
1858	ext	v9.16b, v9.16b, v9.16b, #12
1859	ext	v14.16b, v14.16b, v14.16b, #8
1860	ext	v19.16b, v19.16b, v19.16b, #4
1861	subs	x7, x7, #1
1862	b.gt	Lopen_main_loop_rounds
1863	subs	x6, x6, #1
1864	b.ge	Lopen_main_loop_rounds_short
1865
1866	eor	v20.16b, v20.16b, v20.16b //zero
1867	not	v21.16b, v20.16b // -1
1868	sub	v21.4s, v25.4s, v21.4s // Add +1
1869	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
1870	add	v19.4s, v19.4s, v20.4s
1871
1872	add	v15.4s, v15.4s, v25.4s
1873	mov	x11, #5
1874	dup	v20.4s, w11
1875	add	v25.4s, v25.4s, v20.4s
1876
1877	zip1	v20.4s, v0.4s, v1.4s
1878	zip2	v21.4s, v0.4s, v1.4s
1879	zip1	v22.4s, v2.4s, v3.4s
1880	zip2	v23.4s, v2.4s, v3.4s
1881
1882	zip1	v0.2d, v20.2d, v22.2d
1883	zip2	v1.2d, v20.2d, v22.2d
1884	zip1	v2.2d, v21.2d, v23.2d
1885	zip2	v3.2d, v21.2d, v23.2d
1886
1887	zip1	v20.4s, v5.4s, v6.4s
1888	zip2	v21.4s, v5.4s, v6.4s
1889	zip1	v22.4s, v7.4s, v8.4s
1890	zip2	v23.4s, v7.4s, v8.4s
1891
1892	zip1	v5.2d, v20.2d, v22.2d
1893	zip2	v6.2d, v20.2d, v22.2d
1894	zip1	v7.2d, v21.2d, v23.2d
1895	zip2	v8.2d, v21.2d, v23.2d
1896
1897	zip1	v20.4s, v10.4s, v11.4s
1898	zip2	v21.4s, v10.4s, v11.4s
1899	zip1	v22.4s, v12.4s, v13.4s
1900	zip2	v23.4s, v12.4s, v13.4s
1901
1902	zip1	v10.2d, v20.2d, v22.2d
1903	zip2	v11.2d, v20.2d, v22.2d
1904	zip1	v12.2d, v21.2d, v23.2d
1905	zip2	v13.2d, v21.2d, v23.2d
1906
1907	zip1	v20.4s, v15.4s, v16.4s
1908	zip2	v21.4s, v15.4s, v16.4s
1909	zip1	v22.4s, v17.4s, v18.4s
1910	zip2	v23.4s, v17.4s, v18.4s
1911
1912	zip1	v15.2d, v20.2d, v22.2d
1913	zip2	v16.2d, v20.2d, v22.2d
1914	zip1	v17.2d, v21.2d, v23.2d
1915	zip2	v18.2d, v21.2d, v23.2d
1916
1917	add	v0.4s, v0.4s, v24.4s
1918	add	v5.4s, v5.4s, v28.4s
1919	add	v10.4s, v10.4s, v29.4s
1920	add	v15.4s, v15.4s, v30.4s
1921
1922	add	v1.4s, v1.4s, v24.4s
1923	add	v6.4s, v6.4s, v28.4s
1924	add	v11.4s, v11.4s, v29.4s
1925	add	v16.4s, v16.4s, v30.4s
1926
1927	add	v2.4s, v2.4s, v24.4s
1928	add	v7.4s, v7.4s, v28.4s
1929	add	v12.4s, v12.4s, v29.4s
1930	add	v17.4s, v17.4s, v30.4s
1931
1932	add	v3.4s, v3.4s, v24.4s
1933	add	v8.4s, v8.4s, v28.4s
1934	add	v13.4s, v13.4s, v29.4s
1935	add	v18.4s, v18.4s, v30.4s
1936
1937	add	v4.4s, v4.4s, v24.4s
1938	add	v9.4s, v9.4s, v28.4s
1939	add	v14.4s, v14.4s, v29.4s
1940	add	v19.4s, v19.4s, v30.4s
1941
1942    // We can always safely store 192 bytes
1943	ld1	{v20.16b - v23.16b}, [x1], #64
1944	eor	v20.16b, v20.16b, v0.16b
1945	eor	v21.16b, v21.16b, v5.16b
1946	eor	v22.16b, v22.16b, v10.16b
1947	eor	v23.16b, v23.16b, v15.16b
1948	st1	{v20.16b - v23.16b}, [x0], #64
1949
1950	ld1	{v20.16b - v23.16b}, [x1], #64
1951	eor	v20.16b, v20.16b, v1.16b
1952	eor	v21.16b, v21.16b, v6.16b
1953	eor	v22.16b, v22.16b, v11.16b
1954	eor	v23.16b, v23.16b, v16.16b
1955	st1	{v20.16b - v23.16b}, [x0], #64
1956
1957	ld1	{v20.16b - v23.16b}, [x1], #64
1958	eor	v20.16b, v20.16b, v2.16b
1959	eor	v21.16b, v21.16b, v7.16b
1960	eor	v22.16b, v22.16b, v12.16b
1961	eor	v23.16b, v23.16b, v17.16b
1962	st1	{v20.16b - v23.16b}, [x0], #64
1963
1964	sub	x2, x2, #192
1965
1966	mov	v0.16b, v3.16b
1967	mov	v5.16b, v8.16b
1968	mov	v10.16b, v13.16b
1969	mov	v15.16b, v18.16b
1970
1971	cmp	x2, #64
1972	b.lt	Lopen_tail_64_store
1973
1974	ld1	{v20.16b - v23.16b}, [x1], #64
1975	eor	v20.16b, v20.16b, v3.16b
1976	eor	v21.16b, v21.16b, v8.16b
1977	eor	v22.16b, v22.16b, v13.16b
1978	eor	v23.16b, v23.16b, v18.16b
1979	st1	{v20.16b - v23.16b}, [x0], #64
1980
1981	sub	x2, x2, #64
1982
1983	mov	v0.16b, v4.16b
1984	mov	v5.16b, v9.16b
1985	mov	v10.16b, v14.16b
1986	mov	v15.16b, v19.16b
1987
1988	cmp	x2, #64
1989	b.lt	Lopen_tail_64_store
1990
1991	ld1	{v20.16b - v23.16b}, [x1], #64
1992	eor	v20.16b, v20.16b, v4.16b
1993	eor	v21.16b, v21.16b, v9.16b
1994	eor	v22.16b, v22.16b, v14.16b
1995	eor	v23.16b, v23.16b, v19.16b
1996	st1	{v20.16b - v23.16b}, [x0], #64
1997
1998	sub	x2, x2, #64
1999	b	Lopen_main_loop
2000
2001Lopen_tail:
2002
2003	cbz	x2, Lopen_finalize
2004
2005	lsr	x4, x2, #4 // How many whole blocks we have to hash
2006
2007	cmp	x2, #64
2008	b.le	Lopen_tail_64
2009	cmp	x2, #128
2010	b.le	Lopen_tail_128
2011
2012Lopen_tail_192:
2013     // We need three more blocks
2014	mov	v0.16b, v24.16b
2015	mov	v1.16b, v24.16b
2016	mov	v2.16b, v24.16b
2017	mov	v5.16b, v28.16b
2018	mov	v6.16b, v28.16b
2019	mov	v7.16b, v28.16b
2020	mov	v10.16b, v29.16b
2021	mov	v11.16b, v29.16b
2022	mov	v12.16b, v29.16b
2023	mov	v15.16b, v30.16b
2024	mov	v16.16b, v30.16b
2025	mov	v17.16b, v30.16b
2026	eor	v23.16b, v23.16b, v23.16b
2027	eor	v21.16b, v21.16b, v21.16b
2028	ins	v23.s[0], v25.s[0]
2029	ins	v21.d[0], x15
2030
2031	add	v22.4s, v23.4s, v21.4s
2032	add	v21.4s, v22.4s, v21.4s
2033
2034	add	v15.4s, v15.4s, v21.4s
2035	add	v16.4s, v16.4s, v23.4s
2036	add	v17.4s, v17.4s, v22.4s
2037
2038	mov	x7, #10
2039	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
2040	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
2041	sub	x4, x4, x7
2042
2043	cbz	x7, Lopen_tail_192_rounds_no_hash
2044
2045Lopen_tail_192_rounds:
2046	ldp	x11, x12, [x3], 16
2047	adds	x8, x8, x11
2048	adcs	x9, x9, x12
2049	adc	x10, x10, x15
2050	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2051	umulh	x12, x8, x16
2052	mul	x13, x9, x16
2053	umulh	x14, x9, x16
2054	adds	x12, x12, x13
2055	mul	x13, x10, x16
2056	adc	x13, x13, x14
2057	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2058	umulh	x8, x8, x17
2059	adds	x12, x12, x14
2060	mul	x14, x9, x17
2061	umulh	x9, x9, x17
2062	adcs	x14, x14, x8
2063	mul	x10, x10, x17
2064	adc	x10, x10, x9
2065	adds	x13, x13, x14
2066	adc	x14, x10, xzr
2067	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2068	and	x8, x13, #-4
2069	extr	x13, x14, x13, #2
2070	adds	x8, x8, x11
2071	lsr	x11, x14, #2
2072	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2073	adds	x8, x8, x13
2074	adcs	x9, x9, x12
2075	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2076Lopen_tail_192_rounds_no_hash:
2077	add	v0.4s, v0.4s, v5.4s
2078	add	v1.4s, v1.4s, v6.4s
2079	add	v2.4s, v2.4s, v7.4s
2080	eor	v15.16b, v15.16b, v0.16b
2081	eor	v16.16b, v16.16b, v1.16b
2082	eor	v17.16b, v17.16b, v2.16b
2083	rev32	v15.8h, v15.8h
2084	rev32	v16.8h, v16.8h
2085	rev32	v17.8h, v17.8h
2086
2087	add	v10.4s, v10.4s, v15.4s
2088	add	v11.4s, v11.4s, v16.4s
2089	add	v12.4s, v12.4s, v17.4s
2090	eor	v5.16b, v5.16b, v10.16b
2091	eor	v6.16b, v6.16b, v11.16b
2092	eor	v7.16b, v7.16b, v12.16b
2093	ushr	v20.4s, v5.4s, #20
2094	sli	v20.4s, v5.4s, #12
2095	ushr	v5.4s, v6.4s, #20
2096	sli	v5.4s, v6.4s, #12
2097	ushr	v6.4s, v7.4s, #20
2098	sli	v6.4s, v7.4s, #12
2099
2100	add	v0.4s, v0.4s, v20.4s
2101	add	v1.4s, v1.4s, v5.4s
2102	add	v2.4s, v2.4s, v6.4s
2103	eor	v15.16b, v15.16b, v0.16b
2104	eor	v16.16b, v16.16b, v1.16b
2105	eor	v17.16b, v17.16b, v2.16b
2106	tbl	v15.16b, {v15.16b}, v26.16b
2107	tbl	v16.16b, {v16.16b}, v26.16b
2108	tbl	v17.16b, {v17.16b}, v26.16b
2109
2110	add	v10.4s, v10.4s, v15.4s
2111	add	v11.4s, v11.4s, v16.4s
2112	add	v12.4s, v12.4s, v17.4s
2113	eor	v20.16b, v20.16b, v10.16b
2114	eor	v5.16b, v5.16b, v11.16b
2115	eor	v6.16b, v6.16b, v12.16b
2116	ushr	v7.4s, v6.4s, #25
2117	sli	v7.4s, v6.4s, #7
2118	ushr	v6.4s, v5.4s, #25
2119	sli	v6.4s, v5.4s, #7
2120	ushr	v5.4s, v20.4s, #25
2121	sli	v5.4s, v20.4s, #7
2122
2123	ext	v5.16b, v5.16b, v5.16b, #4
2124	ext	v6.16b, v6.16b, v6.16b, #4
2125	ext	v7.16b, v7.16b, v7.16b, #4
2126
2127	ext	v10.16b, v10.16b, v10.16b, #8
2128	ext	v11.16b, v11.16b, v11.16b, #8
2129	ext	v12.16b, v12.16b, v12.16b, #8
2130
2131	ext	v15.16b, v15.16b, v15.16b, #12
2132	ext	v16.16b, v16.16b, v16.16b, #12
2133	ext	v17.16b, v17.16b, v17.16b, #12
2134	add	v0.4s, v0.4s, v5.4s
2135	add	v1.4s, v1.4s, v6.4s
2136	add	v2.4s, v2.4s, v7.4s
2137	eor	v15.16b, v15.16b, v0.16b
2138	eor	v16.16b, v16.16b, v1.16b
2139	eor	v17.16b, v17.16b, v2.16b
2140	rev32	v15.8h, v15.8h
2141	rev32	v16.8h, v16.8h
2142	rev32	v17.8h, v17.8h
2143
2144	add	v10.4s, v10.4s, v15.4s
2145	add	v11.4s, v11.4s, v16.4s
2146	add	v12.4s, v12.4s, v17.4s
2147	eor	v5.16b, v5.16b, v10.16b
2148	eor	v6.16b, v6.16b, v11.16b
2149	eor	v7.16b, v7.16b, v12.16b
2150	ushr	v20.4s, v5.4s, #20
2151	sli	v20.4s, v5.4s, #12
2152	ushr	v5.4s, v6.4s, #20
2153	sli	v5.4s, v6.4s, #12
2154	ushr	v6.4s, v7.4s, #20
2155	sli	v6.4s, v7.4s, #12
2156
2157	add	v0.4s, v0.4s, v20.4s
2158	add	v1.4s, v1.4s, v5.4s
2159	add	v2.4s, v2.4s, v6.4s
2160	eor	v15.16b, v15.16b, v0.16b
2161	eor	v16.16b, v16.16b, v1.16b
2162	eor	v17.16b, v17.16b, v2.16b
2163	tbl	v15.16b, {v15.16b}, v26.16b
2164	tbl	v16.16b, {v16.16b}, v26.16b
2165	tbl	v17.16b, {v17.16b}, v26.16b
2166
2167	add	v10.4s, v10.4s, v15.4s
2168	add	v11.4s, v11.4s, v16.4s
2169	add	v12.4s, v12.4s, v17.4s
2170	eor	v20.16b, v20.16b, v10.16b
2171	eor	v5.16b, v5.16b, v11.16b
2172	eor	v6.16b, v6.16b, v12.16b
2173	ushr	v7.4s, v6.4s, #25
2174	sli	v7.4s, v6.4s, #7
2175	ushr	v6.4s, v5.4s, #25
2176	sli	v6.4s, v5.4s, #7
2177	ushr	v5.4s, v20.4s, #25
2178	sli	v5.4s, v20.4s, #7
2179
2180	ext	v5.16b, v5.16b, v5.16b, #12
2181	ext	v6.16b, v6.16b, v6.16b, #12
2182	ext	v7.16b, v7.16b, v7.16b, #12
2183
2184	ext	v10.16b, v10.16b, v10.16b, #8
2185	ext	v11.16b, v11.16b, v11.16b, #8
2186	ext	v12.16b, v12.16b, v12.16b, #8
2187
2188	ext	v15.16b, v15.16b, v15.16b, #4
2189	ext	v16.16b, v16.16b, v16.16b, #4
2190	ext	v17.16b, v17.16b, v17.16b, #4
2191	subs	x7, x7, #1
2192	b.gt	Lopen_tail_192_rounds
2193	subs	x6, x6, #1
2194	b.ge	Lopen_tail_192_rounds_no_hash
2195
2196    // We hashed 160 bytes at most, may still have 32 bytes left
2197Lopen_tail_192_hash:
2198	cbz	x4, Lopen_tail_192_hash_done
2199	ldp	x11, x12, [x3], 16
2200	adds	x8, x8, x11
2201	adcs	x9, x9, x12
2202	adc	x10, x10, x15
2203	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2204	umulh	x12, x8, x16
2205	mul	x13, x9, x16
2206	umulh	x14, x9, x16
2207	adds	x12, x12, x13
2208	mul	x13, x10, x16
2209	adc	x13, x13, x14
2210	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2211	umulh	x8, x8, x17
2212	adds	x12, x12, x14
2213	mul	x14, x9, x17
2214	umulh	x9, x9, x17
2215	adcs	x14, x14, x8
2216	mul	x10, x10, x17
2217	adc	x10, x10, x9
2218	adds	x13, x13, x14
2219	adc	x14, x10, xzr
2220	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2221	and	x8, x13, #-4
2222	extr	x13, x14, x13, #2
2223	adds	x8, x8, x11
2224	lsr	x11, x14, #2
2225	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2226	adds	x8, x8, x13
2227	adcs	x9, x9, x12
2228	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2229	sub	x4, x4, #1
2230	b	Lopen_tail_192_hash
2231
2232Lopen_tail_192_hash_done:
2233
2234	add	v0.4s, v0.4s, v24.4s
2235	add	v1.4s, v1.4s, v24.4s
2236	add	v2.4s, v2.4s, v24.4s
2237	add	v5.4s, v5.4s, v28.4s
2238	add	v6.4s, v6.4s, v28.4s
2239	add	v7.4s, v7.4s, v28.4s
2240	add	v10.4s, v10.4s, v29.4s
2241	add	v11.4s, v11.4s, v29.4s
2242	add	v12.4s, v12.4s, v29.4s
2243	add	v15.4s, v15.4s, v30.4s
2244	add	v16.4s, v16.4s, v30.4s
2245	add	v17.4s, v17.4s, v30.4s
2246
2247	add	v15.4s, v15.4s, v21.4s
2248	add	v16.4s, v16.4s, v23.4s
2249	add	v17.4s, v17.4s, v22.4s
2250
2251	ld1	{v20.16b - v23.16b}, [x1], #64
2252
2253	eor	v20.16b, v20.16b, v1.16b
2254	eor	v21.16b, v21.16b, v6.16b
2255	eor	v22.16b, v22.16b, v11.16b
2256	eor	v23.16b, v23.16b, v16.16b
2257
2258	st1	{v20.16b - v23.16b}, [x0], #64
2259
2260	ld1	{v20.16b - v23.16b}, [x1], #64
2261
2262	eor	v20.16b, v20.16b, v2.16b
2263	eor	v21.16b, v21.16b, v7.16b
2264	eor	v22.16b, v22.16b, v12.16b
2265	eor	v23.16b, v23.16b, v17.16b
2266
2267	st1	{v20.16b - v23.16b}, [x0], #64
2268
2269	sub	x2, x2, #128
2270	b	Lopen_tail_64_store
2271
2272Lopen_tail_128:
2273     // We need two more blocks
2274	mov	v0.16b, v24.16b
2275	mov	v1.16b, v24.16b
2276	mov	v5.16b, v28.16b
2277	mov	v6.16b, v28.16b
2278	mov	v10.16b, v29.16b
2279	mov	v11.16b, v29.16b
2280	mov	v15.16b, v30.16b
2281	mov	v16.16b, v30.16b
2282	eor	v23.16b, v23.16b, v23.16b
2283	eor	v22.16b, v22.16b, v22.16b
2284	ins	v23.s[0], v25.s[0]
2285	ins	v22.d[0], x15
2286	add	v22.4s, v22.4s, v23.4s
2287
2288	add	v15.4s, v15.4s, v22.4s
2289	add	v16.4s, v16.4s, v23.4s
2290
2291	mov	x6, #10
2292	sub	x6, x6, x4
2293
2294Lopen_tail_128_rounds:
2295	add	v0.4s, v0.4s, v5.4s
2296	eor	v15.16b, v15.16b, v0.16b
2297	rev32	v15.8h, v15.8h
2298
2299	add	v10.4s, v10.4s, v15.4s
2300	eor	v5.16b, v5.16b, v10.16b
2301	ushr	v20.4s, v5.4s, #20
2302	sli	v20.4s, v5.4s, #12
2303	add	v0.4s, v0.4s, v20.4s
2304	eor	v15.16b, v15.16b, v0.16b
2305	tbl	v15.16b, {v15.16b}, v26.16b
2306
2307	add	v10.4s, v10.4s, v15.4s
2308	eor	v20.16b, v20.16b, v10.16b
2309	ushr	v5.4s, v20.4s, #25
2310	sli	v5.4s, v20.4s, #7
2311	ext	v5.16b, v5.16b, v5.16b, #4
2312	ext	v10.16b, v10.16b, v10.16b, #8
2313	ext	v15.16b, v15.16b, v15.16b, #12
2314	add	v1.4s, v1.4s, v6.4s
2315	eor	v16.16b, v16.16b, v1.16b
2316	rev32	v16.8h, v16.8h
2317
2318	add	v11.4s, v11.4s, v16.4s
2319	eor	v6.16b, v6.16b, v11.16b
2320	ushr	v20.4s, v6.4s, #20
2321	sli	v20.4s, v6.4s, #12
2322	add	v1.4s, v1.4s, v20.4s
2323	eor	v16.16b, v16.16b, v1.16b
2324	tbl	v16.16b, {v16.16b}, v26.16b
2325
2326	add	v11.4s, v11.4s, v16.4s
2327	eor	v20.16b, v20.16b, v11.16b
2328	ushr	v6.4s, v20.4s, #25
2329	sli	v6.4s, v20.4s, #7
2330	ext	v6.16b, v6.16b, v6.16b, #4
2331	ext	v11.16b, v11.16b, v11.16b, #8
2332	ext	v16.16b, v16.16b, v16.16b, #12
2333	add	v0.4s, v0.4s, v5.4s
2334	eor	v15.16b, v15.16b, v0.16b
2335	rev32	v15.8h, v15.8h
2336
2337	add	v10.4s, v10.4s, v15.4s
2338	eor	v5.16b, v5.16b, v10.16b
2339	ushr	v20.4s, v5.4s, #20
2340	sli	v20.4s, v5.4s, #12
2341	add	v0.4s, v0.4s, v20.4s
2342	eor	v15.16b, v15.16b, v0.16b
2343	tbl	v15.16b, {v15.16b}, v26.16b
2344
2345	add	v10.4s, v10.4s, v15.4s
2346	eor	v20.16b, v20.16b, v10.16b
2347	ushr	v5.4s, v20.4s, #25
2348	sli	v5.4s, v20.4s, #7
2349	ext	v5.16b, v5.16b, v5.16b, #12
2350	ext	v10.16b, v10.16b, v10.16b, #8
2351	ext	v15.16b, v15.16b, v15.16b, #4
2352	add	v1.4s, v1.4s, v6.4s
2353	eor	v16.16b, v16.16b, v1.16b
2354	rev32	v16.8h, v16.8h
2355
2356	add	v11.4s, v11.4s, v16.4s
2357	eor	v6.16b, v6.16b, v11.16b
2358	ushr	v20.4s, v6.4s, #20
2359	sli	v20.4s, v6.4s, #12
2360	add	v1.4s, v1.4s, v20.4s
2361	eor	v16.16b, v16.16b, v1.16b
2362	tbl	v16.16b, {v16.16b}, v26.16b
2363
2364	add	v11.4s, v11.4s, v16.4s
2365	eor	v20.16b, v20.16b, v11.16b
2366	ushr	v6.4s, v20.4s, #25
2367	sli	v6.4s, v20.4s, #7
2368	ext	v6.16b, v6.16b, v6.16b, #12
2369	ext	v11.16b, v11.16b, v11.16b, #8
2370	ext	v16.16b, v16.16b, v16.16b, #4
2371	subs	x6, x6, #1
2372	b.gt	Lopen_tail_128_rounds
2373	cbz	x4, Lopen_tail_128_rounds_done
2374	subs	x4, x4, #1
2375	ldp	x11, x12, [x3], 16
2376	adds	x8, x8, x11
2377	adcs	x9, x9, x12
2378	adc	x10, x10, x15
2379	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2380	umulh	x12, x8, x16
2381	mul	x13, x9, x16
2382	umulh	x14, x9, x16
2383	adds	x12, x12, x13
2384	mul	x13, x10, x16
2385	adc	x13, x13, x14
2386	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2387	umulh	x8, x8, x17
2388	adds	x12, x12, x14
2389	mul	x14, x9, x17
2390	umulh	x9, x9, x17
2391	adcs	x14, x14, x8
2392	mul	x10, x10, x17
2393	adc	x10, x10, x9
2394	adds	x13, x13, x14
2395	adc	x14, x10, xzr
2396	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2397	and	x8, x13, #-4
2398	extr	x13, x14, x13, #2
2399	adds	x8, x8, x11
2400	lsr	x11, x14, #2
2401	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2402	adds	x8, x8, x13
2403	adcs	x9, x9, x12
2404	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2405	b	Lopen_tail_128_rounds
2406
2407Lopen_tail_128_rounds_done:
2408	add	v0.4s, v0.4s, v24.4s
2409	add	v1.4s, v1.4s, v24.4s
2410	add	v5.4s, v5.4s, v28.4s
2411	add	v6.4s, v6.4s, v28.4s
2412	add	v10.4s, v10.4s, v29.4s
2413	add	v11.4s, v11.4s, v29.4s
2414	add	v15.4s, v15.4s, v30.4s
2415	add	v16.4s, v16.4s, v30.4s
2416	add	v15.4s, v15.4s, v22.4s
2417	add	v16.4s, v16.4s, v23.4s
2418
2419	ld1	{v20.16b - v23.16b}, [x1], #64
2420
2421	eor	v20.16b, v20.16b, v1.16b
2422	eor	v21.16b, v21.16b, v6.16b
2423	eor	v22.16b, v22.16b, v11.16b
2424	eor	v23.16b, v23.16b, v16.16b
2425
2426	st1	{v20.16b - v23.16b}, [x0], #64
2427	sub	x2, x2, #64
2428
2429	b	Lopen_tail_64_store
2430
2431Lopen_tail_64:
2432    // We just need a single block
2433	mov	v0.16b, v24.16b
2434	mov	v5.16b, v28.16b
2435	mov	v10.16b, v29.16b
2436	mov	v15.16b, v30.16b
2437	eor	v23.16b, v23.16b, v23.16b
2438	ins	v23.s[0], v25.s[0]
2439	add	v15.4s, v15.4s, v23.4s
2440
2441	mov	x6, #10
2442	sub	x6, x6, x4
2443
2444Lopen_tail_64_rounds:
2445	add	v0.4s, v0.4s, v5.4s
2446	eor	v15.16b, v15.16b, v0.16b
2447	rev32	v15.8h, v15.8h
2448
2449	add	v10.4s, v10.4s, v15.4s
2450	eor	v5.16b, v5.16b, v10.16b
2451	ushr	v20.4s, v5.4s, #20
2452	sli	v20.4s, v5.4s, #12
2453	add	v0.4s, v0.4s, v20.4s
2454	eor	v15.16b, v15.16b, v0.16b
2455	tbl	v15.16b, {v15.16b}, v26.16b
2456
2457	add	v10.4s, v10.4s, v15.4s
2458	eor	v20.16b, v20.16b, v10.16b
2459	ushr	v5.4s, v20.4s, #25
2460	sli	v5.4s, v20.4s, #7
2461	ext	v5.16b, v5.16b, v5.16b, #4
2462	ext	v10.16b, v10.16b, v10.16b, #8
2463	ext	v15.16b, v15.16b, v15.16b, #12
2464	add	v0.4s, v0.4s, v5.4s
2465	eor	v15.16b, v15.16b, v0.16b
2466	rev32	v15.8h, v15.8h
2467
2468	add	v10.4s, v10.4s, v15.4s
2469	eor	v5.16b, v5.16b, v10.16b
2470	ushr	v20.4s, v5.4s, #20
2471	sli	v20.4s, v5.4s, #12
2472	add	v0.4s, v0.4s, v20.4s
2473	eor	v15.16b, v15.16b, v0.16b
2474	tbl	v15.16b, {v15.16b}, v26.16b
2475
2476	add	v10.4s, v10.4s, v15.4s
2477	eor	v20.16b, v20.16b, v10.16b
2478	ushr	v5.4s, v20.4s, #25
2479	sli	v5.4s, v20.4s, #7
2480	ext	v5.16b, v5.16b, v5.16b, #12
2481	ext	v10.16b, v10.16b, v10.16b, #8
2482	ext	v15.16b, v15.16b, v15.16b, #4
2483	subs	x6, x6, #1
2484	b.gt	Lopen_tail_64_rounds
2485	cbz	x4, Lopen_tail_64_rounds_done
2486	subs	x4, x4, #1
2487	ldp	x11, x12, [x3], 16
2488	adds	x8, x8, x11
2489	adcs	x9, x9, x12
2490	adc	x10, x10, x15
2491	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2492	umulh	x12, x8, x16
2493	mul	x13, x9, x16
2494	umulh	x14, x9, x16
2495	adds	x12, x12, x13
2496	mul	x13, x10, x16
2497	adc	x13, x13, x14
2498	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2499	umulh	x8, x8, x17
2500	adds	x12, x12, x14
2501	mul	x14, x9, x17
2502	umulh	x9, x9, x17
2503	adcs	x14, x14, x8
2504	mul	x10, x10, x17
2505	adc	x10, x10, x9
2506	adds	x13, x13, x14
2507	adc	x14, x10, xzr
2508	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2509	and	x8, x13, #-4
2510	extr	x13, x14, x13, #2
2511	adds	x8, x8, x11
2512	lsr	x11, x14, #2
2513	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2514	adds	x8, x8, x13
2515	adcs	x9, x9, x12
2516	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2517	b	Lopen_tail_64_rounds
2518
2519Lopen_tail_64_rounds_done:
2520	add	v0.4s, v0.4s, v24.4s
2521	add	v5.4s, v5.4s, v28.4s
2522	add	v10.4s, v10.4s, v29.4s
2523	add	v15.4s, v15.4s, v30.4s
2524	add	v15.4s, v15.4s, v23.4s
2525
2526Lopen_tail_64_store:
2527	cmp	x2, #16
2528	b.lt	Lopen_tail_16
2529
2530	ld1	{v20.16b}, [x1], #16
2531	eor	v20.16b, v20.16b, v0.16b
2532	st1	{v20.16b}, [x0], #16
2533	mov	v0.16b, v5.16b
2534	mov	v5.16b, v10.16b
2535	mov	v10.16b, v15.16b
2536	sub	x2, x2, #16
2537	b	Lopen_tail_64_store
2538
2539Lopen_tail_16:
2540    // Here we handle the last [0,16) bytes that require a padded block
2541	cbz	x2, Lopen_finalize
2542
2543	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
2544	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
2545	not	v22.16b, v20.16b
2546
2547	add	x7, x1, x2
2548	mov	x6, x2
2549
2550Lopen_tail_16_compose:
2551	ext	v20.16b, v20.16b, v20.16b, #15
2552	ldrb	w11, [x7, #-1]!
2553	mov	v20.b[0], w11
2554	ext	v21.16b, v22.16b, v21.16b, #15
2555	subs	x2, x2, #1
2556	b.gt	Lopen_tail_16_compose
2557
2558	and	v20.16b, v20.16b, v21.16b
2559    // Hash in the final padded block
2560	mov	x11, v20.d[0]
2561	mov	x12, v20.d[1]
2562	adds	x8, x8, x11
2563	adcs	x9, x9, x12
2564	adc	x10, x10, x15
2565	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2566	umulh	x12, x8, x16
2567	mul	x13, x9, x16
2568	umulh	x14, x9, x16
2569	adds	x12, x12, x13
2570	mul	x13, x10, x16
2571	adc	x13, x13, x14
2572	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2573	umulh	x8, x8, x17
2574	adds	x12, x12, x14
2575	mul	x14, x9, x17
2576	umulh	x9, x9, x17
2577	adcs	x14, x14, x8
2578	mul	x10, x10, x17
2579	adc	x10, x10, x9
2580	adds	x13, x13, x14
2581	adc	x14, x10, xzr
2582	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2583	and	x8, x13, #-4
2584	extr	x13, x14, x13, #2
2585	adds	x8, x8, x11
2586	lsr	x11, x14, #2
2587	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2588	adds	x8, x8, x13
2589	adcs	x9, x9, x12
2590	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2591	eor	v20.16b, v20.16b, v0.16b
2592
2593Lopen_tail_16_store:
2594	umov	w11, v20.b[0]
2595	strb	w11, [x0], #1
2596	ext	v20.16b, v20.16b, v20.16b, #1
2597	subs	x6, x6, #1
2598	b.gt	Lopen_tail_16_store
2599
2600Lopen_finalize:
2601	mov	x11, v31.d[0]
2602	mov	x12, v31.d[1]
2603	adds	x8, x8, x11
2604	adcs	x9, x9, x12
2605	adc	x10, x10, x15
2606	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2607	umulh	x12, x8, x16
2608	mul	x13, x9, x16
2609	umulh	x14, x9, x16
2610	adds	x12, x12, x13
2611	mul	x13, x10, x16
2612	adc	x13, x13, x14
2613	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2614	umulh	x8, x8, x17
2615	adds	x12, x12, x14
2616	mul	x14, x9, x17
2617	umulh	x9, x9, x17
2618	adcs	x14, x14, x8
2619	mul	x10, x10, x17
2620	adc	x10, x10, x9
2621	adds	x13, x13, x14
2622	adc	x14, x10, xzr
2623	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2624	and	x8, x13, #-4
2625	extr	x13, x14, x13, #2
2626	adds	x8, x8, x11
2627	lsr	x11, x14, #2
2628	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2629	adds	x8, x8, x13
2630	adcs	x9, x9, x12
2631	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2632    // Final reduction step
2633	sub	x12, xzr, x15
2634	orr	x13, xzr, #3
2635	subs	x11, x8, #-5
2636	sbcs	x12, x9, x12
2637	sbcs	x13, x10, x13
2638	csel	x8, x11, x8, cs
2639	csel	x9, x12, x9, cs
2640	csel	x10, x13, x10, cs
2641	mov	x11, v27.d[0]
2642	mov	x12, v27.d[1]
2643	adds	x8, x8, x11
2644	adcs	x9, x9, x12
2645	adc	x10, x10, x15
2646
2647	stp	x8, x9, [x5]
2648
2649	ldp	d8, d9, [sp, #16]
2650	ldp	d10, d11, [sp, #32]
2651	ldp	d12, d13, [sp, #48]
2652	ldp	d14, d15, [sp, #64]
2653.cfi_restore	b15
2654.cfi_restore	b14
2655.cfi_restore	b13
2656.cfi_restore	b12
2657.cfi_restore	b11
2658.cfi_restore	b10
2659.cfi_restore	b9
2660.cfi_restore	b8
2661	ldp	x29, x30, [sp], 80
2662.cfi_restore	w29
2663.cfi_restore	w30
2664.cfi_def_cfa_offset	0
2665	AARCH64_VALIDATE_LINK_REGISTER
2666	ret
2667
2668Lopen_128:
2669    // On some architectures preparing 5 blocks for small buffers is wasteful
2670	eor	v25.16b, v25.16b, v25.16b
2671	mov	x11, #1
2672	mov	v25.s[0], w11
2673	mov	v0.16b, v24.16b
2674	mov	v1.16b, v24.16b
2675	mov	v2.16b, v24.16b
2676	mov	v5.16b, v28.16b
2677	mov	v6.16b, v28.16b
2678	mov	v7.16b, v28.16b
2679	mov	v10.16b, v29.16b
2680	mov	v11.16b, v29.16b
2681	mov	v12.16b, v29.16b
2682	mov	v17.16b, v30.16b
2683	add	v15.4s, v17.4s, v25.4s
2684	add	v16.4s, v15.4s, v25.4s
2685
2686	mov	x6, #10
2687
2688Lopen_128_rounds:
2689	add	v0.4s, v0.4s, v5.4s
2690	add	v1.4s, v1.4s, v6.4s
2691	add	v2.4s, v2.4s, v7.4s
2692	eor	v15.16b, v15.16b, v0.16b
2693	eor	v16.16b, v16.16b, v1.16b
2694	eor	v17.16b, v17.16b, v2.16b
2695	rev32	v15.8h, v15.8h
2696	rev32	v16.8h, v16.8h
2697	rev32	v17.8h, v17.8h
2698
2699	add	v10.4s, v10.4s, v15.4s
2700	add	v11.4s, v11.4s, v16.4s
2701	add	v12.4s, v12.4s, v17.4s
2702	eor	v5.16b, v5.16b, v10.16b
2703	eor	v6.16b, v6.16b, v11.16b
2704	eor	v7.16b, v7.16b, v12.16b
2705	ushr	v20.4s, v5.4s, #20
2706	sli	v20.4s, v5.4s, #12
2707	ushr	v5.4s, v6.4s, #20
2708	sli	v5.4s, v6.4s, #12
2709	ushr	v6.4s, v7.4s, #20
2710	sli	v6.4s, v7.4s, #12
2711
2712	add	v0.4s, v0.4s, v20.4s
2713	add	v1.4s, v1.4s, v5.4s
2714	add	v2.4s, v2.4s, v6.4s
2715	eor	v15.16b, v15.16b, v0.16b
2716	eor	v16.16b, v16.16b, v1.16b
2717	eor	v17.16b, v17.16b, v2.16b
2718	tbl	v15.16b, {v15.16b}, v26.16b
2719	tbl	v16.16b, {v16.16b}, v26.16b
2720	tbl	v17.16b, {v17.16b}, v26.16b
2721
2722	add	v10.4s, v10.4s, v15.4s
2723	add	v11.4s, v11.4s, v16.4s
2724	add	v12.4s, v12.4s, v17.4s
2725	eor	v20.16b, v20.16b, v10.16b
2726	eor	v5.16b, v5.16b, v11.16b
2727	eor	v6.16b, v6.16b, v12.16b
2728	ushr	v7.4s, v6.4s, #25
2729	sli	v7.4s, v6.4s, #7
2730	ushr	v6.4s, v5.4s, #25
2731	sli	v6.4s, v5.4s, #7
2732	ushr	v5.4s, v20.4s, #25
2733	sli	v5.4s, v20.4s, #7
2734
2735	ext	v5.16b, v5.16b, v5.16b, #4
2736	ext	v6.16b, v6.16b, v6.16b, #4
2737	ext	v7.16b, v7.16b, v7.16b, #4
2738
2739	ext	v10.16b, v10.16b, v10.16b, #8
2740	ext	v11.16b, v11.16b, v11.16b, #8
2741	ext	v12.16b, v12.16b, v12.16b, #8
2742
2743	ext	v15.16b, v15.16b, v15.16b, #12
2744	ext	v16.16b, v16.16b, v16.16b, #12
2745	ext	v17.16b, v17.16b, v17.16b, #12
2746	add	v0.4s, v0.4s, v5.4s
2747	add	v1.4s, v1.4s, v6.4s
2748	add	v2.4s, v2.4s, v7.4s
2749	eor	v15.16b, v15.16b, v0.16b
2750	eor	v16.16b, v16.16b, v1.16b
2751	eor	v17.16b, v17.16b, v2.16b
2752	rev32	v15.8h, v15.8h
2753	rev32	v16.8h, v16.8h
2754	rev32	v17.8h, v17.8h
2755
2756	add	v10.4s, v10.4s, v15.4s
2757	add	v11.4s, v11.4s, v16.4s
2758	add	v12.4s, v12.4s, v17.4s
2759	eor	v5.16b, v5.16b, v10.16b
2760	eor	v6.16b, v6.16b, v11.16b
2761	eor	v7.16b, v7.16b, v12.16b
2762	ushr	v20.4s, v5.4s, #20
2763	sli	v20.4s, v5.4s, #12
2764	ushr	v5.4s, v6.4s, #20
2765	sli	v5.4s, v6.4s, #12
2766	ushr	v6.4s, v7.4s, #20
2767	sli	v6.4s, v7.4s, #12
2768
2769	add	v0.4s, v0.4s, v20.4s
2770	add	v1.4s, v1.4s, v5.4s
2771	add	v2.4s, v2.4s, v6.4s
2772	eor	v15.16b, v15.16b, v0.16b
2773	eor	v16.16b, v16.16b, v1.16b
2774	eor	v17.16b, v17.16b, v2.16b
2775	tbl	v15.16b, {v15.16b}, v26.16b
2776	tbl	v16.16b, {v16.16b}, v26.16b
2777	tbl	v17.16b, {v17.16b}, v26.16b
2778
2779	add	v10.4s, v10.4s, v15.4s
2780	add	v11.4s, v11.4s, v16.4s
2781	add	v12.4s, v12.4s, v17.4s
2782	eor	v20.16b, v20.16b, v10.16b
2783	eor	v5.16b, v5.16b, v11.16b
2784	eor	v6.16b, v6.16b, v12.16b
2785	ushr	v7.4s, v6.4s, #25
2786	sli	v7.4s, v6.4s, #7
2787	ushr	v6.4s, v5.4s, #25
2788	sli	v6.4s, v5.4s, #7
2789	ushr	v5.4s, v20.4s, #25
2790	sli	v5.4s, v20.4s, #7
2791
2792	ext	v5.16b, v5.16b, v5.16b, #12
2793	ext	v6.16b, v6.16b, v6.16b, #12
2794	ext	v7.16b, v7.16b, v7.16b, #12
2795
2796	ext	v10.16b, v10.16b, v10.16b, #8
2797	ext	v11.16b, v11.16b, v11.16b, #8
2798	ext	v12.16b, v12.16b, v12.16b, #8
2799
2800	ext	v15.16b, v15.16b, v15.16b, #4
2801	ext	v16.16b, v16.16b, v16.16b, #4
2802	ext	v17.16b, v17.16b, v17.16b, #4
2803	subs	x6, x6, #1
2804	b.hi	Lopen_128_rounds
2805
2806	add	v0.4s, v0.4s, v24.4s
2807	add	v1.4s, v1.4s, v24.4s
2808	add	v2.4s, v2.4s, v24.4s
2809
2810	add	v5.4s, v5.4s, v28.4s
2811	add	v6.4s, v6.4s, v28.4s
2812	add	v7.4s, v7.4s, v28.4s
2813
2814	add	v10.4s, v10.4s, v29.4s
2815	add	v11.4s, v11.4s, v29.4s
2816
2817	add	v30.4s, v30.4s, v25.4s
2818	add	v15.4s, v15.4s, v30.4s
2819	add	v30.4s, v30.4s, v25.4s
2820	add	v16.4s, v16.4s, v30.4s
2821
2822	and	v2.16b, v2.16b, v27.16b
2823	mov	x16, v2.d[0] // Move the R key to GPRs
2824	mov	x17, v2.d[1]
2825	mov	v27.16b, v7.16b // Store the S key
2826
2827	bl	Lpoly_hash_ad_internal
2828
2829Lopen_128_store:
2830	cmp	x2, #64
2831	b.lt	Lopen_128_store_64
2832
2833	ld1	{v20.16b - v23.16b}, [x1], #64
2834
2835	mov	x11, v20.d[0]
2836	mov	x12, v20.d[1]
2837	adds	x8, x8, x11
2838	adcs	x9, x9, x12
2839	adc	x10, x10, x15
2840	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2841	umulh	x12, x8, x16
2842	mul	x13, x9, x16
2843	umulh	x14, x9, x16
2844	adds	x12, x12, x13
2845	mul	x13, x10, x16
2846	adc	x13, x13, x14
2847	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2848	umulh	x8, x8, x17
2849	adds	x12, x12, x14
2850	mul	x14, x9, x17
2851	umulh	x9, x9, x17
2852	adcs	x14, x14, x8
2853	mul	x10, x10, x17
2854	adc	x10, x10, x9
2855	adds	x13, x13, x14
2856	adc	x14, x10, xzr
2857	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2858	and	x8, x13, #-4
2859	extr	x13, x14, x13, #2
2860	adds	x8, x8, x11
2861	lsr	x11, x14, #2
2862	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2863	adds	x8, x8, x13
2864	adcs	x9, x9, x12
2865	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2866	mov	x11, v21.d[0]
2867	mov	x12, v21.d[1]
2868	adds	x8, x8, x11
2869	adcs	x9, x9, x12
2870	adc	x10, x10, x15
2871	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2872	umulh	x12, x8, x16
2873	mul	x13, x9, x16
2874	umulh	x14, x9, x16
2875	adds	x12, x12, x13
2876	mul	x13, x10, x16
2877	adc	x13, x13, x14
2878	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2879	umulh	x8, x8, x17
2880	adds	x12, x12, x14
2881	mul	x14, x9, x17
2882	umulh	x9, x9, x17
2883	adcs	x14, x14, x8
2884	mul	x10, x10, x17
2885	adc	x10, x10, x9
2886	adds	x13, x13, x14
2887	adc	x14, x10, xzr
2888	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2889	and	x8, x13, #-4
2890	extr	x13, x14, x13, #2
2891	adds	x8, x8, x11
2892	lsr	x11, x14, #2
2893	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2894	adds	x8, x8, x13
2895	adcs	x9, x9, x12
2896	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2897	mov	x11, v22.d[0]
2898	mov	x12, v22.d[1]
2899	adds	x8, x8, x11
2900	adcs	x9, x9, x12
2901	adc	x10, x10, x15
2902	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2903	umulh	x12, x8, x16
2904	mul	x13, x9, x16
2905	umulh	x14, x9, x16
2906	adds	x12, x12, x13
2907	mul	x13, x10, x16
2908	adc	x13, x13, x14
2909	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2910	umulh	x8, x8, x17
2911	adds	x12, x12, x14
2912	mul	x14, x9, x17
2913	umulh	x9, x9, x17
2914	adcs	x14, x14, x8
2915	mul	x10, x10, x17
2916	adc	x10, x10, x9
2917	adds	x13, x13, x14
2918	adc	x14, x10, xzr
2919	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2920	and	x8, x13, #-4
2921	extr	x13, x14, x13, #2
2922	adds	x8, x8, x11
2923	lsr	x11, x14, #2
2924	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2925	adds	x8, x8, x13
2926	adcs	x9, x9, x12
2927	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2928	mov	x11, v23.d[0]
2929	mov	x12, v23.d[1]
2930	adds	x8, x8, x11
2931	adcs	x9, x9, x12
2932	adc	x10, x10, x15
2933	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2934	umulh	x12, x8, x16
2935	mul	x13, x9, x16
2936	umulh	x14, x9, x16
2937	adds	x12, x12, x13
2938	mul	x13, x10, x16
2939	adc	x13, x13, x14
2940	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2941	umulh	x8, x8, x17
2942	adds	x12, x12, x14
2943	mul	x14, x9, x17
2944	umulh	x9, x9, x17
2945	adcs	x14, x14, x8
2946	mul	x10, x10, x17
2947	adc	x10, x10, x9
2948	adds	x13, x13, x14
2949	adc	x14, x10, xzr
2950	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2951	and	x8, x13, #-4
2952	extr	x13, x14, x13, #2
2953	adds	x8, x8, x11
2954	lsr	x11, x14, #2
2955	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2956	adds	x8, x8, x13
2957	adcs	x9, x9, x12
2958	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2959
2960	eor	v20.16b, v20.16b, v0.16b
2961	eor	v21.16b, v21.16b, v5.16b
2962	eor	v22.16b, v22.16b, v10.16b
2963	eor	v23.16b, v23.16b, v15.16b
2964
2965	st1	{v20.16b - v23.16b}, [x0], #64
2966
2967	sub	x2, x2, #64
2968
2969	mov	v0.16b, v1.16b
2970	mov	v5.16b, v6.16b
2971	mov	v10.16b, v11.16b
2972	mov	v15.16b, v16.16b
2973
2974Lopen_128_store_64:
2975
2976	lsr	x4, x2, #4
2977	mov	x3, x1
2978
2979Lopen_128_hash_64:
2980	cbz	x4, Lopen_tail_64_store
2981	ldp	x11, x12, [x3], 16
2982	adds	x8, x8, x11
2983	adcs	x9, x9, x12
2984	adc	x10, x10, x15
2985	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2986	umulh	x12, x8, x16
2987	mul	x13, x9, x16
2988	umulh	x14, x9, x16
2989	adds	x12, x12, x13
2990	mul	x13, x10, x16
2991	adc	x13, x13, x14
2992	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2993	umulh	x8, x8, x17
2994	adds	x12, x12, x14
2995	mul	x14, x9, x17
2996	umulh	x9, x9, x17
2997	adcs	x14, x14, x8
2998	mul	x10, x10, x17
2999	adc	x10, x10, x9
3000	adds	x13, x13, x14
3001	adc	x14, x10, xzr
3002	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
3003	and	x8, x13, #-4
3004	extr	x13, x14, x13, #2
3005	adds	x8, x8, x11
3006	lsr	x11, x14, #2
3007	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
3008	adds	x8, x8, x13
3009	adcs	x9, x9, x12
3010	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
3011	sub	x4, x4, #1
3012	b	Lopen_128_hash_64
3013.cfi_endproc
3014
3015#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
3016