ring/pregenerated/chacha20_poly1305_armv8-win64.S

// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.

#include <ring-core/asm_base.h>

#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include <ring-core/arm_arch.h>
.section	.rodata

.align	7
Lchacha20_consts:
.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
Linc:
.long	1,2,3,4
Lrol8:
.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
Lclamp:
.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC

.text

.def Lpoly_hash_ad_internal
   .type 32
.endef
.align	6
Lpoly_hash_ad_internal:
.cfi_startproc
	cbnz	x4, Lpoly_hash_intro
	ret

Lpoly_hash_intro:
	cmp	x4, #16
	b.lt	Lpoly_hash_ad_tail
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	sub	x4, x4, #16
	b	Lpoly_hash_ad_internal

Lpoly_hash_ad_tail:
	cbz	x4, Lpoly_hash_ad_ret

	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
	sub	x4, x4, #1

Lpoly_hash_tail_16_compose:
	ext	v20.16b, v20.16b, v20.16b, #15
	ldrb	w11, [x3, x4]
	mov	v20.b[0], w11
	subs	x4, x4, #1
	b.ge	Lpoly_hash_tail_16_compose
	mov	x11, v20.d[0]
	mov	x12, v20.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most

Lpoly_hash_ad_ret:
	ret
.cfi_endproc


/////////////////////////////////
//
// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
//
.globl	chacha20_poly1305_seal

.def chacha20_poly1305_seal
   .type 32
.endef
.align	6
chacha20_poly1305_seal:
	AARCH64_SIGN_LINK_REGISTER
.cfi_startproc
	stp	x29, x30, [sp, #-80]!
.cfi_def_cfa_offset	80
.cfi_offset	w30, -72
.cfi_offset	w29, -80
	mov	x29, sp
    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
    // we don't actually use the frame pointer like that, it's probably not
    // worth bothering.
	stp	d8, d9, [sp, #16]
	stp	d10, d11, [sp, #32]
	stp	d12, d13, [sp, #48]
	stp	d14, d15, [sp, #64]
.cfi_offset	b15, -8
.cfi_offset	b14, -16
.cfi_offset	b13, -24
.cfi_offset	b12, -32
.cfi_offset	b11, -40
.cfi_offset	b10, -48
.cfi_offset	b9, -56
.cfi_offset	b8, -64

	adrp	x11, Lchacha20_consts
	add	x11, x11, :lo12:Lchacha20_consts

	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
	ld1	{v28.16b - v30.16b}, [x5]

	mov	x15, #1 // Prepare the Poly1305 state
	mov	x8, #0
	mov	x9, #0
	mov	x10, #0

	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
	add	x12, x12, x2
	mov	v31.d[0], x4  // Store the input and aad lengths
	mov	v31.d[1], x12

	cmp	x2, #128
	b.le	Lseal_128 // Optimization for smaller buffers

    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
    // the fifth block (A4-D4) horizontally.
	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
	mov	v4.16b, v24.16b

	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
	mov	v9.16b, v28.16b

	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
	mov	v14.16b, v29.16b

	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
	add	v15.4s, v15.4s, v25.4s
	mov	v19.16b, v30.16b

	sub	x5, x5, #32

	mov	x6, #10

.align	5
Lseal_init_rounds:
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	add	v3.4s, v3.4s, v8.4s
	add	v4.4s, v4.4s, v9.4s

	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	eor	v18.16b, v18.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h
	rev32	v18.8h, v18.8h
	rev32	v19.8h, v19.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	add	v13.4s, v13.4s, v18.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	eor	v8.16b, v8.16b, v13.16b
	eor	v9.16b, v9.16b, v14.16b

	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12
	ushr	v7.4s, v8.4s, #20
	sli	v7.4s, v8.4s, #12
	ushr	v8.4s, v9.4s, #20
	sli	v8.4s, v9.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	add	v3.4s, v3.4s, v7.4s
	add	v4.4s, v4.4s, v8.4s

	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	eor	v18.16b, v18.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b
	tbl	v18.16b, {v18.16b}, v26.16b
	tbl	v19.16b, {v19.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	add	v13.4s, v13.4s, v18.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	eor	v7.16b, v7.16b, v13.16b
	eor	v8.16b, v8.16b, v14.16b

	ushr	v9.4s, v8.4s, #25
	sli	v9.4s, v8.4s, #7
	ushr	v8.4s, v7.4s, #25
	sli	v8.4s, v7.4s, #7
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v9.16b, v9.16b, v9.16b, #4
	ext	v14.16b, v14.16b, v14.16b, #8
	ext	v19.16b, v19.16b, v19.16b, #12
	add	v0.4s, v0.4s, v6.4s
	add	v1.4s, v1.4s, v7.4s
	add	v2.4s, v2.4s, v8.4s
	add	v3.4s, v3.4s, v5.4s
	add	v4.4s, v4.4s, v9.4s

	eor	v18.16b, v18.16b, v0.16b
	eor	v15.16b, v15.16b, v1.16b
	eor	v16.16b, v16.16b, v2.16b
	eor	v17.16b, v17.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	rev32	v18.8h, v18.8h
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h
	rev32	v19.8h, v19.8h

	add	v12.4s, v12.4s, v18.4s
	add	v13.4s, v13.4s, v15.4s
	add	v10.4s, v10.4s, v16.4s
	add	v11.4s, v11.4s, v17.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v6.16b, v6.16b, v12.16b
	eor	v7.16b, v7.16b, v13.16b
	eor	v8.16b, v8.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v9.16b, v9.16b, v14.16b

	ushr	v20.4s, v6.4s, #20
	sli	v20.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12
	ushr	v7.4s, v8.4s, #20
	sli	v7.4s, v8.4s, #12
	ushr	v8.4s, v5.4s, #20
	sli	v8.4s, v5.4s, #12
	ushr	v5.4s, v9.4s, #20
	sli	v5.4s, v9.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	add	v3.4s, v3.4s, v8.4s
	add	v4.4s, v4.4s, v5.4s

	eor	v18.16b, v18.16b, v0.16b
	eor	v15.16b, v15.16b, v1.16b
	eor	v16.16b, v16.16b, v2.16b
	eor	v17.16b, v17.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	tbl	v18.16b, {v18.16b}, v26.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b
	tbl	v19.16b, {v19.16b}, v26.16b

	add	v12.4s, v12.4s, v18.4s
	add	v13.4s, v13.4s, v15.4s
	add	v10.4s, v10.4s, v16.4s
	add	v11.4s, v11.4s, v17.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v20.16b, v20.16b, v12.16b
	eor	v6.16b, v6.16b, v13.16b
	eor	v7.16b, v7.16b, v10.16b
	eor	v8.16b, v8.16b, v11.16b
	eor	v5.16b, v5.16b, v14.16b

	ushr	v9.4s, v5.4s, #25
	sli	v9.4s, v5.4s, #7
	ushr	v5.4s, v8.4s, #25
	sli	v5.4s, v8.4s, #7
	ushr	v8.4s, v7.4s, #25
	sli	v8.4s, v7.4s, #7
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v20.4s, #25
	sli	v6.4s, v20.4s, #7

	ext	v9.16b, v9.16b, v9.16b, #12
	ext	v14.16b, v14.16b, v14.16b, #8
	ext	v19.16b, v19.16b, v19.16b, #4
	subs	x6, x6, #1
	b.hi	Lseal_init_rounds

	add	v15.4s, v15.4s, v25.4s
	mov	x11, #4
	dup	v20.4s, w11
	add	v25.4s, v25.4s, v20.4s

	zip1	v20.4s, v0.4s, v1.4s
	zip2	v21.4s, v0.4s, v1.4s
	zip1	v22.4s, v2.4s, v3.4s
	zip2	v23.4s, v2.4s, v3.4s

	zip1	v0.2d, v20.2d, v22.2d
	zip2	v1.2d, v20.2d, v22.2d
	zip1	v2.2d, v21.2d, v23.2d
	zip2	v3.2d, v21.2d, v23.2d

	zip1	v20.4s, v5.4s, v6.4s
	zip2	v21.4s, v5.4s, v6.4s
	zip1	v22.4s, v7.4s, v8.4s
	zip2	v23.4s, v7.4s, v8.4s

	zip1	v5.2d, v20.2d, v22.2d
	zip2	v6.2d, v20.2d, v22.2d
	zip1	v7.2d, v21.2d, v23.2d
	zip2	v8.2d, v21.2d, v23.2d

	zip1	v20.4s, v10.4s, v11.4s
	zip2	v21.4s, v10.4s, v11.4s
	zip1	v22.4s, v12.4s, v13.4s
	zip2	v23.4s, v12.4s, v13.4s

	zip1	v10.2d, v20.2d, v22.2d
	zip2	v11.2d, v20.2d, v22.2d
	zip1	v12.2d, v21.2d, v23.2d
	zip2	v13.2d, v21.2d, v23.2d

	zip1	v20.4s, v15.4s, v16.4s
	zip2	v21.4s, v15.4s, v16.4s
	zip1	v22.4s, v17.4s, v18.4s
	zip2	v23.4s, v17.4s, v18.4s

	zip1	v15.2d, v20.2d, v22.2d
	zip2	v16.2d, v20.2d, v22.2d
	zip1	v17.2d, v21.2d, v23.2d
	zip2	v18.2d, v21.2d, v23.2d

	add	v4.4s, v4.4s, v24.4s
	add	v9.4s, v9.4s, v28.4s
	and	v4.16b, v4.16b, v27.16b

	add	v0.4s, v0.4s, v24.4s
	add	v5.4s, v5.4s, v28.4s
	add	v10.4s, v10.4s, v29.4s
	add	v15.4s, v15.4s, v30.4s

	add	v1.4s, v1.4s, v24.4s
	add	v6.4s, v6.4s, v28.4s
	add	v11.4s, v11.4s, v29.4s
	add	v16.4s, v16.4s, v30.4s

	add	v2.4s, v2.4s, v24.4s
	add	v7.4s, v7.4s, v28.4s
	add	v12.4s, v12.4s, v29.4s
	add	v17.4s, v17.4s, v30.4s

	add	v3.4s, v3.4s, v24.4s
	add	v8.4s, v8.4s, v28.4s
	add	v13.4s, v13.4s, v29.4s
	add	v18.4s, v18.4s, v30.4s

	mov	x16, v4.d[0] // Move the R key to GPRs
	mov	x17, v4.d[1]
	mov	v27.16b, v9.16b // Store the S key

	bl	Lpoly_hash_ad_internal

	mov	x3, x0
	cmp	x2, #256
	b.le	Lseal_tail

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v0.16b
	eor	v21.16b, v21.16b, v5.16b
	eor	v22.16b, v22.16b, v10.16b
	eor	v23.16b, v23.16b, v15.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v1.16b
	eor	v21.16b, v21.16b, v6.16b
	eor	v22.16b, v22.16b, v11.16b
	eor	v23.16b, v23.16b, v16.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v2.16b
	eor	v21.16b, v21.16b, v7.16b
	eor	v22.16b, v22.16b, v12.16b
	eor	v23.16b, v23.16b, v17.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v3.16b
	eor	v21.16b, v21.16b, v8.16b
	eor	v22.16b, v22.16b, v13.16b
	eor	v23.16b, v23.16b, v18.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	sub	x2, x2, #256

	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256

Lseal_main_loop:
	adrp	x11, Lchacha20_consts
	add	x11, x11, :lo12:Lchacha20_consts

	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
	mov	v4.16b, v24.16b

	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
	mov	v9.16b, v28.16b

	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
	mov	v14.16b, v29.16b

	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
	add	v15.4s, v15.4s, v25.4s
	mov	v19.16b, v30.16b

	eor	v20.16b, v20.16b, v20.16b //zero
	not	v21.16b, v20.16b // -1
	sub	v21.4s, v25.4s, v21.4s // Add +1
	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
	add	v19.4s, v19.4s, v20.4s

	sub	x5, x5, #32
.align	5
Lseal_main_loop_rounds:
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	add	v3.4s, v3.4s, v8.4s
	add	v4.4s, v4.4s, v9.4s

	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	eor	v18.16b, v18.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h
	rev32	v18.8h, v18.8h
	rev32	v19.8h, v19.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	add	v13.4s, v13.4s, v18.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	eor	v8.16b, v8.16b, v13.16b
	eor	v9.16b, v9.16b, v14.16b

	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12
	ushr	v7.4s, v8.4s, #20
	sli	v7.4s, v8.4s, #12
	ushr	v8.4s, v9.4s, #20
	sli	v8.4s, v9.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	add	v3.4s, v3.4s, v7.4s
	add	v4.4s, v4.4s, v8.4s

	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	eor	v18.16b, v18.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b
	tbl	v18.16b, {v18.16b}, v26.16b
	tbl	v19.16b, {v19.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	add	v13.4s, v13.4s, v18.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	eor	v7.16b, v7.16b, v13.16b
	eor	v8.16b, v8.16b, v14.16b

	ushr	v9.4s, v8.4s, #25
	sli	v9.4s, v8.4s, #7
	ushr	v8.4s, v7.4s, #25
	sli	v8.4s, v7.4s, #7
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v9.16b, v9.16b, v9.16b, #4
	ext	v14.16b, v14.16b, v14.16b, #8
	ext	v19.16b, v19.16b, v19.16b, #12
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	add	v0.4s, v0.4s, v6.4s
	add	v1.4s, v1.4s, v7.4s
	add	v2.4s, v2.4s, v8.4s
	add	v3.4s, v3.4s, v5.4s
	add	v4.4s, v4.4s, v9.4s

	eor	v18.16b, v18.16b, v0.16b
	eor	v15.16b, v15.16b, v1.16b
	eor	v16.16b, v16.16b, v2.16b
	eor	v17.16b, v17.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	rev32	v18.8h, v18.8h
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h
	rev32	v19.8h, v19.8h

	add	v12.4s, v12.4s, v18.4s
	add	v13.4s, v13.4s, v15.4s
	add	v10.4s, v10.4s, v16.4s
	add	v11.4s, v11.4s, v17.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v6.16b, v6.16b, v12.16b
	eor	v7.16b, v7.16b, v13.16b
	eor	v8.16b, v8.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v9.16b, v9.16b, v14.16b

	ushr	v20.4s, v6.4s, #20
	sli	v20.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12
	ushr	v7.4s, v8.4s, #20
	sli	v7.4s, v8.4s, #12
	ushr	v8.4s, v5.4s, #20
	sli	v8.4s, v5.4s, #12
	ushr	v5.4s, v9.4s, #20
	sli	v5.4s, v9.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	add	v3.4s, v3.4s, v8.4s
	add	v4.4s, v4.4s, v5.4s

	eor	v18.16b, v18.16b, v0.16b
	eor	v15.16b, v15.16b, v1.16b
	eor	v16.16b, v16.16b, v2.16b
	eor	v17.16b, v17.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	tbl	v18.16b, {v18.16b}, v26.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b
	tbl	v19.16b, {v19.16b}, v26.16b

	add	v12.4s, v12.4s, v18.4s
	add	v13.4s, v13.4s, v15.4s
	add	v10.4s, v10.4s, v16.4s
	add	v11.4s, v11.4s, v17.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v20.16b, v20.16b, v12.16b
	eor	v6.16b, v6.16b, v13.16b
	eor	v7.16b, v7.16b, v10.16b
	eor	v8.16b, v8.16b, v11.16b
	eor	v5.16b, v5.16b, v14.16b

	ushr	v9.4s, v5.4s, #25
	sli	v9.4s, v5.4s, #7
	ushr	v5.4s, v8.4s, #25
	sli	v5.4s, v8.4s, #7
	ushr	v8.4s, v7.4s, #25
	sli	v8.4s, v7.4s, #7
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v20.4s, #25
	sli	v6.4s, v20.4s, #7

	ext	v9.16b, v9.16b, v9.16b, #12
	ext	v14.16b, v14.16b, v14.16b, #8
	ext	v19.16b, v19.16b, v19.16b, #4
	subs	x6, x6, #1
	b.ge	Lseal_main_loop_rounds
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	subs	x7, x7, #1
	b.gt	Lseal_main_loop_rounds

	eor	v20.16b, v20.16b, v20.16b //zero
	not	v21.16b, v20.16b // -1
	sub	v21.4s, v25.4s, v21.4s // Add +1
	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
	add	v19.4s, v19.4s, v20.4s

	add	v15.4s, v15.4s, v25.4s
	mov	x11, #5
	dup	v20.4s, w11
	add	v25.4s, v25.4s, v20.4s

	zip1	v20.4s, v0.4s, v1.4s
	zip2	v21.4s, v0.4s, v1.4s
	zip1	v22.4s, v2.4s, v3.4s
	zip2	v23.4s, v2.4s, v3.4s

	zip1	v0.2d, v20.2d, v22.2d
	zip2	v1.2d, v20.2d, v22.2d
	zip1	v2.2d, v21.2d, v23.2d
	zip2	v3.2d, v21.2d, v23.2d

	zip1	v20.4s, v5.4s, v6.4s
	zip2	v21.4s, v5.4s, v6.4s
	zip1	v22.4s, v7.4s, v8.4s
	zip2	v23.4s, v7.4s, v8.4s

	zip1	v5.2d, v20.2d, v22.2d
	zip2	v6.2d, v20.2d, v22.2d
	zip1	v7.2d, v21.2d, v23.2d
	zip2	v8.2d, v21.2d, v23.2d

	zip1	v20.4s, v10.4s, v11.4s
	zip2	v21.4s, v10.4s, v11.4s
	zip1	v22.4s, v12.4s, v13.4s
	zip2	v23.4s, v12.4s, v13.4s

	zip1	v10.2d, v20.2d, v22.2d
	zip2	v11.2d, v20.2d, v22.2d
	zip1	v12.2d, v21.2d, v23.2d
	zip2	v13.2d, v21.2d, v23.2d

	zip1	v20.4s, v15.4s, v16.4s
	zip2	v21.4s, v15.4s, v16.4s
	zip1	v22.4s, v17.4s, v18.4s
	zip2	v23.4s, v17.4s, v18.4s

	zip1	v15.2d, v20.2d, v22.2d
	zip2	v16.2d, v20.2d, v22.2d
	zip1	v17.2d, v21.2d, v23.2d
	zip2	v18.2d, v21.2d, v23.2d

	add	v0.4s, v0.4s, v24.4s
	add	v5.4s, v5.4s, v28.4s
	add	v10.4s, v10.4s, v29.4s
	add	v15.4s, v15.4s, v30.4s

	add	v1.4s, v1.4s, v24.4s
	add	v6.4s, v6.4s, v28.4s
	add	v11.4s, v11.4s, v29.4s
	add	v16.4s, v16.4s, v30.4s

	add	v2.4s, v2.4s, v24.4s
	add	v7.4s, v7.4s, v28.4s
	add	v12.4s, v12.4s, v29.4s
	add	v17.4s, v17.4s, v30.4s

	add	v3.4s, v3.4s, v24.4s
	add	v8.4s, v8.4s, v28.4s
	add	v13.4s, v13.4s, v29.4s
	add	v18.4s, v18.4s, v30.4s

	add	v4.4s, v4.4s, v24.4s
	add	v9.4s, v9.4s, v28.4s
	add	v14.4s, v14.4s, v29.4s
	add	v19.4s, v19.4s, v30.4s

	cmp	x2, #320
	b.le	Lseal_tail

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v0.16b
	eor	v21.16b, v21.16b, v5.16b
	eor	v22.16b, v22.16b, v10.16b
	eor	v23.16b, v23.16b, v15.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v1.16b
	eor	v21.16b, v21.16b, v6.16b
	eor	v22.16b, v22.16b, v11.16b
	eor	v23.16b, v23.16b, v16.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v2.16b
	eor	v21.16b, v21.16b, v7.16b
	eor	v22.16b, v22.16b, v12.16b
	eor	v23.16b, v23.16b, v17.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v3.16b
	eor	v21.16b, v21.16b, v8.16b
	eor	v22.16b, v22.16b, v13.16b
	eor	v23.16b, v23.16b, v18.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v4.16b
	eor	v21.16b, v21.16b, v9.16b
	eor	v22.16b, v22.16b, v14.16b
	eor	v23.16b, v23.16b, v19.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	sub	x2, x2, #320

	mov	x6, #0
	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration

	b	Lseal_main_loop

Lseal_tail:
    // This part of the function handles the storage and authentication of the last [0,320) bytes
    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
	cmp	x2, #64
	b.lt	Lseal_tail_64

    // Store and authenticate 64B blocks per iteration
	ld1	{v20.16b - v23.16b}, [x1], #64

	eor	v20.16b, v20.16b, v0.16b
	eor	v21.16b, v21.16b, v5.16b
	eor	v22.16b, v22.16b, v10.16b
	eor	v23.16b, v23.16b, v15.16b
	mov	x11, v20.d[0]
	mov	x12, v20.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	mov	x11, v21.d[0]
	mov	x12, v21.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	mov	x11, v22.d[0]
	mov	x12, v22.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	mov	x11, v23.d[0]
	mov	x12, v23.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	st1	{v20.16b - v23.16b}, [x0], #64
	sub	x2, x2, #64

    // Shift the state left by 64 bytes for the next iteration of the loop
	mov	v0.16b, v1.16b
	mov	v5.16b, v6.16b
	mov	v10.16b, v11.16b
	mov	v15.16b, v16.16b

	mov	v1.16b, v2.16b
	mov	v6.16b, v7.16b
	mov	v11.16b, v12.16b
	mov	v16.16b, v17.16b

	mov	v2.16b, v3.16b
	mov	v7.16b, v8.16b
	mov	v12.16b, v13.16b
	mov	v17.16b, v18.16b

	mov	v3.16b, v4.16b
	mov	v8.16b, v9.16b
	mov	v13.16b, v14.16b
	mov	v18.16b, v19.16b

	b	Lseal_tail

Lseal_tail_64:
	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr

    // Here we handle the last [0,64) bytes of plaintext
	cmp	x2, #16
	b.lt	Lseal_tail_16
    // Each iteration encrypt and authenticate a 16B block
	ld1	{v20.16b}, [x1], #16
	eor	v20.16b, v20.16b, v0.16b
	mov	x11, v20.d[0]
	mov	x12, v20.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	st1	{v20.16b}, [x0], #16

	sub	x2, x2, #16

    // Shift the state left by 16 bytes for the next iteration of the loop
	mov	v0.16b, v5.16b
	mov	v5.16b, v10.16b
	mov	v10.16b, v15.16b

	b	Lseal_tail_64

Lseal_tail_16:
    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
	cbz	x2, Lseal_hash_extra

	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
	not	v22.16b, v20.16b

	mov	x6, x2
	add	x1, x1, x2

	cbz	x4, Lseal_tail_16_compose // No extra data to pad with, zero padding

	mov	x7, #16          // We need to load some extra_in first for padding
	sub	x7, x7, x2
	cmp	x4, x7
	csel	x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register
	mov	x12, x7
	add	x3, x3, x7
	sub	x4, x4, x7

Lseal_tail16_compose_extra_in:
	ext	v20.16b, v20.16b, v20.16b, #15
	ldrb	w11, [x3, #-1]!
	mov	v20.b[0], w11
	subs	x7, x7, #1
	b.gt	Lseal_tail16_compose_extra_in

	add	x3, x3, x12

Lseal_tail_16_compose:
	ext	v20.16b, v20.16b, v20.16b, #15
	ldrb	w11, [x1, #-1]!
	mov	v20.b[0], w11
	ext	v21.16b, v22.16b, v21.16b, #15
	subs	x2, x2, #1
	b.gt	Lseal_tail_16_compose

	and	v0.16b, v0.16b, v21.16b
	eor	v20.16b, v20.16b, v0.16b
	mov	v21.16b, v20.16b

Lseal_tail_16_store:
	umov	w11, v20.b[0]
	strb	w11, [x0], #1
	ext	v20.16b, v20.16b, v20.16b, #1
	subs	x6, x6, #1
	b.gt	Lseal_tail_16_store

    // Hash in the final ct block concatenated with extra_in
	mov	x11, v21.d[0]
	mov	x12, v21.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most

Lseal_hash_extra:
	cbz	x4, Lseal_finalize

Lseal_hash_extra_loop:
	cmp	x4, #16
	b.lt	Lseal_hash_extra_tail
	ld1	{v20.16b}, [x3], #16
	mov	x11, v20.d[0]
	mov	x12, v20.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	sub	x4, x4, #16
	b	Lseal_hash_extra_loop

Lseal_hash_extra_tail:
	cbz	x4, Lseal_finalize
	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
	add	x3, x3, x4

Lseal_hash_extra_load:
	ext	v20.16b, v20.16b, v20.16b, #15
	ldrb	w11, [x3, #-1]!
	mov	v20.b[0], w11
	subs	x4, x4, #1
	b.gt	Lseal_hash_extra_load

    // Hash in the final padded extra_in blcok
	mov	x11, v20.d[0]
	mov	x12, v20.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most

Lseal_finalize:
	mov	x11, v31.d[0]
	mov	x12, v31.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
    // Final reduction step
	sub	x12, xzr, x15
	orr	x13, xzr, #3
	subs	x11, x8, #-5
	sbcs	x12, x9, x12
	sbcs	x13, x10, x13
	csel	x8, x11, x8, cs
	csel	x9, x12, x9, cs
	csel	x10, x13, x10, cs
	mov	x11, v27.d[0]
	mov	x12, v27.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15

	stp	x8, x9, [x5]

	ldp	d8, d9, [sp, #16]
	ldp	d10, d11, [sp, #32]
	ldp	d12, d13, [sp, #48]
	ldp	d14, d15, [sp, #64]
.cfi_restore	b15
.cfi_restore	b14
.cfi_restore	b13
.cfi_restore	b12
.cfi_restore	b11
.cfi_restore	b10
.cfi_restore	b9
.cfi_restore	b8
	ldp	x29, x30, [sp], 80
.cfi_restore	w29
.cfi_restore	w30
.cfi_def_cfa_offset	0
	AARCH64_VALIDATE_LINK_REGISTER
	ret

Lseal_128:
    // On some architectures preparing 5 blocks for small buffers is wasteful
	eor	v25.16b, v25.16b, v25.16b
	mov	x11, #1
	mov	v25.s[0], w11
	mov	v0.16b, v24.16b
	mov	v1.16b, v24.16b
	mov	v2.16b, v24.16b
	mov	v5.16b, v28.16b
	mov	v6.16b, v28.16b
	mov	v7.16b, v28.16b
	mov	v10.16b, v29.16b
	mov	v11.16b, v29.16b
	mov	v12.16b, v29.16b
	mov	v17.16b, v30.16b
	add	v15.4s, v17.4s, v25.4s
	add	v16.4s, v15.4s, v25.4s

	mov	x6, #10

Lseal_128_rounds:
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v5.16b, v5.16b, v5.16b, #4
	ext	v6.16b, v6.16b, v6.16b, #4
	ext	v7.16b, v7.16b, v7.16b, #4

	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v11.16b, v11.16b, v11.16b, #8
	ext	v12.16b, v12.16b, v12.16b, #8

	ext	v15.16b, v15.16b, v15.16b, #12
	ext	v16.16b, v16.16b, v16.16b, #12
	ext	v17.16b, v17.16b, v17.16b, #12
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v5.16b, v5.16b, v5.16b, #12
	ext	v6.16b, v6.16b, v6.16b, #12
	ext	v7.16b, v7.16b, v7.16b, #12

	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v11.16b, v11.16b, v11.16b, #8
	ext	v12.16b, v12.16b, v12.16b, #8

	ext	v15.16b, v15.16b, v15.16b, #4
	ext	v16.16b, v16.16b, v16.16b, #4
	ext	v17.16b, v17.16b, v17.16b, #4
	subs	x6, x6, #1
	b.hi	Lseal_128_rounds

	add	v0.4s, v0.4s, v24.4s
	add	v1.4s, v1.4s, v24.4s
	add	v2.4s, v2.4s, v24.4s

	add	v5.4s, v5.4s, v28.4s
	add	v6.4s, v6.4s, v28.4s
	add	v7.4s, v7.4s, v28.4s

    // Only the first 32 bytes of the third block (counter = 0) are needed,
    // so skip updating v12 and v17.
	add	v10.4s, v10.4s, v29.4s
	add	v11.4s, v11.4s, v29.4s

	add	v30.4s, v30.4s, v25.4s
	add	v15.4s, v15.4s, v30.4s
	add	v30.4s, v30.4s, v25.4s
	add	v16.4s, v16.4s, v30.4s

	and	v2.16b, v2.16b, v27.16b
	mov	x16, v2.d[0] // Move the R key to GPRs
	mov	x17, v2.d[1]
	mov	v27.16b, v7.16b // Store the S key

	bl	Lpoly_hash_ad_internal
	b	Lseal_tail
.cfi_endproc


/////////////////////////////////
//
// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
//
.globl	chacha20_poly1305_open

.def chacha20_poly1305_open
   .type 32
.endef
.align	6
chacha20_poly1305_open:
	AARCH64_SIGN_LINK_REGISTER
.cfi_startproc
	stp	x29, x30, [sp, #-80]!
.cfi_def_cfa_offset	80
.cfi_offset	w30, -72
.cfi_offset	w29, -80
	mov	x29, sp
    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
    // we don't actually use the frame pointer like that, it's probably not
    // worth bothering.
	stp	d8, d9, [sp, #16]
	stp	d10, d11, [sp, #32]
	stp	d12, d13, [sp, #48]
	stp	d14, d15, [sp, #64]
.cfi_offset	b15, -8
.cfi_offset	b14, -16
.cfi_offset	b13, -24
.cfi_offset	b12, -32
.cfi_offset	b11, -40
.cfi_offset	b10, -48
.cfi_offset	b9, -56
.cfi_offset	b8, -64

	adrp	x11, Lchacha20_consts
	add	x11, x11, :lo12:Lchacha20_consts

	ld1	{v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values
	ld1	{v28.16b - v30.16b}, [x5]

	mov	x15, #1 // Prepare the Poly1305 state
	mov	x8, #0
	mov	x9, #0
	mov	x10, #0

	mov	v31.d[0], x4  // Store the input and aad lengths
	mov	v31.d[1], x2

	cmp	x2, #128
	b.le	Lopen_128 // Optimization for smaller buffers

    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
	mov	v0.16b, v24.16b
	mov	v5.16b, v28.16b
	mov	v10.16b, v29.16b
	mov	v15.16b, v30.16b

	mov	x6, #10

.align	5
Lopen_init_rounds:
	add	v0.4s, v0.4s, v5.4s
	eor	v15.16b, v15.16b, v0.16b
	rev32	v15.8h, v15.8h

	add	v10.4s, v10.4s, v15.4s
	eor	v5.16b, v5.16b, v10.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	add	v0.4s, v0.4s, v20.4s
	eor	v15.16b, v15.16b, v0.16b
	tbl	v15.16b, {v15.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	eor	v20.16b, v20.16b, v10.16b
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7
	ext	v5.16b, v5.16b, v5.16b, #4
	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v15.16b, v15.16b, v15.16b, #12
	add	v0.4s, v0.4s, v5.4s
	eor	v15.16b, v15.16b, v0.16b
	rev32	v15.8h, v15.8h

	add	v10.4s, v10.4s, v15.4s
	eor	v5.16b, v5.16b, v10.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	add	v0.4s, v0.4s, v20.4s
	eor	v15.16b, v15.16b, v0.16b
	tbl	v15.16b, {v15.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	eor	v20.16b, v20.16b, v10.16b
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7
	ext	v5.16b, v5.16b, v5.16b, #12
	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v15.16b, v15.16b, v15.16b, #4
	subs	x6, x6, #1
	b.hi	Lopen_init_rounds

	add	v0.4s, v0.4s, v24.4s
	add	v5.4s, v5.4s, v28.4s

	and	v0.16b, v0.16b, v27.16b
	mov	x16, v0.d[0] // Move the R key to GPRs
	mov	x17, v0.d[1]
	mov	v27.16b, v5.16b // Store the S key

	bl	Lpoly_hash_ad_internal

Lopen_ad_done:
	mov	x3, x1

// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
Lopen_main_loop:

	cmp	x2, #192
	b.lt	Lopen_tail

	adrp	x11, Lchacha20_consts
	add	x11, x11, :lo12:Lchacha20_consts

	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
	mov	v4.16b, v24.16b

	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
	mov	v9.16b, v28.16b

	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
	mov	v14.16b, v29.16b

	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
	sub	x5, x5, #32
	add	v15.4s, v15.4s, v25.4s
	mov	v19.16b, v30.16b

	eor	v20.16b, v20.16b, v20.16b //zero
	not	v21.16b, v20.16b // -1
	sub	v21.4s, v25.4s, v21.4s // Add +1
	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
	add	v19.4s, v19.4s, v20.4s

	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
	sub	x4, x4, #10

	mov	x7, #10
	subs	x6, x7, x4
	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full

	cbz	x7, Lopen_main_loop_rounds_short

.align	5
Lopen_main_loop_rounds:
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
Lopen_main_loop_rounds_short:
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	add	v3.4s, v3.4s, v8.4s
	add	v4.4s, v4.4s, v9.4s

	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	eor	v18.16b, v18.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h
	rev32	v18.8h, v18.8h
	rev32	v19.8h, v19.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	add	v13.4s, v13.4s, v18.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	eor	v8.16b, v8.16b, v13.16b
	eor	v9.16b, v9.16b, v14.16b

	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12
	ushr	v7.4s, v8.4s, #20
	sli	v7.4s, v8.4s, #12
	ushr	v8.4s, v9.4s, #20
	sli	v8.4s, v9.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	add	v3.4s, v3.4s, v7.4s
	add	v4.4s, v4.4s, v8.4s

	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	eor	v18.16b, v18.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b
	tbl	v18.16b, {v18.16b}, v26.16b
	tbl	v19.16b, {v19.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	add	v13.4s, v13.4s, v18.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	eor	v7.16b, v7.16b, v13.16b
	eor	v8.16b, v8.16b, v14.16b

	ushr	v9.4s, v8.4s, #25
	sli	v9.4s, v8.4s, #7
	ushr	v8.4s, v7.4s, #25
	sli	v8.4s, v7.4s, #7
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v9.16b, v9.16b, v9.16b, #4
	ext	v14.16b, v14.16b, v14.16b, #8
	ext	v19.16b, v19.16b, v19.16b, #12
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	add	v0.4s, v0.4s, v6.4s
	add	v1.4s, v1.4s, v7.4s
	add	v2.4s, v2.4s, v8.4s
	add	v3.4s, v3.4s, v5.4s
	add	v4.4s, v4.4s, v9.4s

	eor	v18.16b, v18.16b, v0.16b
	eor	v15.16b, v15.16b, v1.16b
	eor	v16.16b, v16.16b, v2.16b
	eor	v17.16b, v17.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	rev32	v18.8h, v18.8h
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h
	rev32	v19.8h, v19.8h

	add	v12.4s, v12.4s, v18.4s
	add	v13.4s, v13.4s, v15.4s
	add	v10.4s, v10.4s, v16.4s
	add	v11.4s, v11.4s, v17.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v6.16b, v6.16b, v12.16b
	eor	v7.16b, v7.16b, v13.16b
	eor	v8.16b, v8.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v9.16b, v9.16b, v14.16b

	ushr	v20.4s, v6.4s, #20
	sli	v20.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12
	ushr	v7.4s, v8.4s, #20
	sli	v7.4s, v8.4s, #12
	ushr	v8.4s, v5.4s, #20
	sli	v8.4s, v5.4s, #12
	ushr	v5.4s, v9.4s, #20
	sli	v5.4s, v9.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	add	v3.4s, v3.4s, v8.4s
	add	v4.4s, v4.4s, v5.4s

	eor	v18.16b, v18.16b, v0.16b
	eor	v15.16b, v15.16b, v1.16b
	eor	v16.16b, v16.16b, v2.16b
	eor	v17.16b, v17.16b, v3.16b
	eor	v19.16b, v19.16b, v4.16b

	tbl	v18.16b, {v18.16b}, v26.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b
	tbl	v19.16b, {v19.16b}, v26.16b

	add	v12.4s, v12.4s, v18.4s
	add	v13.4s, v13.4s, v15.4s
	add	v10.4s, v10.4s, v16.4s
	add	v11.4s, v11.4s, v17.4s
	add	v14.4s, v14.4s, v19.4s

	eor	v20.16b, v20.16b, v12.16b
	eor	v6.16b, v6.16b, v13.16b
	eor	v7.16b, v7.16b, v10.16b
	eor	v8.16b, v8.16b, v11.16b
	eor	v5.16b, v5.16b, v14.16b

	ushr	v9.4s, v5.4s, #25
	sli	v9.4s, v5.4s, #7
	ushr	v5.4s, v8.4s, #25
	sli	v5.4s, v8.4s, #7
	ushr	v8.4s, v7.4s, #25
	sli	v8.4s, v7.4s, #7
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v20.4s, #25
	sli	v6.4s, v20.4s, #7

	ext	v9.16b, v9.16b, v9.16b, #12
	ext	v14.16b, v14.16b, v14.16b, #8
	ext	v19.16b, v19.16b, v19.16b, #4
	subs	x7, x7, #1
	b.gt	Lopen_main_loop_rounds
	subs	x6, x6, #1
	b.ge	Lopen_main_loop_rounds_short

	eor	v20.16b, v20.16b, v20.16b //zero
	not	v21.16b, v20.16b // -1
	sub	v21.4s, v25.4s, v21.4s // Add +1
	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
	add	v19.4s, v19.4s, v20.4s

	add	v15.4s, v15.4s, v25.4s
	mov	x11, #5
	dup	v20.4s, w11
	add	v25.4s, v25.4s, v20.4s

	zip1	v20.4s, v0.4s, v1.4s
	zip2	v21.4s, v0.4s, v1.4s
	zip1	v22.4s, v2.4s, v3.4s
	zip2	v23.4s, v2.4s, v3.4s

	zip1	v0.2d, v20.2d, v22.2d
	zip2	v1.2d, v20.2d, v22.2d
	zip1	v2.2d, v21.2d, v23.2d
	zip2	v3.2d, v21.2d, v23.2d

	zip1	v20.4s, v5.4s, v6.4s
	zip2	v21.4s, v5.4s, v6.4s
	zip1	v22.4s, v7.4s, v8.4s
	zip2	v23.4s, v7.4s, v8.4s

	zip1	v5.2d, v20.2d, v22.2d
	zip2	v6.2d, v20.2d, v22.2d
	zip1	v7.2d, v21.2d, v23.2d
	zip2	v8.2d, v21.2d, v23.2d

	zip1	v20.4s, v10.4s, v11.4s
	zip2	v21.4s, v10.4s, v11.4s
	zip1	v22.4s, v12.4s, v13.4s
	zip2	v23.4s, v12.4s, v13.4s

	zip1	v10.2d, v20.2d, v22.2d
	zip2	v11.2d, v20.2d, v22.2d
	zip1	v12.2d, v21.2d, v23.2d
	zip2	v13.2d, v21.2d, v23.2d

	zip1	v20.4s, v15.4s, v16.4s
	zip2	v21.4s, v15.4s, v16.4s
	zip1	v22.4s, v17.4s, v18.4s
	zip2	v23.4s, v17.4s, v18.4s

	zip1	v15.2d, v20.2d, v22.2d
	zip2	v16.2d, v20.2d, v22.2d
	zip1	v17.2d, v21.2d, v23.2d
	zip2	v18.2d, v21.2d, v23.2d

	add	v0.4s, v0.4s, v24.4s
	add	v5.4s, v5.4s, v28.4s
	add	v10.4s, v10.4s, v29.4s
	add	v15.4s, v15.4s, v30.4s

	add	v1.4s, v1.4s, v24.4s
	add	v6.4s, v6.4s, v28.4s
	add	v11.4s, v11.4s, v29.4s
	add	v16.4s, v16.4s, v30.4s

	add	v2.4s, v2.4s, v24.4s
	add	v7.4s, v7.4s, v28.4s
	add	v12.4s, v12.4s, v29.4s
	add	v17.4s, v17.4s, v30.4s

	add	v3.4s, v3.4s, v24.4s
	add	v8.4s, v8.4s, v28.4s
	add	v13.4s, v13.4s, v29.4s
	add	v18.4s, v18.4s, v30.4s

	add	v4.4s, v4.4s, v24.4s
	add	v9.4s, v9.4s, v28.4s
	add	v14.4s, v14.4s, v29.4s
	add	v19.4s, v19.4s, v30.4s

    // We can always safely store 192 bytes
	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v0.16b
	eor	v21.16b, v21.16b, v5.16b
	eor	v22.16b, v22.16b, v10.16b
	eor	v23.16b, v23.16b, v15.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v1.16b
	eor	v21.16b, v21.16b, v6.16b
	eor	v22.16b, v22.16b, v11.16b
	eor	v23.16b, v23.16b, v16.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v2.16b
	eor	v21.16b, v21.16b, v7.16b
	eor	v22.16b, v22.16b, v12.16b
	eor	v23.16b, v23.16b, v17.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	sub	x2, x2, #192

	mov	v0.16b, v3.16b
	mov	v5.16b, v8.16b
	mov	v10.16b, v13.16b
	mov	v15.16b, v18.16b

	cmp	x2, #64
	b.lt	Lopen_tail_64_store

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v3.16b
	eor	v21.16b, v21.16b, v8.16b
	eor	v22.16b, v22.16b, v13.16b
	eor	v23.16b, v23.16b, v18.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	sub	x2, x2, #64

	mov	v0.16b, v4.16b
	mov	v5.16b, v9.16b
	mov	v10.16b, v14.16b
	mov	v15.16b, v19.16b

	cmp	x2, #64
	b.lt	Lopen_tail_64_store

	ld1	{v20.16b - v23.16b}, [x1], #64
	eor	v20.16b, v20.16b, v4.16b
	eor	v21.16b, v21.16b, v9.16b
	eor	v22.16b, v22.16b, v14.16b
	eor	v23.16b, v23.16b, v19.16b
	st1	{v20.16b - v23.16b}, [x0], #64

	sub	x2, x2, #64
	b	Lopen_main_loop

Lopen_tail:

	cbz	x2, Lopen_finalize

	lsr	x4, x2, #4 // How many whole blocks we have to hash

	cmp	x2, #64
	b.le	Lopen_tail_64
	cmp	x2, #128
	b.le	Lopen_tail_128

Lopen_tail_192:
     // We need three more blocks
	mov	v0.16b, v24.16b
	mov	v1.16b, v24.16b
	mov	v2.16b, v24.16b
	mov	v5.16b, v28.16b
	mov	v6.16b, v28.16b
	mov	v7.16b, v28.16b
	mov	v10.16b, v29.16b
	mov	v11.16b, v29.16b
	mov	v12.16b, v29.16b
	mov	v15.16b, v30.16b
	mov	v16.16b, v30.16b
	mov	v17.16b, v30.16b
	eor	v23.16b, v23.16b, v23.16b
	eor	v21.16b, v21.16b, v21.16b
	ins	v23.s[0], v25.s[0]
	ins	v21.d[0], x15

	add	v22.4s, v23.4s, v21.4s
	add	v21.4s, v22.4s, v21.4s

	add	v15.4s, v15.4s, v21.4s
	add	v16.4s, v16.4s, v23.4s
	add	v17.4s, v17.4s, v22.4s

	mov	x7, #10
	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
	sub	x4, x4, x7

	cbz	x7, Lopen_tail_192_rounds_no_hash

Lopen_tail_192_rounds:
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
Lopen_tail_192_rounds_no_hash:
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v5.16b, v5.16b, v5.16b, #4
	ext	v6.16b, v6.16b, v6.16b, #4
	ext	v7.16b, v7.16b, v7.16b, #4

	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v11.16b, v11.16b, v11.16b, #8
	ext	v12.16b, v12.16b, v12.16b, #8

	ext	v15.16b, v15.16b, v15.16b, #12
	ext	v16.16b, v16.16b, v16.16b, #12
	ext	v17.16b, v17.16b, v17.16b, #12
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v5.16b, v5.16b, v5.16b, #12
	ext	v6.16b, v6.16b, v6.16b, #12
	ext	v7.16b, v7.16b, v7.16b, #12

	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v11.16b, v11.16b, v11.16b, #8
	ext	v12.16b, v12.16b, v12.16b, #8

	ext	v15.16b, v15.16b, v15.16b, #4
	ext	v16.16b, v16.16b, v16.16b, #4
	ext	v17.16b, v17.16b, v17.16b, #4
	subs	x7, x7, #1
	b.gt	Lopen_tail_192_rounds
	subs	x6, x6, #1
	b.ge	Lopen_tail_192_rounds_no_hash

    // We hashed 160 bytes at most, may still have 32 bytes left
Lopen_tail_192_hash:
	cbz	x4, Lopen_tail_192_hash_done
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	sub	x4, x4, #1
	b	Lopen_tail_192_hash

Lopen_tail_192_hash_done:

	add	v0.4s, v0.4s, v24.4s
	add	v1.4s, v1.4s, v24.4s
	add	v2.4s, v2.4s, v24.4s
	add	v5.4s, v5.4s, v28.4s
	add	v6.4s, v6.4s, v28.4s
	add	v7.4s, v7.4s, v28.4s
	add	v10.4s, v10.4s, v29.4s
	add	v11.4s, v11.4s, v29.4s
	add	v12.4s, v12.4s, v29.4s
	add	v15.4s, v15.4s, v30.4s
	add	v16.4s, v16.4s, v30.4s
	add	v17.4s, v17.4s, v30.4s

	add	v15.4s, v15.4s, v21.4s
	add	v16.4s, v16.4s, v23.4s
	add	v17.4s, v17.4s, v22.4s

	ld1	{v20.16b - v23.16b}, [x1], #64

	eor	v20.16b, v20.16b, v1.16b
	eor	v21.16b, v21.16b, v6.16b
	eor	v22.16b, v22.16b, v11.16b
	eor	v23.16b, v23.16b, v16.16b

	st1	{v20.16b - v23.16b}, [x0], #64

	ld1	{v20.16b - v23.16b}, [x1], #64

	eor	v20.16b, v20.16b, v2.16b
	eor	v21.16b, v21.16b, v7.16b
	eor	v22.16b, v22.16b, v12.16b
	eor	v23.16b, v23.16b, v17.16b

	st1	{v20.16b - v23.16b}, [x0], #64

	sub	x2, x2, #128
	b	Lopen_tail_64_store

Lopen_tail_128:
     // We need two more blocks
	mov	v0.16b, v24.16b
	mov	v1.16b, v24.16b
	mov	v5.16b, v28.16b
	mov	v6.16b, v28.16b
	mov	v10.16b, v29.16b
	mov	v11.16b, v29.16b
	mov	v15.16b, v30.16b
	mov	v16.16b, v30.16b
	eor	v23.16b, v23.16b, v23.16b
	eor	v22.16b, v22.16b, v22.16b
	ins	v23.s[0], v25.s[0]
	ins	v22.d[0], x15
	add	v22.4s, v22.4s, v23.4s

	add	v15.4s, v15.4s, v22.4s
	add	v16.4s, v16.4s, v23.4s

	mov	x6, #10
	sub	x6, x6, x4

Lopen_tail_128_rounds:
	add	v0.4s, v0.4s, v5.4s
	eor	v15.16b, v15.16b, v0.16b
	rev32	v15.8h, v15.8h

	add	v10.4s, v10.4s, v15.4s
	eor	v5.16b, v5.16b, v10.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	add	v0.4s, v0.4s, v20.4s
	eor	v15.16b, v15.16b, v0.16b
	tbl	v15.16b, {v15.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	eor	v20.16b, v20.16b, v10.16b
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7
	ext	v5.16b, v5.16b, v5.16b, #4
	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v15.16b, v15.16b, v15.16b, #12
	add	v1.4s, v1.4s, v6.4s
	eor	v16.16b, v16.16b, v1.16b
	rev32	v16.8h, v16.8h

	add	v11.4s, v11.4s, v16.4s
	eor	v6.16b, v6.16b, v11.16b
	ushr	v20.4s, v6.4s, #20
	sli	v20.4s, v6.4s, #12
	add	v1.4s, v1.4s, v20.4s
	eor	v16.16b, v16.16b, v1.16b
	tbl	v16.16b, {v16.16b}, v26.16b

	add	v11.4s, v11.4s, v16.4s
	eor	v20.16b, v20.16b, v11.16b
	ushr	v6.4s, v20.4s, #25
	sli	v6.4s, v20.4s, #7
	ext	v6.16b, v6.16b, v6.16b, #4
	ext	v11.16b, v11.16b, v11.16b, #8
	ext	v16.16b, v16.16b, v16.16b, #12
	add	v0.4s, v0.4s, v5.4s
	eor	v15.16b, v15.16b, v0.16b
	rev32	v15.8h, v15.8h

	add	v10.4s, v10.4s, v15.4s
	eor	v5.16b, v5.16b, v10.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	add	v0.4s, v0.4s, v20.4s
	eor	v15.16b, v15.16b, v0.16b
	tbl	v15.16b, {v15.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	eor	v20.16b, v20.16b, v10.16b
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7
	ext	v5.16b, v5.16b, v5.16b, #12
	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v15.16b, v15.16b, v15.16b, #4
	add	v1.4s, v1.4s, v6.4s
	eor	v16.16b, v16.16b, v1.16b
	rev32	v16.8h, v16.8h

	add	v11.4s, v11.4s, v16.4s
	eor	v6.16b, v6.16b, v11.16b
	ushr	v20.4s, v6.4s, #20
	sli	v20.4s, v6.4s, #12
	add	v1.4s, v1.4s, v20.4s
	eor	v16.16b, v16.16b, v1.16b
	tbl	v16.16b, {v16.16b}, v26.16b

	add	v11.4s, v11.4s, v16.4s
	eor	v20.16b, v20.16b, v11.16b
	ushr	v6.4s, v20.4s, #25
	sli	v6.4s, v20.4s, #7
	ext	v6.16b, v6.16b, v6.16b, #12
	ext	v11.16b, v11.16b, v11.16b, #8
	ext	v16.16b, v16.16b, v16.16b, #4
	subs	x6, x6, #1
	b.gt	Lopen_tail_128_rounds
	cbz	x4, Lopen_tail_128_rounds_done
	subs	x4, x4, #1
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	b	Lopen_tail_128_rounds

Lopen_tail_128_rounds_done:
	add	v0.4s, v0.4s, v24.4s
	add	v1.4s, v1.4s, v24.4s
	add	v5.4s, v5.4s, v28.4s
	add	v6.4s, v6.4s, v28.4s
	add	v10.4s, v10.4s, v29.4s
	add	v11.4s, v11.4s, v29.4s
	add	v15.4s, v15.4s, v30.4s
	add	v16.4s, v16.4s, v30.4s
	add	v15.4s, v15.4s, v22.4s
	add	v16.4s, v16.4s, v23.4s

	ld1	{v20.16b - v23.16b}, [x1], #64

	eor	v20.16b, v20.16b, v1.16b
	eor	v21.16b, v21.16b, v6.16b
	eor	v22.16b, v22.16b, v11.16b
	eor	v23.16b, v23.16b, v16.16b

	st1	{v20.16b - v23.16b}, [x0], #64
	sub	x2, x2, #64

	b	Lopen_tail_64_store

Lopen_tail_64:
    // We just need a single block
	mov	v0.16b, v24.16b
	mov	v5.16b, v28.16b
	mov	v10.16b, v29.16b
	mov	v15.16b, v30.16b
	eor	v23.16b, v23.16b, v23.16b
	ins	v23.s[0], v25.s[0]
	add	v15.4s, v15.4s, v23.4s

	mov	x6, #10
	sub	x6, x6, x4

Lopen_tail_64_rounds:
	add	v0.4s, v0.4s, v5.4s
	eor	v15.16b, v15.16b, v0.16b
	rev32	v15.8h, v15.8h

	add	v10.4s, v10.4s, v15.4s
	eor	v5.16b, v5.16b, v10.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	add	v0.4s, v0.4s, v20.4s
	eor	v15.16b, v15.16b, v0.16b
	tbl	v15.16b, {v15.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	eor	v20.16b, v20.16b, v10.16b
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7
	ext	v5.16b, v5.16b, v5.16b, #4
	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v15.16b, v15.16b, v15.16b, #12
	add	v0.4s, v0.4s, v5.4s
	eor	v15.16b, v15.16b, v0.16b
	rev32	v15.8h, v15.8h

	add	v10.4s, v10.4s, v15.4s
	eor	v5.16b, v5.16b, v10.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	add	v0.4s, v0.4s, v20.4s
	eor	v15.16b, v15.16b, v0.16b
	tbl	v15.16b, {v15.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	eor	v20.16b, v20.16b, v10.16b
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7
	ext	v5.16b, v5.16b, v5.16b, #12
	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v15.16b, v15.16b, v15.16b, #4
	subs	x6, x6, #1
	b.gt	Lopen_tail_64_rounds
	cbz	x4, Lopen_tail_64_rounds_done
	subs	x4, x4, #1
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	b	Lopen_tail_64_rounds

Lopen_tail_64_rounds_done:
	add	v0.4s, v0.4s, v24.4s
	add	v5.4s, v5.4s, v28.4s
	add	v10.4s, v10.4s, v29.4s
	add	v15.4s, v15.4s, v30.4s
	add	v15.4s, v15.4s, v23.4s

Lopen_tail_64_store:
	cmp	x2, #16
	b.lt	Lopen_tail_16

	ld1	{v20.16b}, [x1], #16
	eor	v20.16b, v20.16b, v0.16b
	st1	{v20.16b}, [x0], #16
	mov	v0.16b, v5.16b
	mov	v5.16b, v10.16b
	mov	v10.16b, v15.16b
	sub	x2, x2, #16
	b	Lopen_tail_64_store

Lopen_tail_16:
    // Here we handle the last [0,16) bytes that require a padded block
	cbz	x2, Lopen_finalize

	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
	not	v22.16b, v20.16b

	add	x7, x1, x2
	mov	x6, x2

Lopen_tail_16_compose:
	ext	v20.16b, v20.16b, v20.16b, #15
	ldrb	w11, [x7, #-1]!
	mov	v20.b[0], w11
	ext	v21.16b, v22.16b, v21.16b, #15
	subs	x2, x2, #1
	b.gt	Lopen_tail_16_compose

	and	v20.16b, v20.16b, v21.16b
    // Hash in the final padded block
	mov	x11, v20.d[0]
	mov	x12, v20.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	eor	v20.16b, v20.16b, v0.16b

Lopen_tail_16_store:
	umov	w11, v20.b[0]
	strb	w11, [x0], #1
	ext	v20.16b, v20.16b, v20.16b, #1
	subs	x6, x6, #1
	b.gt	Lopen_tail_16_store

Lopen_finalize:
	mov	x11, v31.d[0]
	mov	x12, v31.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
    // Final reduction step
	sub	x12, xzr, x15
	orr	x13, xzr, #3
	subs	x11, x8, #-5
	sbcs	x12, x9, x12
	sbcs	x13, x10, x13
	csel	x8, x11, x8, cs
	csel	x9, x12, x9, cs
	csel	x10, x13, x10, cs
	mov	x11, v27.d[0]
	mov	x12, v27.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15

	stp	x8, x9, [x5]

	ldp	d8, d9, [sp, #16]
	ldp	d10, d11, [sp, #32]
	ldp	d12, d13, [sp, #48]
	ldp	d14, d15, [sp, #64]
.cfi_restore	b15
.cfi_restore	b14
.cfi_restore	b13
.cfi_restore	b12
.cfi_restore	b11
.cfi_restore	b10
.cfi_restore	b9
.cfi_restore	b8
	ldp	x29, x30, [sp], 80
.cfi_restore	w29
.cfi_restore	w30
.cfi_def_cfa_offset	0
	AARCH64_VALIDATE_LINK_REGISTER
	ret

Lopen_128:
    // On some architectures preparing 5 blocks for small buffers is wasteful
	eor	v25.16b, v25.16b, v25.16b
	mov	x11, #1
	mov	v25.s[0], w11
	mov	v0.16b, v24.16b
	mov	v1.16b, v24.16b
	mov	v2.16b, v24.16b
	mov	v5.16b, v28.16b
	mov	v6.16b, v28.16b
	mov	v7.16b, v28.16b
	mov	v10.16b, v29.16b
	mov	v11.16b, v29.16b
	mov	v12.16b, v29.16b
	mov	v17.16b, v30.16b
	add	v15.4s, v17.4s, v25.4s
	add	v16.4s, v15.4s, v25.4s

	mov	x6, #10

Lopen_128_rounds:
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v5.16b, v5.16b, v5.16b, #4
	ext	v6.16b, v6.16b, v6.16b, #4
	ext	v7.16b, v7.16b, v7.16b, #4

	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v11.16b, v11.16b, v11.16b, #8
	ext	v12.16b, v12.16b, v12.16b, #8

	ext	v15.16b, v15.16b, v15.16b, #12
	ext	v16.16b, v16.16b, v16.16b, #12
	ext	v17.16b, v17.16b, v17.16b, #12
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	rev32	v15.8h, v15.8h
	rev32	v16.8h, v16.8h
	rev32	v17.8h, v17.8h

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v5.16b, v5.16b, v10.16b
	eor	v6.16b, v6.16b, v11.16b
	eor	v7.16b, v7.16b, v12.16b
	ushr	v20.4s, v5.4s, #20
	sli	v20.4s, v5.4s, #12
	ushr	v5.4s, v6.4s, #20
	sli	v5.4s, v6.4s, #12
	ushr	v6.4s, v7.4s, #20
	sli	v6.4s, v7.4s, #12

	add	v0.4s, v0.4s, v20.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v16.16b, v16.16b, v1.16b
	eor	v17.16b, v17.16b, v2.16b
	tbl	v15.16b, {v15.16b}, v26.16b
	tbl	v16.16b, {v16.16b}, v26.16b
	tbl	v17.16b, {v17.16b}, v26.16b

	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v16.4s
	add	v12.4s, v12.4s, v17.4s
	eor	v20.16b, v20.16b, v10.16b
	eor	v5.16b, v5.16b, v11.16b
	eor	v6.16b, v6.16b, v12.16b
	ushr	v7.4s, v6.4s, #25
	sli	v7.4s, v6.4s, #7
	ushr	v6.4s, v5.4s, #25
	sli	v6.4s, v5.4s, #7
	ushr	v5.4s, v20.4s, #25
	sli	v5.4s, v20.4s, #7

	ext	v5.16b, v5.16b, v5.16b, #12
	ext	v6.16b, v6.16b, v6.16b, #12
	ext	v7.16b, v7.16b, v7.16b, #12

	ext	v10.16b, v10.16b, v10.16b, #8
	ext	v11.16b, v11.16b, v11.16b, #8
	ext	v12.16b, v12.16b, v12.16b, #8

	ext	v15.16b, v15.16b, v15.16b, #4
	ext	v16.16b, v16.16b, v16.16b, #4
	ext	v17.16b, v17.16b, v17.16b, #4
	subs	x6, x6, #1
	b.hi	Lopen_128_rounds

	add	v0.4s, v0.4s, v24.4s
	add	v1.4s, v1.4s, v24.4s
	add	v2.4s, v2.4s, v24.4s

	add	v5.4s, v5.4s, v28.4s
	add	v6.4s, v6.4s, v28.4s
	add	v7.4s, v7.4s, v28.4s

	add	v10.4s, v10.4s, v29.4s
	add	v11.4s, v11.4s, v29.4s

	add	v30.4s, v30.4s, v25.4s
	add	v15.4s, v15.4s, v30.4s
	add	v30.4s, v30.4s, v25.4s
	add	v16.4s, v16.4s, v30.4s

	and	v2.16b, v2.16b, v27.16b
	mov	x16, v2.d[0] // Move the R key to GPRs
	mov	x17, v2.d[1]
	mov	v27.16b, v7.16b // Store the S key

	bl	Lpoly_hash_ad_internal

Lopen_128_store:
	cmp	x2, #64
	b.lt	Lopen_128_store_64

	ld1	{v20.16b - v23.16b}, [x1], #64

	mov	x11, v20.d[0]
	mov	x12, v20.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	mov	x11, v21.d[0]
	mov	x12, v21.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	mov	x11, v22.d[0]
	mov	x12, v22.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	mov	x11, v23.d[0]
	mov	x12, v23.d[1]
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most

	eor	v20.16b, v20.16b, v0.16b
	eor	v21.16b, v21.16b, v5.16b
	eor	v22.16b, v22.16b, v10.16b
	eor	v23.16b, v23.16b, v15.16b

	st1	{v20.16b - v23.16b}, [x0], #64

	sub	x2, x2, #64

	mov	v0.16b, v1.16b
	mov	v5.16b, v6.16b
	mov	v10.16b, v11.16b
	mov	v15.16b, v16.16b

Lopen_128_store_64:

	lsr	x4, x2, #4
	mov	x3, x1

Lopen_128_hash_64:
	cbz	x4, Lopen_tail_64_store
	ldp	x11, x12, [x3], 16
	adds	x8, x8, x11
	adcs	x9, x9, x12
	adc	x10, x10, x15
	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
	umulh	x12, x8, x16
	mul	x13, x9, x16
	umulh	x14, x9, x16
	adds	x12, x12, x13
	mul	x13, x10, x16
	adc	x13, x13, x14
	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
	umulh	x8, x8, x17
	adds	x12, x12, x14
	mul	x14, x9, x17
	umulh	x9, x9, x17
	adcs	x14, x14, x8
	mul	x10, x10, x17
	adc	x10, x10, x9
	adds	x13, x13, x14
	adc	x14, x10, xzr
	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
	and	x8, x13, #-4
	extr	x13, x14, x13, #2
	adds	x8, x8, x11
	lsr	x11, x14, #2
	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
	adds	x8, x8, x13
	adcs	x9, x9, x12
	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
	sub	x4, x4, #1
	b	Lopen_128_hash_64
.cfi_endproc

#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)