xref: /aosp_15_r20/external/boringssl/src/gen/bcm/ghash-neon-armv8-linux.S (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1*8fb009dcSAndroid Build Coastguard Worker// This file is generated from a similarly-named Perl script in the BoringSSL
2*8fb009dcSAndroid Build Coastguard Worker// source tree. Do not edit by hand.
3*8fb009dcSAndroid Build Coastguard Worker
4*8fb009dcSAndroid Build Coastguard Worker#include <openssl/asm_base.h>
5*8fb009dcSAndroid Build Coastguard Worker
6*8fb009dcSAndroid Build Coastguard Worker#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7*8fb009dcSAndroid Build Coastguard Worker#include <openssl/arm_arch.h>
8*8fb009dcSAndroid Build Coastguard Worker
9*8fb009dcSAndroid Build Coastguard Worker.text
10*8fb009dcSAndroid Build Coastguard Worker
11*8fb009dcSAndroid Build Coastguard Worker.globl	gcm_init_neon
12*8fb009dcSAndroid Build Coastguard Worker.hidden	gcm_init_neon
13*8fb009dcSAndroid Build Coastguard Worker.type	gcm_init_neon,%function
14*8fb009dcSAndroid Build Coastguard Worker.align	4
15*8fb009dcSAndroid Build Coastguard Workergcm_init_neon:
16*8fb009dcSAndroid Build Coastguard Worker	AARCH64_VALID_CALL_TARGET
17*8fb009dcSAndroid Build Coastguard Worker	// This function is adapted from gcm_init_v8. xC2 is t3.
18*8fb009dcSAndroid Build Coastguard Worker	ld1	{v17.2d}, [x1]			// load H
19*8fb009dcSAndroid Build Coastguard Worker	movi	v19.16b, #0xe1
20*8fb009dcSAndroid Build Coastguard Worker	shl	v19.2d, v19.2d, #57		// 0xc2.0
21*8fb009dcSAndroid Build Coastguard Worker	ext	v3.16b, v17.16b, v17.16b, #8
22*8fb009dcSAndroid Build Coastguard Worker	ushr	v18.2d, v19.2d, #63
23*8fb009dcSAndroid Build Coastguard Worker	dup	v17.4s, v17.s[1]
24*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
25*8fb009dcSAndroid Build Coastguard Worker	ushr	v18.2d, v3.2d, #63
26*8fb009dcSAndroid Build Coastguard Worker	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
27*8fb009dcSAndroid Build Coastguard Worker	and	v18.16b, v18.16b, v16.16b
28*8fb009dcSAndroid Build Coastguard Worker	shl	v3.2d, v3.2d, #1
29*8fb009dcSAndroid Build Coastguard Worker	ext	v18.16b, v18.16b, v18.16b, #8
30*8fb009dcSAndroid Build Coastguard Worker	and	v16.16b, v16.16b, v17.16b
31*8fb009dcSAndroid Build Coastguard Worker	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
32*8fb009dcSAndroid Build Coastguard Worker	eor	v5.16b, v3.16b, v16.16b	// twisted H
33*8fb009dcSAndroid Build Coastguard Worker	st1	{v5.2d}, [x0]			// store Htable[0]
34*8fb009dcSAndroid Build Coastguard Worker	ret
35*8fb009dcSAndroid Build Coastguard Worker.size	gcm_init_neon,.-gcm_init_neon
36*8fb009dcSAndroid Build Coastguard Worker
37*8fb009dcSAndroid Build Coastguard Worker.globl	gcm_gmult_neon
38*8fb009dcSAndroid Build Coastguard Worker.hidden	gcm_gmult_neon
39*8fb009dcSAndroid Build Coastguard Worker.type	gcm_gmult_neon,%function
40*8fb009dcSAndroid Build Coastguard Worker.align	4
41*8fb009dcSAndroid Build Coastguard Workergcm_gmult_neon:
42*8fb009dcSAndroid Build Coastguard Worker	AARCH64_VALID_CALL_TARGET
43*8fb009dcSAndroid Build Coastguard Worker	ld1	{v3.16b}, [x0]		// load Xi
44*8fb009dcSAndroid Build Coastguard Worker	ld1	{v5.1d}, [x1], #8		// load twisted H
45*8fb009dcSAndroid Build Coastguard Worker	ld1	{v6.1d}, [x1]
46*8fb009dcSAndroid Build Coastguard Worker	adrp	x9, .Lmasks		// load constants
47*8fb009dcSAndroid Build Coastguard Worker	add	x9, x9, :lo12:.Lmasks
48*8fb009dcSAndroid Build Coastguard Worker	ld1	{v24.2d, v25.2d}, [x9]
49*8fb009dcSAndroid Build Coastguard Worker	rev64	v3.16b, v3.16b		// byteswap Xi
50*8fb009dcSAndroid Build Coastguard Worker	ext	v3.16b, v3.16b, v3.16b, #8
51*8fb009dcSAndroid Build Coastguard Worker	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
52*8fb009dcSAndroid Build Coastguard Worker
53*8fb009dcSAndroid Build Coastguard Worker	mov	x3, #16
54*8fb009dcSAndroid Build Coastguard Worker	b	.Lgmult_neon
55*8fb009dcSAndroid Build Coastguard Worker.size	gcm_gmult_neon,.-gcm_gmult_neon
56*8fb009dcSAndroid Build Coastguard Worker
57*8fb009dcSAndroid Build Coastguard Worker.globl	gcm_ghash_neon
58*8fb009dcSAndroid Build Coastguard Worker.hidden	gcm_ghash_neon
59*8fb009dcSAndroid Build Coastguard Worker.type	gcm_ghash_neon,%function
60*8fb009dcSAndroid Build Coastguard Worker.align	4
61*8fb009dcSAndroid Build Coastguard Workergcm_ghash_neon:
62*8fb009dcSAndroid Build Coastguard Worker	AARCH64_VALID_CALL_TARGET
63*8fb009dcSAndroid Build Coastguard Worker	ld1	{v0.16b}, [x0]		// load Xi
64*8fb009dcSAndroid Build Coastguard Worker	ld1	{v5.1d}, [x1], #8		// load twisted H
65*8fb009dcSAndroid Build Coastguard Worker	ld1	{v6.1d}, [x1]
66*8fb009dcSAndroid Build Coastguard Worker	adrp	x9, .Lmasks		// load constants
67*8fb009dcSAndroid Build Coastguard Worker	add	x9, x9, :lo12:.Lmasks
68*8fb009dcSAndroid Build Coastguard Worker	ld1	{v24.2d, v25.2d}, [x9]
69*8fb009dcSAndroid Build Coastguard Worker	rev64	v0.16b, v0.16b		// byteswap Xi
70*8fb009dcSAndroid Build Coastguard Worker	ext	v0.16b, v0.16b, v0.16b, #8
71*8fb009dcSAndroid Build Coastguard Worker	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
72*8fb009dcSAndroid Build Coastguard Worker
73*8fb009dcSAndroid Build Coastguard Worker.Loop_neon:
74*8fb009dcSAndroid Build Coastguard Worker	ld1	{v3.16b}, [x2], #16	// load inp
75*8fb009dcSAndroid Build Coastguard Worker	rev64	v3.16b, v3.16b		// byteswap inp
76*8fb009dcSAndroid Build Coastguard Worker	ext	v3.16b, v3.16b, v3.16b, #8
77*8fb009dcSAndroid Build Coastguard Worker	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
78*8fb009dcSAndroid Build Coastguard Worker
79*8fb009dcSAndroid Build Coastguard Worker.Lgmult_neon:
80*8fb009dcSAndroid Build Coastguard Worker	// Split the input into v3 and v4. (The upper halves are unused,
81*8fb009dcSAndroid Build Coastguard Worker	// so it is okay to leave them alone.)
82*8fb009dcSAndroid Build Coastguard Worker	ins	v4.d[0], v3.d[1]
83*8fb009dcSAndroid Build Coastguard Worker	ext	v16.8b, v5.8b, v5.8b, #1	// A1
84*8fb009dcSAndroid Build Coastguard Worker	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
85*8fb009dcSAndroid Build Coastguard Worker	ext	v0.8b, v3.8b, v3.8b, #1		// B1
86*8fb009dcSAndroid Build Coastguard Worker	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
87*8fb009dcSAndroid Build Coastguard Worker	ext	v17.8b, v5.8b, v5.8b, #2	// A2
88*8fb009dcSAndroid Build Coastguard Worker	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
89*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v3.8b, v3.8b, #2	// B2
90*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
91*8fb009dcSAndroid Build Coastguard Worker	ext	v18.8b, v5.8b, v5.8b, #3	// A3
92*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v0.16b	// L = E + F
93*8fb009dcSAndroid Build Coastguard Worker	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
94*8fb009dcSAndroid Build Coastguard Worker	ext	v0.8b, v3.8b, v3.8b, #3		// B3
95*8fb009dcSAndroid Build Coastguard Worker	eor	v17.16b, v17.16b, v19.16b	// M = G + H
96*8fb009dcSAndroid Build Coastguard Worker	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
97*8fb009dcSAndroid Build Coastguard Worker
98*8fb009dcSAndroid Build Coastguard Worker	// Here we diverge from the 32-bit version. It computes the following
99*8fb009dcSAndroid Build Coastguard Worker	// (instructions reordered for clarity):
100*8fb009dcSAndroid Build Coastguard Worker	//
101*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
102*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t0#hi, $t0#hi, $k48
103*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi
104*8fb009dcSAndroid Build Coastguard Worker	//
105*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
106*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t1#hi, $t1#hi, $k32
107*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi
108*8fb009dcSAndroid Build Coastguard Worker	//
109*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
110*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t2#hi, $t2#hi, $k16
111*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi
112*8fb009dcSAndroid Build Coastguard Worker	//
113*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
114*8fb009dcSAndroid Build Coastguard Worker	//     vmov.i64	$t3#hi, #0
115*8fb009dcSAndroid Build Coastguard Worker	//
116*8fb009dcSAndroid Build Coastguard Worker	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
117*8fb009dcSAndroid Build Coastguard Worker	// upper halves of SIMD registers, so we must split each half into
118*8fb009dcSAndroid Build Coastguard Worker	// separate registers. To compensate, we pair computations up and
119*8fb009dcSAndroid Build Coastguard Worker	// parallelize.
120*8fb009dcSAndroid Build Coastguard Worker
121*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v3.8b, v3.8b, #4	// B4
122*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v0.16b	// N = I + J
123*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
124*8fb009dcSAndroid Build Coastguard Worker
125*8fb009dcSAndroid Build Coastguard Worker	// This can probably be scheduled more efficiently. For now, we just
126*8fb009dcSAndroid Build Coastguard Worker	// pair up independent instructions.
127*8fb009dcSAndroid Build Coastguard Worker	zip1	v20.2d, v16.2d, v17.2d
128*8fb009dcSAndroid Build Coastguard Worker	zip1	v22.2d, v18.2d, v19.2d
129*8fb009dcSAndroid Build Coastguard Worker	zip2	v21.2d, v16.2d, v17.2d
130*8fb009dcSAndroid Build Coastguard Worker	zip2	v23.2d, v18.2d, v19.2d
131*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
132*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
133*8fb009dcSAndroid Build Coastguard Worker	and	v21.16b, v21.16b, v24.16b
134*8fb009dcSAndroid Build Coastguard Worker	and	v23.16b, v23.16b, v25.16b
135*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
136*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
137*8fb009dcSAndroid Build Coastguard Worker	zip1	v16.2d, v20.2d, v21.2d
138*8fb009dcSAndroid Build Coastguard Worker	zip1	v18.2d, v22.2d, v23.2d
139*8fb009dcSAndroid Build Coastguard Worker	zip2	v17.2d, v20.2d, v21.2d
140*8fb009dcSAndroid Build Coastguard Worker	zip2	v19.2d, v22.2d, v23.2d
141*8fb009dcSAndroid Build Coastguard Worker
142*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
143*8fb009dcSAndroid Build Coastguard Worker	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
144*8fb009dcSAndroid Build Coastguard Worker	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
145*8fb009dcSAndroid Build Coastguard Worker	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
146*8fb009dcSAndroid Build Coastguard Worker	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
147*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v17.16b
148*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v19.16b
149*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b, v16.16b
150*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b, v18.16b
151*8fb009dcSAndroid Build Coastguard Worker	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
152*8fb009dcSAndroid Build Coastguard Worker	ext	v16.8b, v7.8b, v7.8b, #1	// A1
153*8fb009dcSAndroid Build Coastguard Worker	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
154*8fb009dcSAndroid Build Coastguard Worker	ext	v1.8b, v3.8b, v3.8b, #1		// B1
155*8fb009dcSAndroid Build Coastguard Worker	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
156*8fb009dcSAndroid Build Coastguard Worker	ext	v17.8b, v7.8b, v7.8b, #2	// A2
157*8fb009dcSAndroid Build Coastguard Worker	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
158*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v3.8b, v3.8b, #2	// B2
159*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
160*8fb009dcSAndroid Build Coastguard Worker	ext	v18.8b, v7.8b, v7.8b, #3	// A3
161*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v1.16b	// L = E + F
162*8fb009dcSAndroid Build Coastguard Worker	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
163*8fb009dcSAndroid Build Coastguard Worker	ext	v1.8b, v3.8b, v3.8b, #3		// B3
164*8fb009dcSAndroid Build Coastguard Worker	eor	v17.16b, v17.16b, v19.16b	// M = G + H
165*8fb009dcSAndroid Build Coastguard Worker	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
166*8fb009dcSAndroid Build Coastguard Worker
167*8fb009dcSAndroid Build Coastguard Worker	// Here we diverge from the 32-bit version. It computes the following
168*8fb009dcSAndroid Build Coastguard Worker	// (instructions reordered for clarity):
169*8fb009dcSAndroid Build Coastguard Worker	//
170*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
171*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t0#hi, $t0#hi, $k48
172*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi
173*8fb009dcSAndroid Build Coastguard Worker	//
174*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
175*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t1#hi, $t1#hi, $k32
176*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi
177*8fb009dcSAndroid Build Coastguard Worker	//
178*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
179*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t2#hi, $t2#hi, $k16
180*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi
181*8fb009dcSAndroid Build Coastguard Worker	//
182*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
183*8fb009dcSAndroid Build Coastguard Worker	//     vmov.i64	$t3#hi, #0
184*8fb009dcSAndroid Build Coastguard Worker	//
185*8fb009dcSAndroid Build Coastguard Worker	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
186*8fb009dcSAndroid Build Coastguard Worker	// upper halves of SIMD registers, so we must split each half into
187*8fb009dcSAndroid Build Coastguard Worker	// separate registers. To compensate, we pair computations up and
188*8fb009dcSAndroid Build Coastguard Worker	// parallelize.
189*8fb009dcSAndroid Build Coastguard Worker
190*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v3.8b, v3.8b, #4	// B4
191*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v1.16b	// N = I + J
192*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
193*8fb009dcSAndroid Build Coastguard Worker
194*8fb009dcSAndroid Build Coastguard Worker	// This can probably be scheduled more efficiently. For now, we just
195*8fb009dcSAndroid Build Coastguard Worker	// pair up independent instructions.
196*8fb009dcSAndroid Build Coastguard Worker	zip1	v20.2d, v16.2d, v17.2d
197*8fb009dcSAndroid Build Coastguard Worker	zip1	v22.2d, v18.2d, v19.2d
198*8fb009dcSAndroid Build Coastguard Worker	zip2	v21.2d, v16.2d, v17.2d
199*8fb009dcSAndroid Build Coastguard Worker	zip2	v23.2d, v18.2d, v19.2d
200*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
201*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
202*8fb009dcSAndroid Build Coastguard Worker	and	v21.16b, v21.16b, v24.16b
203*8fb009dcSAndroid Build Coastguard Worker	and	v23.16b, v23.16b, v25.16b
204*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
205*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
206*8fb009dcSAndroid Build Coastguard Worker	zip1	v16.2d, v20.2d, v21.2d
207*8fb009dcSAndroid Build Coastguard Worker	zip1	v18.2d, v22.2d, v23.2d
208*8fb009dcSAndroid Build Coastguard Worker	zip2	v17.2d, v20.2d, v21.2d
209*8fb009dcSAndroid Build Coastguard Worker	zip2	v19.2d, v22.2d, v23.2d
210*8fb009dcSAndroid Build Coastguard Worker
211*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
212*8fb009dcSAndroid Build Coastguard Worker	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
213*8fb009dcSAndroid Build Coastguard Worker	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
214*8fb009dcSAndroid Build Coastguard Worker	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
215*8fb009dcSAndroid Build Coastguard Worker	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
216*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v17.16b
217*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v19.16b
218*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v16.16b
219*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v18.16b
220*8fb009dcSAndroid Build Coastguard Worker	ext	v16.8b, v6.8b, v6.8b, #1	// A1
221*8fb009dcSAndroid Build Coastguard Worker	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
222*8fb009dcSAndroid Build Coastguard Worker	ext	v2.8b, v4.8b, v4.8b, #1		// B1
223*8fb009dcSAndroid Build Coastguard Worker	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
224*8fb009dcSAndroid Build Coastguard Worker	ext	v17.8b, v6.8b, v6.8b, #2	// A2
225*8fb009dcSAndroid Build Coastguard Worker	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
226*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v4.8b, v4.8b, #2	// B2
227*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
228*8fb009dcSAndroid Build Coastguard Worker	ext	v18.8b, v6.8b, v6.8b, #3	// A3
229*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v2.16b	// L = E + F
230*8fb009dcSAndroid Build Coastguard Worker	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
231*8fb009dcSAndroid Build Coastguard Worker	ext	v2.8b, v4.8b, v4.8b, #3		// B3
232*8fb009dcSAndroid Build Coastguard Worker	eor	v17.16b, v17.16b, v19.16b	// M = G + H
233*8fb009dcSAndroid Build Coastguard Worker	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
234*8fb009dcSAndroid Build Coastguard Worker
235*8fb009dcSAndroid Build Coastguard Worker	// Here we diverge from the 32-bit version. It computes the following
236*8fb009dcSAndroid Build Coastguard Worker	// (instructions reordered for clarity):
237*8fb009dcSAndroid Build Coastguard Worker	//
238*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
239*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t0#hi, $t0#hi, $k48
240*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi
241*8fb009dcSAndroid Build Coastguard Worker	//
242*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
243*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t1#hi, $t1#hi, $k32
244*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi
245*8fb009dcSAndroid Build Coastguard Worker	//
246*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
247*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t2#hi, $t2#hi, $k16
248*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi
249*8fb009dcSAndroid Build Coastguard Worker	//
250*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
251*8fb009dcSAndroid Build Coastguard Worker	//     vmov.i64	$t3#hi, #0
252*8fb009dcSAndroid Build Coastguard Worker	//
253*8fb009dcSAndroid Build Coastguard Worker	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
254*8fb009dcSAndroid Build Coastguard Worker	// upper halves of SIMD registers, so we must split each half into
255*8fb009dcSAndroid Build Coastguard Worker	// separate registers. To compensate, we pair computations up and
256*8fb009dcSAndroid Build Coastguard Worker	// parallelize.
257*8fb009dcSAndroid Build Coastguard Worker
258*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v4.8b, v4.8b, #4	// B4
259*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v2.16b	// N = I + J
260*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
261*8fb009dcSAndroid Build Coastguard Worker
262*8fb009dcSAndroid Build Coastguard Worker	// This can probably be scheduled more efficiently. For now, we just
263*8fb009dcSAndroid Build Coastguard Worker	// pair up independent instructions.
264*8fb009dcSAndroid Build Coastguard Worker	zip1	v20.2d, v16.2d, v17.2d
265*8fb009dcSAndroid Build Coastguard Worker	zip1	v22.2d, v18.2d, v19.2d
266*8fb009dcSAndroid Build Coastguard Worker	zip2	v21.2d, v16.2d, v17.2d
267*8fb009dcSAndroid Build Coastguard Worker	zip2	v23.2d, v18.2d, v19.2d
268*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
269*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
270*8fb009dcSAndroid Build Coastguard Worker	and	v21.16b, v21.16b, v24.16b
271*8fb009dcSAndroid Build Coastguard Worker	and	v23.16b, v23.16b, v25.16b
272*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
273*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
274*8fb009dcSAndroid Build Coastguard Worker	zip1	v16.2d, v20.2d, v21.2d
275*8fb009dcSAndroid Build Coastguard Worker	zip1	v18.2d, v22.2d, v23.2d
276*8fb009dcSAndroid Build Coastguard Worker	zip2	v17.2d, v20.2d, v21.2d
277*8fb009dcSAndroid Build Coastguard Worker	zip2	v19.2d, v22.2d, v23.2d
278*8fb009dcSAndroid Build Coastguard Worker
279*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
280*8fb009dcSAndroid Build Coastguard Worker	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
281*8fb009dcSAndroid Build Coastguard Worker	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
282*8fb009dcSAndroid Build Coastguard Worker	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
283*8fb009dcSAndroid Build Coastguard Worker	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
284*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v17.16b
285*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v19.16b
286*8fb009dcSAndroid Build Coastguard Worker	eor	v2.16b, v2.16b, v16.16b
287*8fb009dcSAndroid Build Coastguard Worker	eor	v2.16b, v2.16b, v18.16b
288*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v0.16b, v2.16b, #8
289*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
290*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v2.16b
291*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
292*8fb009dcSAndroid Build Coastguard Worker	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
293*8fb009dcSAndroid Build Coastguard Worker	// This is a no-op due to the ins instruction below.
294*8fb009dcSAndroid Build Coastguard Worker	// ins	v2.d[0], v1.d[1]
295*8fb009dcSAndroid Build Coastguard Worker
296*8fb009dcSAndroid Build Coastguard Worker	// equivalent of reduction_avx from ghash-x86_64.pl
297*8fb009dcSAndroid Build Coastguard Worker	shl	v17.2d, v0.2d, #57		// 1st phase
298*8fb009dcSAndroid Build Coastguard Worker	shl	v18.2d, v0.2d, #62
299*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v17.16b	//
300*8fb009dcSAndroid Build Coastguard Worker	shl	v17.2d, v0.2d, #63
301*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v17.16b	//
302*8fb009dcSAndroid Build Coastguard Worker	// Note Xm contains {Xl.d[1], Xh.d[0]}.
303*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v1.16b
304*8fb009dcSAndroid Build Coastguard Worker	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
305*8fb009dcSAndroid Build Coastguard Worker	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
306*8fb009dcSAndroid Build Coastguard Worker
307*8fb009dcSAndroid Build Coastguard Worker	ushr	v18.2d, v0.2d, #1		// 2nd phase
308*8fb009dcSAndroid Build Coastguard Worker	eor	v2.16b, v2.16b,v0.16b
309*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b,v18.16b	//
310*8fb009dcSAndroid Build Coastguard Worker	ushr	v18.2d, v18.2d, #6
311*8fb009dcSAndroid Build Coastguard Worker	ushr	v0.2d, v0.2d, #1		//
312*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b, v2.16b	//
313*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b, v18.16b	//
314*8fb009dcSAndroid Build Coastguard Worker
315*8fb009dcSAndroid Build Coastguard Worker	subs	x3, x3, #16
316*8fb009dcSAndroid Build Coastguard Worker	bne	.Loop_neon
317*8fb009dcSAndroid Build Coastguard Worker
318*8fb009dcSAndroid Build Coastguard Worker	rev64	v0.16b, v0.16b		// byteswap Xi and write
319*8fb009dcSAndroid Build Coastguard Worker	ext	v0.16b, v0.16b, v0.16b, #8
320*8fb009dcSAndroid Build Coastguard Worker	st1	{v0.16b}, [x0]
321*8fb009dcSAndroid Build Coastguard Worker
322*8fb009dcSAndroid Build Coastguard Worker	ret
323*8fb009dcSAndroid Build Coastguard Worker.size	gcm_ghash_neon,.-gcm_ghash_neon
324*8fb009dcSAndroid Build Coastguard Worker
325*8fb009dcSAndroid Build Coastguard Worker.section	.rodata
326*8fb009dcSAndroid Build Coastguard Worker.align	4
327*8fb009dcSAndroid Build Coastguard Worker.Lmasks:
328*8fb009dcSAndroid Build Coastguard Worker.quad	0x0000ffffffffffff	// k48
329*8fb009dcSAndroid Build Coastguard Worker.quad	0x00000000ffffffff	// k32
330*8fb009dcSAndroid Build Coastguard Worker.quad	0x000000000000ffff	// k16
331*8fb009dcSAndroid Build Coastguard Worker.quad	0x0000000000000000	// k0
332*8fb009dcSAndroid Build Coastguard Worker.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
333*8fb009dcSAndroid Build Coastguard Worker.align	2
334*8fb009dcSAndroid Build Coastguard Worker.align	2
335*8fb009dcSAndroid Build Coastguard Worker#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
336