xref: /aosp_15_r20/external/boringssl/src/gen/bcm/ghash-neon-armv8-win.S (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1*8fb009dcSAndroid Build Coastguard Worker// This file is generated from a similarly-named Perl script in the BoringSSL
2*8fb009dcSAndroid Build Coastguard Worker// source tree. Do not edit by hand.
3*8fb009dcSAndroid Build Coastguard Worker
4*8fb009dcSAndroid Build Coastguard Worker#include <openssl/asm_base.h>
5*8fb009dcSAndroid Build Coastguard Worker
6*8fb009dcSAndroid Build Coastguard Worker#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
7*8fb009dcSAndroid Build Coastguard Worker#include <openssl/arm_arch.h>
8*8fb009dcSAndroid Build Coastguard Worker
9*8fb009dcSAndroid Build Coastguard Worker.text
10*8fb009dcSAndroid Build Coastguard Worker
11*8fb009dcSAndroid Build Coastguard Worker.globl	gcm_init_neon
12*8fb009dcSAndroid Build Coastguard Worker
13*8fb009dcSAndroid Build Coastguard Worker.def gcm_init_neon
14*8fb009dcSAndroid Build Coastguard Worker   .type 32
15*8fb009dcSAndroid Build Coastguard Worker.endef
16*8fb009dcSAndroid Build Coastguard Worker.align	4
17*8fb009dcSAndroid Build Coastguard Workergcm_init_neon:
18*8fb009dcSAndroid Build Coastguard Worker	AARCH64_VALID_CALL_TARGET
19*8fb009dcSAndroid Build Coastguard Worker	// This function is adapted from gcm_init_v8. xC2 is t3.
20*8fb009dcSAndroid Build Coastguard Worker	ld1	{v17.2d}, [x1]			// load H
21*8fb009dcSAndroid Build Coastguard Worker	movi	v19.16b, #0xe1
22*8fb009dcSAndroid Build Coastguard Worker	shl	v19.2d, v19.2d, #57		// 0xc2.0
23*8fb009dcSAndroid Build Coastguard Worker	ext	v3.16b, v17.16b, v17.16b, #8
24*8fb009dcSAndroid Build Coastguard Worker	ushr	v18.2d, v19.2d, #63
25*8fb009dcSAndroid Build Coastguard Worker	dup	v17.4s, v17.s[1]
26*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
27*8fb009dcSAndroid Build Coastguard Worker	ushr	v18.2d, v3.2d, #63
28*8fb009dcSAndroid Build Coastguard Worker	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
29*8fb009dcSAndroid Build Coastguard Worker	and	v18.16b, v18.16b, v16.16b
30*8fb009dcSAndroid Build Coastguard Worker	shl	v3.2d, v3.2d, #1
31*8fb009dcSAndroid Build Coastguard Worker	ext	v18.16b, v18.16b, v18.16b, #8
32*8fb009dcSAndroid Build Coastguard Worker	and	v16.16b, v16.16b, v17.16b
33*8fb009dcSAndroid Build Coastguard Worker	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
34*8fb009dcSAndroid Build Coastguard Worker	eor	v5.16b, v3.16b, v16.16b	// twisted H
35*8fb009dcSAndroid Build Coastguard Worker	st1	{v5.2d}, [x0]			// store Htable[0]
36*8fb009dcSAndroid Build Coastguard Worker	ret
37*8fb009dcSAndroid Build Coastguard Worker
38*8fb009dcSAndroid Build Coastguard Worker
39*8fb009dcSAndroid Build Coastguard Worker.globl	gcm_gmult_neon
40*8fb009dcSAndroid Build Coastguard Worker
41*8fb009dcSAndroid Build Coastguard Worker.def gcm_gmult_neon
42*8fb009dcSAndroid Build Coastguard Worker   .type 32
43*8fb009dcSAndroid Build Coastguard Worker.endef
44*8fb009dcSAndroid Build Coastguard Worker.align	4
45*8fb009dcSAndroid Build Coastguard Workergcm_gmult_neon:
46*8fb009dcSAndroid Build Coastguard Worker	AARCH64_VALID_CALL_TARGET
47*8fb009dcSAndroid Build Coastguard Worker	ld1	{v3.16b}, [x0]		// load Xi
48*8fb009dcSAndroid Build Coastguard Worker	ld1	{v5.1d}, [x1], #8		// load twisted H
49*8fb009dcSAndroid Build Coastguard Worker	ld1	{v6.1d}, [x1]
50*8fb009dcSAndroid Build Coastguard Worker	adrp	x9, Lmasks		// load constants
51*8fb009dcSAndroid Build Coastguard Worker	add	x9, x9, :lo12:Lmasks
52*8fb009dcSAndroid Build Coastguard Worker	ld1	{v24.2d, v25.2d}, [x9]
53*8fb009dcSAndroid Build Coastguard Worker	rev64	v3.16b, v3.16b		// byteswap Xi
54*8fb009dcSAndroid Build Coastguard Worker	ext	v3.16b, v3.16b, v3.16b, #8
55*8fb009dcSAndroid Build Coastguard Worker	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
56*8fb009dcSAndroid Build Coastguard Worker
57*8fb009dcSAndroid Build Coastguard Worker	mov	x3, #16
58*8fb009dcSAndroid Build Coastguard Worker	b	Lgmult_neon
59*8fb009dcSAndroid Build Coastguard Worker
60*8fb009dcSAndroid Build Coastguard Worker
61*8fb009dcSAndroid Build Coastguard Worker.globl	gcm_ghash_neon
62*8fb009dcSAndroid Build Coastguard Worker
63*8fb009dcSAndroid Build Coastguard Worker.def gcm_ghash_neon
64*8fb009dcSAndroid Build Coastguard Worker   .type 32
65*8fb009dcSAndroid Build Coastguard Worker.endef
66*8fb009dcSAndroid Build Coastguard Worker.align	4
67*8fb009dcSAndroid Build Coastguard Workergcm_ghash_neon:
68*8fb009dcSAndroid Build Coastguard Worker	AARCH64_VALID_CALL_TARGET
69*8fb009dcSAndroid Build Coastguard Worker	ld1	{v0.16b}, [x0]		// load Xi
70*8fb009dcSAndroid Build Coastguard Worker	ld1	{v5.1d}, [x1], #8		// load twisted H
71*8fb009dcSAndroid Build Coastguard Worker	ld1	{v6.1d}, [x1]
72*8fb009dcSAndroid Build Coastguard Worker	adrp	x9, Lmasks		// load constants
73*8fb009dcSAndroid Build Coastguard Worker	add	x9, x9, :lo12:Lmasks
74*8fb009dcSAndroid Build Coastguard Worker	ld1	{v24.2d, v25.2d}, [x9]
75*8fb009dcSAndroid Build Coastguard Worker	rev64	v0.16b, v0.16b		// byteswap Xi
76*8fb009dcSAndroid Build Coastguard Worker	ext	v0.16b, v0.16b, v0.16b, #8
77*8fb009dcSAndroid Build Coastguard Worker	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
78*8fb009dcSAndroid Build Coastguard Worker
79*8fb009dcSAndroid Build Coastguard WorkerLoop_neon:
80*8fb009dcSAndroid Build Coastguard Worker	ld1	{v3.16b}, [x2], #16	// load inp
81*8fb009dcSAndroid Build Coastguard Worker	rev64	v3.16b, v3.16b		// byteswap inp
82*8fb009dcSAndroid Build Coastguard Worker	ext	v3.16b, v3.16b, v3.16b, #8
83*8fb009dcSAndroid Build Coastguard Worker	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
84*8fb009dcSAndroid Build Coastguard Worker
85*8fb009dcSAndroid Build Coastguard WorkerLgmult_neon:
86*8fb009dcSAndroid Build Coastguard Worker	// Split the input into v3 and v4. (The upper halves are unused,
87*8fb009dcSAndroid Build Coastguard Worker	// so it is okay to leave them alone.)
88*8fb009dcSAndroid Build Coastguard Worker	ins	v4.d[0], v3.d[1]
89*8fb009dcSAndroid Build Coastguard Worker	ext	v16.8b, v5.8b, v5.8b, #1	// A1
90*8fb009dcSAndroid Build Coastguard Worker	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
91*8fb009dcSAndroid Build Coastguard Worker	ext	v0.8b, v3.8b, v3.8b, #1		// B1
92*8fb009dcSAndroid Build Coastguard Worker	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
93*8fb009dcSAndroid Build Coastguard Worker	ext	v17.8b, v5.8b, v5.8b, #2	// A2
94*8fb009dcSAndroid Build Coastguard Worker	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
95*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v3.8b, v3.8b, #2	// B2
96*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
97*8fb009dcSAndroid Build Coastguard Worker	ext	v18.8b, v5.8b, v5.8b, #3	// A3
98*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v0.16b	// L = E + F
99*8fb009dcSAndroid Build Coastguard Worker	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
100*8fb009dcSAndroid Build Coastguard Worker	ext	v0.8b, v3.8b, v3.8b, #3		// B3
101*8fb009dcSAndroid Build Coastguard Worker	eor	v17.16b, v17.16b, v19.16b	// M = G + H
102*8fb009dcSAndroid Build Coastguard Worker	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
103*8fb009dcSAndroid Build Coastguard Worker
104*8fb009dcSAndroid Build Coastguard Worker	// Here we diverge from the 32-bit version. It computes the following
105*8fb009dcSAndroid Build Coastguard Worker	// (instructions reordered for clarity):
106*8fb009dcSAndroid Build Coastguard Worker	//
107*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
108*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t0#hi, $t0#hi, $k48
109*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi
110*8fb009dcSAndroid Build Coastguard Worker	//
111*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
112*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t1#hi, $t1#hi, $k32
113*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi
114*8fb009dcSAndroid Build Coastguard Worker	//
115*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
116*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t2#hi, $t2#hi, $k16
117*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi
118*8fb009dcSAndroid Build Coastguard Worker	//
119*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
120*8fb009dcSAndroid Build Coastguard Worker	//     vmov.i64	$t3#hi, #0
121*8fb009dcSAndroid Build Coastguard Worker	//
122*8fb009dcSAndroid Build Coastguard Worker	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
123*8fb009dcSAndroid Build Coastguard Worker	// upper halves of SIMD registers, so we must split each half into
124*8fb009dcSAndroid Build Coastguard Worker	// separate registers. To compensate, we pair computations up and
125*8fb009dcSAndroid Build Coastguard Worker	// parallelize.
126*8fb009dcSAndroid Build Coastguard Worker
127*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v3.8b, v3.8b, #4	// B4
128*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v0.16b	// N = I + J
129*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
130*8fb009dcSAndroid Build Coastguard Worker
131*8fb009dcSAndroid Build Coastguard Worker	// This can probably be scheduled more efficiently. For now, we just
132*8fb009dcSAndroid Build Coastguard Worker	// pair up independent instructions.
133*8fb009dcSAndroid Build Coastguard Worker	zip1	v20.2d, v16.2d, v17.2d
134*8fb009dcSAndroid Build Coastguard Worker	zip1	v22.2d, v18.2d, v19.2d
135*8fb009dcSAndroid Build Coastguard Worker	zip2	v21.2d, v16.2d, v17.2d
136*8fb009dcSAndroid Build Coastguard Worker	zip2	v23.2d, v18.2d, v19.2d
137*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
138*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
139*8fb009dcSAndroid Build Coastguard Worker	and	v21.16b, v21.16b, v24.16b
140*8fb009dcSAndroid Build Coastguard Worker	and	v23.16b, v23.16b, v25.16b
141*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
142*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
143*8fb009dcSAndroid Build Coastguard Worker	zip1	v16.2d, v20.2d, v21.2d
144*8fb009dcSAndroid Build Coastguard Worker	zip1	v18.2d, v22.2d, v23.2d
145*8fb009dcSAndroid Build Coastguard Worker	zip2	v17.2d, v20.2d, v21.2d
146*8fb009dcSAndroid Build Coastguard Worker	zip2	v19.2d, v22.2d, v23.2d
147*8fb009dcSAndroid Build Coastguard Worker
148*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
149*8fb009dcSAndroid Build Coastguard Worker	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
150*8fb009dcSAndroid Build Coastguard Worker	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
151*8fb009dcSAndroid Build Coastguard Worker	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
152*8fb009dcSAndroid Build Coastguard Worker	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
153*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v17.16b
154*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v19.16b
155*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b, v16.16b
156*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b, v18.16b
157*8fb009dcSAndroid Build Coastguard Worker	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
158*8fb009dcSAndroid Build Coastguard Worker	ext	v16.8b, v7.8b, v7.8b, #1	// A1
159*8fb009dcSAndroid Build Coastguard Worker	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
160*8fb009dcSAndroid Build Coastguard Worker	ext	v1.8b, v3.8b, v3.8b, #1		// B1
161*8fb009dcSAndroid Build Coastguard Worker	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
162*8fb009dcSAndroid Build Coastguard Worker	ext	v17.8b, v7.8b, v7.8b, #2	// A2
163*8fb009dcSAndroid Build Coastguard Worker	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
164*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v3.8b, v3.8b, #2	// B2
165*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
166*8fb009dcSAndroid Build Coastguard Worker	ext	v18.8b, v7.8b, v7.8b, #3	// A3
167*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v1.16b	// L = E + F
168*8fb009dcSAndroid Build Coastguard Worker	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
169*8fb009dcSAndroid Build Coastguard Worker	ext	v1.8b, v3.8b, v3.8b, #3		// B3
170*8fb009dcSAndroid Build Coastguard Worker	eor	v17.16b, v17.16b, v19.16b	// M = G + H
171*8fb009dcSAndroid Build Coastguard Worker	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
172*8fb009dcSAndroid Build Coastguard Worker
173*8fb009dcSAndroid Build Coastguard Worker	// Here we diverge from the 32-bit version. It computes the following
174*8fb009dcSAndroid Build Coastguard Worker	// (instructions reordered for clarity):
175*8fb009dcSAndroid Build Coastguard Worker	//
176*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
177*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t0#hi, $t0#hi, $k48
178*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi
179*8fb009dcSAndroid Build Coastguard Worker	//
180*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
181*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t1#hi, $t1#hi, $k32
182*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi
183*8fb009dcSAndroid Build Coastguard Worker	//
184*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
185*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t2#hi, $t2#hi, $k16
186*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi
187*8fb009dcSAndroid Build Coastguard Worker	//
188*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
189*8fb009dcSAndroid Build Coastguard Worker	//     vmov.i64	$t3#hi, #0
190*8fb009dcSAndroid Build Coastguard Worker	//
191*8fb009dcSAndroid Build Coastguard Worker	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
192*8fb009dcSAndroid Build Coastguard Worker	// upper halves of SIMD registers, so we must split each half into
193*8fb009dcSAndroid Build Coastguard Worker	// separate registers. To compensate, we pair computations up and
194*8fb009dcSAndroid Build Coastguard Worker	// parallelize.
195*8fb009dcSAndroid Build Coastguard Worker
196*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v3.8b, v3.8b, #4	// B4
197*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v1.16b	// N = I + J
198*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
199*8fb009dcSAndroid Build Coastguard Worker
200*8fb009dcSAndroid Build Coastguard Worker	// This can probably be scheduled more efficiently. For now, we just
201*8fb009dcSAndroid Build Coastguard Worker	// pair up independent instructions.
202*8fb009dcSAndroid Build Coastguard Worker	zip1	v20.2d, v16.2d, v17.2d
203*8fb009dcSAndroid Build Coastguard Worker	zip1	v22.2d, v18.2d, v19.2d
204*8fb009dcSAndroid Build Coastguard Worker	zip2	v21.2d, v16.2d, v17.2d
205*8fb009dcSAndroid Build Coastguard Worker	zip2	v23.2d, v18.2d, v19.2d
206*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
207*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
208*8fb009dcSAndroid Build Coastguard Worker	and	v21.16b, v21.16b, v24.16b
209*8fb009dcSAndroid Build Coastguard Worker	and	v23.16b, v23.16b, v25.16b
210*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
211*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
212*8fb009dcSAndroid Build Coastguard Worker	zip1	v16.2d, v20.2d, v21.2d
213*8fb009dcSAndroid Build Coastguard Worker	zip1	v18.2d, v22.2d, v23.2d
214*8fb009dcSAndroid Build Coastguard Worker	zip2	v17.2d, v20.2d, v21.2d
215*8fb009dcSAndroid Build Coastguard Worker	zip2	v19.2d, v22.2d, v23.2d
216*8fb009dcSAndroid Build Coastguard Worker
217*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
218*8fb009dcSAndroid Build Coastguard Worker	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
219*8fb009dcSAndroid Build Coastguard Worker	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
220*8fb009dcSAndroid Build Coastguard Worker	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
221*8fb009dcSAndroid Build Coastguard Worker	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
222*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v17.16b
223*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v19.16b
224*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v16.16b
225*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v18.16b
226*8fb009dcSAndroid Build Coastguard Worker	ext	v16.8b, v6.8b, v6.8b, #1	// A1
227*8fb009dcSAndroid Build Coastguard Worker	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
228*8fb009dcSAndroid Build Coastguard Worker	ext	v2.8b, v4.8b, v4.8b, #1		// B1
229*8fb009dcSAndroid Build Coastguard Worker	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
230*8fb009dcSAndroid Build Coastguard Worker	ext	v17.8b, v6.8b, v6.8b, #2	// A2
231*8fb009dcSAndroid Build Coastguard Worker	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
232*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v4.8b, v4.8b, #2	// B2
233*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
234*8fb009dcSAndroid Build Coastguard Worker	ext	v18.8b, v6.8b, v6.8b, #3	// A3
235*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v2.16b	// L = E + F
236*8fb009dcSAndroid Build Coastguard Worker	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
237*8fb009dcSAndroid Build Coastguard Worker	ext	v2.8b, v4.8b, v4.8b, #3		// B3
238*8fb009dcSAndroid Build Coastguard Worker	eor	v17.16b, v17.16b, v19.16b	// M = G + H
239*8fb009dcSAndroid Build Coastguard Worker	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
240*8fb009dcSAndroid Build Coastguard Worker
241*8fb009dcSAndroid Build Coastguard Worker	// Here we diverge from the 32-bit version. It computes the following
242*8fb009dcSAndroid Build Coastguard Worker	// (instructions reordered for clarity):
243*8fb009dcSAndroid Build Coastguard Worker	//
244*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
245*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t0#hi, $t0#hi, $k48
246*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t0#lo, $t0#lo, $t0#hi
247*8fb009dcSAndroid Build Coastguard Worker	//
248*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
249*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t1#hi, $t1#hi, $k32
250*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t1#lo, $t1#lo, $t1#hi
251*8fb009dcSAndroid Build Coastguard Worker	//
252*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
253*8fb009dcSAndroid Build Coastguard Worker	//     vand	$t2#hi, $t2#hi, $k16
254*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t2#lo, $t2#lo, $t2#hi
255*8fb009dcSAndroid Build Coastguard Worker	//
256*8fb009dcSAndroid Build Coastguard Worker	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
257*8fb009dcSAndroid Build Coastguard Worker	//     vmov.i64	$t3#hi, #0
258*8fb009dcSAndroid Build Coastguard Worker	//
259*8fb009dcSAndroid Build Coastguard Worker	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
260*8fb009dcSAndroid Build Coastguard Worker	// upper halves of SIMD registers, so we must split each half into
261*8fb009dcSAndroid Build Coastguard Worker	// separate registers. To compensate, we pair computations up and
262*8fb009dcSAndroid Build Coastguard Worker	// parallelize.
263*8fb009dcSAndroid Build Coastguard Worker
264*8fb009dcSAndroid Build Coastguard Worker	ext	v19.8b, v4.8b, v4.8b, #4	// B4
265*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v2.16b	// N = I + J
266*8fb009dcSAndroid Build Coastguard Worker	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
267*8fb009dcSAndroid Build Coastguard Worker
268*8fb009dcSAndroid Build Coastguard Worker	// This can probably be scheduled more efficiently. For now, we just
269*8fb009dcSAndroid Build Coastguard Worker	// pair up independent instructions.
270*8fb009dcSAndroid Build Coastguard Worker	zip1	v20.2d, v16.2d, v17.2d
271*8fb009dcSAndroid Build Coastguard Worker	zip1	v22.2d, v18.2d, v19.2d
272*8fb009dcSAndroid Build Coastguard Worker	zip2	v21.2d, v16.2d, v17.2d
273*8fb009dcSAndroid Build Coastguard Worker	zip2	v23.2d, v18.2d, v19.2d
274*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
275*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
276*8fb009dcSAndroid Build Coastguard Worker	and	v21.16b, v21.16b, v24.16b
277*8fb009dcSAndroid Build Coastguard Worker	and	v23.16b, v23.16b, v25.16b
278*8fb009dcSAndroid Build Coastguard Worker	eor	v20.16b, v20.16b, v21.16b
279*8fb009dcSAndroid Build Coastguard Worker	eor	v22.16b, v22.16b, v23.16b
280*8fb009dcSAndroid Build Coastguard Worker	zip1	v16.2d, v20.2d, v21.2d
281*8fb009dcSAndroid Build Coastguard Worker	zip1	v18.2d, v22.2d, v23.2d
282*8fb009dcSAndroid Build Coastguard Worker	zip2	v17.2d, v20.2d, v21.2d
283*8fb009dcSAndroid Build Coastguard Worker	zip2	v19.2d, v22.2d, v23.2d
284*8fb009dcSAndroid Build Coastguard Worker
285*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
286*8fb009dcSAndroid Build Coastguard Worker	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
287*8fb009dcSAndroid Build Coastguard Worker	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
288*8fb009dcSAndroid Build Coastguard Worker	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
289*8fb009dcSAndroid Build Coastguard Worker	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
290*8fb009dcSAndroid Build Coastguard Worker	eor	v16.16b, v16.16b, v17.16b
291*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v19.16b
292*8fb009dcSAndroid Build Coastguard Worker	eor	v2.16b, v2.16b, v16.16b
293*8fb009dcSAndroid Build Coastguard Worker	eor	v2.16b, v2.16b, v18.16b
294*8fb009dcSAndroid Build Coastguard Worker	ext	v16.16b, v0.16b, v2.16b, #8
295*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
296*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v2.16b
297*8fb009dcSAndroid Build Coastguard Worker	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
298*8fb009dcSAndroid Build Coastguard Worker	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
299*8fb009dcSAndroid Build Coastguard Worker	// This is a no-op due to the ins instruction below.
300*8fb009dcSAndroid Build Coastguard Worker	// ins	v2.d[0], v1.d[1]
301*8fb009dcSAndroid Build Coastguard Worker
302*8fb009dcSAndroid Build Coastguard Worker	// equivalent of reduction_avx from ghash-x86_64.pl
303*8fb009dcSAndroid Build Coastguard Worker	shl	v17.2d, v0.2d, #57		// 1st phase
304*8fb009dcSAndroid Build Coastguard Worker	shl	v18.2d, v0.2d, #62
305*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v17.16b	//
306*8fb009dcSAndroid Build Coastguard Worker	shl	v17.2d, v0.2d, #63
307*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v17.16b	//
308*8fb009dcSAndroid Build Coastguard Worker	// Note Xm contains {Xl.d[1], Xh.d[0]}.
309*8fb009dcSAndroid Build Coastguard Worker	eor	v18.16b, v18.16b, v1.16b
310*8fb009dcSAndroid Build Coastguard Worker	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
311*8fb009dcSAndroid Build Coastguard Worker	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
312*8fb009dcSAndroid Build Coastguard Worker
313*8fb009dcSAndroid Build Coastguard Worker	ushr	v18.2d, v0.2d, #1		// 2nd phase
314*8fb009dcSAndroid Build Coastguard Worker	eor	v2.16b, v2.16b,v0.16b
315*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b,v18.16b	//
316*8fb009dcSAndroid Build Coastguard Worker	ushr	v18.2d, v18.2d, #6
317*8fb009dcSAndroid Build Coastguard Worker	ushr	v0.2d, v0.2d, #1		//
318*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b, v2.16b	//
319*8fb009dcSAndroid Build Coastguard Worker	eor	v0.16b, v0.16b, v18.16b	//
320*8fb009dcSAndroid Build Coastguard Worker
321*8fb009dcSAndroid Build Coastguard Worker	subs	x3, x3, #16
322*8fb009dcSAndroid Build Coastguard Worker	bne	Loop_neon
323*8fb009dcSAndroid Build Coastguard Worker
324*8fb009dcSAndroid Build Coastguard Worker	rev64	v0.16b, v0.16b		// byteswap Xi and write
325*8fb009dcSAndroid Build Coastguard Worker	ext	v0.16b, v0.16b, v0.16b, #8
326*8fb009dcSAndroid Build Coastguard Worker	st1	{v0.16b}, [x0]
327*8fb009dcSAndroid Build Coastguard Worker
328*8fb009dcSAndroid Build Coastguard Worker	ret
329*8fb009dcSAndroid Build Coastguard Worker
330*8fb009dcSAndroid Build Coastguard Worker
331*8fb009dcSAndroid Build Coastguard Worker.section	.rodata
332*8fb009dcSAndroid Build Coastguard Worker.align	4
333*8fb009dcSAndroid Build Coastguard WorkerLmasks:
334*8fb009dcSAndroid Build Coastguard Worker.quad	0x0000ffffffffffff	// k48
335*8fb009dcSAndroid Build Coastguard Worker.quad	0x00000000ffffffff	// k32
336*8fb009dcSAndroid Build Coastguard Worker.quad	0x000000000000ffff	// k16
337*8fb009dcSAndroid Build Coastguard Worker.quad	0x0000000000000000	// k0
338*8fb009dcSAndroid Build Coastguard Worker.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
339*8fb009dcSAndroid Build Coastguard Worker.align	2
340*8fb009dcSAndroid Build Coastguard Worker.align	2
341*8fb009dcSAndroid Build Coastguard Worker#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
342