1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <ring-core/arm_arch.h>
8
9.text
10
11.globl	gcm_init_neon
12.hidden	gcm_init_neon
13.type	gcm_init_neon,%function
14.align	4
15gcm_init_neon:
16	AARCH64_VALID_CALL_TARGET
17	// This function is adapted from gcm_init_v8. xC2 is t3.
18	ld1	{v17.2d}, [x1]			// load H
19	movi	v19.16b, #0xe1
20	shl	v19.2d, v19.2d, #57		// 0xc2.0
21	ext	v3.16b, v17.16b, v17.16b, #8
22	ushr	v18.2d, v19.2d, #63
23	dup	v17.4s, v17.s[1]
24	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
25	ushr	v18.2d, v3.2d, #63
26	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
27	and	v18.16b, v18.16b, v16.16b
28	shl	v3.2d, v3.2d, #1
29	ext	v18.16b, v18.16b, v18.16b, #8
30	and	v16.16b, v16.16b, v17.16b
31	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
32	eor	v5.16b, v3.16b, v16.16b	// twisted H
33	st1	{v5.2d}, [x0]			// store Htable[0]
34	ret
35.size	gcm_init_neon,.-gcm_init_neon
36
37.globl	gcm_gmult_neon
38.hidden	gcm_gmult_neon
39.type	gcm_gmult_neon,%function
40.align	4
41gcm_gmult_neon:
42	AARCH64_VALID_CALL_TARGET
43	ld1	{v3.16b}, [x0]		// load Xi
44	ld1	{v5.1d}, [x1], #8		// load twisted H
45	ld1	{v6.1d}, [x1]
46	adrp	x9, .Lmasks		// load constants
47	add	x9, x9, :lo12:.Lmasks
48	ld1	{v24.2d, v25.2d}, [x9]
49	rev64	v3.16b, v3.16b		// byteswap Xi
50	ext	v3.16b, v3.16b, v3.16b, #8
51	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
52
53	mov	x3, #16
54	b	.Lgmult_neon
55.size	gcm_gmult_neon,.-gcm_gmult_neon
56
57.globl	gcm_ghash_neon
58.hidden	gcm_ghash_neon
59.type	gcm_ghash_neon,%function
60.align	4
61gcm_ghash_neon:
62	AARCH64_VALID_CALL_TARGET
63	ld1	{v0.16b}, [x0]		// load Xi
64	ld1	{v5.1d}, [x1], #8		// load twisted H
65	ld1	{v6.1d}, [x1]
66	adrp	x9, .Lmasks		// load constants
67	add	x9, x9, :lo12:.Lmasks
68	ld1	{v24.2d, v25.2d}, [x9]
69	rev64	v0.16b, v0.16b		// byteswap Xi
70	ext	v0.16b, v0.16b, v0.16b, #8
71	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
72
73.Loop_neon:
74	ld1	{v3.16b}, [x2], #16	// load inp
75	rev64	v3.16b, v3.16b		// byteswap inp
76	ext	v3.16b, v3.16b, v3.16b, #8
77	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
78
79.Lgmult_neon:
80	// Split the input into v3 and v4. (The upper halves are unused,
81	// so it is okay to leave them alone.)
82	ins	v4.d[0], v3.d[1]
83	ext	v16.8b, v5.8b, v5.8b, #1	// A1
84	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
85	ext	v0.8b, v3.8b, v3.8b, #1		// B1
86	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
87	ext	v17.8b, v5.8b, v5.8b, #2	// A2
88	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
89	ext	v19.8b, v3.8b, v3.8b, #2	// B2
90	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
91	ext	v18.8b, v5.8b, v5.8b, #3	// A3
92	eor	v16.16b, v16.16b, v0.16b	// L = E + F
93	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
94	ext	v0.8b, v3.8b, v3.8b, #3		// B3
95	eor	v17.16b, v17.16b, v19.16b	// M = G + H
96	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
97
98	// Here we diverge from the 32-bit version. It computes the following
99	// (instructions reordered for clarity):
100	//
101	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
102	//     vand	$t0#hi, $t0#hi, $k48
103	//     veor	$t0#lo, $t0#lo, $t0#hi
104	//
105	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
106	//     vand	$t1#hi, $t1#hi, $k32
107	//     veor	$t1#lo, $t1#lo, $t1#hi
108	//
109	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
110	//     vand	$t2#hi, $t2#hi, $k16
111	//     veor	$t2#lo, $t2#lo, $t2#hi
112	//
113	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
114	//     vmov.i64	$t3#hi, #0
115	//
116	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
117	// upper halves of SIMD registers, so we must split each half into
118	// separate registers. To compensate, we pair computations up and
119	// parallelize.
120
121	ext	v19.8b, v3.8b, v3.8b, #4	// B4
122	eor	v18.16b, v18.16b, v0.16b	// N = I + J
123	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
124
125	// This can probably be scheduled more efficiently. For now, we just
126	// pair up independent instructions.
127	zip1	v20.2d, v16.2d, v17.2d
128	zip1	v22.2d, v18.2d, v19.2d
129	zip2	v21.2d, v16.2d, v17.2d
130	zip2	v23.2d, v18.2d, v19.2d
131	eor	v20.16b, v20.16b, v21.16b
132	eor	v22.16b, v22.16b, v23.16b
133	and	v21.16b, v21.16b, v24.16b
134	and	v23.16b, v23.16b, v25.16b
135	eor	v20.16b, v20.16b, v21.16b
136	eor	v22.16b, v22.16b, v23.16b
137	zip1	v16.2d, v20.2d, v21.2d
138	zip1	v18.2d, v22.2d, v23.2d
139	zip2	v17.2d, v20.2d, v21.2d
140	zip2	v19.2d, v22.2d, v23.2d
141
142	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
143	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
144	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
145	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
146	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
147	eor	v16.16b, v16.16b, v17.16b
148	eor	v18.16b, v18.16b, v19.16b
149	eor	v0.16b, v0.16b, v16.16b
150	eor	v0.16b, v0.16b, v18.16b
151	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
152	ext	v16.8b, v7.8b, v7.8b, #1	// A1
153	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
154	ext	v1.8b, v3.8b, v3.8b, #1		// B1
155	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
156	ext	v17.8b, v7.8b, v7.8b, #2	// A2
157	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
158	ext	v19.8b, v3.8b, v3.8b, #2	// B2
159	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
160	ext	v18.8b, v7.8b, v7.8b, #3	// A3
161	eor	v16.16b, v16.16b, v1.16b	// L = E + F
162	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
163	ext	v1.8b, v3.8b, v3.8b, #3		// B3
164	eor	v17.16b, v17.16b, v19.16b	// M = G + H
165	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
166
167	// Here we diverge from the 32-bit version. It computes the following
168	// (instructions reordered for clarity):
169	//
170	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
171	//     vand	$t0#hi, $t0#hi, $k48
172	//     veor	$t0#lo, $t0#lo, $t0#hi
173	//
174	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
175	//     vand	$t1#hi, $t1#hi, $k32
176	//     veor	$t1#lo, $t1#lo, $t1#hi
177	//
178	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
179	//     vand	$t2#hi, $t2#hi, $k16
180	//     veor	$t2#lo, $t2#lo, $t2#hi
181	//
182	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
183	//     vmov.i64	$t3#hi, #0
184	//
185	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
186	// upper halves of SIMD registers, so we must split each half into
187	// separate registers. To compensate, we pair computations up and
188	// parallelize.
189
190	ext	v19.8b, v3.8b, v3.8b, #4	// B4
191	eor	v18.16b, v18.16b, v1.16b	// N = I + J
192	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
193
194	// This can probably be scheduled more efficiently. For now, we just
195	// pair up independent instructions.
196	zip1	v20.2d, v16.2d, v17.2d
197	zip1	v22.2d, v18.2d, v19.2d
198	zip2	v21.2d, v16.2d, v17.2d
199	zip2	v23.2d, v18.2d, v19.2d
200	eor	v20.16b, v20.16b, v21.16b
201	eor	v22.16b, v22.16b, v23.16b
202	and	v21.16b, v21.16b, v24.16b
203	and	v23.16b, v23.16b, v25.16b
204	eor	v20.16b, v20.16b, v21.16b
205	eor	v22.16b, v22.16b, v23.16b
206	zip1	v16.2d, v20.2d, v21.2d
207	zip1	v18.2d, v22.2d, v23.2d
208	zip2	v17.2d, v20.2d, v21.2d
209	zip2	v19.2d, v22.2d, v23.2d
210
211	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
212	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
213	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
214	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
215	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
216	eor	v16.16b, v16.16b, v17.16b
217	eor	v18.16b, v18.16b, v19.16b
218	eor	v1.16b, v1.16b, v16.16b
219	eor	v1.16b, v1.16b, v18.16b
220	ext	v16.8b, v6.8b, v6.8b, #1	// A1
221	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
222	ext	v2.8b, v4.8b, v4.8b, #1		// B1
223	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
224	ext	v17.8b, v6.8b, v6.8b, #2	// A2
225	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
226	ext	v19.8b, v4.8b, v4.8b, #2	// B2
227	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
228	ext	v18.8b, v6.8b, v6.8b, #3	// A3
229	eor	v16.16b, v16.16b, v2.16b	// L = E + F
230	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
231	ext	v2.8b, v4.8b, v4.8b, #3		// B3
232	eor	v17.16b, v17.16b, v19.16b	// M = G + H
233	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
234
235	// Here we diverge from the 32-bit version. It computes the following
236	// (instructions reordered for clarity):
237	//
238	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
239	//     vand	$t0#hi, $t0#hi, $k48
240	//     veor	$t0#lo, $t0#lo, $t0#hi
241	//
242	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
243	//     vand	$t1#hi, $t1#hi, $k32
244	//     veor	$t1#lo, $t1#lo, $t1#hi
245	//
246	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
247	//     vand	$t2#hi, $t2#hi, $k16
248	//     veor	$t2#lo, $t2#lo, $t2#hi
249	//
250	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
251	//     vmov.i64	$t3#hi, #0
252	//
253	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
254	// upper halves of SIMD registers, so we must split each half into
255	// separate registers. To compensate, we pair computations up and
256	// parallelize.
257
258	ext	v19.8b, v4.8b, v4.8b, #4	// B4
259	eor	v18.16b, v18.16b, v2.16b	// N = I + J
260	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
261
262	// This can probably be scheduled more efficiently. For now, we just
263	// pair up independent instructions.
264	zip1	v20.2d, v16.2d, v17.2d
265	zip1	v22.2d, v18.2d, v19.2d
266	zip2	v21.2d, v16.2d, v17.2d
267	zip2	v23.2d, v18.2d, v19.2d
268	eor	v20.16b, v20.16b, v21.16b
269	eor	v22.16b, v22.16b, v23.16b
270	and	v21.16b, v21.16b, v24.16b
271	and	v23.16b, v23.16b, v25.16b
272	eor	v20.16b, v20.16b, v21.16b
273	eor	v22.16b, v22.16b, v23.16b
274	zip1	v16.2d, v20.2d, v21.2d
275	zip1	v18.2d, v22.2d, v23.2d
276	zip2	v17.2d, v20.2d, v21.2d
277	zip2	v19.2d, v22.2d, v23.2d
278
279	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
280	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
281	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
282	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
283	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
284	eor	v16.16b, v16.16b, v17.16b
285	eor	v18.16b, v18.16b, v19.16b
286	eor	v2.16b, v2.16b, v16.16b
287	eor	v2.16b, v2.16b, v18.16b
288	ext	v16.16b, v0.16b, v2.16b, #8
289	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
290	eor	v1.16b, v1.16b, v2.16b
291	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
292	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
293	// This is a no-op due to the ins instruction below.
294	// ins	v2.d[0], v1.d[1]
295
296	// equivalent of reduction_avx from ghash-x86_64.pl
297	shl	v17.2d, v0.2d, #57		// 1st phase
298	shl	v18.2d, v0.2d, #62
299	eor	v18.16b, v18.16b, v17.16b	//
300	shl	v17.2d, v0.2d, #63
301	eor	v18.16b, v18.16b, v17.16b	//
302	// Note Xm contains {Xl.d[1], Xh.d[0]}.
303	eor	v18.16b, v18.16b, v1.16b
304	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
305	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
306
307	ushr	v18.2d, v0.2d, #1		// 2nd phase
308	eor	v2.16b, v2.16b,v0.16b
309	eor	v0.16b, v0.16b,v18.16b	//
310	ushr	v18.2d, v18.2d, #6
311	ushr	v0.2d, v0.2d, #1		//
312	eor	v0.16b, v0.16b, v2.16b	//
313	eor	v0.16b, v0.16b, v18.16b	//
314
315	subs	x3, x3, #16
316	bne	.Loop_neon
317
318	rev64	v0.16b, v0.16b		// byteswap Xi and write
319	ext	v0.16b, v0.16b, v0.16b, #8
320	st1	{v0.16b}, [x0]
321
322	ret
323.size	gcm_ghash_neon,.-gcm_ghash_neon
324
325.section	.rodata
326.align	4
327.Lmasks:
328.quad	0x0000ffffffffffff	// k48
329.quad	0x00000000ffffffff	// k32
330.quad	0x000000000000ffff	// k16
331.quad	0x0000000000000000	// k0
332.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
333.align	2
334.align	2
335#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
336