1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#include <ring-core/asm_base.h>
5
6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
7#include <ring-core/arm_arch.h>
8
9#if __ARM_MAX_ARCH__>=7
10.text
11.arch	armv8-a+crypto
12.globl	gcm_init_clmul
13.hidden	gcm_init_clmul
14.type	gcm_init_clmul,%function
15.align	4
16gcm_init_clmul:
17	AARCH64_VALID_CALL_TARGET
18	ld1	{v17.2d},[x1]		//load input H
19	movi	v19.16b,#0xe1
20	shl	v19.2d,v19.2d,#57		//0xc2.0
21	ext	v3.16b,v17.16b,v17.16b,#8
22	ushr	v18.2d,v19.2d,#63
23	dup	v17.4s,v17.s[1]
24	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
25	ushr	v18.2d,v3.2d,#63
26	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
27	and	v18.16b,v18.16b,v16.16b
28	shl	v3.2d,v3.2d,#1
29	ext	v18.16b,v18.16b,v18.16b,#8
30	and	v16.16b,v16.16b,v17.16b
31	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
32	eor	v20.16b,v3.16b,v16.16b		//twisted H
33	st1	{v20.2d},[x0],#16		//store Htable[0]
34
35	//calculate H^2
36	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
37	pmull	v0.1q,v20.1d,v20.1d
38	eor	v16.16b,v16.16b,v20.16b
39	pmull2	v2.1q,v20.2d,v20.2d
40	pmull	v1.1q,v16.1d,v16.1d
41
42	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
43	eor	v18.16b,v0.16b,v2.16b
44	eor	v1.16b,v1.16b,v17.16b
45	eor	v1.16b,v1.16b,v18.16b
46	pmull	v18.1q,v0.1d,v19.1d		//1st phase
47
48	ins	v2.d[0],v1.d[1]
49	ins	v1.d[1],v0.d[0]
50	eor	v0.16b,v1.16b,v18.16b
51
52	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
53	pmull	v0.1q,v0.1d,v19.1d
54	eor	v18.16b,v18.16b,v2.16b
55	eor	v22.16b,v0.16b,v18.16b
56
57	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
58	eor	v17.16b,v17.16b,v22.16b
59	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
60	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
61	//calculate H^3 and H^4
62	pmull	v0.1q,v20.1d, v22.1d
63	pmull	v5.1q,v22.1d,v22.1d
64	pmull2	v2.1q,v20.2d, v22.2d
65	pmull2	v7.1q,v22.2d,v22.2d
66	pmull	v1.1q,v16.1d,v17.1d
67	pmull	v6.1q,v17.1d,v17.1d
68
69	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
70	ext	v17.16b,v5.16b,v7.16b,#8
71	eor	v18.16b,v0.16b,v2.16b
72	eor	v1.16b,v1.16b,v16.16b
73	eor	v4.16b,v5.16b,v7.16b
74	eor	v6.16b,v6.16b,v17.16b
75	eor	v1.16b,v1.16b,v18.16b
76	pmull	v18.1q,v0.1d,v19.1d		//1st phase
77	eor	v6.16b,v6.16b,v4.16b
78	pmull	v4.1q,v5.1d,v19.1d
79
80	ins	v2.d[0],v1.d[1]
81	ins	v7.d[0],v6.d[1]
82	ins	v1.d[1],v0.d[0]
83	ins	v6.d[1],v5.d[0]
84	eor	v0.16b,v1.16b,v18.16b
85	eor	v5.16b,v6.16b,v4.16b
86
87	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
88	ext	v4.16b,v5.16b,v5.16b,#8
89	pmull	v0.1q,v0.1d,v19.1d
90	pmull	v5.1q,v5.1d,v19.1d
91	eor	v18.16b,v18.16b,v2.16b
92	eor	v4.16b,v4.16b,v7.16b
93	eor	v20.16b, v0.16b,v18.16b		//H^3
94	eor	v22.16b,v5.16b,v4.16b		//H^4
95
96	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
97	ext	v17.16b,v22.16b,v22.16b,#8
98	eor	v16.16b,v16.16b,v20.16b
99	eor	v17.16b,v17.16b,v22.16b
100	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
101	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
102	ret
103.size	gcm_init_clmul,.-gcm_init_clmul
104.globl	gcm_gmult_clmul
105.hidden	gcm_gmult_clmul
106.type	gcm_gmult_clmul,%function
107.align	4
108gcm_gmult_clmul:
109	AARCH64_VALID_CALL_TARGET
110	ld1	{v17.2d},[x0]		//load Xi
111	movi	v19.16b,#0xe1
112	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
113	shl	v19.2d,v19.2d,#57
114#ifndef __AARCH64EB__
115	rev64	v17.16b,v17.16b
116#endif
117	ext	v3.16b,v17.16b,v17.16b,#8
118
119	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
120	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
121	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
122	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
123
124	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
125	eor	v18.16b,v0.16b,v2.16b
126	eor	v1.16b,v1.16b,v17.16b
127	eor	v1.16b,v1.16b,v18.16b
128	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
129
130	ins	v2.d[0],v1.d[1]
131	ins	v1.d[1],v0.d[0]
132	eor	v0.16b,v1.16b,v18.16b
133
134	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
135	pmull	v0.1q,v0.1d,v19.1d
136	eor	v18.16b,v18.16b,v2.16b
137	eor	v0.16b,v0.16b,v18.16b
138
139#ifndef __AARCH64EB__
140	rev64	v0.16b,v0.16b
141#endif
142	ext	v0.16b,v0.16b,v0.16b,#8
143	st1	{v0.2d},[x0]		//write out Xi
144
145	ret
146.size	gcm_gmult_clmul,.-gcm_gmult_clmul
147.globl	gcm_ghash_clmul
148.hidden	gcm_ghash_clmul
149.type	gcm_ghash_clmul,%function
150.align	4
151gcm_ghash_clmul:
152	AARCH64_VALID_CALL_TARGET
153	cmp	x3,#64
154	b.hs	.Lgcm_ghash_v8_4x
155	ld1	{v0.2d},[x0]		//load [rotated] Xi
156						//"[rotated]" means that
157						//loaded value would have
158						//to be rotated in order to
159						//make it appear as in
160						//algorithm specification
161	subs	x3,x3,#32		//see if x3 is 32 or larger
162	mov	x12,#16		//x12 is used as post-
163						//increment for input pointer;
164						//as loop is modulo-scheduled
165						//x12 is zeroed just in time
166						//to preclude overstepping
167						//inp[len], which means that
168						//last block[s] are actually
169						//loaded twice, but last
170						//copy is not processed
171	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
172	movi	v19.16b,#0xe1
173	ld1	{v22.2d},[x1]
174	csel	x12,xzr,x12,eq			//is it time to zero x12?
175	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
176	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
177	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
178#ifndef __AARCH64EB__
179	rev64	v16.16b,v16.16b
180	rev64	v0.16b,v0.16b
181#endif
182	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
183	b.lo	.Lodd_tail_v8		//x3 was less than 32
184	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
185#ifndef __AARCH64EB__
186	rev64	v17.16b,v17.16b
187#endif
188	ext	v7.16b,v17.16b,v17.16b,#8
189	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
190	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
191	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
192	pmull2	v6.1q,v20.2d,v7.2d
193	b	.Loop_mod2x_v8
194
195.align	4
196.Loop_mod2x_v8:
197	ext	v18.16b,v3.16b,v3.16b,#8
198	subs	x3,x3,#32		//is there more data?
199	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
200	csel	x12,xzr,x12,lo			//is it time to zero x12?
201
202	pmull	v5.1q,v21.1d,v17.1d
203	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
204	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
205	eor	v0.16b,v0.16b,v4.16b		//accumulate
206	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
207	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
208
209	eor	v2.16b,v2.16b,v6.16b
210	csel	x12,xzr,x12,eq			//is it time to zero x12?
211	eor	v1.16b,v1.16b,v5.16b
212
213	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
214	eor	v18.16b,v0.16b,v2.16b
215	eor	v1.16b,v1.16b,v17.16b
216	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
217#ifndef __AARCH64EB__
218	rev64	v16.16b,v16.16b
219#endif
220	eor	v1.16b,v1.16b,v18.16b
221	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
222
223#ifndef __AARCH64EB__
224	rev64	v17.16b,v17.16b
225#endif
226	ins	v2.d[0],v1.d[1]
227	ins	v1.d[1],v0.d[0]
228	ext	v7.16b,v17.16b,v17.16b,#8
229	ext	v3.16b,v16.16b,v16.16b,#8
230	eor	v0.16b,v1.16b,v18.16b
231	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
232	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
233
234	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
235	pmull	v0.1q,v0.1d,v19.1d
236	eor	v3.16b,v3.16b,v18.16b
237	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
238	eor	v3.16b,v3.16b,v0.16b
239	pmull2	v6.1q,v20.2d,v7.2d
240	b.hs	.Loop_mod2x_v8		//there was at least 32 more bytes
241
242	eor	v2.16b,v2.16b,v18.16b
243	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
244	adds	x3,x3,#32		//re-construct x3
245	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
246	b.eq	.Ldone_v8		//is x3 zero?
247.Lodd_tail_v8:
248	ext	v18.16b,v0.16b,v0.16b,#8
249	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
250	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
251
252	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
253	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
254	pmull2	v2.1q,v20.2d,v3.2d		//H.hXi.hi
255	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
256
257	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
258	eor	v18.16b,v0.16b,v2.16b
259	eor	v1.16b,v1.16b,v17.16b
260	eor	v1.16b,v1.16b,v18.16b
261	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
262
263	ins	v2.d[0],v1.d[1]
264	ins	v1.d[1],v0.d[0]
265	eor	v0.16b,v1.16b,v18.16b
266
267	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
268	pmull	v0.1q,v0.1d,v19.1d
269	eor	v18.16b,v18.16b,v2.16b
270	eor	v0.16b,v0.16b,v18.16b
271
272.Ldone_v8:
273#ifndef __AARCH64EB__
274	rev64	v0.16b,v0.16b
275#endif
276	ext	v0.16b,v0.16b,v0.16b,#8
277	st1	{v0.2d},[x0]		//write out Xi
278
279	ret
280.size	gcm_ghash_clmul,.-gcm_ghash_clmul
281.type	gcm_ghash_v8_4x,%function
282.align	4
283gcm_ghash_v8_4x:
284.Lgcm_ghash_v8_4x:
285	ld1	{v0.2d},[x0]		//load [rotated] Xi
286	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
287	movi	v19.16b,#0xe1
288	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
289	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
290
291	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
292#ifndef __AARCH64EB__
293	rev64	v0.16b,v0.16b
294	rev64	v5.16b,v5.16b
295	rev64	v6.16b,v6.16b
296	rev64	v7.16b,v7.16b
297	rev64	v4.16b,v4.16b
298#endif
299	ext	v25.16b,v7.16b,v7.16b,#8
300	ext	v24.16b,v6.16b,v6.16b,#8
301	ext	v23.16b,v5.16b,v5.16b,#8
302
303	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
304	eor	v7.16b,v7.16b,v25.16b
305	pmull2	v31.1q,v20.2d,v25.2d
306	pmull	v30.1q,v21.1d,v7.1d
307
308	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
309	eor	v6.16b,v6.16b,v24.16b
310	pmull2	v24.1q,v22.2d,v24.2d
311	pmull2	v6.1q,v21.2d,v6.2d
312
313	eor	v29.16b,v29.16b,v16.16b
314	eor	v31.16b,v31.16b,v24.16b
315	eor	v30.16b,v30.16b,v6.16b
316
317	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
318	eor	v5.16b,v5.16b,v23.16b
319	pmull2	v23.1q,v26.2d,v23.2d
320	pmull	v5.1q,v27.1d,v5.1d
321
322	eor	v29.16b,v29.16b,v7.16b
323	eor	v31.16b,v31.16b,v23.16b
324	eor	v30.16b,v30.16b,v5.16b
325
326	subs	x3,x3,#128
327	b.lo	.Ltail4x
328
329	b	.Loop4x
330
331.align	4
332.Loop4x:
333	eor	v16.16b,v4.16b,v0.16b
334	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
335	ext	v3.16b,v16.16b,v16.16b,#8
336#ifndef __AARCH64EB__
337	rev64	v5.16b,v5.16b
338	rev64	v6.16b,v6.16b
339	rev64	v7.16b,v7.16b
340	rev64	v4.16b,v4.16b
341#endif
342
343	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
344	eor	v16.16b,v16.16b,v3.16b
345	pmull2	v2.1q,v28.2d,v3.2d
346	ext	v25.16b,v7.16b,v7.16b,#8
347	pmull2	v1.1q,v27.2d,v16.2d
348
349	eor	v0.16b,v0.16b,v29.16b
350	eor	v2.16b,v2.16b,v31.16b
351	ext	v24.16b,v6.16b,v6.16b,#8
352	eor	v1.16b,v1.16b,v30.16b
353	ext	v23.16b,v5.16b,v5.16b,#8
354
355	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
356	eor	v18.16b,v0.16b,v2.16b
357	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
358	eor	v7.16b,v7.16b,v25.16b
359	eor	v1.16b,v1.16b,v17.16b
360	pmull2	v31.1q,v20.2d,v25.2d
361	eor	v1.16b,v1.16b,v18.16b
362	pmull	v30.1q,v21.1d,v7.1d
363
364	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
365	ins	v2.d[0],v1.d[1]
366	ins	v1.d[1],v0.d[0]
367	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
368	eor	v6.16b,v6.16b,v24.16b
369	pmull2	v24.1q,v22.2d,v24.2d
370	eor	v0.16b,v1.16b,v18.16b
371	pmull2	v6.1q,v21.2d,v6.2d
372
373	eor	v29.16b,v29.16b,v16.16b
374	eor	v31.16b,v31.16b,v24.16b
375	eor	v30.16b,v30.16b,v6.16b
376
377	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
378	pmull	v0.1q,v0.1d,v19.1d
379	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
380	eor	v5.16b,v5.16b,v23.16b
381	eor	v18.16b,v18.16b,v2.16b
382	pmull2	v23.1q,v26.2d,v23.2d
383	pmull	v5.1q,v27.1d,v5.1d
384
385	eor	v0.16b,v0.16b,v18.16b
386	eor	v29.16b,v29.16b,v7.16b
387	eor	v31.16b,v31.16b,v23.16b
388	ext	v0.16b,v0.16b,v0.16b,#8
389	eor	v30.16b,v30.16b,v5.16b
390
391	subs	x3,x3,#64
392	b.hs	.Loop4x
393
394.Ltail4x:
395	eor	v16.16b,v4.16b,v0.16b
396	ext	v3.16b,v16.16b,v16.16b,#8
397
398	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
399	eor	v16.16b,v16.16b,v3.16b
400	pmull2	v2.1q,v28.2d,v3.2d
401	pmull2	v1.1q,v27.2d,v16.2d
402
403	eor	v0.16b,v0.16b,v29.16b
404	eor	v2.16b,v2.16b,v31.16b
405	eor	v1.16b,v1.16b,v30.16b
406
407	adds	x3,x3,#64
408	b.eq	.Ldone4x
409
410	cmp	x3,#32
411	b.lo	.Lone
412	b.eq	.Ltwo
413.Lthree:
414	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
415	eor	v18.16b,v0.16b,v2.16b
416	eor	v1.16b,v1.16b,v17.16b
417	ld1	{v4.2d,v5.2d,v6.2d},[x2]
418	eor	v1.16b,v1.16b,v18.16b
419#ifndef	__AARCH64EB__
420	rev64	v5.16b,v5.16b
421	rev64	v6.16b,v6.16b
422	rev64	v4.16b,v4.16b
423#endif
424
425	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
426	ins	v2.d[0],v1.d[1]
427	ins	v1.d[1],v0.d[0]
428	ext	v24.16b,v6.16b,v6.16b,#8
429	ext	v23.16b,v5.16b,v5.16b,#8
430	eor	v0.16b,v1.16b,v18.16b
431
432	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
433	eor	v6.16b,v6.16b,v24.16b
434
435	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
436	pmull	v0.1q,v0.1d,v19.1d
437	eor	v18.16b,v18.16b,v2.16b
438	pmull2	v31.1q,v20.2d,v24.2d
439	pmull	v30.1q,v21.1d,v6.1d
440	eor	v0.16b,v0.16b,v18.16b
441	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
442	eor	v5.16b,v5.16b,v23.16b
443	ext	v0.16b,v0.16b,v0.16b,#8
444
445	pmull2	v23.1q,v22.2d,v23.2d
446	eor	v16.16b,v4.16b,v0.16b
447	pmull2	v5.1q,v21.2d,v5.2d
448	ext	v3.16b,v16.16b,v16.16b,#8
449
450	eor	v29.16b,v29.16b,v7.16b
451	eor	v31.16b,v31.16b,v23.16b
452	eor	v30.16b,v30.16b,v5.16b
453
454	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
455	eor	v16.16b,v16.16b,v3.16b
456	pmull2	v2.1q,v26.2d,v3.2d
457	pmull	v1.1q,v27.1d,v16.1d
458
459	eor	v0.16b,v0.16b,v29.16b
460	eor	v2.16b,v2.16b,v31.16b
461	eor	v1.16b,v1.16b,v30.16b
462	b	.Ldone4x
463
464.align	4
465.Ltwo:
466	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
467	eor	v18.16b,v0.16b,v2.16b
468	eor	v1.16b,v1.16b,v17.16b
469	ld1	{v4.2d,v5.2d},[x2]
470	eor	v1.16b,v1.16b,v18.16b
471#ifndef	__AARCH64EB__
472	rev64	v5.16b,v5.16b
473	rev64	v4.16b,v4.16b
474#endif
475
476	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
477	ins	v2.d[0],v1.d[1]
478	ins	v1.d[1],v0.d[0]
479	ext	v23.16b,v5.16b,v5.16b,#8
480	eor	v0.16b,v1.16b,v18.16b
481
482	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
483	pmull	v0.1q,v0.1d,v19.1d
484	eor	v18.16b,v18.16b,v2.16b
485	eor	v0.16b,v0.16b,v18.16b
486	ext	v0.16b,v0.16b,v0.16b,#8
487
488	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
489	eor	v5.16b,v5.16b,v23.16b
490
491	eor	v16.16b,v4.16b,v0.16b
492	ext	v3.16b,v16.16b,v16.16b,#8
493
494	pmull2	v31.1q,v20.2d,v23.2d
495	pmull	v30.1q,v21.1d,v5.1d
496
497	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
498	eor	v16.16b,v16.16b,v3.16b
499	pmull2	v2.1q,v22.2d,v3.2d
500	pmull2	v1.1q,v21.2d,v16.2d
501
502	eor	v0.16b,v0.16b,v29.16b
503	eor	v2.16b,v2.16b,v31.16b
504	eor	v1.16b,v1.16b,v30.16b
505	b	.Ldone4x
506
507.align	4
508.Lone:
509	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
510	eor	v18.16b,v0.16b,v2.16b
511	eor	v1.16b,v1.16b,v17.16b
512	ld1	{v4.2d},[x2]
513	eor	v1.16b,v1.16b,v18.16b
514#ifndef	__AARCH64EB__
515	rev64	v4.16b,v4.16b
516#endif
517
518	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
519	ins	v2.d[0],v1.d[1]
520	ins	v1.d[1],v0.d[0]
521	eor	v0.16b,v1.16b,v18.16b
522
523	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
524	pmull	v0.1q,v0.1d,v19.1d
525	eor	v18.16b,v18.16b,v2.16b
526	eor	v0.16b,v0.16b,v18.16b
527	ext	v0.16b,v0.16b,v0.16b,#8
528
529	eor	v16.16b,v4.16b,v0.16b
530	ext	v3.16b,v16.16b,v16.16b,#8
531
532	pmull	v0.1q,v20.1d,v3.1d
533	eor	v16.16b,v16.16b,v3.16b
534	pmull2	v2.1q,v20.2d,v3.2d
535	pmull	v1.1q,v21.1d,v16.1d
536
537.Ldone4x:
538	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
539	eor	v18.16b,v0.16b,v2.16b
540	eor	v1.16b,v1.16b,v17.16b
541	eor	v1.16b,v1.16b,v18.16b
542
543	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
544	ins	v2.d[0],v1.d[1]
545	ins	v1.d[1],v0.d[0]
546	eor	v0.16b,v1.16b,v18.16b
547
548	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
549	pmull	v0.1q,v0.1d,v19.1d
550	eor	v18.16b,v18.16b,v2.16b
551	eor	v0.16b,v0.16b,v18.16b
552	ext	v0.16b,v0.16b,v0.16b,#8
553
554#ifndef __AARCH64EB__
555	rev64	v0.16b,v0.16b
556#endif
557	st1	{v0.2d},[x0]		//write out Xi
558
559	ret
560.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
561.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
562.align	2
563.align	2
564#endif
565#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
566