xref: /aosp_15_r20/external/boringssl/src/gen/crypto/aes128gcmsiv-x86_64-win.asm (revision 8fb009dc861624b67b6cdb62ea21f0f22d0c584b)
1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifidn __OUTPUT_FORMAT__, win64
5default	rel
6%define XMMWORD
7%define YMMWORD
8%define ZMMWORD
9%define _CET_ENDBR
10
11%ifdef BORINGSSL_PREFIX
12%include "boringssl_prefix_symbols_nasm.inc"
13%endif
14section	.rdata rdata align=8
15
16ALIGN	16
17one:
18	DQ	1,0
19two:
20	DQ	2,0
21three:
22	DQ	3,0
23four:
24	DQ	4,0
25five:
26	DQ	5,0
27six:
28	DQ	6,0
29seven:
30	DQ	7,0
31eight:
32	DQ	8,0
33
34OR_MASK:
35	DD	0x00000000,0x00000000,0x00000000,0x80000000
36poly:
37	DQ	0x1,0xc200000000000000
38mask:
39	DD	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
40con1:
41	DD	1,1,1,1
42con2:
43	DD	0x1b,0x1b,0x1b,0x1b
44con3:
45	DB	-1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
46and_mask:
47	DD	0,0xffffffff,0xffffffff,0xffffffff
48section	.text code align=64
49
50
51ALIGN	16
52GFMUL:
53
54	vpclmulqdq	xmm2,xmm0,xmm1,0x00
55	vpclmulqdq	xmm5,xmm0,xmm1,0x11
56	vpclmulqdq	xmm3,xmm0,xmm1,0x10
57	vpclmulqdq	xmm4,xmm0,xmm1,0x01
58	vpxor	xmm3,xmm3,xmm4
59	vpslldq	xmm4,xmm3,8
60	vpsrldq	xmm3,xmm3,8
61	vpxor	xmm2,xmm2,xmm4
62	vpxor	xmm5,xmm5,xmm3
63
64	vpclmulqdq	xmm3,xmm2,XMMWORD[poly],0x10
65	vpshufd	xmm4,xmm2,78
66	vpxor	xmm2,xmm3,xmm4
67
68	vpclmulqdq	xmm3,xmm2,XMMWORD[poly],0x10
69	vpshufd	xmm4,xmm2,78
70	vpxor	xmm2,xmm3,xmm4
71
72	vpxor	xmm0,xmm2,xmm5
73	ret
74
75
76global	aesgcmsiv_htable_init
77
78ALIGN	16
79aesgcmsiv_htable_init:
80	mov	QWORD[8+rsp],rdi	;WIN64 prologue
81	mov	QWORD[16+rsp],rsi
82	mov	rax,rsp
83$L$SEH_begin_aesgcmsiv_htable_init:
84	mov	rdi,rcx
85	mov	rsi,rdx
86
87
88
89_CET_ENDBR
90	vmovdqa	xmm0,XMMWORD[rsi]
91	vmovdqa	xmm1,xmm0
92	vmovdqa	XMMWORD[rdi],xmm0
93	call	GFMUL
94	vmovdqa	XMMWORD[16+rdi],xmm0
95	call	GFMUL
96	vmovdqa	XMMWORD[32+rdi],xmm0
97	call	GFMUL
98	vmovdqa	XMMWORD[48+rdi],xmm0
99	call	GFMUL
100	vmovdqa	XMMWORD[64+rdi],xmm0
101	call	GFMUL
102	vmovdqa	XMMWORD[80+rdi],xmm0
103	call	GFMUL
104	vmovdqa	XMMWORD[96+rdi],xmm0
105	call	GFMUL
106	vmovdqa	XMMWORD[112+rdi],xmm0
107	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
108	mov	rsi,QWORD[16+rsp]
109	ret
110
111$L$SEH_end_aesgcmsiv_htable_init:
112global	aesgcmsiv_htable6_init
113
114ALIGN	16
115aesgcmsiv_htable6_init:
116	mov	QWORD[8+rsp],rdi	;WIN64 prologue
117	mov	QWORD[16+rsp],rsi
118	mov	rax,rsp
119$L$SEH_begin_aesgcmsiv_htable6_init:
120	mov	rdi,rcx
121	mov	rsi,rdx
122
123
124
125_CET_ENDBR
126	vmovdqa	xmm0,XMMWORD[rsi]
127	vmovdqa	xmm1,xmm0
128	vmovdqa	XMMWORD[rdi],xmm0
129	call	GFMUL
130	vmovdqa	XMMWORD[16+rdi],xmm0
131	call	GFMUL
132	vmovdqa	XMMWORD[32+rdi],xmm0
133	call	GFMUL
134	vmovdqa	XMMWORD[48+rdi],xmm0
135	call	GFMUL
136	vmovdqa	XMMWORD[64+rdi],xmm0
137	call	GFMUL
138	vmovdqa	XMMWORD[80+rdi],xmm0
139	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
140	mov	rsi,QWORD[16+rsp]
141	ret
142
143$L$SEH_end_aesgcmsiv_htable6_init:
144global	aesgcmsiv_htable_polyval
145
146ALIGN	16
147aesgcmsiv_htable_polyval:
148	mov	QWORD[8+rsp],rdi	;WIN64 prologue
149	mov	QWORD[16+rsp],rsi
150	mov	rax,rsp
151$L$SEH_begin_aesgcmsiv_htable_polyval:
152	mov	rdi,rcx
153	mov	rsi,rdx
154	mov	rdx,r8
155	mov	rcx,r9
156
157
158
159_CET_ENDBR
160	test	rdx,rdx
161	jnz	NEAR $L$htable_polyval_start
162	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
163	mov	rsi,QWORD[16+rsp]
164	ret
165
166$L$htable_polyval_start:
167	vzeroall
168
169
170
171	mov	r11,rdx
172	and	r11,127
173
174	jz	NEAR $L$htable_polyval_no_prefix
175
176	vpxor	xmm9,xmm9,xmm9
177	vmovdqa	xmm1,XMMWORD[rcx]
178	sub	rdx,r11
179
180	sub	r11,16
181
182
183	vmovdqu	xmm0,XMMWORD[rsi]
184	vpxor	xmm0,xmm0,xmm1
185
186	vpclmulqdq	xmm5,xmm0,XMMWORD[r11*1+rdi],0x01
187	vpclmulqdq	xmm3,xmm0,XMMWORD[r11*1+rdi],0x00
188	vpclmulqdq	xmm4,xmm0,XMMWORD[r11*1+rdi],0x11
189	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
190	vpxor	xmm5,xmm5,xmm6
191
192	lea	rsi,[16+rsi]
193	test	r11,r11
194	jnz	NEAR $L$htable_polyval_prefix_loop
195	jmp	NEAR $L$htable_polyval_prefix_complete
196
197
198ALIGN	64
199$L$htable_polyval_prefix_loop:
200	sub	r11,16
201
202	vmovdqu	xmm0,XMMWORD[rsi]
203
204	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x00
205	vpxor	xmm3,xmm3,xmm6
206	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x11
207	vpxor	xmm4,xmm4,xmm6
208	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x01
209	vpxor	xmm5,xmm5,xmm6
210	vpclmulqdq	xmm6,xmm0,XMMWORD[r11*1+rdi],0x10
211	vpxor	xmm5,xmm5,xmm6
212
213	test	r11,r11
214
215	lea	rsi,[16+rsi]
216
217	jnz	NEAR $L$htable_polyval_prefix_loop
218
219$L$htable_polyval_prefix_complete:
220	vpsrldq	xmm6,xmm5,8
221	vpslldq	xmm5,xmm5,8
222
223	vpxor	xmm9,xmm4,xmm6
224	vpxor	xmm1,xmm3,xmm5
225
226	jmp	NEAR $L$htable_polyval_main_loop
227
228$L$htable_polyval_no_prefix:
229
230
231
232
233	vpxor	xmm1,xmm1,xmm1
234	vmovdqa	xmm9,XMMWORD[rcx]
235
236ALIGN	64
237$L$htable_polyval_main_loop:
238	sub	rdx,0x80
239	jb	NEAR $L$htable_polyval_out
240
241	vmovdqu	xmm0,XMMWORD[112+rsi]
242
243	vpclmulqdq	xmm5,xmm0,XMMWORD[rdi],0x01
244	vpclmulqdq	xmm3,xmm0,XMMWORD[rdi],0x00
245	vpclmulqdq	xmm4,xmm0,XMMWORD[rdi],0x11
246	vpclmulqdq	xmm6,xmm0,XMMWORD[rdi],0x10
247	vpxor	xmm5,xmm5,xmm6
248
249
250	vmovdqu	xmm0,XMMWORD[96+rsi]
251	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x01
252	vpxor	xmm5,xmm5,xmm6
253	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x00
254	vpxor	xmm3,xmm3,xmm6
255	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x11
256	vpxor	xmm4,xmm4,xmm6
257	vpclmulqdq	xmm6,xmm0,XMMWORD[16+rdi],0x10
258	vpxor	xmm5,xmm5,xmm6
259
260
261
262	vmovdqu	xmm0,XMMWORD[80+rsi]
263
264	vpclmulqdq	xmm7,xmm1,XMMWORD[poly],0x10
265	vpalignr	xmm1,xmm1,xmm1,8
266
267	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x01
268	vpxor	xmm5,xmm5,xmm6
269	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x00
270	vpxor	xmm3,xmm3,xmm6
271	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x11
272	vpxor	xmm4,xmm4,xmm6
273	vpclmulqdq	xmm6,xmm0,XMMWORD[32+rdi],0x10
274	vpxor	xmm5,xmm5,xmm6
275
276
277	vpxor	xmm1,xmm1,xmm7
278
279	vmovdqu	xmm0,XMMWORD[64+rsi]
280
281	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x01
282	vpxor	xmm5,xmm5,xmm6
283	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x00
284	vpxor	xmm3,xmm3,xmm6
285	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x11
286	vpxor	xmm4,xmm4,xmm6
287	vpclmulqdq	xmm6,xmm0,XMMWORD[48+rdi],0x10
288	vpxor	xmm5,xmm5,xmm6
289
290
291	vmovdqu	xmm0,XMMWORD[48+rsi]
292
293	vpclmulqdq	xmm7,xmm1,XMMWORD[poly],0x10
294	vpalignr	xmm1,xmm1,xmm1,8
295
296	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x01
297	vpxor	xmm5,xmm5,xmm6
298	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x00
299	vpxor	xmm3,xmm3,xmm6
300	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x11
301	vpxor	xmm4,xmm4,xmm6
302	vpclmulqdq	xmm6,xmm0,XMMWORD[64+rdi],0x10
303	vpxor	xmm5,xmm5,xmm6
304
305
306	vpxor	xmm1,xmm1,xmm7
307
308	vmovdqu	xmm0,XMMWORD[32+rsi]
309
310	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x01
311	vpxor	xmm5,xmm5,xmm6
312	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x00
313	vpxor	xmm3,xmm3,xmm6
314	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x11
315	vpxor	xmm4,xmm4,xmm6
316	vpclmulqdq	xmm6,xmm0,XMMWORD[80+rdi],0x10
317	vpxor	xmm5,xmm5,xmm6
318
319
320	vpxor	xmm1,xmm1,xmm9
321
322	vmovdqu	xmm0,XMMWORD[16+rsi]
323
324	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x01
325	vpxor	xmm5,xmm5,xmm6
326	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x00
327	vpxor	xmm3,xmm3,xmm6
328	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x11
329	vpxor	xmm4,xmm4,xmm6
330	vpclmulqdq	xmm6,xmm0,XMMWORD[96+rdi],0x10
331	vpxor	xmm5,xmm5,xmm6
332
333
334	vmovdqu	xmm0,XMMWORD[rsi]
335	vpxor	xmm0,xmm0,xmm1
336
337	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x01
338	vpxor	xmm5,xmm5,xmm6
339	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x00
340	vpxor	xmm3,xmm3,xmm6
341	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x11
342	vpxor	xmm4,xmm4,xmm6
343	vpclmulqdq	xmm6,xmm0,XMMWORD[112+rdi],0x10
344	vpxor	xmm5,xmm5,xmm6
345
346
347	vpsrldq	xmm6,xmm5,8
348	vpslldq	xmm5,xmm5,8
349
350	vpxor	xmm9,xmm4,xmm6
351	vpxor	xmm1,xmm3,xmm5
352
353	lea	rsi,[128+rsi]
354	jmp	NEAR $L$htable_polyval_main_loop
355
356
357
358$L$htable_polyval_out:
359	vpclmulqdq	xmm6,xmm1,XMMWORD[poly],0x10
360	vpalignr	xmm1,xmm1,xmm1,8
361	vpxor	xmm1,xmm1,xmm6
362
363	vpclmulqdq	xmm6,xmm1,XMMWORD[poly],0x10
364	vpalignr	xmm1,xmm1,xmm1,8
365	vpxor	xmm1,xmm1,xmm6
366	vpxor	xmm1,xmm1,xmm9
367
368	vmovdqu	XMMWORD[rcx],xmm1
369	vzeroupper
370	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
371	mov	rsi,QWORD[16+rsp]
372	ret
373
374$L$SEH_end_aesgcmsiv_htable_polyval:
375global	aesgcmsiv_polyval_horner
376
377ALIGN	16
378aesgcmsiv_polyval_horner:
379	mov	QWORD[8+rsp],rdi	;WIN64 prologue
380	mov	QWORD[16+rsp],rsi
381	mov	rax,rsp
382$L$SEH_begin_aesgcmsiv_polyval_horner:
383	mov	rdi,rcx
384	mov	rsi,rdx
385	mov	rdx,r8
386	mov	rcx,r9
387
388
389
390_CET_ENDBR
391	test	rcx,rcx
392	jnz	NEAR $L$polyval_horner_start
393	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
394	mov	rsi,QWORD[16+rsp]
395	ret
396
397$L$polyval_horner_start:
398
399
400
401	xor	r10,r10
402	shl	rcx,4
403
404	vmovdqa	xmm1,XMMWORD[rsi]
405	vmovdqa	xmm0,XMMWORD[rdi]
406
407$L$polyval_horner_loop:
408	vpxor	xmm0,xmm0,XMMWORD[r10*1+rdx]
409	call	GFMUL
410
411	add	r10,16
412	cmp	rcx,r10
413	jne	NEAR $L$polyval_horner_loop
414
415
416	vmovdqa	XMMWORD[rdi],xmm0
417	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
418	mov	rsi,QWORD[16+rsp]
419	ret
420
421$L$SEH_end_aesgcmsiv_polyval_horner:
422global	aes128gcmsiv_aes_ks
423
424ALIGN	16
425aes128gcmsiv_aes_ks:
426	mov	QWORD[8+rsp],rdi	;WIN64 prologue
427	mov	QWORD[16+rsp],rsi
428	mov	rax,rsp
429$L$SEH_begin_aes128gcmsiv_aes_ks:
430	mov	rdi,rcx
431	mov	rsi,rdx
432
433
434
435_CET_ENDBR
436	vmovdqu	xmm1,XMMWORD[rdi]
437	vmovdqa	XMMWORD[rsi],xmm1
438
439	vmovdqa	xmm0,XMMWORD[con1]
440	vmovdqa	xmm15,XMMWORD[mask]
441
442	mov	rax,8
443
444$L$ks128_loop:
445	add	rsi,16
446	sub	rax,1
447	vpshufb	xmm2,xmm1,xmm15
448	vaesenclast	xmm2,xmm2,xmm0
449	vpslld	xmm0,xmm0,1
450	vpslldq	xmm3,xmm1,4
451	vpxor	xmm1,xmm1,xmm3
452	vpslldq	xmm3,xmm3,4
453	vpxor	xmm1,xmm1,xmm3
454	vpslldq	xmm3,xmm3,4
455	vpxor	xmm1,xmm1,xmm3
456	vpxor	xmm1,xmm1,xmm2
457	vmovdqa	XMMWORD[rsi],xmm1
458	jne	NEAR $L$ks128_loop
459
460	vmovdqa	xmm0,XMMWORD[con2]
461	vpshufb	xmm2,xmm1,xmm15
462	vaesenclast	xmm2,xmm2,xmm0
463	vpslld	xmm0,xmm0,1
464	vpslldq	xmm3,xmm1,4
465	vpxor	xmm1,xmm1,xmm3
466	vpslldq	xmm3,xmm3,4
467	vpxor	xmm1,xmm1,xmm3
468	vpslldq	xmm3,xmm3,4
469	vpxor	xmm1,xmm1,xmm3
470	vpxor	xmm1,xmm1,xmm2
471	vmovdqa	XMMWORD[16+rsi],xmm1
472
473	vpshufb	xmm2,xmm1,xmm15
474	vaesenclast	xmm2,xmm2,xmm0
475	vpslldq	xmm3,xmm1,4
476	vpxor	xmm1,xmm1,xmm3
477	vpslldq	xmm3,xmm3,4
478	vpxor	xmm1,xmm1,xmm3
479	vpslldq	xmm3,xmm3,4
480	vpxor	xmm1,xmm1,xmm3
481	vpxor	xmm1,xmm1,xmm2
482	vmovdqa	XMMWORD[32+rsi],xmm1
483	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
484	mov	rsi,QWORD[16+rsp]
485	ret
486
487$L$SEH_end_aes128gcmsiv_aes_ks:
488global	aes256gcmsiv_aes_ks
489
490ALIGN	16
491aes256gcmsiv_aes_ks:
492	mov	QWORD[8+rsp],rdi	;WIN64 prologue
493	mov	QWORD[16+rsp],rsi
494	mov	rax,rsp
495$L$SEH_begin_aes256gcmsiv_aes_ks:
496	mov	rdi,rcx
497	mov	rsi,rdx
498
499
500
501_CET_ENDBR
502	vmovdqu	xmm1,XMMWORD[rdi]
503	vmovdqu	xmm3,XMMWORD[16+rdi]
504	vmovdqa	XMMWORD[rsi],xmm1
505	vmovdqa	XMMWORD[16+rsi],xmm3
506	vmovdqa	xmm0,XMMWORD[con1]
507	vmovdqa	xmm15,XMMWORD[mask]
508	vpxor	xmm14,xmm14,xmm14
509	mov	rax,6
510
511$L$ks256_loop:
512	add	rsi,32
513	sub	rax,1
514	vpshufb	xmm2,xmm3,xmm15
515	vaesenclast	xmm2,xmm2,xmm0
516	vpslld	xmm0,xmm0,1
517	vpsllq	xmm4,xmm1,32
518	vpxor	xmm1,xmm1,xmm4
519	vpshufb	xmm4,xmm1,XMMWORD[con3]
520	vpxor	xmm1,xmm1,xmm4
521	vpxor	xmm1,xmm1,xmm2
522	vmovdqa	XMMWORD[rsi],xmm1
523	vpshufd	xmm2,xmm1,0xff
524	vaesenclast	xmm2,xmm2,xmm14
525	vpsllq	xmm4,xmm3,32
526	vpxor	xmm3,xmm3,xmm4
527	vpshufb	xmm4,xmm3,XMMWORD[con3]
528	vpxor	xmm3,xmm3,xmm4
529	vpxor	xmm3,xmm3,xmm2
530	vmovdqa	XMMWORD[16+rsi],xmm3
531	jne	NEAR $L$ks256_loop
532
533	vpshufb	xmm2,xmm3,xmm15
534	vaesenclast	xmm2,xmm2,xmm0
535	vpsllq	xmm4,xmm1,32
536	vpxor	xmm1,xmm1,xmm4
537	vpshufb	xmm4,xmm1,XMMWORD[con3]
538	vpxor	xmm1,xmm1,xmm4
539	vpxor	xmm1,xmm1,xmm2
540	vmovdqa	XMMWORD[32+rsi],xmm1
541	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
542	mov	rsi,QWORD[16+rsp]
543	ret
544
545global	aes128gcmsiv_aes_ks_enc_x1
546
547ALIGN	16
548aes128gcmsiv_aes_ks_enc_x1:
549	mov	QWORD[8+rsp],rdi	;WIN64 prologue
550	mov	QWORD[16+rsp],rsi
551	mov	rax,rsp
552$L$SEH_begin_aes128gcmsiv_aes_ks_enc_x1:
553	mov	rdi,rcx
554	mov	rsi,rdx
555	mov	rdx,r8
556	mov	rcx,r9
557
558
559
560_CET_ENDBR
561	vmovdqa	xmm1,XMMWORD[rcx]
562	vmovdqa	xmm4,XMMWORD[rdi]
563
564	vmovdqa	XMMWORD[rdx],xmm1
565	vpxor	xmm4,xmm4,xmm1
566
567	vmovdqa	xmm0,XMMWORD[con1]
568	vmovdqa	xmm15,XMMWORD[mask]
569
570	vpshufb	xmm2,xmm1,xmm15
571	vaesenclast	xmm2,xmm2,xmm0
572	vpslld	xmm0,xmm0,1
573	vpsllq	xmm3,xmm1,32
574	vpxor	xmm1,xmm1,xmm3
575	vpshufb	xmm3,xmm1,XMMWORD[con3]
576	vpxor	xmm1,xmm1,xmm3
577	vpxor	xmm1,xmm1,xmm2
578
579	vaesenc	xmm4,xmm4,xmm1
580	vmovdqa	XMMWORD[16+rdx],xmm1
581
582	vpshufb	xmm2,xmm1,xmm15
583	vaesenclast	xmm2,xmm2,xmm0
584	vpslld	xmm0,xmm0,1
585	vpsllq	xmm3,xmm1,32
586	vpxor	xmm1,xmm1,xmm3
587	vpshufb	xmm3,xmm1,XMMWORD[con3]
588	vpxor	xmm1,xmm1,xmm3
589	vpxor	xmm1,xmm1,xmm2
590
591	vaesenc	xmm4,xmm4,xmm1
592	vmovdqa	XMMWORD[32+rdx],xmm1
593
594	vpshufb	xmm2,xmm1,xmm15
595	vaesenclast	xmm2,xmm2,xmm0
596	vpslld	xmm0,xmm0,1
597	vpsllq	xmm3,xmm1,32
598	vpxor	xmm1,xmm1,xmm3
599	vpshufb	xmm3,xmm1,XMMWORD[con3]
600	vpxor	xmm1,xmm1,xmm3
601	vpxor	xmm1,xmm1,xmm2
602
603	vaesenc	xmm4,xmm4,xmm1
604	vmovdqa	XMMWORD[48+rdx],xmm1
605
606	vpshufb	xmm2,xmm1,xmm15
607	vaesenclast	xmm2,xmm2,xmm0
608	vpslld	xmm0,xmm0,1
609	vpsllq	xmm3,xmm1,32
610	vpxor	xmm1,xmm1,xmm3
611	vpshufb	xmm3,xmm1,XMMWORD[con3]
612	vpxor	xmm1,xmm1,xmm3
613	vpxor	xmm1,xmm1,xmm2
614
615	vaesenc	xmm4,xmm4,xmm1
616	vmovdqa	XMMWORD[64+rdx],xmm1
617
618	vpshufb	xmm2,xmm1,xmm15
619	vaesenclast	xmm2,xmm2,xmm0
620	vpslld	xmm0,xmm0,1
621	vpsllq	xmm3,xmm1,32
622	vpxor	xmm1,xmm1,xmm3
623	vpshufb	xmm3,xmm1,XMMWORD[con3]
624	vpxor	xmm1,xmm1,xmm3
625	vpxor	xmm1,xmm1,xmm2
626
627	vaesenc	xmm4,xmm4,xmm1
628	vmovdqa	XMMWORD[80+rdx],xmm1
629
630	vpshufb	xmm2,xmm1,xmm15
631	vaesenclast	xmm2,xmm2,xmm0
632	vpslld	xmm0,xmm0,1
633	vpsllq	xmm3,xmm1,32
634	vpxor	xmm1,xmm1,xmm3
635	vpshufb	xmm3,xmm1,XMMWORD[con3]
636	vpxor	xmm1,xmm1,xmm3
637	vpxor	xmm1,xmm1,xmm2
638
639	vaesenc	xmm4,xmm4,xmm1
640	vmovdqa	XMMWORD[96+rdx],xmm1
641
642	vpshufb	xmm2,xmm1,xmm15
643	vaesenclast	xmm2,xmm2,xmm0
644	vpslld	xmm0,xmm0,1
645	vpsllq	xmm3,xmm1,32
646	vpxor	xmm1,xmm1,xmm3
647	vpshufb	xmm3,xmm1,XMMWORD[con3]
648	vpxor	xmm1,xmm1,xmm3
649	vpxor	xmm1,xmm1,xmm2
650
651	vaesenc	xmm4,xmm4,xmm1
652	vmovdqa	XMMWORD[112+rdx],xmm1
653
654	vpshufb	xmm2,xmm1,xmm15
655	vaesenclast	xmm2,xmm2,xmm0
656	vpslld	xmm0,xmm0,1
657	vpsllq	xmm3,xmm1,32
658	vpxor	xmm1,xmm1,xmm3
659	vpshufb	xmm3,xmm1,XMMWORD[con3]
660	vpxor	xmm1,xmm1,xmm3
661	vpxor	xmm1,xmm1,xmm2
662
663	vaesenc	xmm4,xmm4,xmm1
664	vmovdqa	XMMWORD[128+rdx],xmm1
665
666
667	vmovdqa	xmm0,XMMWORD[con2]
668
669	vpshufb	xmm2,xmm1,xmm15
670	vaesenclast	xmm2,xmm2,xmm0
671	vpslld	xmm0,xmm0,1
672	vpsllq	xmm3,xmm1,32
673	vpxor	xmm1,xmm1,xmm3
674	vpshufb	xmm3,xmm1,XMMWORD[con3]
675	vpxor	xmm1,xmm1,xmm3
676	vpxor	xmm1,xmm1,xmm2
677
678	vaesenc	xmm4,xmm4,xmm1
679	vmovdqa	XMMWORD[144+rdx],xmm1
680
681	vpshufb	xmm2,xmm1,xmm15
682	vaesenclast	xmm2,xmm2,xmm0
683	vpsllq	xmm3,xmm1,32
684	vpxor	xmm1,xmm1,xmm3
685	vpshufb	xmm3,xmm1,XMMWORD[con3]
686	vpxor	xmm1,xmm1,xmm3
687	vpxor	xmm1,xmm1,xmm2
688
689	vaesenclast	xmm4,xmm4,xmm1
690	vmovdqa	XMMWORD[160+rdx],xmm1
691
692
693	vmovdqa	XMMWORD[rsi],xmm4
694	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
695	mov	rsi,QWORD[16+rsp]
696	ret
697
698$L$SEH_end_aes128gcmsiv_aes_ks_enc_x1:
699global	aes128gcmsiv_kdf
700
701ALIGN	16
702aes128gcmsiv_kdf:
703	mov	QWORD[8+rsp],rdi	;WIN64 prologue
704	mov	QWORD[16+rsp],rsi
705	mov	rax,rsp
706$L$SEH_begin_aes128gcmsiv_kdf:
707	mov	rdi,rcx
708	mov	rsi,rdx
709	mov	rdx,r8
710
711
712
713_CET_ENDBR
714
715
716
717
718	vmovdqa	xmm1,XMMWORD[rdx]
719	vmovdqa	xmm9,XMMWORD[rdi]
720	vmovdqa	xmm12,XMMWORD[and_mask]
721	vmovdqa	xmm13,XMMWORD[one]
722	vpshufd	xmm9,xmm9,0x90
723	vpand	xmm9,xmm9,xmm12
724	vpaddd	xmm10,xmm9,xmm13
725	vpaddd	xmm11,xmm10,xmm13
726	vpaddd	xmm12,xmm11,xmm13
727
728	vpxor	xmm9,xmm9,xmm1
729	vpxor	xmm10,xmm10,xmm1
730	vpxor	xmm11,xmm11,xmm1
731	vpxor	xmm12,xmm12,xmm1
732
733	vmovdqa	xmm1,XMMWORD[16+rdx]
734	vaesenc	xmm9,xmm9,xmm1
735	vaesenc	xmm10,xmm10,xmm1
736	vaesenc	xmm11,xmm11,xmm1
737	vaesenc	xmm12,xmm12,xmm1
738
739	vmovdqa	xmm2,XMMWORD[32+rdx]
740	vaesenc	xmm9,xmm9,xmm2
741	vaesenc	xmm10,xmm10,xmm2
742	vaesenc	xmm11,xmm11,xmm2
743	vaesenc	xmm12,xmm12,xmm2
744
745	vmovdqa	xmm1,XMMWORD[48+rdx]
746	vaesenc	xmm9,xmm9,xmm1
747	vaesenc	xmm10,xmm10,xmm1
748	vaesenc	xmm11,xmm11,xmm1
749	vaesenc	xmm12,xmm12,xmm1
750
751	vmovdqa	xmm2,XMMWORD[64+rdx]
752	vaesenc	xmm9,xmm9,xmm2
753	vaesenc	xmm10,xmm10,xmm2
754	vaesenc	xmm11,xmm11,xmm2
755	vaesenc	xmm12,xmm12,xmm2
756
757	vmovdqa	xmm1,XMMWORD[80+rdx]
758	vaesenc	xmm9,xmm9,xmm1
759	vaesenc	xmm10,xmm10,xmm1
760	vaesenc	xmm11,xmm11,xmm1
761	vaesenc	xmm12,xmm12,xmm1
762
763	vmovdqa	xmm2,XMMWORD[96+rdx]
764	vaesenc	xmm9,xmm9,xmm2
765	vaesenc	xmm10,xmm10,xmm2
766	vaesenc	xmm11,xmm11,xmm2
767	vaesenc	xmm12,xmm12,xmm2
768
769	vmovdqa	xmm1,XMMWORD[112+rdx]
770	vaesenc	xmm9,xmm9,xmm1
771	vaesenc	xmm10,xmm10,xmm1
772	vaesenc	xmm11,xmm11,xmm1
773	vaesenc	xmm12,xmm12,xmm1
774
775	vmovdqa	xmm2,XMMWORD[128+rdx]
776	vaesenc	xmm9,xmm9,xmm2
777	vaesenc	xmm10,xmm10,xmm2
778	vaesenc	xmm11,xmm11,xmm2
779	vaesenc	xmm12,xmm12,xmm2
780
781	vmovdqa	xmm1,XMMWORD[144+rdx]
782	vaesenc	xmm9,xmm9,xmm1
783	vaesenc	xmm10,xmm10,xmm1
784	vaesenc	xmm11,xmm11,xmm1
785	vaesenc	xmm12,xmm12,xmm1
786
787	vmovdqa	xmm2,XMMWORD[160+rdx]
788	vaesenclast	xmm9,xmm9,xmm2
789	vaesenclast	xmm10,xmm10,xmm2
790	vaesenclast	xmm11,xmm11,xmm2
791	vaesenclast	xmm12,xmm12,xmm2
792
793
794	vmovdqa	XMMWORD[rsi],xmm9
795	vmovdqa	XMMWORD[16+rsi],xmm10
796	vmovdqa	XMMWORD[32+rsi],xmm11
797	vmovdqa	XMMWORD[48+rsi],xmm12
798	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
799	mov	rsi,QWORD[16+rsp]
800	ret
801
802$L$SEH_end_aes128gcmsiv_kdf:
803global	aes128gcmsiv_enc_msg_x4
804
805ALIGN	16
806aes128gcmsiv_enc_msg_x4:
807	mov	QWORD[8+rsp],rdi	;WIN64 prologue
808	mov	QWORD[16+rsp],rsi
809	mov	rax,rsp
810$L$SEH_begin_aes128gcmsiv_enc_msg_x4:
811	mov	rdi,rcx
812	mov	rsi,rdx
813	mov	rdx,r8
814	mov	rcx,r9
815	mov	r8,QWORD[40+rsp]
816
817
818
819_CET_ENDBR
820	test	r8,r8
821	jnz	NEAR $L$128_enc_msg_x4_start
822	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
823	mov	rsi,QWORD[16+rsp]
824	ret
825
826$L$128_enc_msg_x4_start:
827	push	r12
828
829	push	r13
830
831
832	shr	r8,4
833	mov	r10,r8
834	shl	r10,62
835	shr	r10,62
836
837
838	vmovdqa	xmm15,XMMWORD[rdx]
839	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
840
841	vmovdqu	xmm4,XMMWORD[four]
842	vmovdqa	xmm0,xmm15
843	vpaddd	xmm1,xmm15,XMMWORD[one]
844	vpaddd	xmm2,xmm15,XMMWORD[two]
845	vpaddd	xmm3,xmm15,XMMWORD[three]
846
847	shr	r8,2
848	je	NEAR $L$128_enc_msg_x4_check_remainder
849
850	sub	rsi,64
851	sub	rdi,64
852
853$L$128_enc_msg_x4_loop1:
854	add	rsi,64
855	add	rdi,64
856
857	vmovdqa	xmm5,xmm0
858	vmovdqa	xmm6,xmm1
859	vmovdqa	xmm7,xmm2
860	vmovdqa	xmm8,xmm3
861
862	vpxor	xmm5,xmm5,XMMWORD[rcx]
863	vpxor	xmm6,xmm6,XMMWORD[rcx]
864	vpxor	xmm7,xmm7,XMMWORD[rcx]
865	vpxor	xmm8,xmm8,XMMWORD[rcx]
866
867	vmovdqu	xmm12,XMMWORD[16+rcx]
868	vaesenc	xmm5,xmm5,xmm12
869	vaesenc	xmm6,xmm6,xmm12
870	vaesenc	xmm7,xmm7,xmm12
871	vaesenc	xmm8,xmm8,xmm12
872
873	vpaddd	xmm0,xmm0,xmm4
874	vmovdqu	xmm12,XMMWORD[32+rcx]
875	vaesenc	xmm5,xmm5,xmm12
876	vaesenc	xmm6,xmm6,xmm12
877	vaesenc	xmm7,xmm7,xmm12
878	vaesenc	xmm8,xmm8,xmm12
879
880	vpaddd	xmm1,xmm1,xmm4
881	vmovdqu	xmm12,XMMWORD[48+rcx]
882	vaesenc	xmm5,xmm5,xmm12
883	vaesenc	xmm6,xmm6,xmm12
884	vaesenc	xmm7,xmm7,xmm12
885	vaesenc	xmm8,xmm8,xmm12
886
887	vpaddd	xmm2,xmm2,xmm4
888	vmovdqu	xmm12,XMMWORD[64+rcx]
889	vaesenc	xmm5,xmm5,xmm12
890	vaesenc	xmm6,xmm6,xmm12
891	vaesenc	xmm7,xmm7,xmm12
892	vaesenc	xmm8,xmm8,xmm12
893
894	vpaddd	xmm3,xmm3,xmm4
895
896	vmovdqu	xmm12,XMMWORD[80+rcx]
897	vaesenc	xmm5,xmm5,xmm12
898	vaesenc	xmm6,xmm6,xmm12
899	vaesenc	xmm7,xmm7,xmm12
900	vaesenc	xmm8,xmm8,xmm12
901
902	vmovdqu	xmm12,XMMWORD[96+rcx]
903	vaesenc	xmm5,xmm5,xmm12
904	vaesenc	xmm6,xmm6,xmm12
905	vaesenc	xmm7,xmm7,xmm12
906	vaesenc	xmm8,xmm8,xmm12
907
908	vmovdqu	xmm12,XMMWORD[112+rcx]
909	vaesenc	xmm5,xmm5,xmm12
910	vaesenc	xmm6,xmm6,xmm12
911	vaesenc	xmm7,xmm7,xmm12
912	vaesenc	xmm8,xmm8,xmm12
913
914	vmovdqu	xmm12,XMMWORD[128+rcx]
915	vaesenc	xmm5,xmm5,xmm12
916	vaesenc	xmm6,xmm6,xmm12
917	vaesenc	xmm7,xmm7,xmm12
918	vaesenc	xmm8,xmm8,xmm12
919
920	vmovdqu	xmm12,XMMWORD[144+rcx]
921	vaesenc	xmm5,xmm5,xmm12
922	vaesenc	xmm6,xmm6,xmm12
923	vaesenc	xmm7,xmm7,xmm12
924	vaesenc	xmm8,xmm8,xmm12
925
926	vmovdqu	xmm12,XMMWORD[160+rcx]
927	vaesenclast	xmm5,xmm5,xmm12
928	vaesenclast	xmm6,xmm6,xmm12
929	vaesenclast	xmm7,xmm7,xmm12
930	vaesenclast	xmm8,xmm8,xmm12
931
932
933
934	vpxor	xmm5,xmm5,XMMWORD[rdi]
935	vpxor	xmm6,xmm6,XMMWORD[16+rdi]
936	vpxor	xmm7,xmm7,XMMWORD[32+rdi]
937	vpxor	xmm8,xmm8,XMMWORD[48+rdi]
938
939	sub	r8,1
940
941	vmovdqu	XMMWORD[rsi],xmm5
942	vmovdqu	XMMWORD[16+rsi],xmm6
943	vmovdqu	XMMWORD[32+rsi],xmm7
944	vmovdqu	XMMWORD[48+rsi],xmm8
945
946	jne	NEAR $L$128_enc_msg_x4_loop1
947
948	add	rsi,64
949	add	rdi,64
950
951$L$128_enc_msg_x4_check_remainder:
952	cmp	r10,0
953	je	NEAR $L$128_enc_msg_x4_out
954
955$L$128_enc_msg_x4_loop2:
956
957
958	vmovdqa	xmm5,xmm0
959	vpaddd	xmm0,xmm0,XMMWORD[one]
960
961	vpxor	xmm5,xmm5,XMMWORD[rcx]
962	vaesenc	xmm5,xmm5,XMMWORD[16+rcx]
963	vaesenc	xmm5,xmm5,XMMWORD[32+rcx]
964	vaesenc	xmm5,xmm5,XMMWORD[48+rcx]
965	vaesenc	xmm5,xmm5,XMMWORD[64+rcx]
966	vaesenc	xmm5,xmm5,XMMWORD[80+rcx]
967	vaesenc	xmm5,xmm5,XMMWORD[96+rcx]
968	vaesenc	xmm5,xmm5,XMMWORD[112+rcx]
969	vaesenc	xmm5,xmm5,XMMWORD[128+rcx]
970	vaesenc	xmm5,xmm5,XMMWORD[144+rcx]
971	vaesenclast	xmm5,xmm5,XMMWORD[160+rcx]
972
973
974	vpxor	xmm5,xmm5,XMMWORD[rdi]
975	vmovdqu	XMMWORD[rsi],xmm5
976
977	add	rdi,16
978	add	rsi,16
979
980	sub	r10,1
981	jne	NEAR $L$128_enc_msg_x4_loop2
982
983$L$128_enc_msg_x4_out:
984	pop	r13
985
986	pop	r12
987
988	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
989	mov	rsi,QWORD[16+rsp]
990	ret
991
992$L$SEH_end_aes128gcmsiv_enc_msg_x4:
993global	aes128gcmsiv_enc_msg_x8
994
995ALIGN	16
996aes128gcmsiv_enc_msg_x8:
997	mov	QWORD[8+rsp],rdi	;WIN64 prologue
998	mov	QWORD[16+rsp],rsi
999	mov	rax,rsp
1000$L$SEH_begin_aes128gcmsiv_enc_msg_x8:
1001	mov	rdi,rcx
1002	mov	rsi,rdx
1003	mov	rdx,r8
1004	mov	rcx,r9
1005	mov	r8,QWORD[40+rsp]
1006
1007
1008
1009_CET_ENDBR
1010	test	r8,r8
1011	jnz	NEAR $L$128_enc_msg_x8_start
1012	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1013	mov	rsi,QWORD[16+rsp]
1014	ret
1015
1016$L$128_enc_msg_x8_start:
1017	push	r12
1018
1019	push	r13
1020
1021	push	rbp
1022
1023	mov	rbp,rsp
1024
1025
1026
1027	sub	rsp,128
1028	and	rsp,-64
1029
1030	shr	r8,4
1031	mov	r10,r8
1032	shl	r10,61
1033	shr	r10,61
1034
1035
1036	vmovdqu	xmm1,XMMWORD[rdx]
1037	vpor	xmm1,xmm1,XMMWORD[OR_MASK]
1038
1039
1040	vpaddd	xmm0,xmm1,XMMWORD[seven]
1041	vmovdqu	XMMWORD[rsp],xmm0
1042	vpaddd	xmm9,xmm1,XMMWORD[one]
1043	vpaddd	xmm10,xmm1,XMMWORD[two]
1044	vpaddd	xmm11,xmm1,XMMWORD[three]
1045	vpaddd	xmm12,xmm1,XMMWORD[four]
1046	vpaddd	xmm13,xmm1,XMMWORD[five]
1047	vpaddd	xmm14,xmm1,XMMWORD[six]
1048	vmovdqa	xmm0,xmm1
1049
1050	shr	r8,3
1051	je	NEAR $L$128_enc_msg_x8_check_remainder
1052
1053	sub	rsi,128
1054	sub	rdi,128
1055
1056$L$128_enc_msg_x8_loop1:
1057	add	rsi,128
1058	add	rdi,128
1059
1060	vmovdqa	xmm1,xmm0
1061	vmovdqa	xmm2,xmm9
1062	vmovdqa	xmm3,xmm10
1063	vmovdqa	xmm4,xmm11
1064	vmovdqa	xmm5,xmm12
1065	vmovdqa	xmm6,xmm13
1066	vmovdqa	xmm7,xmm14
1067
1068	vmovdqu	xmm8,XMMWORD[rsp]
1069
1070	vpxor	xmm1,xmm1,XMMWORD[rcx]
1071	vpxor	xmm2,xmm2,XMMWORD[rcx]
1072	vpxor	xmm3,xmm3,XMMWORD[rcx]
1073	vpxor	xmm4,xmm4,XMMWORD[rcx]
1074	vpxor	xmm5,xmm5,XMMWORD[rcx]
1075	vpxor	xmm6,xmm6,XMMWORD[rcx]
1076	vpxor	xmm7,xmm7,XMMWORD[rcx]
1077	vpxor	xmm8,xmm8,XMMWORD[rcx]
1078
1079	vmovdqu	xmm15,XMMWORD[16+rcx]
1080	vaesenc	xmm1,xmm1,xmm15
1081	vaesenc	xmm2,xmm2,xmm15
1082	vaesenc	xmm3,xmm3,xmm15
1083	vaesenc	xmm4,xmm4,xmm15
1084	vaesenc	xmm5,xmm5,xmm15
1085	vaesenc	xmm6,xmm6,xmm15
1086	vaesenc	xmm7,xmm7,xmm15
1087	vaesenc	xmm8,xmm8,xmm15
1088
1089	vmovdqu	xmm14,XMMWORD[rsp]
1090	vpaddd	xmm14,xmm14,XMMWORD[eight]
1091	vmovdqu	XMMWORD[rsp],xmm14
1092	vmovdqu	xmm15,XMMWORD[32+rcx]
1093	vaesenc	xmm1,xmm1,xmm15
1094	vaesenc	xmm2,xmm2,xmm15
1095	vaesenc	xmm3,xmm3,xmm15
1096	vaesenc	xmm4,xmm4,xmm15
1097	vaesenc	xmm5,xmm5,xmm15
1098	vaesenc	xmm6,xmm6,xmm15
1099	vaesenc	xmm7,xmm7,xmm15
1100	vaesenc	xmm8,xmm8,xmm15
1101
1102	vpsubd	xmm14,xmm14,XMMWORD[one]
1103	vmovdqu	xmm15,XMMWORD[48+rcx]
1104	vaesenc	xmm1,xmm1,xmm15
1105	vaesenc	xmm2,xmm2,xmm15
1106	vaesenc	xmm3,xmm3,xmm15
1107	vaesenc	xmm4,xmm4,xmm15
1108	vaesenc	xmm5,xmm5,xmm15
1109	vaesenc	xmm6,xmm6,xmm15
1110	vaesenc	xmm7,xmm7,xmm15
1111	vaesenc	xmm8,xmm8,xmm15
1112
1113	vpaddd	xmm0,xmm0,XMMWORD[eight]
1114	vmovdqu	xmm15,XMMWORD[64+rcx]
1115	vaesenc	xmm1,xmm1,xmm15
1116	vaesenc	xmm2,xmm2,xmm15
1117	vaesenc	xmm3,xmm3,xmm15
1118	vaesenc	xmm4,xmm4,xmm15
1119	vaesenc	xmm5,xmm5,xmm15
1120	vaesenc	xmm6,xmm6,xmm15
1121	vaesenc	xmm7,xmm7,xmm15
1122	vaesenc	xmm8,xmm8,xmm15
1123
1124	vpaddd	xmm9,xmm9,XMMWORD[eight]
1125	vmovdqu	xmm15,XMMWORD[80+rcx]
1126	vaesenc	xmm1,xmm1,xmm15
1127	vaesenc	xmm2,xmm2,xmm15
1128	vaesenc	xmm3,xmm3,xmm15
1129	vaesenc	xmm4,xmm4,xmm15
1130	vaesenc	xmm5,xmm5,xmm15
1131	vaesenc	xmm6,xmm6,xmm15
1132	vaesenc	xmm7,xmm7,xmm15
1133	vaesenc	xmm8,xmm8,xmm15
1134
1135	vpaddd	xmm10,xmm10,XMMWORD[eight]
1136	vmovdqu	xmm15,XMMWORD[96+rcx]
1137	vaesenc	xmm1,xmm1,xmm15
1138	vaesenc	xmm2,xmm2,xmm15
1139	vaesenc	xmm3,xmm3,xmm15
1140	vaesenc	xmm4,xmm4,xmm15
1141	vaesenc	xmm5,xmm5,xmm15
1142	vaesenc	xmm6,xmm6,xmm15
1143	vaesenc	xmm7,xmm7,xmm15
1144	vaesenc	xmm8,xmm8,xmm15
1145
1146	vpaddd	xmm11,xmm11,XMMWORD[eight]
1147	vmovdqu	xmm15,XMMWORD[112+rcx]
1148	vaesenc	xmm1,xmm1,xmm15
1149	vaesenc	xmm2,xmm2,xmm15
1150	vaesenc	xmm3,xmm3,xmm15
1151	vaesenc	xmm4,xmm4,xmm15
1152	vaesenc	xmm5,xmm5,xmm15
1153	vaesenc	xmm6,xmm6,xmm15
1154	vaesenc	xmm7,xmm7,xmm15
1155	vaesenc	xmm8,xmm8,xmm15
1156
1157	vpaddd	xmm12,xmm12,XMMWORD[eight]
1158	vmovdqu	xmm15,XMMWORD[128+rcx]
1159	vaesenc	xmm1,xmm1,xmm15
1160	vaesenc	xmm2,xmm2,xmm15
1161	vaesenc	xmm3,xmm3,xmm15
1162	vaesenc	xmm4,xmm4,xmm15
1163	vaesenc	xmm5,xmm5,xmm15
1164	vaesenc	xmm6,xmm6,xmm15
1165	vaesenc	xmm7,xmm7,xmm15
1166	vaesenc	xmm8,xmm8,xmm15
1167
1168	vpaddd	xmm13,xmm13,XMMWORD[eight]
1169	vmovdqu	xmm15,XMMWORD[144+rcx]
1170	vaesenc	xmm1,xmm1,xmm15
1171	vaesenc	xmm2,xmm2,xmm15
1172	vaesenc	xmm3,xmm3,xmm15
1173	vaesenc	xmm4,xmm4,xmm15
1174	vaesenc	xmm5,xmm5,xmm15
1175	vaesenc	xmm6,xmm6,xmm15
1176	vaesenc	xmm7,xmm7,xmm15
1177	vaesenc	xmm8,xmm8,xmm15
1178
1179	vmovdqu	xmm15,XMMWORD[160+rcx]
1180	vaesenclast	xmm1,xmm1,xmm15
1181	vaesenclast	xmm2,xmm2,xmm15
1182	vaesenclast	xmm3,xmm3,xmm15
1183	vaesenclast	xmm4,xmm4,xmm15
1184	vaesenclast	xmm5,xmm5,xmm15
1185	vaesenclast	xmm6,xmm6,xmm15
1186	vaesenclast	xmm7,xmm7,xmm15
1187	vaesenclast	xmm8,xmm8,xmm15
1188
1189
1190
1191	vpxor	xmm1,xmm1,XMMWORD[rdi]
1192	vpxor	xmm2,xmm2,XMMWORD[16+rdi]
1193	vpxor	xmm3,xmm3,XMMWORD[32+rdi]
1194	vpxor	xmm4,xmm4,XMMWORD[48+rdi]
1195	vpxor	xmm5,xmm5,XMMWORD[64+rdi]
1196	vpxor	xmm6,xmm6,XMMWORD[80+rdi]
1197	vpxor	xmm7,xmm7,XMMWORD[96+rdi]
1198	vpxor	xmm8,xmm8,XMMWORD[112+rdi]
1199
1200	dec	r8
1201
1202	vmovdqu	XMMWORD[rsi],xmm1
1203	vmovdqu	XMMWORD[16+rsi],xmm2
1204	vmovdqu	XMMWORD[32+rsi],xmm3
1205	vmovdqu	XMMWORD[48+rsi],xmm4
1206	vmovdqu	XMMWORD[64+rsi],xmm5
1207	vmovdqu	XMMWORD[80+rsi],xmm6
1208	vmovdqu	XMMWORD[96+rsi],xmm7
1209	vmovdqu	XMMWORD[112+rsi],xmm8
1210
1211	jne	NEAR $L$128_enc_msg_x8_loop1
1212
1213	add	rsi,128
1214	add	rdi,128
1215
1216$L$128_enc_msg_x8_check_remainder:
1217	cmp	r10,0
1218	je	NEAR $L$128_enc_msg_x8_out
1219
1220$L$128_enc_msg_x8_loop2:
1221
1222
1223	vmovdqa	xmm1,xmm0
1224	vpaddd	xmm0,xmm0,XMMWORD[one]
1225
1226	vpxor	xmm1,xmm1,XMMWORD[rcx]
1227	vaesenc	xmm1,xmm1,XMMWORD[16+rcx]
1228	vaesenc	xmm1,xmm1,XMMWORD[32+rcx]
1229	vaesenc	xmm1,xmm1,XMMWORD[48+rcx]
1230	vaesenc	xmm1,xmm1,XMMWORD[64+rcx]
1231	vaesenc	xmm1,xmm1,XMMWORD[80+rcx]
1232	vaesenc	xmm1,xmm1,XMMWORD[96+rcx]
1233	vaesenc	xmm1,xmm1,XMMWORD[112+rcx]
1234	vaesenc	xmm1,xmm1,XMMWORD[128+rcx]
1235	vaesenc	xmm1,xmm1,XMMWORD[144+rcx]
1236	vaesenclast	xmm1,xmm1,XMMWORD[160+rcx]
1237
1238
1239	vpxor	xmm1,xmm1,XMMWORD[rdi]
1240
1241	vmovdqu	XMMWORD[rsi],xmm1
1242
1243	add	rdi,16
1244	add	rsi,16
1245
1246	dec	r10
1247	jne	NEAR $L$128_enc_msg_x8_loop2
1248
1249$L$128_enc_msg_x8_out:
1250	mov	rsp,rbp
1251
1252	pop	rbp
1253
1254	pop	r13
1255
1256	pop	r12
1257
1258	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1259	mov	rsi,QWORD[16+rsp]
1260	ret
1261
1262$L$SEH_end_aes128gcmsiv_enc_msg_x8:
1263global	aes128gcmsiv_dec
1264
1265ALIGN	16
1266aes128gcmsiv_dec:
1267	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1268	mov	QWORD[16+rsp],rsi
1269	mov	rax,rsp
1270$L$SEH_begin_aes128gcmsiv_dec:
1271	mov	rdi,rcx
1272	mov	rsi,rdx
1273	mov	rdx,r8
1274	mov	rcx,r9
1275	mov	r8,QWORD[40+rsp]
1276	mov	r9,QWORD[48+rsp]
1277
1278
1279
1280_CET_ENDBR
1281	test	r9,~15
1282	jnz	NEAR $L$128_dec_start
1283	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1284	mov	rsi,QWORD[16+rsp]
1285	ret
1286
1287$L$128_dec_start:
1288	vzeroupper
1289	vmovdqa	xmm0,XMMWORD[rdx]
1290
1291
1292	vmovdqu	xmm15,XMMWORD[16+rdx]
1293	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
1294	mov	rax,rdx
1295
1296	lea	rax,[32+rax]
1297	lea	rcx,[32+rcx]
1298
1299	and	r9,~15
1300
1301
1302	cmp	r9,96
1303	jb	NEAR $L$128_dec_loop2
1304
1305
1306	sub	r9,96
1307	vmovdqa	xmm7,xmm15
1308	vpaddd	xmm8,xmm7,XMMWORD[one]
1309	vpaddd	xmm9,xmm7,XMMWORD[two]
1310	vpaddd	xmm10,xmm9,XMMWORD[one]
1311	vpaddd	xmm11,xmm9,XMMWORD[two]
1312	vpaddd	xmm12,xmm11,XMMWORD[one]
1313	vpaddd	xmm15,xmm11,XMMWORD[two]
1314
1315	vpxor	xmm7,xmm7,XMMWORD[r8]
1316	vpxor	xmm8,xmm8,XMMWORD[r8]
1317	vpxor	xmm9,xmm9,XMMWORD[r8]
1318	vpxor	xmm10,xmm10,XMMWORD[r8]
1319	vpxor	xmm11,xmm11,XMMWORD[r8]
1320	vpxor	xmm12,xmm12,XMMWORD[r8]
1321
1322	vmovdqu	xmm4,XMMWORD[16+r8]
1323	vaesenc	xmm7,xmm7,xmm4
1324	vaesenc	xmm8,xmm8,xmm4
1325	vaesenc	xmm9,xmm9,xmm4
1326	vaesenc	xmm10,xmm10,xmm4
1327	vaesenc	xmm11,xmm11,xmm4
1328	vaesenc	xmm12,xmm12,xmm4
1329
1330	vmovdqu	xmm4,XMMWORD[32+r8]
1331	vaesenc	xmm7,xmm7,xmm4
1332	vaesenc	xmm8,xmm8,xmm4
1333	vaesenc	xmm9,xmm9,xmm4
1334	vaesenc	xmm10,xmm10,xmm4
1335	vaesenc	xmm11,xmm11,xmm4
1336	vaesenc	xmm12,xmm12,xmm4
1337
1338	vmovdqu	xmm4,XMMWORD[48+r8]
1339	vaesenc	xmm7,xmm7,xmm4
1340	vaesenc	xmm8,xmm8,xmm4
1341	vaesenc	xmm9,xmm9,xmm4
1342	vaesenc	xmm10,xmm10,xmm4
1343	vaesenc	xmm11,xmm11,xmm4
1344	vaesenc	xmm12,xmm12,xmm4
1345
1346	vmovdqu	xmm4,XMMWORD[64+r8]
1347	vaesenc	xmm7,xmm7,xmm4
1348	vaesenc	xmm8,xmm8,xmm4
1349	vaesenc	xmm9,xmm9,xmm4
1350	vaesenc	xmm10,xmm10,xmm4
1351	vaesenc	xmm11,xmm11,xmm4
1352	vaesenc	xmm12,xmm12,xmm4
1353
1354	vmovdqu	xmm4,XMMWORD[80+r8]
1355	vaesenc	xmm7,xmm7,xmm4
1356	vaesenc	xmm8,xmm8,xmm4
1357	vaesenc	xmm9,xmm9,xmm4
1358	vaesenc	xmm10,xmm10,xmm4
1359	vaesenc	xmm11,xmm11,xmm4
1360	vaesenc	xmm12,xmm12,xmm4
1361
1362	vmovdqu	xmm4,XMMWORD[96+r8]
1363	vaesenc	xmm7,xmm7,xmm4
1364	vaesenc	xmm8,xmm8,xmm4
1365	vaesenc	xmm9,xmm9,xmm4
1366	vaesenc	xmm10,xmm10,xmm4
1367	vaesenc	xmm11,xmm11,xmm4
1368	vaesenc	xmm12,xmm12,xmm4
1369
1370	vmovdqu	xmm4,XMMWORD[112+r8]
1371	vaesenc	xmm7,xmm7,xmm4
1372	vaesenc	xmm8,xmm8,xmm4
1373	vaesenc	xmm9,xmm9,xmm4
1374	vaesenc	xmm10,xmm10,xmm4
1375	vaesenc	xmm11,xmm11,xmm4
1376	vaesenc	xmm12,xmm12,xmm4
1377
1378	vmovdqu	xmm4,XMMWORD[128+r8]
1379	vaesenc	xmm7,xmm7,xmm4
1380	vaesenc	xmm8,xmm8,xmm4
1381	vaesenc	xmm9,xmm9,xmm4
1382	vaesenc	xmm10,xmm10,xmm4
1383	vaesenc	xmm11,xmm11,xmm4
1384	vaesenc	xmm12,xmm12,xmm4
1385
1386	vmovdqu	xmm4,XMMWORD[144+r8]
1387	vaesenc	xmm7,xmm7,xmm4
1388	vaesenc	xmm8,xmm8,xmm4
1389	vaesenc	xmm9,xmm9,xmm4
1390	vaesenc	xmm10,xmm10,xmm4
1391	vaesenc	xmm11,xmm11,xmm4
1392	vaesenc	xmm12,xmm12,xmm4
1393
1394	vmovdqu	xmm4,XMMWORD[160+r8]
1395	vaesenclast	xmm7,xmm7,xmm4
1396	vaesenclast	xmm8,xmm8,xmm4
1397	vaesenclast	xmm9,xmm9,xmm4
1398	vaesenclast	xmm10,xmm10,xmm4
1399	vaesenclast	xmm11,xmm11,xmm4
1400	vaesenclast	xmm12,xmm12,xmm4
1401
1402
1403	vpxor	xmm7,xmm7,XMMWORD[rdi]
1404	vpxor	xmm8,xmm8,XMMWORD[16+rdi]
1405	vpxor	xmm9,xmm9,XMMWORD[32+rdi]
1406	vpxor	xmm10,xmm10,XMMWORD[48+rdi]
1407	vpxor	xmm11,xmm11,XMMWORD[64+rdi]
1408	vpxor	xmm12,xmm12,XMMWORD[80+rdi]
1409
1410	vmovdqu	XMMWORD[rsi],xmm7
1411	vmovdqu	XMMWORD[16+rsi],xmm8
1412	vmovdqu	XMMWORD[32+rsi],xmm9
1413	vmovdqu	XMMWORD[48+rsi],xmm10
1414	vmovdqu	XMMWORD[64+rsi],xmm11
1415	vmovdqu	XMMWORD[80+rsi],xmm12
1416
1417	add	rdi,96
1418	add	rsi,96
1419	jmp	NEAR $L$128_dec_loop1
1420
1421
1422ALIGN	64
1423$L$128_dec_loop1:
1424	cmp	r9,96
1425	jb	NEAR $L$128_dec_finish_96
1426	sub	r9,96
1427
1428	vmovdqa	xmm6,xmm12
1429	vmovdqa	XMMWORD[(16-32)+rax],xmm11
1430	vmovdqa	XMMWORD[(32-32)+rax],xmm10
1431	vmovdqa	XMMWORD[(48-32)+rax],xmm9
1432	vmovdqa	XMMWORD[(64-32)+rax],xmm8
1433	vmovdqa	XMMWORD[(80-32)+rax],xmm7
1434
1435	vmovdqa	xmm7,xmm15
1436	vpaddd	xmm8,xmm7,XMMWORD[one]
1437	vpaddd	xmm9,xmm7,XMMWORD[two]
1438	vpaddd	xmm10,xmm9,XMMWORD[one]
1439	vpaddd	xmm11,xmm9,XMMWORD[two]
1440	vpaddd	xmm12,xmm11,XMMWORD[one]
1441	vpaddd	xmm15,xmm11,XMMWORD[two]
1442
1443	vmovdqa	xmm4,XMMWORD[r8]
1444	vpxor	xmm7,xmm7,xmm4
1445	vpxor	xmm8,xmm8,xmm4
1446	vpxor	xmm9,xmm9,xmm4
1447	vpxor	xmm10,xmm10,xmm4
1448	vpxor	xmm11,xmm11,xmm4
1449	vpxor	xmm12,xmm12,xmm4
1450
1451	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
1452	vpclmulqdq	xmm2,xmm6,xmm4,0x11
1453	vpclmulqdq	xmm3,xmm6,xmm4,0x00
1454	vpclmulqdq	xmm1,xmm6,xmm4,0x01
1455	vpclmulqdq	xmm4,xmm6,xmm4,0x10
1456	vpxor	xmm1,xmm1,xmm4
1457
1458	vmovdqu	xmm4,XMMWORD[16+r8]
1459	vaesenc	xmm7,xmm7,xmm4
1460	vaesenc	xmm8,xmm8,xmm4
1461	vaesenc	xmm9,xmm9,xmm4
1462	vaesenc	xmm10,xmm10,xmm4
1463	vaesenc	xmm11,xmm11,xmm4
1464	vaesenc	xmm12,xmm12,xmm4
1465
1466	vmovdqu	xmm6,XMMWORD[((-16))+rax]
1467	vmovdqu	xmm13,XMMWORD[((-16))+rcx]
1468
1469	vpclmulqdq	xmm4,xmm6,xmm13,0x10
1470	vpxor	xmm1,xmm1,xmm4
1471	vpclmulqdq	xmm4,xmm6,xmm13,0x11
1472	vpxor	xmm2,xmm2,xmm4
1473	vpclmulqdq	xmm4,xmm6,xmm13,0x00
1474	vpxor	xmm3,xmm3,xmm4
1475	vpclmulqdq	xmm4,xmm6,xmm13,0x01
1476	vpxor	xmm1,xmm1,xmm4
1477
1478
1479	vmovdqu	xmm4,XMMWORD[32+r8]
1480	vaesenc	xmm7,xmm7,xmm4
1481	vaesenc	xmm8,xmm8,xmm4
1482	vaesenc	xmm9,xmm9,xmm4
1483	vaesenc	xmm10,xmm10,xmm4
1484	vaesenc	xmm11,xmm11,xmm4
1485	vaesenc	xmm12,xmm12,xmm4
1486
1487	vmovdqu	xmm6,XMMWORD[rax]
1488	vmovdqu	xmm13,XMMWORD[rcx]
1489
1490	vpclmulqdq	xmm4,xmm6,xmm13,0x10
1491	vpxor	xmm1,xmm1,xmm4
1492	vpclmulqdq	xmm4,xmm6,xmm13,0x11
1493	vpxor	xmm2,xmm2,xmm4
1494	vpclmulqdq	xmm4,xmm6,xmm13,0x00
1495	vpxor	xmm3,xmm3,xmm4
1496	vpclmulqdq	xmm4,xmm6,xmm13,0x01
1497	vpxor	xmm1,xmm1,xmm4
1498
1499
1500	vmovdqu	xmm4,XMMWORD[48+r8]
1501	vaesenc	xmm7,xmm7,xmm4
1502	vaesenc	xmm8,xmm8,xmm4
1503	vaesenc	xmm9,xmm9,xmm4
1504	vaesenc	xmm10,xmm10,xmm4
1505	vaesenc	xmm11,xmm11,xmm4
1506	vaesenc	xmm12,xmm12,xmm4
1507
1508	vmovdqu	xmm6,XMMWORD[16+rax]
1509	vmovdqu	xmm13,XMMWORD[16+rcx]
1510
1511	vpclmulqdq	xmm4,xmm6,xmm13,0x10
1512	vpxor	xmm1,xmm1,xmm4
1513	vpclmulqdq	xmm4,xmm6,xmm13,0x11
1514	vpxor	xmm2,xmm2,xmm4
1515	vpclmulqdq	xmm4,xmm6,xmm13,0x00
1516	vpxor	xmm3,xmm3,xmm4
1517	vpclmulqdq	xmm4,xmm6,xmm13,0x01
1518	vpxor	xmm1,xmm1,xmm4
1519
1520
1521	vmovdqu	xmm4,XMMWORD[64+r8]
1522	vaesenc	xmm7,xmm7,xmm4
1523	vaesenc	xmm8,xmm8,xmm4
1524	vaesenc	xmm9,xmm9,xmm4
1525	vaesenc	xmm10,xmm10,xmm4
1526	vaesenc	xmm11,xmm11,xmm4
1527	vaesenc	xmm12,xmm12,xmm4
1528
1529	vmovdqu	xmm6,XMMWORD[32+rax]
1530	vmovdqu	xmm13,XMMWORD[32+rcx]
1531
1532	vpclmulqdq	xmm4,xmm6,xmm13,0x10
1533	vpxor	xmm1,xmm1,xmm4
1534	vpclmulqdq	xmm4,xmm6,xmm13,0x11
1535	vpxor	xmm2,xmm2,xmm4
1536	vpclmulqdq	xmm4,xmm6,xmm13,0x00
1537	vpxor	xmm3,xmm3,xmm4
1538	vpclmulqdq	xmm4,xmm6,xmm13,0x01
1539	vpxor	xmm1,xmm1,xmm4
1540
1541
1542	vmovdqu	xmm4,XMMWORD[80+r8]
1543	vaesenc	xmm7,xmm7,xmm4
1544	vaesenc	xmm8,xmm8,xmm4
1545	vaesenc	xmm9,xmm9,xmm4
1546	vaesenc	xmm10,xmm10,xmm4
1547	vaesenc	xmm11,xmm11,xmm4
1548	vaesenc	xmm12,xmm12,xmm4
1549
1550	vmovdqu	xmm4,XMMWORD[96+r8]
1551	vaesenc	xmm7,xmm7,xmm4
1552	vaesenc	xmm8,xmm8,xmm4
1553	vaesenc	xmm9,xmm9,xmm4
1554	vaesenc	xmm10,xmm10,xmm4
1555	vaesenc	xmm11,xmm11,xmm4
1556	vaesenc	xmm12,xmm12,xmm4
1557
1558	vmovdqu	xmm4,XMMWORD[112+r8]
1559	vaesenc	xmm7,xmm7,xmm4
1560	vaesenc	xmm8,xmm8,xmm4
1561	vaesenc	xmm9,xmm9,xmm4
1562	vaesenc	xmm10,xmm10,xmm4
1563	vaesenc	xmm11,xmm11,xmm4
1564	vaesenc	xmm12,xmm12,xmm4
1565
1566
1567	vmovdqa	xmm6,XMMWORD[((80-32))+rax]
1568	vpxor	xmm6,xmm6,xmm0
1569	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
1570
1571	vpclmulqdq	xmm4,xmm6,xmm5,0x01
1572	vpxor	xmm1,xmm1,xmm4
1573	vpclmulqdq	xmm4,xmm6,xmm5,0x11
1574	vpxor	xmm2,xmm2,xmm4
1575	vpclmulqdq	xmm4,xmm6,xmm5,0x00
1576	vpxor	xmm3,xmm3,xmm4
1577	vpclmulqdq	xmm4,xmm6,xmm5,0x10
1578	vpxor	xmm1,xmm1,xmm4
1579
1580	vmovdqu	xmm4,XMMWORD[128+r8]
1581	vaesenc	xmm7,xmm7,xmm4
1582	vaesenc	xmm8,xmm8,xmm4
1583	vaesenc	xmm9,xmm9,xmm4
1584	vaesenc	xmm10,xmm10,xmm4
1585	vaesenc	xmm11,xmm11,xmm4
1586	vaesenc	xmm12,xmm12,xmm4
1587
1588
1589	vpsrldq	xmm4,xmm1,8
1590	vpxor	xmm5,xmm2,xmm4
1591	vpslldq	xmm4,xmm1,8
1592	vpxor	xmm0,xmm3,xmm4
1593
1594	vmovdqa	xmm3,XMMWORD[poly]
1595
1596	vmovdqu	xmm4,XMMWORD[144+r8]
1597	vaesenc	xmm7,xmm7,xmm4
1598	vaesenc	xmm8,xmm8,xmm4
1599	vaesenc	xmm9,xmm9,xmm4
1600	vaesenc	xmm10,xmm10,xmm4
1601	vaesenc	xmm11,xmm11,xmm4
1602	vaesenc	xmm12,xmm12,xmm4
1603
1604	vmovdqu	xmm6,XMMWORD[160+r8]
1605	vpalignr	xmm2,xmm0,xmm0,8
1606	vpclmulqdq	xmm0,xmm0,xmm3,0x10
1607	vpxor	xmm0,xmm2,xmm0
1608
1609	vpxor	xmm4,xmm6,XMMWORD[rdi]
1610	vaesenclast	xmm7,xmm7,xmm4
1611	vpxor	xmm4,xmm6,XMMWORD[16+rdi]
1612	vaesenclast	xmm8,xmm8,xmm4
1613	vpxor	xmm4,xmm6,XMMWORD[32+rdi]
1614	vaesenclast	xmm9,xmm9,xmm4
1615	vpxor	xmm4,xmm6,XMMWORD[48+rdi]
1616	vaesenclast	xmm10,xmm10,xmm4
1617	vpxor	xmm4,xmm6,XMMWORD[64+rdi]
1618	vaesenclast	xmm11,xmm11,xmm4
1619	vpxor	xmm4,xmm6,XMMWORD[80+rdi]
1620	vaesenclast	xmm12,xmm12,xmm4
1621
1622	vpalignr	xmm2,xmm0,xmm0,8
1623	vpclmulqdq	xmm0,xmm0,xmm3,0x10
1624	vpxor	xmm0,xmm2,xmm0
1625
1626	vmovdqu	XMMWORD[rsi],xmm7
1627	vmovdqu	XMMWORD[16+rsi],xmm8
1628	vmovdqu	XMMWORD[32+rsi],xmm9
1629	vmovdqu	XMMWORD[48+rsi],xmm10
1630	vmovdqu	XMMWORD[64+rsi],xmm11
1631	vmovdqu	XMMWORD[80+rsi],xmm12
1632
1633	vpxor	xmm0,xmm0,xmm5
1634
1635	lea	rdi,[96+rdi]
1636	lea	rsi,[96+rsi]
1637	jmp	NEAR $L$128_dec_loop1
1638
1639$L$128_dec_finish_96:
1640	vmovdqa	xmm6,xmm12
1641	vmovdqa	XMMWORD[(16-32)+rax],xmm11
1642	vmovdqa	XMMWORD[(32-32)+rax],xmm10
1643	vmovdqa	XMMWORD[(48-32)+rax],xmm9
1644	vmovdqa	XMMWORD[(64-32)+rax],xmm8
1645	vmovdqa	XMMWORD[(80-32)+rax],xmm7
1646
1647	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
1648	vpclmulqdq	xmm1,xmm6,xmm4,0x10
1649	vpclmulqdq	xmm2,xmm6,xmm4,0x11
1650	vpclmulqdq	xmm3,xmm6,xmm4,0x00
1651	vpclmulqdq	xmm4,xmm6,xmm4,0x01
1652	vpxor	xmm1,xmm1,xmm4
1653
1654	vmovdqu	xmm6,XMMWORD[((-16))+rax]
1655	vmovdqu	xmm13,XMMWORD[((-16))+rcx]
1656
1657	vpclmulqdq	xmm4,xmm6,xmm13,0x10
1658	vpxor	xmm1,xmm1,xmm4
1659	vpclmulqdq	xmm4,xmm6,xmm13,0x11
1660	vpxor	xmm2,xmm2,xmm4
1661	vpclmulqdq	xmm4,xmm6,xmm13,0x00
1662	vpxor	xmm3,xmm3,xmm4
1663	vpclmulqdq	xmm4,xmm6,xmm13,0x01
1664	vpxor	xmm1,xmm1,xmm4
1665
1666	vmovdqu	xmm6,XMMWORD[rax]
1667	vmovdqu	xmm13,XMMWORD[rcx]
1668
1669	vpclmulqdq	xmm4,xmm6,xmm13,0x10
1670	vpxor	xmm1,xmm1,xmm4
1671	vpclmulqdq	xmm4,xmm6,xmm13,0x11
1672	vpxor	xmm2,xmm2,xmm4
1673	vpclmulqdq	xmm4,xmm6,xmm13,0x00
1674	vpxor	xmm3,xmm3,xmm4
1675	vpclmulqdq	xmm4,xmm6,xmm13,0x01
1676	vpxor	xmm1,xmm1,xmm4
1677
1678	vmovdqu	xmm6,XMMWORD[16+rax]
1679	vmovdqu	xmm13,XMMWORD[16+rcx]
1680
1681	vpclmulqdq	xmm4,xmm6,xmm13,0x10
1682	vpxor	xmm1,xmm1,xmm4
1683	vpclmulqdq	xmm4,xmm6,xmm13,0x11
1684	vpxor	xmm2,xmm2,xmm4
1685	vpclmulqdq	xmm4,xmm6,xmm13,0x00
1686	vpxor	xmm3,xmm3,xmm4
1687	vpclmulqdq	xmm4,xmm6,xmm13,0x01
1688	vpxor	xmm1,xmm1,xmm4
1689
1690	vmovdqu	xmm6,XMMWORD[32+rax]
1691	vmovdqu	xmm13,XMMWORD[32+rcx]
1692
1693	vpclmulqdq	xmm4,xmm6,xmm13,0x10
1694	vpxor	xmm1,xmm1,xmm4
1695	vpclmulqdq	xmm4,xmm6,xmm13,0x11
1696	vpxor	xmm2,xmm2,xmm4
1697	vpclmulqdq	xmm4,xmm6,xmm13,0x00
1698	vpxor	xmm3,xmm3,xmm4
1699	vpclmulqdq	xmm4,xmm6,xmm13,0x01
1700	vpxor	xmm1,xmm1,xmm4
1701
1702
1703	vmovdqu	xmm6,XMMWORD[((80-32))+rax]
1704	vpxor	xmm6,xmm6,xmm0
1705	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
1706	vpclmulqdq	xmm4,xmm6,xmm5,0x11
1707	vpxor	xmm2,xmm2,xmm4
1708	vpclmulqdq	xmm4,xmm6,xmm5,0x00
1709	vpxor	xmm3,xmm3,xmm4
1710	vpclmulqdq	xmm4,xmm6,xmm5,0x10
1711	vpxor	xmm1,xmm1,xmm4
1712	vpclmulqdq	xmm4,xmm6,xmm5,0x01
1713	vpxor	xmm1,xmm1,xmm4
1714
1715	vpsrldq	xmm4,xmm1,8
1716	vpxor	xmm5,xmm2,xmm4
1717	vpslldq	xmm4,xmm1,8
1718	vpxor	xmm0,xmm3,xmm4
1719
1720	vmovdqa	xmm3,XMMWORD[poly]
1721
1722	vpalignr	xmm2,xmm0,xmm0,8
1723	vpclmulqdq	xmm0,xmm0,xmm3,0x10
1724	vpxor	xmm0,xmm2,xmm0
1725
1726	vpalignr	xmm2,xmm0,xmm0,8
1727	vpclmulqdq	xmm0,xmm0,xmm3,0x10
1728	vpxor	xmm0,xmm2,xmm0
1729
1730	vpxor	xmm0,xmm0,xmm5
1731
1732$L$128_dec_loop2:
1733
1734
1735
1736	cmp	r9,16
1737	jb	NEAR $L$128_dec_out
1738	sub	r9,16
1739
1740	vmovdqa	xmm2,xmm15
1741	vpaddd	xmm15,xmm15,XMMWORD[one]
1742
1743	vpxor	xmm2,xmm2,XMMWORD[r8]
1744	vaesenc	xmm2,xmm2,XMMWORD[16+r8]
1745	vaesenc	xmm2,xmm2,XMMWORD[32+r8]
1746	vaesenc	xmm2,xmm2,XMMWORD[48+r8]
1747	vaesenc	xmm2,xmm2,XMMWORD[64+r8]
1748	vaesenc	xmm2,xmm2,XMMWORD[80+r8]
1749	vaesenc	xmm2,xmm2,XMMWORD[96+r8]
1750	vaesenc	xmm2,xmm2,XMMWORD[112+r8]
1751	vaesenc	xmm2,xmm2,XMMWORD[128+r8]
1752	vaesenc	xmm2,xmm2,XMMWORD[144+r8]
1753	vaesenclast	xmm2,xmm2,XMMWORD[160+r8]
1754	vpxor	xmm2,xmm2,XMMWORD[rdi]
1755	vmovdqu	XMMWORD[rsi],xmm2
1756	add	rdi,16
1757	add	rsi,16
1758
1759	vpxor	xmm0,xmm0,xmm2
1760	vmovdqa	xmm1,XMMWORD[((-32))+rcx]
1761	call	GFMUL
1762
1763	jmp	NEAR $L$128_dec_loop2
1764
1765$L$128_dec_out:
1766	vmovdqu	XMMWORD[rdx],xmm0
1767	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1768	mov	rsi,QWORD[16+rsp]
1769	ret
1770
1771$L$SEH_end_aes128gcmsiv_dec:
1772global	aes128gcmsiv_ecb_enc_block
1773
1774ALIGN	16
1775aes128gcmsiv_ecb_enc_block:
1776	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1777	mov	QWORD[16+rsp],rsi
1778	mov	rax,rsp
1779$L$SEH_begin_aes128gcmsiv_ecb_enc_block:
1780	mov	rdi,rcx
1781	mov	rsi,rdx
1782	mov	rdx,r8
1783
1784
1785
1786_CET_ENDBR
1787	vmovdqa	xmm1,XMMWORD[rdi]
1788
1789	vpxor	xmm1,xmm1,XMMWORD[rdx]
1790	vaesenc	xmm1,xmm1,XMMWORD[16+rdx]
1791	vaesenc	xmm1,xmm1,XMMWORD[32+rdx]
1792	vaesenc	xmm1,xmm1,XMMWORD[48+rdx]
1793	vaesenc	xmm1,xmm1,XMMWORD[64+rdx]
1794	vaesenc	xmm1,xmm1,XMMWORD[80+rdx]
1795	vaesenc	xmm1,xmm1,XMMWORD[96+rdx]
1796	vaesenc	xmm1,xmm1,XMMWORD[112+rdx]
1797	vaesenc	xmm1,xmm1,XMMWORD[128+rdx]
1798	vaesenc	xmm1,xmm1,XMMWORD[144+rdx]
1799	vaesenclast	xmm1,xmm1,XMMWORD[160+rdx]
1800
1801	vmovdqa	XMMWORD[rsi],xmm1
1802
1803	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1804	mov	rsi,QWORD[16+rsp]
1805	ret
1806
1807$L$SEH_end_aes128gcmsiv_ecb_enc_block:
1808global	aes256gcmsiv_aes_ks_enc_x1
1809
1810ALIGN	16
1811aes256gcmsiv_aes_ks_enc_x1:
1812	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1813	mov	QWORD[16+rsp],rsi
1814	mov	rax,rsp
1815$L$SEH_begin_aes256gcmsiv_aes_ks_enc_x1:
1816	mov	rdi,rcx
1817	mov	rsi,rdx
1818	mov	rdx,r8
1819	mov	rcx,r9
1820
1821
1822
1823_CET_ENDBR
1824	vmovdqa	xmm0,XMMWORD[con1]
1825	vmovdqa	xmm15,XMMWORD[mask]
1826	vmovdqa	xmm8,XMMWORD[rdi]
1827	vmovdqa	xmm1,XMMWORD[rcx]
1828	vmovdqa	xmm3,XMMWORD[16+rcx]
1829	vpxor	xmm8,xmm8,xmm1
1830	vaesenc	xmm8,xmm8,xmm3
1831	vmovdqu	XMMWORD[rdx],xmm1
1832	vmovdqu	XMMWORD[16+rdx],xmm3
1833	vpxor	xmm14,xmm14,xmm14
1834
1835	vpshufb	xmm2,xmm3,xmm15
1836	vaesenclast	xmm2,xmm2,xmm0
1837	vpslld	xmm0,xmm0,1
1838	vpslldq	xmm4,xmm1,4
1839	vpxor	xmm1,xmm1,xmm4
1840	vpslldq	xmm4,xmm4,4
1841	vpxor	xmm1,xmm1,xmm4
1842	vpslldq	xmm4,xmm4,4
1843	vpxor	xmm1,xmm1,xmm4
1844	vpxor	xmm1,xmm1,xmm2
1845	vaesenc	xmm8,xmm8,xmm1
1846	vmovdqu	XMMWORD[32+rdx],xmm1
1847
1848	vpshufd	xmm2,xmm1,0xff
1849	vaesenclast	xmm2,xmm2,xmm14
1850	vpslldq	xmm4,xmm3,4
1851	vpxor	xmm3,xmm3,xmm4
1852	vpslldq	xmm4,xmm4,4
1853	vpxor	xmm3,xmm3,xmm4
1854	vpslldq	xmm4,xmm4,4
1855	vpxor	xmm3,xmm3,xmm4
1856	vpxor	xmm3,xmm3,xmm2
1857	vaesenc	xmm8,xmm8,xmm3
1858	vmovdqu	XMMWORD[48+rdx],xmm3
1859
1860	vpshufb	xmm2,xmm3,xmm15
1861	vaesenclast	xmm2,xmm2,xmm0
1862	vpslld	xmm0,xmm0,1
1863	vpslldq	xmm4,xmm1,4
1864	vpxor	xmm1,xmm1,xmm4
1865	vpslldq	xmm4,xmm4,4
1866	vpxor	xmm1,xmm1,xmm4
1867	vpslldq	xmm4,xmm4,4
1868	vpxor	xmm1,xmm1,xmm4
1869	vpxor	xmm1,xmm1,xmm2
1870	vaesenc	xmm8,xmm8,xmm1
1871	vmovdqu	XMMWORD[64+rdx],xmm1
1872
1873	vpshufd	xmm2,xmm1,0xff
1874	vaesenclast	xmm2,xmm2,xmm14
1875	vpslldq	xmm4,xmm3,4
1876	vpxor	xmm3,xmm3,xmm4
1877	vpslldq	xmm4,xmm4,4
1878	vpxor	xmm3,xmm3,xmm4
1879	vpslldq	xmm4,xmm4,4
1880	vpxor	xmm3,xmm3,xmm4
1881	vpxor	xmm3,xmm3,xmm2
1882	vaesenc	xmm8,xmm8,xmm3
1883	vmovdqu	XMMWORD[80+rdx],xmm3
1884
1885	vpshufb	xmm2,xmm3,xmm15
1886	vaesenclast	xmm2,xmm2,xmm0
1887	vpslld	xmm0,xmm0,1
1888	vpslldq	xmm4,xmm1,4
1889	vpxor	xmm1,xmm1,xmm4
1890	vpslldq	xmm4,xmm4,4
1891	vpxor	xmm1,xmm1,xmm4
1892	vpslldq	xmm4,xmm4,4
1893	vpxor	xmm1,xmm1,xmm4
1894	vpxor	xmm1,xmm1,xmm2
1895	vaesenc	xmm8,xmm8,xmm1
1896	vmovdqu	XMMWORD[96+rdx],xmm1
1897
1898	vpshufd	xmm2,xmm1,0xff
1899	vaesenclast	xmm2,xmm2,xmm14
1900	vpslldq	xmm4,xmm3,4
1901	vpxor	xmm3,xmm3,xmm4
1902	vpslldq	xmm4,xmm4,4
1903	vpxor	xmm3,xmm3,xmm4
1904	vpslldq	xmm4,xmm4,4
1905	vpxor	xmm3,xmm3,xmm4
1906	vpxor	xmm3,xmm3,xmm2
1907	vaesenc	xmm8,xmm8,xmm3
1908	vmovdqu	XMMWORD[112+rdx],xmm3
1909
1910	vpshufb	xmm2,xmm3,xmm15
1911	vaesenclast	xmm2,xmm2,xmm0
1912	vpslld	xmm0,xmm0,1
1913	vpslldq	xmm4,xmm1,4
1914	vpxor	xmm1,xmm1,xmm4
1915	vpslldq	xmm4,xmm4,4
1916	vpxor	xmm1,xmm1,xmm4
1917	vpslldq	xmm4,xmm4,4
1918	vpxor	xmm1,xmm1,xmm4
1919	vpxor	xmm1,xmm1,xmm2
1920	vaesenc	xmm8,xmm8,xmm1
1921	vmovdqu	XMMWORD[128+rdx],xmm1
1922
1923	vpshufd	xmm2,xmm1,0xff
1924	vaesenclast	xmm2,xmm2,xmm14
1925	vpslldq	xmm4,xmm3,4
1926	vpxor	xmm3,xmm3,xmm4
1927	vpslldq	xmm4,xmm4,4
1928	vpxor	xmm3,xmm3,xmm4
1929	vpslldq	xmm4,xmm4,4
1930	vpxor	xmm3,xmm3,xmm4
1931	vpxor	xmm3,xmm3,xmm2
1932	vaesenc	xmm8,xmm8,xmm3
1933	vmovdqu	XMMWORD[144+rdx],xmm3
1934
1935	vpshufb	xmm2,xmm3,xmm15
1936	vaesenclast	xmm2,xmm2,xmm0
1937	vpslld	xmm0,xmm0,1
1938	vpslldq	xmm4,xmm1,4
1939	vpxor	xmm1,xmm1,xmm4
1940	vpslldq	xmm4,xmm4,4
1941	vpxor	xmm1,xmm1,xmm4
1942	vpslldq	xmm4,xmm4,4
1943	vpxor	xmm1,xmm1,xmm4
1944	vpxor	xmm1,xmm1,xmm2
1945	vaesenc	xmm8,xmm8,xmm1
1946	vmovdqu	XMMWORD[160+rdx],xmm1
1947
1948	vpshufd	xmm2,xmm1,0xff
1949	vaesenclast	xmm2,xmm2,xmm14
1950	vpslldq	xmm4,xmm3,4
1951	vpxor	xmm3,xmm3,xmm4
1952	vpslldq	xmm4,xmm4,4
1953	vpxor	xmm3,xmm3,xmm4
1954	vpslldq	xmm4,xmm4,4
1955	vpxor	xmm3,xmm3,xmm4
1956	vpxor	xmm3,xmm3,xmm2
1957	vaesenc	xmm8,xmm8,xmm3
1958	vmovdqu	XMMWORD[176+rdx],xmm3
1959
1960	vpshufb	xmm2,xmm3,xmm15
1961	vaesenclast	xmm2,xmm2,xmm0
1962	vpslld	xmm0,xmm0,1
1963	vpslldq	xmm4,xmm1,4
1964	vpxor	xmm1,xmm1,xmm4
1965	vpslldq	xmm4,xmm4,4
1966	vpxor	xmm1,xmm1,xmm4
1967	vpslldq	xmm4,xmm4,4
1968	vpxor	xmm1,xmm1,xmm4
1969	vpxor	xmm1,xmm1,xmm2
1970	vaesenc	xmm8,xmm8,xmm1
1971	vmovdqu	XMMWORD[192+rdx],xmm1
1972
1973	vpshufd	xmm2,xmm1,0xff
1974	vaesenclast	xmm2,xmm2,xmm14
1975	vpslldq	xmm4,xmm3,4
1976	vpxor	xmm3,xmm3,xmm4
1977	vpslldq	xmm4,xmm4,4
1978	vpxor	xmm3,xmm3,xmm4
1979	vpslldq	xmm4,xmm4,4
1980	vpxor	xmm3,xmm3,xmm4
1981	vpxor	xmm3,xmm3,xmm2
1982	vaesenc	xmm8,xmm8,xmm3
1983	vmovdqu	XMMWORD[208+rdx],xmm3
1984
1985	vpshufb	xmm2,xmm3,xmm15
1986	vaesenclast	xmm2,xmm2,xmm0
1987	vpslldq	xmm4,xmm1,4
1988	vpxor	xmm1,xmm1,xmm4
1989	vpslldq	xmm4,xmm4,4
1990	vpxor	xmm1,xmm1,xmm4
1991	vpslldq	xmm4,xmm4,4
1992	vpxor	xmm1,xmm1,xmm4
1993	vpxor	xmm1,xmm1,xmm2
1994	vaesenclast	xmm8,xmm8,xmm1
1995	vmovdqu	XMMWORD[224+rdx],xmm1
1996
1997	vmovdqa	XMMWORD[rsi],xmm8
1998	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1999	mov	rsi,QWORD[16+rsp]
2000	ret
2001
2002$L$SEH_end_aes256gcmsiv_aes_ks_enc_x1:
2003global	aes256gcmsiv_ecb_enc_block
2004
2005ALIGN	16
2006aes256gcmsiv_ecb_enc_block:
2007	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2008	mov	QWORD[16+rsp],rsi
2009	mov	rax,rsp
2010$L$SEH_begin_aes256gcmsiv_ecb_enc_block:
2011	mov	rdi,rcx
2012	mov	rsi,rdx
2013	mov	rdx,r8
2014
2015
2016
2017_CET_ENDBR
2018	vmovdqa	xmm1,XMMWORD[rdi]
2019	vpxor	xmm1,xmm1,XMMWORD[rdx]
2020	vaesenc	xmm1,xmm1,XMMWORD[16+rdx]
2021	vaesenc	xmm1,xmm1,XMMWORD[32+rdx]
2022	vaesenc	xmm1,xmm1,XMMWORD[48+rdx]
2023	vaesenc	xmm1,xmm1,XMMWORD[64+rdx]
2024	vaesenc	xmm1,xmm1,XMMWORD[80+rdx]
2025	vaesenc	xmm1,xmm1,XMMWORD[96+rdx]
2026	vaesenc	xmm1,xmm1,XMMWORD[112+rdx]
2027	vaesenc	xmm1,xmm1,XMMWORD[128+rdx]
2028	vaesenc	xmm1,xmm1,XMMWORD[144+rdx]
2029	vaesenc	xmm1,xmm1,XMMWORD[160+rdx]
2030	vaesenc	xmm1,xmm1,XMMWORD[176+rdx]
2031	vaesenc	xmm1,xmm1,XMMWORD[192+rdx]
2032	vaesenc	xmm1,xmm1,XMMWORD[208+rdx]
2033	vaesenclast	xmm1,xmm1,XMMWORD[224+rdx]
2034	vmovdqa	XMMWORD[rsi],xmm1
2035	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2036	mov	rsi,QWORD[16+rsp]
2037	ret
2038
2039$L$SEH_end_aes256gcmsiv_ecb_enc_block:
2040global	aes256gcmsiv_enc_msg_x4
2041
2042ALIGN	16
2043aes256gcmsiv_enc_msg_x4:
2044	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2045	mov	QWORD[16+rsp],rsi
2046	mov	rax,rsp
2047$L$SEH_begin_aes256gcmsiv_enc_msg_x4:
2048	mov	rdi,rcx
2049	mov	rsi,rdx
2050	mov	rdx,r8
2051	mov	rcx,r9
2052	mov	r8,QWORD[40+rsp]
2053
2054
2055
2056_CET_ENDBR
2057	test	r8,r8
2058	jnz	NEAR $L$256_enc_msg_x4_start
2059	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2060	mov	rsi,QWORD[16+rsp]
2061	ret
2062
2063$L$256_enc_msg_x4_start:
2064	mov	r10,r8
2065	shr	r8,4
2066	shl	r10,60
2067	jz	NEAR $L$256_enc_msg_x4_start2
2068	add	r8,1
2069
2070$L$256_enc_msg_x4_start2:
2071	mov	r10,r8
2072	shl	r10,62
2073	shr	r10,62
2074
2075
2076	vmovdqa	xmm15,XMMWORD[rdx]
2077	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
2078
2079	vmovdqa	xmm4,XMMWORD[four]
2080	vmovdqa	xmm0,xmm15
2081	vpaddd	xmm1,xmm15,XMMWORD[one]
2082	vpaddd	xmm2,xmm15,XMMWORD[two]
2083	vpaddd	xmm3,xmm15,XMMWORD[three]
2084
2085	shr	r8,2
2086	je	NEAR $L$256_enc_msg_x4_check_remainder
2087
2088	sub	rsi,64
2089	sub	rdi,64
2090
2091$L$256_enc_msg_x4_loop1:
2092	add	rsi,64
2093	add	rdi,64
2094
2095	vmovdqa	xmm5,xmm0
2096	vmovdqa	xmm6,xmm1
2097	vmovdqa	xmm7,xmm2
2098	vmovdqa	xmm8,xmm3
2099
2100	vpxor	xmm5,xmm5,XMMWORD[rcx]
2101	vpxor	xmm6,xmm6,XMMWORD[rcx]
2102	vpxor	xmm7,xmm7,XMMWORD[rcx]
2103	vpxor	xmm8,xmm8,XMMWORD[rcx]
2104
2105	vmovdqu	xmm12,XMMWORD[16+rcx]
2106	vaesenc	xmm5,xmm5,xmm12
2107	vaesenc	xmm6,xmm6,xmm12
2108	vaesenc	xmm7,xmm7,xmm12
2109	vaesenc	xmm8,xmm8,xmm12
2110
2111	vpaddd	xmm0,xmm0,xmm4
2112	vmovdqu	xmm12,XMMWORD[32+rcx]
2113	vaesenc	xmm5,xmm5,xmm12
2114	vaesenc	xmm6,xmm6,xmm12
2115	vaesenc	xmm7,xmm7,xmm12
2116	vaesenc	xmm8,xmm8,xmm12
2117
2118	vpaddd	xmm1,xmm1,xmm4
2119	vmovdqu	xmm12,XMMWORD[48+rcx]
2120	vaesenc	xmm5,xmm5,xmm12
2121	vaesenc	xmm6,xmm6,xmm12
2122	vaesenc	xmm7,xmm7,xmm12
2123	vaesenc	xmm8,xmm8,xmm12
2124
2125	vpaddd	xmm2,xmm2,xmm4
2126	vmovdqu	xmm12,XMMWORD[64+rcx]
2127	vaesenc	xmm5,xmm5,xmm12
2128	vaesenc	xmm6,xmm6,xmm12
2129	vaesenc	xmm7,xmm7,xmm12
2130	vaesenc	xmm8,xmm8,xmm12
2131
2132	vpaddd	xmm3,xmm3,xmm4
2133
2134	vmovdqu	xmm12,XMMWORD[80+rcx]
2135	vaesenc	xmm5,xmm5,xmm12
2136	vaesenc	xmm6,xmm6,xmm12
2137	vaesenc	xmm7,xmm7,xmm12
2138	vaesenc	xmm8,xmm8,xmm12
2139
2140	vmovdqu	xmm12,XMMWORD[96+rcx]
2141	vaesenc	xmm5,xmm5,xmm12
2142	vaesenc	xmm6,xmm6,xmm12
2143	vaesenc	xmm7,xmm7,xmm12
2144	vaesenc	xmm8,xmm8,xmm12
2145
2146	vmovdqu	xmm12,XMMWORD[112+rcx]
2147	vaesenc	xmm5,xmm5,xmm12
2148	vaesenc	xmm6,xmm6,xmm12
2149	vaesenc	xmm7,xmm7,xmm12
2150	vaesenc	xmm8,xmm8,xmm12
2151
2152	vmovdqu	xmm12,XMMWORD[128+rcx]
2153	vaesenc	xmm5,xmm5,xmm12
2154	vaesenc	xmm6,xmm6,xmm12
2155	vaesenc	xmm7,xmm7,xmm12
2156	vaesenc	xmm8,xmm8,xmm12
2157
2158	vmovdqu	xmm12,XMMWORD[144+rcx]
2159	vaesenc	xmm5,xmm5,xmm12
2160	vaesenc	xmm6,xmm6,xmm12
2161	vaesenc	xmm7,xmm7,xmm12
2162	vaesenc	xmm8,xmm8,xmm12
2163
2164	vmovdqu	xmm12,XMMWORD[160+rcx]
2165	vaesenc	xmm5,xmm5,xmm12
2166	vaesenc	xmm6,xmm6,xmm12
2167	vaesenc	xmm7,xmm7,xmm12
2168	vaesenc	xmm8,xmm8,xmm12
2169
2170	vmovdqu	xmm12,XMMWORD[176+rcx]
2171	vaesenc	xmm5,xmm5,xmm12
2172	vaesenc	xmm6,xmm6,xmm12
2173	vaesenc	xmm7,xmm7,xmm12
2174	vaesenc	xmm8,xmm8,xmm12
2175
2176	vmovdqu	xmm12,XMMWORD[192+rcx]
2177	vaesenc	xmm5,xmm5,xmm12
2178	vaesenc	xmm6,xmm6,xmm12
2179	vaesenc	xmm7,xmm7,xmm12
2180	vaesenc	xmm8,xmm8,xmm12
2181
2182	vmovdqu	xmm12,XMMWORD[208+rcx]
2183	vaesenc	xmm5,xmm5,xmm12
2184	vaesenc	xmm6,xmm6,xmm12
2185	vaesenc	xmm7,xmm7,xmm12
2186	vaesenc	xmm8,xmm8,xmm12
2187
2188	vmovdqu	xmm12,XMMWORD[224+rcx]
2189	vaesenclast	xmm5,xmm5,xmm12
2190	vaesenclast	xmm6,xmm6,xmm12
2191	vaesenclast	xmm7,xmm7,xmm12
2192	vaesenclast	xmm8,xmm8,xmm12
2193
2194
2195
2196	vpxor	xmm5,xmm5,XMMWORD[rdi]
2197	vpxor	xmm6,xmm6,XMMWORD[16+rdi]
2198	vpxor	xmm7,xmm7,XMMWORD[32+rdi]
2199	vpxor	xmm8,xmm8,XMMWORD[48+rdi]
2200
2201	sub	r8,1
2202
2203	vmovdqu	XMMWORD[rsi],xmm5
2204	vmovdqu	XMMWORD[16+rsi],xmm6
2205	vmovdqu	XMMWORD[32+rsi],xmm7
2206	vmovdqu	XMMWORD[48+rsi],xmm8
2207
2208	jne	NEAR $L$256_enc_msg_x4_loop1
2209
2210	add	rsi,64
2211	add	rdi,64
2212
2213$L$256_enc_msg_x4_check_remainder:
2214	cmp	r10,0
2215	je	NEAR $L$256_enc_msg_x4_out
2216
2217$L$256_enc_msg_x4_loop2:
2218
2219
2220
2221	vmovdqa	xmm5,xmm0
2222	vpaddd	xmm0,xmm0,XMMWORD[one]
2223	vpxor	xmm5,xmm5,XMMWORD[rcx]
2224	vaesenc	xmm5,xmm5,XMMWORD[16+rcx]
2225	vaesenc	xmm5,xmm5,XMMWORD[32+rcx]
2226	vaesenc	xmm5,xmm5,XMMWORD[48+rcx]
2227	vaesenc	xmm5,xmm5,XMMWORD[64+rcx]
2228	vaesenc	xmm5,xmm5,XMMWORD[80+rcx]
2229	vaesenc	xmm5,xmm5,XMMWORD[96+rcx]
2230	vaesenc	xmm5,xmm5,XMMWORD[112+rcx]
2231	vaesenc	xmm5,xmm5,XMMWORD[128+rcx]
2232	vaesenc	xmm5,xmm5,XMMWORD[144+rcx]
2233	vaesenc	xmm5,xmm5,XMMWORD[160+rcx]
2234	vaesenc	xmm5,xmm5,XMMWORD[176+rcx]
2235	vaesenc	xmm5,xmm5,XMMWORD[192+rcx]
2236	vaesenc	xmm5,xmm5,XMMWORD[208+rcx]
2237	vaesenclast	xmm5,xmm5,XMMWORD[224+rcx]
2238
2239
2240	vpxor	xmm5,xmm5,XMMWORD[rdi]
2241
2242	vmovdqu	XMMWORD[rsi],xmm5
2243
2244	add	rdi,16
2245	add	rsi,16
2246
2247	sub	r10,1
2248	jne	NEAR $L$256_enc_msg_x4_loop2
2249
2250$L$256_enc_msg_x4_out:
2251	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2252	mov	rsi,QWORD[16+rsp]
2253	ret
2254
2255$L$SEH_end_aes256gcmsiv_enc_msg_x4:
2256global	aes256gcmsiv_enc_msg_x8
2257
2258ALIGN	16
2259aes256gcmsiv_enc_msg_x8:
2260	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2261	mov	QWORD[16+rsp],rsi
2262	mov	rax,rsp
2263$L$SEH_begin_aes256gcmsiv_enc_msg_x8:
2264	mov	rdi,rcx
2265	mov	rsi,rdx
2266	mov	rdx,r8
2267	mov	rcx,r9
2268	mov	r8,QWORD[40+rsp]
2269
2270
2271
2272_CET_ENDBR
2273	test	r8,r8
2274	jnz	NEAR $L$256_enc_msg_x8_start
2275	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2276	mov	rsi,QWORD[16+rsp]
2277	ret
2278
2279$L$256_enc_msg_x8_start:
2280
2281	mov	r11,rsp
2282	sub	r11,16
2283	and	r11,-64
2284
2285	mov	r10,r8
2286	shr	r8,4
2287	shl	r10,60
2288	jz	NEAR $L$256_enc_msg_x8_start2
2289	add	r8,1
2290
2291$L$256_enc_msg_x8_start2:
2292	mov	r10,r8
2293	shl	r10,61
2294	shr	r10,61
2295
2296
2297	vmovdqa	xmm1,XMMWORD[rdx]
2298	vpor	xmm1,xmm1,XMMWORD[OR_MASK]
2299
2300
2301	vpaddd	xmm0,xmm1,XMMWORD[seven]
2302	vmovdqa	XMMWORD[r11],xmm0
2303	vpaddd	xmm9,xmm1,XMMWORD[one]
2304	vpaddd	xmm10,xmm1,XMMWORD[two]
2305	vpaddd	xmm11,xmm1,XMMWORD[three]
2306	vpaddd	xmm12,xmm1,XMMWORD[four]
2307	vpaddd	xmm13,xmm1,XMMWORD[five]
2308	vpaddd	xmm14,xmm1,XMMWORD[six]
2309	vmovdqa	xmm0,xmm1
2310
2311	shr	r8,3
2312	jz	NEAR $L$256_enc_msg_x8_check_remainder
2313
2314	sub	rsi,128
2315	sub	rdi,128
2316
2317$L$256_enc_msg_x8_loop1:
2318	add	rsi,128
2319	add	rdi,128
2320
2321	vmovdqa	xmm1,xmm0
2322	vmovdqa	xmm2,xmm9
2323	vmovdqa	xmm3,xmm10
2324	vmovdqa	xmm4,xmm11
2325	vmovdqa	xmm5,xmm12
2326	vmovdqa	xmm6,xmm13
2327	vmovdqa	xmm7,xmm14
2328
2329	vmovdqa	xmm8,XMMWORD[r11]
2330
2331	vpxor	xmm1,xmm1,XMMWORD[rcx]
2332	vpxor	xmm2,xmm2,XMMWORD[rcx]
2333	vpxor	xmm3,xmm3,XMMWORD[rcx]
2334	vpxor	xmm4,xmm4,XMMWORD[rcx]
2335	vpxor	xmm5,xmm5,XMMWORD[rcx]
2336	vpxor	xmm6,xmm6,XMMWORD[rcx]
2337	vpxor	xmm7,xmm7,XMMWORD[rcx]
2338	vpxor	xmm8,xmm8,XMMWORD[rcx]
2339
2340	vmovdqu	xmm15,XMMWORD[16+rcx]
2341	vaesenc	xmm1,xmm1,xmm15
2342	vaesenc	xmm2,xmm2,xmm15
2343	vaesenc	xmm3,xmm3,xmm15
2344	vaesenc	xmm4,xmm4,xmm15
2345	vaesenc	xmm5,xmm5,xmm15
2346	vaesenc	xmm6,xmm6,xmm15
2347	vaesenc	xmm7,xmm7,xmm15
2348	vaesenc	xmm8,xmm8,xmm15
2349
2350	vmovdqa	xmm14,XMMWORD[r11]
2351	vpaddd	xmm14,xmm14,XMMWORD[eight]
2352	vmovdqa	XMMWORD[r11],xmm14
2353	vmovdqu	xmm15,XMMWORD[32+rcx]
2354	vaesenc	xmm1,xmm1,xmm15
2355	vaesenc	xmm2,xmm2,xmm15
2356	vaesenc	xmm3,xmm3,xmm15
2357	vaesenc	xmm4,xmm4,xmm15
2358	vaesenc	xmm5,xmm5,xmm15
2359	vaesenc	xmm6,xmm6,xmm15
2360	vaesenc	xmm7,xmm7,xmm15
2361	vaesenc	xmm8,xmm8,xmm15
2362
2363	vpsubd	xmm14,xmm14,XMMWORD[one]
2364	vmovdqu	xmm15,XMMWORD[48+rcx]
2365	vaesenc	xmm1,xmm1,xmm15
2366	vaesenc	xmm2,xmm2,xmm15
2367	vaesenc	xmm3,xmm3,xmm15
2368	vaesenc	xmm4,xmm4,xmm15
2369	vaesenc	xmm5,xmm5,xmm15
2370	vaesenc	xmm6,xmm6,xmm15
2371	vaesenc	xmm7,xmm7,xmm15
2372	vaesenc	xmm8,xmm8,xmm15
2373
2374	vpaddd	xmm0,xmm0,XMMWORD[eight]
2375	vmovdqu	xmm15,XMMWORD[64+rcx]
2376	vaesenc	xmm1,xmm1,xmm15
2377	vaesenc	xmm2,xmm2,xmm15
2378	vaesenc	xmm3,xmm3,xmm15
2379	vaesenc	xmm4,xmm4,xmm15
2380	vaesenc	xmm5,xmm5,xmm15
2381	vaesenc	xmm6,xmm6,xmm15
2382	vaesenc	xmm7,xmm7,xmm15
2383	vaesenc	xmm8,xmm8,xmm15
2384
2385	vpaddd	xmm9,xmm9,XMMWORD[eight]
2386	vmovdqu	xmm15,XMMWORD[80+rcx]
2387	vaesenc	xmm1,xmm1,xmm15
2388	vaesenc	xmm2,xmm2,xmm15
2389	vaesenc	xmm3,xmm3,xmm15
2390	vaesenc	xmm4,xmm4,xmm15
2391	vaesenc	xmm5,xmm5,xmm15
2392	vaesenc	xmm6,xmm6,xmm15
2393	vaesenc	xmm7,xmm7,xmm15
2394	vaesenc	xmm8,xmm8,xmm15
2395
2396	vpaddd	xmm10,xmm10,XMMWORD[eight]
2397	vmovdqu	xmm15,XMMWORD[96+rcx]
2398	vaesenc	xmm1,xmm1,xmm15
2399	vaesenc	xmm2,xmm2,xmm15
2400	vaesenc	xmm3,xmm3,xmm15
2401	vaesenc	xmm4,xmm4,xmm15
2402	vaesenc	xmm5,xmm5,xmm15
2403	vaesenc	xmm6,xmm6,xmm15
2404	vaesenc	xmm7,xmm7,xmm15
2405	vaesenc	xmm8,xmm8,xmm15
2406
2407	vpaddd	xmm11,xmm11,XMMWORD[eight]
2408	vmovdqu	xmm15,XMMWORD[112+rcx]
2409	vaesenc	xmm1,xmm1,xmm15
2410	vaesenc	xmm2,xmm2,xmm15
2411	vaesenc	xmm3,xmm3,xmm15
2412	vaesenc	xmm4,xmm4,xmm15
2413	vaesenc	xmm5,xmm5,xmm15
2414	vaesenc	xmm6,xmm6,xmm15
2415	vaesenc	xmm7,xmm7,xmm15
2416	vaesenc	xmm8,xmm8,xmm15
2417
2418	vpaddd	xmm12,xmm12,XMMWORD[eight]
2419	vmovdqu	xmm15,XMMWORD[128+rcx]
2420	vaesenc	xmm1,xmm1,xmm15
2421	vaesenc	xmm2,xmm2,xmm15
2422	vaesenc	xmm3,xmm3,xmm15
2423	vaesenc	xmm4,xmm4,xmm15
2424	vaesenc	xmm5,xmm5,xmm15
2425	vaesenc	xmm6,xmm6,xmm15
2426	vaesenc	xmm7,xmm7,xmm15
2427	vaesenc	xmm8,xmm8,xmm15
2428
2429	vpaddd	xmm13,xmm13,XMMWORD[eight]
2430	vmovdqu	xmm15,XMMWORD[144+rcx]
2431	vaesenc	xmm1,xmm1,xmm15
2432	vaesenc	xmm2,xmm2,xmm15
2433	vaesenc	xmm3,xmm3,xmm15
2434	vaesenc	xmm4,xmm4,xmm15
2435	vaesenc	xmm5,xmm5,xmm15
2436	vaesenc	xmm6,xmm6,xmm15
2437	vaesenc	xmm7,xmm7,xmm15
2438	vaesenc	xmm8,xmm8,xmm15
2439
2440	vmovdqu	xmm15,XMMWORD[160+rcx]
2441	vaesenc	xmm1,xmm1,xmm15
2442	vaesenc	xmm2,xmm2,xmm15
2443	vaesenc	xmm3,xmm3,xmm15
2444	vaesenc	xmm4,xmm4,xmm15
2445	vaesenc	xmm5,xmm5,xmm15
2446	vaesenc	xmm6,xmm6,xmm15
2447	vaesenc	xmm7,xmm7,xmm15
2448	vaesenc	xmm8,xmm8,xmm15
2449
2450	vmovdqu	xmm15,XMMWORD[176+rcx]
2451	vaesenc	xmm1,xmm1,xmm15
2452	vaesenc	xmm2,xmm2,xmm15
2453	vaesenc	xmm3,xmm3,xmm15
2454	vaesenc	xmm4,xmm4,xmm15
2455	vaesenc	xmm5,xmm5,xmm15
2456	vaesenc	xmm6,xmm6,xmm15
2457	vaesenc	xmm7,xmm7,xmm15
2458	vaesenc	xmm8,xmm8,xmm15
2459
2460	vmovdqu	xmm15,XMMWORD[192+rcx]
2461	vaesenc	xmm1,xmm1,xmm15
2462	vaesenc	xmm2,xmm2,xmm15
2463	vaesenc	xmm3,xmm3,xmm15
2464	vaesenc	xmm4,xmm4,xmm15
2465	vaesenc	xmm5,xmm5,xmm15
2466	vaesenc	xmm6,xmm6,xmm15
2467	vaesenc	xmm7,xmm7,xmm15
2468	vaesenc	xmm8,xmm8,xmm15
2469
2470	vmovdqu	xmm15,XMMWORD[208+rcx]
2471	vaesenc	xmm1,xmm1,xmm15
2472	vaesenc	xmm2,xmm2,xmm15
2473	vaesenc	xmm3,xmm3,xmm15
2474	vaesenc	xmm4,xmm4,xmm15
2475	vaesenc	xmm5,xmm5,xmm15
2476	vaesenc	xmm6,xmm6,xmm15
2477	vaesenc	xmm7,xmm7,xmm15
2478	vaesenc	xmm8,xmm8,xmm15
2479
2480	vmovdqu	xmm15,XMMWORD[224+rcx]
2481	vaesenclast	xmm1,xmm1,xmm15
2482	vaesenclast	xmm2,xmm2,xmm15
2483	vaesenclast	xmm3,xmm3,xmm15
2484	vaesenclast	xmm4,xmm4,xmm15
2485	vaesenclast	xmm5,xmm5,xmm15
2486	vaesenclast	xmm6,xmm6,xmm15
2487	vaesenclast	xmm7,xmm7,xmm15
2488	vaesenclast	xmm8,xmm8,xmm15
2489
2490
2491
2492	vpxor	xmm1,xmm1,XMMWORD[rdi]
2493	vpxor	xmm2,xmm2,XMMWORD[16+rdi]
2494	vpxor	xmm3,xmm3,XMMWORD[32+rdi]
2495	vpxor	xmm4,xmm4,XMMWORD[48+rdi]
2496	vpxor	xmm5,xmm5,XMMWORD[64+rdi]
2497	vpxor	xmm6,xmm6,XMMWORD[80+rdi]
2498	vpxor	xmm7,xmm7,XMMWORD[96+rdi]
2499	vpxor	xmm8,xmm8,XMMWORD[112+rdi]
2500
2501	sub	r8,1
2502
2503	vmovdqu	XMMWORD[rsi],xmm1
2504	vmovdqu	XMMWORD[16+rsi],xmm2
2505	vmovdqu	XMMWORD[32+rsi],xmm3
2506	vmovdqu	XMMWORD[48+rsi],xmm4
2507	vmovdqu	XMMWORD[64+rsi],xmm5
2508	vmovdqu	XMMWORD[80+rsi],xmm6
2509	vmovdqu	XMMWORD[96+rsi],xmm7
2510	vmovdqu	XMMWORD[112+rsi],xmm8
2511
2512	jne	NEAR $L$256_enc_msg_x8_loop1
2513
2514	add	rsi,128
2515	add	rdi,128
2516
2517$L$256_enc_msg_x8_check_remainder:
2518	cmp	r10,0
2519	je	NEAR $L$256_enc_msg_x8_out
2520
2521$L$256_enc_msg_x8_loop2:
2522
2523
2524	vmovdqa	xmm1,xmm0
2525	vpaddd	xmm0,xmm0,XMMWORD[one]
2526
2527	vpxor	xmm1,xmm1,XMMWORD[rcx]
2528	vaesenc	xmm1,xmm1,XMMWORD[16+rcx]
2529	vaesenc	xmm1,xmm1,XMMWORD[32+rcx]
2530	vaesenc	xmm1,xmm1,XMMWORD[48+rcx]
2531	vaesenc	xmm1,xmm1,XMMWORD[64+rcx]
2532	vaesenc	xmm1,xmm1,XMMWORD[80+rcx]
2533	vaesenc	xmm1,xmm1,XMMWORD[96+rcx]
2534	vaesenc	xmm1,xmm1,XMMWORD[112+rcx]
2535	vaesenc	xmm1,xmm1,XMMWORD[128+rcx]
2536	vaesenc	xmm1,xmm1,XMMWORD[144+rcx]
2537	vaesenc	xmm1,xmm1,XMMWORD[160+rcx]
2538	vaesenc	xmm1,xmm1,XMMWORD[176+rcx]
2539	vaesenc	xmm1,xmm1,XMMWORD[192+rcx]
2540	vaesenc	xmm1,xmm1,XMMWORD[208+rcx]
2541	vaesenclast	xmm1,xmm1,XMMWORD[224+rcx]
2542
2543
2544	vpxor	xmm1,xmm1,XMMWORD[rdi]
2545
2546	vmovdqu	XMMWORD[rsi],xmm1
2547
2548	add	rdi,16
2549	add	rsi,16
2550	sub	r10,1
2551	jnz	NEAR $L$256_enc_msg_x8_loop2
2552
2553$L$256_enc_msg_x8_out:
2554	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2555	mov	rsi,QWORD[16+rsp]
2556	ret
2557
2558
2559$L$SEH_end_aes256gcmsiv_enc_msg_x8:
2560global	aes256gcmsiv_dec
2561
2562ALIGN	16
2563aes256gcmsiv_dec:
2564	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2565	mov	QWORD[16+rsp],rsi
2566	mov	rax,rsp
2567$L$SEH_begin_aes256gcmsiv_dec:
2568	mov	rdi,rcx
2569	mov	rsi,rdx
2570	mov	rdx,r8
2571	mov	rcx,r9
2572	mov	r8,QWORD[40+rsp]
2573	mov	r9,QWORD[48+rsp]
2574
2575
2576
2577_CET_ENDBR
2578	test	r9,~15
2579	jnz	NEAR $L$256_dec_start
2580	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2581	mov	rsi,QWORD[16+rsp]
2582	ret
2583
2584$L$256_dec_start:
2585	vzeroupper
2586	vmovdqa	xmm0,XMMWORD[rdx]
2587
2588
2589	vmovdqu	xmm15,XMMWORD[16+rdx]
2590	vpor	xmm15,xmm15,XMMWORD[OR_MASK]
2591	mov	rax,rdx
2592
2593	lea	rax,[32+rax]
2594	lea	rcx,[32+rcx]
2595
2596	and	r9,~15
2597
2598
2599	cmp	r9,96
2600	jb	NEAR $L$256_dec_loop2
2601
2602
2603	sub	r9,96
2604	vmovdqa	xmm7,xmm15
2605	vpaddd	xmm8,xmm7,XMMWORD[one]
2606	vpaddd	xmm9,xmm7,XMMWORD[two]
2607	vpaddd	xmm10,xmm9,XMMWORD[one]
2608	vpaddd	xmm11,xmm9,XMMWORD[two]
2609	vpaddd	xmm12,xmm11,XMMWORD[one]
2610	vpaddd	xmm15,xmm11,XMMWORD[two]
2611
2612	vpxor	xmm7,xmm7,XMMWORD[r8]
2613	vpxor	xmm8,xmm8,XMMWORD[r8]
2614	vpxor	xmm9,xmm9,XMMWORD[r8]
2615	vpxor	xmm10,xmm10,XMMWORD[r8]
2616	vpxor	xmm11,xmm11,XMMWORD[r8]
2617	vpxor	xmm12,xmm12,XMMWORD[r8]
2618
2619	vmovdqu	xmm4,XMMWORD[16+r8]
2620	vaesenc	xmm7,xmm7,xmm4
2621	vaesenc	xmm8,xmm8,xmm4
2622	vaesenc	xmm9,xmm9,xmm4
2623	vaesenc	xmm10,xmm10,xmm4
2624	vaesenc	xmm11,xmm11,xmm4
2625	vaesenc	xmm12,xmm12,xmm4
2626
2627	vmovdqu	xmm4,XMMWORD[32+r8]
2628	vaesenc	xmm7,xmm7,xmm4
2629	vaesenc	xmm8,xmm8,xmm4
2630	vaesenc	xmm9,xmm9,xmm4
2631	vaesenc	xmm10,xmm10,xmm4
2632	vaesenc	xmm11,xmm11,xmm4
2633	vaesenc	xmm12,xmm12,xmm4
2634
2635	vmovdqu	xmm4,XMMWORD[48+r8]
2636	vaesenc	xmm7,xmm7,xmm4
2637	vaesenc	xmm8,xmm8,xmm4
2638	vaesenc	xmm9,xmm9,xmm4
2639	vaesenc	xmm10,xmm10,xmm4
2640	vaesenc	xmm11,xmm11,xmm4
2641	vaesenc	xmm12,xmm12,xmm4
2642
2643	vmovdqu	xmm4,XMMWORD[64+r8]
2644	vaesenc	xmm7,xmm7,xmm4
2645	vaesenc	xmm8,xmm8,xmm4
2646	vaesenc	xmm9,xmm9,xmm4
2647	vaesenc	xmm10,xmm10,xmm4
2648	vaesenc	xmm11,xmm11,xmm4
2649	vaesenc	xmm12,xmm12,xmm4
2650
2651	vmovdqu	xmm4,XMMWORD[80+r8]
2652	vaesenc	xmm7,xmm7,xmm4
2653	vaesenc	xmm8,xmm8,xmm4
2654	vaesenc	xmm9,xmm9,xmm4
2655	vaesenc	xmm10,xmm10,xmm4
2656	vaesenc	xmm11,xmm11,xmm4
2657	vaesenc	xmm12,xmm12,xmm4
2658
2659	vmovdqu	xmm4,XMMWORD[96+r8]
2660	vaesenc	xmm7,xmm7,xmm4
2661	vaesenc	xmm8,xmm8,xmm4
2662	vaesenc	xmm9,xmm9,xmm4
2663	vaesenc	xmm10,xmm10,xmm4
2664	vaesenc	xmm11,xmm11,xmm4
2665	vaesenc	xmm12,xmm12,xmm4
2666
2667	vmovdqu	xmm4,XMMWORD[112+r8]
2668	vaesenc	xmm7,xmm7,xmm4
2669	vaesenc	xmm8,xmm8,xmm4
2670	vaesenc	xmm9,xmm9,xmm4
2671	vaesenc	xmm10,xmm10,xmm4
2672	vaesenc	xmm11,xmm11,xmm4
2673	vaesenc	xmm12,xmm12,xmm4
2674
2675	vmovdqu	xmm4,XMMWORD[128+r8]
2676	vaesenc	xmm7,xmm7,xmm4
2677	vaesenc	xmm8,xmm8,xmm4
2678	vaesenc	xmm9,xmm9,xmm4
2679	vaesenc	xmm10,xmm10,xmm4
2680	vaesenc	xmm11,xmm11,xmm4
2681	vaesenc	xmm12,xmm12,xmm4
2682
2683	vmovdqu	xmm4,XMMWORD[144+r8]
2684	vaesenc	xmm7,xmm7,xmm4
2685	vaesenc	xmm8,xmm8,xmm4
2686	vaesenc	xmm9,xmm9,xmm4
2687	vaesenc	xmm10,xmm10,xmm4
2688	vaesenc	xmm11,xmm11,xmm4
2689	vaesenc	xmm12,xmm12,xmm4
2690
2691	vmovdqu	xmm4,XMMWORD[160+r8]
2692	vaesenc	xmm7,xmm7,xmm4
2693	vaesenc	xmm8,xmm8,xmm4
2694	vaesenc	xmm9,xmm9,xmm4
2695	vaesenc	xmm10,xmm10,xmm4
2696	vaesenc	xmm11,xmm11,xmm4
2697	vaesenc	xmm12,xmm12,xmm4
2698
2699	vmovdqu	xmm4,XMMWORD[176+r8]
2700	vaesenc	xmm7,xmm7,xmm4
2701	vaesenc	xmm8,xmm8,xmm4
2702	vaesenc	xmm9,xmm9,xmm4
2703	vaesenc	xmm10,xmm10,xmm4
2704	vaesenc	xmm11,xmm11,xmm4
2705	vaesenc	xmm12,xmm12,xmm4
2706
2707	vmovdqu	xmm4,XMMWORD[192+r8]
2708	vaesenc	xmm7,xmm7,xmm4
2709	vaesenc	xmm8,xmm8,xmm4
2710	vaesenc	xmm9,xmm9,xmm4
2711	vaesenc	xmm10,xmm10,xmm4
2712	vaesenc	xmm11,xmm11,xmm4
2713	vaesenc	xmm12,xmm12,xmm4
2714
2715	vmovdqu	xmm4,XMMWORD[208+r8]
2716	vaesenc	xmm7,xmm7,xmm4
2717	vaesenc	xmm8,xmm8,xmm4
2718	vaesenc	xmm9,xmm9,xmm4
2719	vaesenc	xmm10,xmm10,xmm4
2720	vaesenc	xmm11,xmm11,xmm4
2721	vaesenc	xmm12,xmm12,xmm4
2722
2723	vmovdqu	xmm4,XMMWORD[224+r8]
2724	vaesenclast	xmm7,xmm7,xmm4
2725	vaesenclast	xmm8,xmm8,xmm4
2726	vaesenclast	xmm9,xmm9,xmm4
2727	vaesenclast	xmm10,xmm10,xmm4
2728	vaesenclast	xmm11,xmm11,xmm4
2729	vaesenclast	xmm12,xmm12,xmm4
2730
2731
2732	vpxor	xmm7,xmm7,XMMWORD[rdi]
2733	vpxor	xmm8,xmm8,XMMWORD[16+rdi]
2734	vpxor	xmm9,xmm9,XMMWORD[32+rdi]
2735	vpxor	xmm10,xmm10,XMMWORD[48+rdi]
2736	vpxor	xmm11,xmm11,XMMWORD[64+rdi]
2737	vpxor	xmm12,xmm12,XMMWORD[80+rdi]
2738
2739	vmovdqu	XMMWORD[rsi],xmm7
2740	vmovdqu	XMMWORD[16+rsi],xmm8
2741	vmovdqu	XMMWORD[32+rsi],xmm9
2742	vmovdqu	XMMWORD[48+rsi],xmm10
2743	vmovdqu	XMMWORD[64+rsi],xmm11
2744	vmovdqu	XMMWORD[80+rsi],xmm12
2745
2746	add	rdi,96
2747	add	rsi,96
2748	jmp	NEAR $L$256_dec_loop1
2749
2750
2751ALIGN	64
2752$L$256_dec_loop1:
2753	cmp	r9,96
2754	jb	NEAR $L$256_dec_finish_96
2755	sub	r9,96
2756
2757	vmovdqa	xmm6,xmm12
2758	vmovdqa	XMMWORD[(16-32)+rax],xmm11
2759	vmovdqa	XMMWORD[(32-32)+rax],xmm10
2760	vmovdqa	XMMWORD[(48-32)+rax],xmm9
2761	vmovdqa	XMMWORD[(64-32)+rax],xmm8
2762	vmovdqa	XMMWORD[(80-32)+rax],xmm7
2763
2764	vmovdqa	xmm7,xmm15
2765	vpaddd	xmm8,xmm7,XMMWORD[one]
2766	vpaddd	xmm9,xmm7,XMMWORD[two]
2767	vpaddd	xmm10,xmm9,XMMWORD[one]
2768	vpaddd	xmm11,xmm9,XMMWORD[two]
2769	vpaddd	xmm12,xmm11,XMMWORD[one]
2770	vpaddd	xmm15,xmm11,XMMWORD[two]
2771
2772	vmovdqa	xmm4,XMMWORD[r8]
2773	vpxor	xmm7,xmm7,xmm4
2774	vpxor	xmm8,xmm8,xmm4
2775	vpxor	xmm9,xmm9,xmm4
2776	vpxor	xmm10,xmm10,xmm4
2777	vpxor	xmm11,xmm11,xmm4
2778	vpxor	xmm12,xmm12,xmm4
2779
2780	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
2781	vpclmulqdq	xmm2,xmm6,xmm4,0x11
2782	vpclmulqdq	xmm3,xmm6,xmm4,0x00
2783	vpclmulqdq	xmm1,xmm6,xmm4,0x01
2784	vpclmulqdq	xmm4,xmm6,xmm4,0x10
2785	vpxor	xmm1,xmm1,xmm4
2786
2787	vmovdqu	xmm4,XMMWORD[16+r8]
2788	vaesenc	xmm7,xmm7,xmm4
2789	vaesenc	xmm8,xmm8,xmm4
2790	vaesenc	xmm9,xmm9,xmm4
2791	vaesenc	xmm10,xmm10,xmm4
2792	vaesenc	xmm11,xmm11,xmm4
2793	vaesenc	xmm12,xmm12,xmm4
2794
2795	vmovdqu	xmm6,XMMWORD[((-16))+rax]
2796	vmovdqu	xmm13,XMMWORD[((-16))+rcx]
2797
2798	vpclmulqdq	xmm4,xmm6,xmm13,0x10
2799	vpxor	xmm1,xmm1,xmm4
2800	vpclmulqdq	xmm4,xmm6,xmm13,0x11
2801	vpxor	xmm2,xmm2,xmm4
2802	vpclmulqdq	xmm4,xmm6,xmm13,0x00
2803	vpxor	xmm3,xmm3,xmm4
2804	vpclmulqdq	xmm4,xmm6,xmm13,0x01
2805	vpxor	xmm1,xmm1,xmm4
2806
2807
2808	vmovdqu	xmm4,XMMWORD[32+r8]
2809	vaesenc	xmm7,xmm7,xmm4
2810	vaesenc	xmm8,xmm8,xmm4
2811	vaesenc	xmm9,xmm9,xmm4
2812	vaesenc	xmm10,xmm10,xmm4
2813	vaesenc	xmm11,xmm11,xmm4
2814	vaesenc	xmm12,xmm12,xmm4
2815
2816	vmovdqu	xmm6,XMMWORD[rax]
2817	vmovdqu	xmm13,XMMWORD[rcx]
2818
2819	vpclmulqdq	xmm4,xmm6,xmm13,0x10
2820	vpxor	xmm1,xmm1,xmm4
2821	vpclmulqdq	xmm4,xmm6,xmm13,0x11
2822	vpxor	xmm2,xmm2,xmm4
2823	vpclmulqdq	xmm4,xmm6,xmm13,0x00
2824	vpxor	xmm3,xmm3,xmm4
2825	vpclmulqdq	xmm4,xmm6,xmm13,0x01
2826	vpxor	xmm1,xmm1,xmm4
2827
2828
2829	vmovdqu	xmm4,XMMWORD[48+r8]
2830	vaesenc	xmm7,xmm7,xmm4
2831	vaesenc	xmm8,xmm8,xmm4
2832	vaesenc	xmm9,xmm9,xmm4
2833	vaesenc	xmm10,xmm10,xmm4
2834	vaesenc	xmm11,xmm11,xmm4
2835	vaesenc	xmm12,xmm12,xmm4
2836
2837	vmovdqu	xmm6,XMMWORD[16+rax]
2838	vmovdqu	xmm13,XMMWORD[16+rcx]
2839
2840	vpclmulqdq	xmm4,xmm6,xmm13,0x10
2841	vpxor	xmm1,xmm1,xmm4
2842	vpclmulqdq	xmm4,xmm6,xmm13,0x11
2843	vpxor	xmm2,xmm2,xmm4
2844	vpclmulqdq	xmm4,xmm6,xmm13,0x00
2845	vpxor	xmm3,xmm3,xmm4
2846	vpclmulqdq	xmm4,xmm6,xmm13,0x01
2847	vpxor	xmm1,xmm1,xmm4
2848
2849
2850	vmovdqu	xmm4,XMMWORD[64+r8]
2851	vaesenc	xmm7,xmm7,xmm4
2852	vaesenc	xmm8,xmm8,xmm4
2853	vaesenc	xmm9,xmm9,xmm4
2854	vaesenc	xmm10,xmm10,xmm4
2855	vaesenc	xmm11,xmm11,xmm4
2856	vaesenc	xmm12,xmm12,xmm4
2857
2858	vmovdqu	xmm6,XMMWORD[32+rax]
2859	vmovdqu	xmm13,XMMWORD[32+rcx]
2860
2861	vpclmulqdq	xmm4,xmm6,xmm13,0x10
2862	vpxor	xmm1,xmm1,xmm4
2863	vpclmulqdq	xmm4,xmm6,xmm13,0x11
2864	vpxor	xmm2,xmm2,xmm4
2865	vpclmulqdq	xmm4,xmm6,xmm13,0x00
2866	vpxor	xmm3,xmm3,xmm4
2867	vpclmulqdq	xmm4,xmm6,xmm13,0x01
2868	vpxor	xmm1,xmm1,xmm4
2869
2870
2871	vmovdqu	xmm4,XMMWORD[80+r8]
2872	vaesenc	xmm7,xmm7,xmm4
2873	vaesenc	xmm8,xmm8,xmm4
2874	vaesenc	xmm9,xmm9,xmm4
2875	vaesenc	xmm10,xmm10,xmm4
2876	vaesenc	xmm11,xmm11,xmm4
2877	vaesenc	xmm12,xmm12,xmm4
2878
2879	vmovdqu	xmm4,XMMWORD[96+r8]
2880	vaesenc	xmm7,xmm7,xmm4
2881	vaesenc	xmm8,xmm8,xmm4
2882	vaesenc	xmm9,xmm9,xmm4
2883	vaesenc	xmm10,xmm10,xmm4
2884	vaesenc	xmm11,xmm11,xmm4
2885	vaesenc	xmm12,xmm12,xmm4
2886
2887	vmovdqu	xmm4,XMMWORD[112+r8]
2888	vaesenc	xmm7,xmm7,xmm4
2889	vaesenc	xmm8,xmm8,xmm4
2890	vaesenc	xmm9,xmm9,xmm4
2891	vaesenc	xmm10,xmm10,xmm4
2892	vaesenc	xmm11,xmm11,xmm4
2893	vaesenc	xmm12,xmm12,xmm4
2894
2895
2896	vmovdqa	xmm6,XMMWORD[((80-32))+rax]
2897	vpxor	xmm6,xmm6,xmm0
2898	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
2899
2900	vpclmulqdq	xmm4,xmm6,xmm5,0x01
2901	vpxor	xmm1,xmm1,xmm4
2902	vpclmulqdq	xmm4,xmm6,xmm5,0x11
2903	vpxor	xmm2,xmm2,xmm4
2904	vpclmulqdq	xmm4,xmm6,xmm5,0x00
2905	vpxor	xmm3,xmm3,xmm4
2906	vpclmulqdq	xmm4,xmm6,xmm5,0x10
2907	vpxor	xmm1,xmm1,xmm4
2908
2909	vmovdqu	xmm4,XMMWORD[128+r8]
2910	vaesenc	xmm7,xmm7,xmm4
2911	vaesenc	xmm8,xmm8,xmm4
2912	vaesenc	xmm9,xmm9,xmm4
2913	vaesenc	xmm10,xmm10,xmm4
2914	vaesenc	xmm11,xmm11,xmm4
2915	vaesenc	xmm12,xmm12,xmm4
2916
2917
2918	vpsrldq	xmm4,xmm1,8
2919	vpxor	xmm5,xmm2,xmm4
2920	vpslldq	xmm4,xmm1,8
2921	vpxor	xmm0,xmm3,xmm4
2922
2923	vmovdqa	xmm3,XMMWORD[poly]
2924
2925	vmovdqu	xmm4,XMMWORD[144+r8]
2926	vaesenc	xmm7,xmm7,xmm4
2927	vaesenc	xmm8,xmm8,xmm4
2928	vaesenc	xmm9,xmm9,xmm4
2929	vaesenc	xmm10,xmm10,xmm4
2930	vaesenc	xmm11,xmm11,xmm4
2931	vaesenc	xmm12,xmm12,xmm4
2932
2933	vmovdqu	xmm4,XMMWORD[160+r8]
2934	vaesenc	xmm7,xmm7,xmm4
2935	vaesenc	xmm8,xmm8,xmm4
2936	vaesenc	xmm9,xmm9,xmm4
2937	vaesenc	xmm10,xmm10,xmm4
2938	vaesenc	xmm11,xmm11,xmm4
2939	vaesenc	xmm12,xmm12,xmm4
2940
2941	vmovdqu	xmm4,XMMWORD[176+r8]
2942	vaesenc	xmm7,xmm7,xmm4
2943	vaesenc	xmm8,xmm8,xmm4
2944	vaesenc	xmm9,xmm9,xmm4
2945	vaesenc	xmm10,xmm10,xmm4
2946	vaesenc	xmm11,xmm11,xmm4
2947	vaesenc	xmm12,xmm12,xmm4
2948
2949	vmovdqu	xmm4,XMMWORD[192+r8]
2950	vaesenc	xmm7,xmm7,xmm4
2951	vaesenc	xmm8,xmm8,xmm4
2952	vaesenc	xmm9,xmm9,xmm4
2953	vaesenc	xmm10,xmm10,xmm4
2954	vaesenc	xmm11,xmm11,xmm4
2955	vaesenc	xmm12,xmm12,xmm4
2956
2957	vmovdqu	xmm4,XMMWORD[208+r8]
2958	vaesenc	xmm7,xmm7,xmm4
2959	vaesenc	xmm8,xmm8,xmm4
2960	vaesenc	xmm9,xmm9,xmm4
2961	vaesenc	xmm10,xmm10,xmm4
2962	vaesenc	xmm11,xmm11,xmm4
2963	vaesenc	xmm12,xmm12,xmm4
2964
2965	vmovdqu	xmm6,XMMWORD[224+r8]
2966	vpalignr	xmm2,xmm0,xmm0,8
2967	vpclmulqdq	xmm0,xmm0,xmm3,0x10
2968	vpxor	xmm0,xmm2,xmm0
2969
2970	vpxor	xmm4,xmm6,XMMWORD[rdi]
2971	vaesenclast	xmm7,xmm7,xmm4
2972	vpxor	xmm4,xmm6,XMMWORD[16+rdi]
2973	vaesenclast	xmm8,xmm8,xmm4
2974	vpxor	xmm4,xmm6,XMMWORD[32+rdi]
2975	vaesenclast	xmm9,xmm9,xmm4
2976	vpxor	xmm4,xmm6,XMMWORD[48+rdi]
2977	vaesenclast	xmm10,xmm10,xmm4
2978	vpxor	xmm4,xmm6,XMMWORD[64+rdi]
2979	vaesenclast	xmm11,xmm11,xmm4
2980	vpxor	xmm4,xmm6,XMMWORD[80+rdi]
2981	vaesenclast	xmm12,xmm12,xmm4
2982
2983	vpalignr	xmm2,xmm0,xmm0,8
2984	vpclmulqdq	xmm0,xmm0,xmm3,0x10
2985	vpxor	xmm0,xmm2,xmm0
2986
2987	vmovdqu	XMMWORD[rsi],xmm7
2988	vmovdqu	XMMWORD[16+rsi],xmm8
2989	vmovdqu	XMMWORD[32+rsi],xmm9
2990	vmovdqu	XMMWORD[48+rsi],xmm10
2991	vmovdqu	XMMWORD[64+rsi],xmm11
2992	vmovdqu	XMMWORD[80+rsi],xmm12
2993
2994	vpxor	xmm0,xmm0,xmm5
2995
2996	lea	rdi,[96+rdi]
2997	lea	rsi,[96+rsi]
2998	jmp	NEAR $L$256_dec_loop1
2999
3000$L$256_dec_finish_96:
3001	vmovdqa	xmm6,xmm12
3002	vmovdqa	XMMWORD[(16-32)+rax],xmm11
3003	vmovdqa	XMMWORD[(32-32)+rax],xmm10
3004	vmovdqa	XMMWORD[(48-32)+rax],xmm9
3005	vmovdqa	XMMWORD[(64-32)+rax],xmm8
3006	vmovdqa	XMMWORD[(80-32)+rax],xmm7
3007
3008	vmovdqu	xmm4,XMMWORD[((0-32))+rcx]
3009	vpclmulqdq	xmm1,xmm6,xmm4,0x10
3010	vpclmulqdq	xmm2,xmm6,xmm4,0x11
3011	vpclmulqdq	xmm3,xmm6,xmm4,0x00
3012	vpclmulqdq	xmm4,xmm6,xmm4,0x01
3013	vpxor	xmm1,xmm1,xmm4
3014
3015	vmovdqu	xmm6,XMMWORD[((-16))+rax]
3016	vmovdqu	xmm13,XMMWORD[((-16))+rcx]
3017
3018	vpclmulqdq	xmm4,xmm6,xmm13,0x10
3019	vpxor	xmm1,xmm1,xmm4
3020	vpclmulqdq	xmm4,xmm6,xmm13,0x11
3021	vpxor	xmm2,xmm2,xmm4
3022	vpclmulqdq	xmm4,xmm6,xmm13,0x00
3023	vpxor	xmm3,xmm3,xmm4
3024	vpclmulqdq	xmm4,xmm6,xmm13,0x01
3025	vpxor	xmm1,xmm1,xmm4
3026
3027	vmovdqu	xmm6,XMMWORD[rax]
3028	vmovdqu	xmm13,XMMWORD[rcx]
3029
3030	vpclmulqdq	xmm4,xmm6,xmm13,0x10
3031	vpxor	xmm1,xmm1,xmm4
3032	vpclmulqdq	xmm4,xmm6,xmm13,0x11
3033	vpxor	xmm2,xmm2,xmm4
3034	vpclmulqdq	xmm4,xmm6,xmm13,0x00
3035	vpxor	xmm3,xmm3,xmm4
3036	vpclmulqdq	xmm4,xmm6,xmm13,0x01
3037	vpxor	xmm1,xmm1,xmm4
3038
3039	vmovdqu	xmm6,XMMWORD[16+rax]
3040	vmovdqu	xmm13,XMMWORD[16+rcx]
3041
3042	vpclmulqdq	xmm4,xmm6,xmm13,0x10
3043	vpxor	xmm1,xmm1,xmm4
3044	vpclmulqdq	xmm4,xmm6,xmm13,0x11
3045	vpxor	xmm2,xmm2,xmm4
3046	vpclmulqdq	xmm4,xmm6,xmm13,0x00
3047	vpxor	xmm3,xmm3,xmm4
3048	vpclmulqdq	xmm4,xmm6,xmm13,0x01
3049	vpxor	xmm1,xmm1,xmm4
3050
3051	vmovdqu	xmm6,XMMWORD[32+rax]
3052	vmovdqu	xmm13,XMMWORD[32+rcx]
3053
3054	vpclmulqdq	xmm4,xmm6,xmm13,0x10
3055	vpxor	xmm1,xmm1,xmm4
3056	vpclmulqdq	xmm4,xmm6,xmm13,0x11
3057	vpxor	xmm2,xmm2,xmm4
3058	vpclmulqdq	xmm4,xmm6,xmm13,0x00
3059	vpxor	xmm3,xmm3,xmm4
3060	vpclmulqdq	xmm4,xmm6,xmm13,0x01
3061	vpxor	xmm1,xmm1,xmm4
3062
3063
3064	vmovdqu	xmm6,XMMWORD[((80-32))+rax]
3065	vpxor	xmm6,xmm6,xmm0
3066	vmovdqu	xmm5,XMMWORD[((80-32))+rcx]
3067	vpclmulqdq	xmm4,xmm6,xmm5,0x11
3068	vpxor	xmm2,xmm2,xmm4
3069	vpclmulqdq	xmm4,xmm6,xmm5,0x00
3070	vpxor	xmm3,xmm3,xmm4
3071	vpclmulqdq	xmm4,xmm6,xmm5,0x10
3072	vpxor	xmm1,xmm1,xmm4
3073	vpclmulqdq	xmm4,xmm6,xmm5,0x01
3074	vpxor	xmm1,xmm1,xmm4
3075
3076	vpsrldq	xmm4,xmm1,8
3077	vpxor	xmm5,xmm2,xmm4
3078	vpslldq	xmm4,xmm1,8
3079	vpxor	xmm0,xmm3,xmm4
3080
3081	vmovdqa	xmm3,XMMWORD[poly]
3082
3083	vpalignr	xmm2,xmm0,xmm0,8
3084	vpclmulqdq	xmm0,xmm0,xmm3,0x10
3085	vpxor	xmm0,xmm2,xmm0
3086
3087	vpalignr	xmm2,xmm0,xmm0,8
3088	vpclmulqdq	xmm0,xmm0,xmm3,0x10
3089	vpxor	xmm0,xmm2,xmm0
3090
3091	vpxor	xmm0,xmm0,xmm5
3092
3093$L$256_dec_loop2:
3094
3095
3096
3097	cmp	r9,16
3098	jb	NEAR $L$256_dec_out
3099	sub	r9,16
3100
3101	vmovdqa	xmm2,xmm15
3102	vpaddd	xmm15,xmm15,XMMWORD[one]
3103
3104	vpxor	xmm2,xmm2,XMMWORD[r8]
3105	vaesenc	xmm2,xmm2,XMMWORD[16+r8]
3106	vaesenc	xmm2,xmm2,XMMWORD[32+r8]
3107	vaesenc	xmm2,xmm2,XMMWORD[48+r8]
3108	vaesenc	xmm2,xmm2,XMMWORD[64+r8]
3109	vaesenc	xmm2,xmm2,XMMWORD[80+r8]
3110	vaesenc	xmm2,xmm2,XMMWORD[96+r8]
3111	vaesenc	xmm2,xmm2,XMMWORD[112+r8]
3112	vaesenc	xmm2,xmm2,XMMWORD[128+r8]
3113	vaesenc	xmm2,xmm2,XMMWORD[144+r8]
3114	vaesenc	xmm2,xmm2,XMMWORD[160+r8]
3115	vaesenc	xmm2,xmm2,XMMWORD[176+r8]
3116	vaesenc	xmm2,xmm2,XMMWORD[192+r8]
3117	vaesenc	xmm2,xmm2,XMMWORD[208+r8]
3118	vaesenclast	xmm2,xmm2,XMMWORD[224+r8]
3119	vpxor	xmm2,xmm2,XMMWORD[rdi]
3120	vmovdqu	XMMWORD[rsi],xmm2
3121	add	rdi,16
3122	add	rsi,16
3123
3124	vpxor	xmm0,xmm0,xmm2
3125	vmovdqa	xmm1,XMMWORD[((-32))+rcx]
3126	call	GFMUL
3127
3128	jmp	NEAR $L$256_dec_loop2
3129
3130$L$256_dec_out:
3131	vmovdqu	XMMWORD[rdx],xmm0
3132	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
3133	mov	rsi,QWORD[16+rsp]
3134	ret
3135
3136$L$SEH_end_aes256gcmsiv_dec:
3137global	aes256gcmsiv_kdf
3138
3139ALIGN	16
3140aes256gcmsiv_kdf:
3141	mov	QWORD[8+rsp],rdi	;WIN64 prologue
3142	mov	QWORD[16+rsp],rsi
3143	mov	rax,rsp
3144$L$SEH_begin_aes256gcmsiv_kdf:
3145	mov	rdi,rcx
3146	mov	rsi,rdx
3147	mov	rdx,r8
3148
3149
3150
3151_CET_ENDBR
3152
3153
3154
3155
3156	vmovdqa	xmm1,XMMWORD[rdx]
3157	vmovdqa	xmm4,XMMWORD[rdi]
3158	vmovdqa	xmm11,XMMWORD[and_mask]
3159	vmovdqa	xmm8,XMMWORD[one]
3160	vpshufd	xmm4,xmm4,0x90
3161	vpand	xmm4,xmm4,xmm11
3162	vpaddd	xmm6,xmm4,xmm8
3163	vpaddd	xmm7,xmm6,xmm8
3164	vpaddd	xmm11,xmm7,xmm8
3165	vpaddd	xmm12,xmm11,xmm8
3166	vpaddd	xmm13,xmm12,xmm8
3167
3168	vpxor	xmm4,xmm4,xmm1
3169	vpxor	xmm6,xmm6,xmm1
3170	vpxor	xmm7,xmm7,xmm1
3171	vpxor	xmm11,xmm11,xmm1
3172	vpxor	xmm12,xmm12,xmm1
3173	vpxor	xmm13,xmm13,xmm1
3174
3175	vmovdqa	xmm1,XMMWORD[16+rdx]
3176	vaesenc	xmm4,xmm4,xmm1
3177	vaesenc	xmm6,xmm6,xmm1
3178	vaesenc	xmm7,xmm7,xmm1
3179	vaesenc	xmm11,xmm11,xmm1
3180	vaesenc	xmm12,xmm12,xmm1
3181	vaesenc	xmm13,xmm13,xmm1
3182
3183	vmovdqa	xmm2,XMMWORD[32+rdx]
3184	vaesenc	xmm4,xmm4,xmm2
3185	vaesenc	xmm6,xmm6,xmm2
3186	vaesenc	xmm7,xmm7,xmm2
3187	vaesenc	xmm11,xmm11,xmm2
3188	vaesenc	xmm12,xmm12,xmm2
3189	vaesenc	xmm13,xmm13,xmm2
3190
3191	vmovdqa	xmm1,XMMWORD[48+rdx]
3192	vaesenc	xmm4,xmm4,xmm1
3193	vaesenc	xmm6,xmm6,xmm1
3194	vaesenc	xmm7,xmm7,xmm1
3195	vaesenc	xmm11,xmm11,xmm1
3196	vaesenc	xmm12,xmm12,xmm1
3197	vaesenc	xmm13,xmm13,xmm1
3198
3199	vmovdqa	xmm2,XMMWORD[64+rdx]
3200	vaesenc	xmm4,xmm4,xmm2
3201	vaesenc	xmm6,xmm6,xmm2
3202	vaesenc	xmm7,xmm7,xmm2
3203	vaesenc	xmm11,xmm11,xmm2
3204	vaesenc	xmm12,xmm12,xmm2
3205	vaesenc	xmm13,xmm13,xmm2
3206
3207	vmovdqa	xmm1,XMMWORD[80+rdx]
3208	vaesenc	xmm4,xmm4,xmm1
3209	vaesenc	xmm6,xmm6,xmm1
3210	vaesenc	xmm7,xmm7,xmm1
3211	vaesenc	xmm11,xmm11,xmm1
3212	vaesenc	xmm12,xmm12,xmm1
3213	vaesenc	xmm13,xmm13,xmm1
3214
3215	vmovdqa	xmm2,XMMWORD[96+rdx]
3216	vaesenc	xmm4,xmm4,xmm2
3217	vaesenc	xmm6,xmm6,xmm2
3218	vaesenc	xmm7,xmm7,xmm2
3219	vaesenc	xmm11,xmm11,xmm2
3220	vaesenc	xmm12,xmm12,xmm2
3221	vaesenc	xmm13,xmm13,xmm2
3222
3223	vmovdqa	xmm1,XMMWORD[112+rdx]
3224	vaesenc	xmm4,xmm4,xmm1
3225	vaesenc	xmm6,xmm6,xmm1
3226	vaesenc	xmm7,xmm7,xmm1
3227	vaesenc	xmm11,xmm11,xmm1
3228	vaesenc	xmm12,xmm12,xmm1
3229	vaesenc	xmm13,xmm13,xmm1
3230
3231	vmovdqa	xmm2,XMMWORD[128+rdx]
3232	vaesenc	xmm4,xmm4,xmm2
3233	vaesenc	xmm6,xmm6,xmm2
3234	vaesenc	xmm7,xmm7,xmm2
3235	vaesenc	xmm11,xmm11,xmm2
3236	vaesenc	xmm12,xmm12,xmm2
3237	vaesenc	xmm13,xmm13,xmm2
3238
3239	vmovdqa	xmm1,XMMWORD[144+rdx]
3240	vaesenc	xmm4,xmm4,xmm1
3241	vaesenc	xmm6,xmm6,xmm1
3242	vaesenc	xmm7,xmm7,xmm1
3243	vaesenc	xmm11,xmm11,xmm1
3244	vaesenc	xmm12,xmm12,xmm1
3245	vaesenc	xmm13,xmm13,xmm1
3246
3247	vmovdqa	xmm2,XMMWORD[160+rdx]
3248	vaesenc	xmm4,xmm4,xmm2
3249	vaesenc	xmm6,xmm6,xmm2
3250	vaesenc	xmm7,xmm7,xmm2
3251	vaesenc	xmm11,xmm11,xmm2
3252	vaesenc	xmm12,xmm12,xmm2
3253	vaesenc	xmm13,xmm13,xmm2
3254
3255	vmovdqa	xmm1,XMMWORD[176+rdx]
3256	vaesenc	xmm4,xmm4,xmm1
3257	vaesenc	xmm6,xmm6,xmm1
3258	vaesenc	xmm7,xmm7,xmm1
3259	vaesenc	xmm11,xmm11,xmm1
3260	vaesenc	xmm12,xmm12,xmm1
3261	vaesenc	xmm13,xmm13,xmm1
3262
3263	vmovdqa	xmm2,XMMWORD[192+rdx]
3264	vaesenc	xmm4,xmm4,xmm2
3265	vaesenc	xmm6,xmm6,xmm2
3266	vaesenc	xmm7,xmm7,xmm2
3267	vaesenc	xmm11,xmm11,xmm2
3268	vaesenc	xmm12,xmm12,xmm2
3269	vaesenc	xmm13,xmm13,xmm2
3270
3271	vmovdqa	xmm1,XMMWORD[208+rdx]
3272	vaesenc	xmm4,xmm4,xmm1
3273	vaesenc	xmm6,xmm6,xmm1
3274	vaesenc	xmm7,xmm7,xmm1
3275	vaesenc	xmm11,xmm11,xmm1
3276	vaesenc	xmm12,xmm12,xmm1
3277	vaesenc	xmm13,xmm13,xmm1
3278
3279	vmovdqa	xmm2,XMMWORD[224+rdx]
3280	vaesenclast	xmm4,xmm4,xmm2
3281	vaesenclast	xmm6,xmm6,xmm2
3282	vaesenclast	xmm7,xmm7,xmm2
3283	vaesenclast	xmm11,xmm11,xmm2
3284	vaesenclast	xmm12,xmm12,xmm2
3285	vaesenclast	xmm13,xmm13,xmm2
3286
3287
3288	vmovdqa	XMMWORD[rsi],xmm4
3289	vmovdqa	XMMWORD[16+rsi],xmm6
3290	vmovdqa	XMMWORD[32+rsi],xmm7
3291	vmovdqa	XMMWORD[48+rsi],xmm11
3292	vmovdqa	XMMWORD[64+rsi],xmm12
3293	vmovdqa	XMMWORD[80+rsi],xmm13
3294	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
3295	mov	rsi,QWORD[16+rsp]
3296	ret
3297
3298$L$SEH_end_aes256gcmsiv_kdf:
3299%else
3300; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
3301ret
3302%endif
3303