1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7#include "textflag.h"
8
9// SHA512 block routine. See sha512block.go for Go equivalent.
10//
11// The algorithm is detailed in FIPS 180-4:
12//
13//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
14//
15// Wt = Mt; for 0 <= t <= 15
16// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
17//
18// a = H0
19// b = H1
20// c = H2
21// d = H3
22// e = H4
23// f = H5
24// g = H6
25// h = H7
26//
27// for t = 0 to 79 {
28//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
29//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
30//    h = g
31//    g = f
32//    f = e
33//    e = d + T1
34//    d = c
35//    c = b
36//    b = a
37//    a = T1 + T2
38// }
39//
40// H0 = a + H0
41// H1 = b + H1
42// H2 = c + H2
43// H3 = d + H3
44// H4 = e + H4
45// H5 = f + H5
46// H6 = g + H6
47// H7 = h + H7
48
49// Wt = Mt; for 0 <= t <= 15
50#define MSGSCHEDULE0(index) \
51	MOVQ	(index*8)(SI), AX; \
52	BSWAPQ	AX; \
53	MOVQ	AX, (index*8)(BP)
54
55// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
56//   SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
57//   SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
58#define MSGSCHEDULE1(index) \
59	MOVQ	((index-2)*8)(BP), AX; \
60	MOVQ	AX, CX; \
61	RORQ	$19, AX; \
62	MOVQ	CX, DX; \
63	RORQ	$61, CX; \
64	SHRQ	$6, DX; \
65	MOVQ	((index-15)*8)(BP), BX; \
66	XORQ	CX, AX; \
67	MOVQ	BX, CX; \
68	XORQ	DX, AX; \
69	RORQ	$1, BX; \
70	MOVQ	CX, DX; \
71	SHRQ	$7, DX; \
72	RORQ	$8, CX; \
73	ADDQ	((index-7)*8)(BP), AX; \
74	XORQ	CX, BX; \
75	XORQ	DX, BX; \
76	ADDQ	((index-16)*8)(BP), BX; \
77	ADDQ	BX, AX; \
78	MOVQ	AX, ((index)*8)(BP)
79
80// Calculate T1 in AX - uses AX, CX and DX registers.
81// h is also used as an accumulator. Wt is passed in AX.
82//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
83//     BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
84//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
85#define SHA512T1(const, e, f, g, h) \
86	MOVQ	$const, DX; \
87	ADDQ	AX, h; \
88	MOVQ	e, AX; \
89	ADDQ	DX, h; \
90	MOVQ	e, CX; \
91	RORQ	$14, AX; \
92	MOVQ	e, DX; \
93	RORQ	$18, CX; \
94	XORQ	CX, AX; \
95	MOVQ	e, CX; \
96	RORQ	$41, DX; \
97	ANDQ	f, CX; \
98	XORQ	AX, DX; \
99	MOVQ	e, AX; \
100	NOTQ	AX; \
101	ADDQ	DX, h; \
102	ANDQ	g, AX; \
103	XORQ	CX, AX; \
104	ADDQ	h, AX
105
106// Calculate T2 in BX - uses BX, CX, DX and DI registers.
107//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
108//     BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
109//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
110#define SHA512T2(a, b, c) \
111	MOVQ	a, DI; \
112	MOVQ	c, BX; \
113	RORQ	$28, DI; \
114	MOVQ	a, DX; \
115	ANDQ	b, BX; \
116	RORQ	$34, DX; \
117	MOVQ	a, CX; \
118	ANDQ	c, CX; \
119	XORQ	DX, DI; \
120	XORQ	CX, BX; \
121	MOVQ	a, DX; \
122	MOVQ	b, CX; \
123	RORQ	$39, DX; \
124	ANDQ	a, CX; \
125	XORQ	CX, BX; \
126	XORQ	DX, DI; \
127	ADDQ	DI, BX
128
129// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
130// The values for e and a are stored in d and h, ready for rotation.
131#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
132	SHA512T1(const, e, f, g, h); \
133	SHA512T2(a, b, c); \
134	MOVQ	BX, h; \
135	ADDQ	AX, d; \
136	ADDQ	AX, h
137
138#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
139	MSGSCHEDULE0(index); \
140	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
141
142#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
143	MSGSCHEDULE1(index); \
144	SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
145
146TEXT ·blockAMD64(SB),0,$648-32
147	MOVQ	p_base+8(FP), SI
148	MOVQ	p_len+16(FP), DX
149	SHRQ	$7, DX
150	SHLQ	$7, DX
151
152	LEAQ	(SI)(DX*1), DI
153	MOVQ	DI, 640(SP)
154	CMPQ	SI, DI
155	JEQ	end
156
157	MOVQ	dig+0(FP), BP
158	MOVQ	(0*8)(BP), R8		// a = H0
159	MOVQ	(1*8)(BP), R9		// b = H1
160	MOVQ	(2*8)(BP), R10		// c = H2
161	MOVQ	(3*8)(BP), R11		// d = H3
162	MOVQ	(4*8)(BP), R12		// e = H4
163	MOVQ	(5*8)(BP), R13		// f = H5
164	MOVQ	(6*8)(BP), R14		// g = H6
165	MOVQ	(7*8)(BP), R15		// h = H7
166
167loop:
168	MOVQ	SP, BP			// message schedule
169
170	SHA512ROUND0(0, 0x428a2f98d728ae22, R8, R9, R10, R11, R12, R13, R14, R15)
171	SHA512ROUND0(1, 0x7137449123ef65cd, R15, R8, R9, R10, R11, R12, R13, R14)
172	SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R14, R15, R8, R9, R10, R11, R12, R13)
173	SHA512ROUND0(3, 0xe9b5dba58189dbbc, R13, R14, R15, R8, R9, R10, R11, R12)
174	SHA512ROUND0(4, 0x3956c25bf348b538, R12, R13, R14, R15, R8, R9, R10, R11)
175	SHA512ROUND0(5, 0x59f111f1b605d019, R11, R12, R13, R14, R15, R8, R9, R10)
176	SHA512ROUND0(6, 0x923f82a4af194f9b, R10, R11, R12, R13, R14, R15, R8, R9)
177	SHA512ROUND0(7, 0xab1c5ed5da6d8118, R9, R10, R11, R12, R13, R14, R15, R8)
178	SHA512ROUND0(8, 0xd807aa98a3030242, R8, R9, R10, R11, R12, R13, R14, R15)
179	SHA512ROUND0(9, 0x12835b0145706fbe, R15, R8, R9, R10, R11, R12, R13, R14)
180	SHA512ROUND0(10, 0x243185be4ee4b28c, R14, R15, R8, R9, R10, R11, R12, R13)
181	SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R13, R14, R15, R8, R9, R10, R11, R12)
182	SHA512ROUND0(12, 0x72be5d74f27b896f, R12, R13, R14, R15, R8, R9, R10, R11)
183	SHA512ROUND0(13, 0x80deb1fe3b1696b1, R11, R12, R13, R14, R15, R8, R9, R10)
184	SHA512ROUND0(14, 0x9bdc06a725c71235, R10, R11, R12, R13, R14, R15, R8, R9)
185	SHA512ROUND0(15, 0xc19bf174cf692694, R9, R10, R11, R12, R13, R14, R15, R8)
186
187	SHA512ROUND1(16, 0xe49b69c19ef14ad2, R8, R9, R10, R11, R12, R13, R14, R15)
188	SHA512ROUND1(17, 0xefbe4786384f25e3, R15, R8, R9, R10, R11, R12, R13, R14)
189	SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R14, R15, R8, R9, R10, R11, R12, R13)
190	SHA512ROUND1(19, 0x240ca1cc77ac9c65, R13, R14, R15, R8, R9, R10, R11, R12)
191	SHA512ROUND1(20, 0x2de92c6f592b0275, R12, R13, R14, R15, R8, R9, R10, R11)
192	SHA512ROUND1(21, 0x4a7484aa6ea6e483, R11, R12, R13, R14, R15, R8, R9, R10)
193	SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R10, R11, R12, R13, R14, R15, R8, R9)
194	SHA512ROUND1(23, 0x76f988da831153b5, R9, R10, R11, R12, R13, R14, R15, R8)
195	SHA512ROUND1(24, 0x983e5152ee66dfab, R8, R9, R10, R11, R12, R13, R14, R15)
196	SHA512ROUND1(25, 0xa831c66d2db43210, R15, R8, R9, R10, R11, R12, R13, R14)
197	SHA512ROUND1(26, 0xb00327c898fb213f, R14, R15, R8, R9, R10, R11, R12, R13)
198	SHA512ROUND1(27, 0xbf597fc7beef0ee4, R13, R14, R15, R8, R9, R10, R11, R12)
199	SHA512ROUND1(28, 0xc6e00bf33da88fc2, R12, R13, R14, R15, R8, R9, R10, R11)
200	SHA512ROUND1(29, 0xd5a79147930aa725, R11, R12, R13, R14, R15, R8, R9, R10)
201	SHA512ROUND1(30, 0x06ca6351e003826f, R10, R11, R12, R13, R14, R15, R8, R9)
202	SHA512ROUND1(31, 0x142929670a0e6e70, R9, R10, R11, R12, R13, R14, R15, R8)
203	SHA512ROUND1(32, 0x27b70a8546d22ffc, R8, R9, R10, R11, R12, R13, R14, R15)
204	SHA512ROUND1(33, 0x2e1b21385c26c926, R15, R8, R9, R10, R11, R12, R13, R14)
205	SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R14, R15, R8, R9, R10, R11, R12, R13)
206	SHA512ROUND1(35, 0x53380d139d95b3df, R13, R14, R15, R8, R9, R10, R11, R12)
207	SHA512ROUND1(36, 0x650a73548baf63de, R12, R13, R14, R15, R8, R9, R10, R11)
208	SHA512ROUND1(37, 0x766a0abb3c77b2a8, R11, R12, R13, R14, R15, R8, R9, R10)
209	SHA512ROUND1(38, 0x81c2c92e47edaee6, R10, R11, R12, R13, R14, R15, R8, R9)
210	SHA512ROUND1(39, 0x92722c851482353b, R9, R10, R11, R12, R13, R14, R15, R8)
211	SHA512ROUND1(40, 0xa2bfe8a14cf10364, R8, R9, R10, R11, R12, R13, R14, R15)
212	SHA512ROUND1(41, 0xa81a664bbc423001, R15, R8, R9, R10, R11, R12, R13, R14)
213	SHA512ROUND1(42, 0xc24b8b70d0f89791, R14, R15, R8, R9, R10, R11, R12, R13)
214	SHA512ROUND1(43, 0xc76c51a30654be30, R13, R14, R15, R8, R9, R10, R11, R12)
215	SHA512ROUND1(44, 0xd192e819d6ef5218, R12, R13, R14, R15, R8, R9, R10, R11)
216	SHA512ROUND1(45, 0xd69906245565a910, R11, R12, R13, R14, R15, R8, R9, R10)
217	SHA512ROUND1(46, 0xf40e35855771202a, R10, R11, R12, R13, R14, R15, R8, R9)
218	SHA512ROUND1(47, 0x106aa07032bbd1b8, R9, R10, R11, R12, R13, R14, R15, R8)
219	SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R8, R9, R10, R11, R12, R13, R14, R15)
220	SHA512ROUND1(49, 0x1e376c085141ab53, R15, R8, R9, R10, R11, R12, R13, R14)
221	SHA512ROUND1(50, 0x2748774cdf8eeb99, R14, R15, R8, R9, R10, R11, R12, R13)
222	SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R13, R14, R15, R8, R9, R10, R11, R12)
223	SHA512ROUND1(52, 0x391c0cb3c5c95a63, R12, R13, R14, R15, R8, R9, R10, R11)
224	SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R11, R12, R13, R14, R15, R8, R9, R10)
225	SHA512ROUND1(54, 0x5b9cca4f7763e373, R10, R11, R12, R13, R14, R15, R8, R9)
226	SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R9, R10, R11, R12, R13, R14, R15, R8)
227	SHA512ROUND1(56, 0x748f82ee5defb2fc, R8, R9, R10, R11, R12, R13, R14, R15)
228	SHA512ROUND1(57, 0x78a5636f43172f60, R15, R8, R9, R10, R11, R12, R13, R14)
229	SHA512ROUND1(58, 0x84c87814a1f0ab72, R14, R15, R8, R9, R10, R11, R12, R13)
230	SHA512ROUND1(59, 0x8cc702081a6439ec, R13, R14, R15, R8, R9, R10, R11, R12)
231	SHA512ROUND1(60, 0x90befffa23631e28, R12, R13, R14, R15, R8, R9, R10, R11)
232	SHA512ROUND1(61, 0xa4506cebde82bde9, R11, R12, R13, R14, R15, R8, R9, R10)
233	SHA512ROUND1(62, 0xbef9a3f7b2c67915, R10, R11, R12, R13, R14, R15, R8, R9)
234	SHA512ROUND1(63, 0xc67178f2e372532b, R9, R10, R11, R12, R13, R14, R15, R8)
235	SHA512ROUND1(64, 0xca273eceea26619c, R8, R9, R10, R11, R12, R13, R14, R15)
236	SHA512ROUND1(65, 0xd186b8c721c0c207, R15, R8, R9, R10, R11, R12, R13, R14)
237	SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R14, R15, R8, R9, R10, R11, R12, R13)
238	SHA512ROUND1(67, 0xf57d4f7fee6ed178, R13, R14, R15, R8, R9, R10, R11, R12)
239	SHA512ROUND1(68, 0x06f067aa72176fba, R12, R13, R14, R15, R8, R9, R10, R11)
240	SHA512ROUND1(69, 0x0a637dc5a2c898a6, R11, R12, R13, R14, R15, R8, R9, R10)
241	SHA512ROUND1(70, 0x113f9804bef90dae, R10, R11, R12, R13, R14, R15, R8, R9)
242	SHA512ROUND1(71, 0x1b710b35131c471b, R9, R10, R11, R12, R13, R14, R15, R8)
243	SHA512ROUND1(72, 0x28db77f523047d84, R8, R9, R10, R11, R12, R13, R14, R15)
244	SHA512ROUND1(73, 0x32caab7b40c72493, R15, R8, R9, R10, R11, R12, R13, R14)
245	SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R14, R15, R8, R9, R10, R11, R12, R13)
246	SHA512ROUND1(75, 0x431d67c49c100d4c, R13, R14, R15, R8, R9, R10, R11, R12)
247	SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R12, R13, R14, R15, R8, R9, R10, R11)
248	SHA512ROUND1(77, 0x597f299cfc657e2a, R11, R12, R13, R14, R15, R8, R9, R10)
249	SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R10, R11, R12, R13, R14, R15, R8, R9)
250	SHA512ROUND1(79, 0x6c44198c4a475817, R9, R10, R11, R12, R13, R14, R15, R8)
251
252	MOVQ	dig+0(FP), BP
253	ADDQ	(0*8)(BP), R8	// H0 = a + H0
254	MOVQ	R8, (0*8)(BP)
255	ADDQ	(1*8)(BP), R9	// H1 = b + H1
256	MOVQ	R9, (1*8)(BP)
257	ADDQ	(2*8)(BP), R10	// H2 = c + H2
258	MOVQ	R10, (2*8)(BP)
259	ADDQ	(3*8)(BP), R11	// H3 = d + H3
260	MOVQ	R11, (3*8)(BP)
261	ADDQ	(4*8)(BP), R12	// H4 = e + H4
262	MOVQ	R12, (4*8)(BP)
263	ADDQ	(5*8)(BP), R13	// H5 = f + H5
264	MOVQ	R13, (5*8)(BP)
265	ADDQ	(6*8)(BP), R14	// H6 = g + H6
266	MOVQ	R14, (6*8)(BP)
267	ADDQ	(7*8)(BP), R15	// H7 = h + H7
268	MOVQ	R15, (7*8)(BP)
269
270	ADDQ	$128, SI
271	CMPQ	SI, 640(SP)
272	JB	loop
273
274end:
275	RET
276
277// Version below is based on "Fast SHA512 Implementations on Intel
278// Architecture Processors" White-paper
279// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
280// AVX2 version by Intel, same algorithm in Linux kernel:
281// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
282
283// James Guilford <[email protected]>
284// Kirk Yap <[email protected]>
285// Tim Chen <[email protected]>
286// David Cote <[email protected]>
287// Aleksey Sidorov <[email protected]>
288
289#define YFER_SIZE (4*8)
290#define SRND_SIZE (1*8)
291#define INP_SIZE (1*8)
292
293#define frame_YFER (0)
294#define frame_SRND (frame_YFER + YFER_SIZE)
295#define frame_INP (frame_SRND + SRND_SIZE)
296#define frame_INPEND (frame_INP + INP_SIZE)
297
298#define addm(p1, p2) \
299	ADDQ p1, p2; \
300	MOVQ p2, p1
301
302#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
303	VMOVDQU p2, p1;    \
304	VPSHUFB p3, p1, p1
305
306#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
307	VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
308	VPALIGNR   $RVAL, YSRC2, YDST, YDST
309
310DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
311DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
312DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
313DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
314
315GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
316
317DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
318DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
319DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
320DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
321
322GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
323
324TEXT ·blockAVX2(SB), NOSPLIT, $56-32
325	MOVQ dig+0(FP), SI
326	MOVQ p_base+8(FP), DI
327	MOVQ p_len+16(FP), DX
328
329	SHRQ $7, DX
330	SHLQ $7, DX
331
332	JZ   done_hash
333	ADDQ DI, DX
334	MOVQ DX, frame_INPEND(SP)
335
336	MOVQ (0*8)(SI), AX
337	MOVQ (1*8)(SI), BX
338	MOVQ (2*8)(SI), CX
339	MOVQ (3*8)(SI), R8
340	MOVQ (4*8)(SI), DX
341	MOVQ (5*8)(SI), R9
342	MOVQ (6*8)(SI), R10
343	MOVQ (7*8)(SI), R11
344
345	VMOVDQU PSHUFFLE_BYTE_FLIP_MASK<>(SB), Y9
346
347loop0:
348	MOVQ ·_K+0(SB), BP
349
350	// byte swap first 16 dwords
351	COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
352	COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
353	COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
354	COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
355
356	MOVQ DI, frame_INP(SP)
357
358	// schedule 64 input dwords, by doing 12 rounds of 4 each
359	MOVQ $4, frame_SRND(SP)
360
361loop1:
362	VPADDQ  (BP), Y4, Y0
363	VMOVDQU Y0, frame_YFER(SP)
364
365	MY_VPALIGNR(Y0, Y7, Y6, 8)
366
367	VPADDQ Y4, Y0, Y0
368
369	MY_VPALIGNR(Y1, Y5, Y4, 8)
370
371	VPSRLQ $1, Y1, Y2
372	VPSLLQ $(64-1), Y1, Y3
373	VPOR   Y2, Y3, Y3
374
375	VPSRLQ $7, Y1, Y8
376
377	MOVQ  AX, DI
378	RORXQ $41, DX, R13
379	RORXQ $18, DX, R14
380	ADDQ  frame_YFER(SP), R11
381	ORQ   CX, DI
382	MOVQ  R9, R15
383	RORXQ $34, AX, R12
384
385	XORQ  R14, R13
386	XORQ  R10, R15
387	RORXQ $14, DX, R14
388
389	ANDQ  DX, R15
390	XORQ  R14, R13
391	RORXQ $39, AX, R14
392	ADDQ  R11, R8
393
394	ANDQ  BX, DI
395	XORQ  R12, R14
396	RORXQ $28, AX, R12
397
398	XORQ R10, R15
399	XORQ R12, R14
400	MOVQ AX, R12
401	ANDQ CX, R12
402
403	ADDQ R13, R15
404	ORQ  R12, DI
405	ADDQ R14, R11
406
407	ADDQ R15, R8
408
409	ADDQ R15, R11
410	ADDQ DI, R11
411
412	VPSRLQ $8, Y1, Y2
413	VPSLLQ $(64-8), Y1, Y1
414	VPOR   Y2, Y1, Y1
415
416	VPXOR Y8, Y3, Y3
417	VPXOR Y1, Y3, Y1
418
419	VPADDQ Y1, Y0, Y0
420
421	VPERM2F128 $0x0, Y0, Y0, Y4
422
423	VPAND MASK_YMM_LO<>(SB), Y0, Y0
424
425	VPERM2F128 $0x11, Y7, Y7, Y2
426	VPSRLQ     $6, Y2, Y8
427
428	MOVQ  R11, DI
429	RORXQ $41, R8, R13
430	RORXQ $18, R8, R14
431	ADDQ  1*8+frame_YFER(SP), R10
432	ORQ   BX, DI
433
434	MOVQ  DX, R15
435	RORXQ $34, R11, R12
436	XORQ  R14, R13
437	XORQ  R9, R15
438
439	RORXQ $14, R8, R14
440	XORQ  R14, R13
441	RORXQ $39, R11, R14
442	ANDQ  R8, R15
443	ADDQ  R10, CX
444
445	ANDQ AX, DI
446	XORQ R12, R14
447
448	RORXQ $28, R11, R12
449	XORQ  R9, R15
450
451	XORQ R12, R14
452	MOVQ R11, R12
453	ANDQ BX, R12
454	ADDQ R13, R15
455
456	ORQ  R12, DI
457	ADDQ R14, R10
458
459	ADDQ R15, CX
460	ADDQ R15, R10
461	ADDQ DI, R10
462
463	VPSRLQ $19, Y2, Y3
464	VPSLLQ $(64-19), Y2, Y1
465	VPOR   Y1, Y3, Y3
466	VPXOR  Y3, Y8, Y8
467	VPSRLQ $61, Y2, Y3
468	VPSLLQ $(64-61), Y2, Y1
469	VPOR   Y1, Y3, Y3
470	VPXOR  Y3, Y8, Y8
471
472	VPADDQ Y8, Y4, Y4
473
474	VPSRLQ $6, Y4, Y8
475
476	MOVQ  R10, DI
477	RORXQ $41, CX, R13
478	ADDQ  2*8+frame_YFER(SP), R9
479
480	RORXQ $18, CX, R14
481	ORQ   AX, DI
482	MOVQ  R8, R15
483	XORQ  DX, R15
484
485	RORXQ $34, R10, R12
486	XORQ  R14, R13
487	ANDQ  CX, R15
488
489	RORXQ $14, CX, R14
490	ADDQ  R9, BX
491	ANDQ  R11, DI
492
493	XORQ  R14, R13
494	RORXQ $39, R10, R14
495	XORQ  DX, R15
496
497	XORQ  R12, R14
498	RORXQ $28, R10, R12
499
500	XORQ R12, R14
501	MOVQ R10, R12
502	ANDQ AX, R12
503	ADDQ R13, R15
504
505	ORQ  R12, DI
506	ADDQ R14, R9
507	ADDQ R15, BX
508	ADDQ R15, R9
509
510	ADDQ DI, R9
511
512	VPSRLQ $19, Y4, Y3
513	VPSLLQ $(64-19), Y4, Y1
514	VPOR   Y1, Y3, Y3
515	VPXOR  Y3, Y8, Y8
516	VPSRLQ $61, Y4, Y3
517	VPSLLQ $(64-61), Y4, Y1
518	VPOR   Y1, Y3, Y3
519	VPXOR  Y3, Y8, Y8
520
521	VPADDQ Y8, Y0, Y2
522
523	VPBLENDD $0xF0, Y2, Y4, Y4
524
525	MOVQ  R9, DI
526	RORXQ $41, BX, R13
527	RORXQ $18, BX, R14
528	ADDQ  3*8+frame_YFER(SP), DX
529	ORQ   R11, DI
530
531	MOVQ  CX, R15
532	RORXQ $34, R9, R12
533	XORQ  R14, R13
534	XORQ  R8, R15
535
536	RORXQ $14, BX, R14
537	ANDQ  BX, R15
538	ADDQ  DX, AX
539	ANDQ  R10, DI
540
541	XORQ R14, R13
542	XORQ R8, R15
543
544	RORXQ $39, R9, R14
545	ADDQ  R13, R15
546
547	XORQ R12, R14
548	ADDQ R15, AX
549
550	RORXQ $28, R9, R12
551
552	XORQ R12, R14
553	MOVQ R9, R12
554	ANDQ R11, R12
555	ORQ  R12, DI
556
557	ADDQ R14, DX
558	ADDQ R15, DX
559	ADDQ DI, DX
560
561	VPADDQ  1*32(BP), Y5, Y0
562	VMOVDQU Y0, frame_YFER(SP)
563
564	MY_VPALIGNR(Y0, Y4, Y7, 8)
565
566	VPADDQ Y5, Y0, Y0
567
568	MY_VPALIGNR(Y1, Y6, Y5, 8)
569
570	VPSRLQ $1, Y1, Y2
571	VPSLLQ $(64-1), Y1, Y3
572	VPOR   Y2, Y3, Y3
573
574	VPSRLQ $7, Y1, Y8
575
576	MOVQ  DX, DI
577	RORXQ $41, AX, R13
578	RORXQ $18, AX, R14
579	ADDQ  frame_YFER(SP), R8
580	ORQ   R10, DI
581	MOVQ  BX, R15
582	RORXQ $34, DX, R12
583
584	XORQ  R14, R13
585	XORQ  CX, R15
586	RORXQ $14, AX, R14
587
588	ANDQ  AX, R15
589	XORQ  R14, R13
590	RORXQ $39, DX, R14
591	ADDQ  R8, R11
592
593	ANDQ  R9, DI
594	XORQ  R12, R14
595	RORXQ $28, DX, R12
596
597	XORQ CX, R15
598	XORQ R12, R14
599	MOVQ DX, R12
600	ANDQ R10, R12
601
602	ADDQ R13, R15
603	ORQ  R12, DI
604	ADDQ R14, R8
605
606	ADDQ R15, R11
607
608	ADDQ R15, R8
609	ADDQ DI, R8
610
611	VPSRLQ $8, Y1, Y2
612	VPSLLQ $(64-8), Y1, Y1
613	VPOR   Y2, Y1, Y1
614
615	VPXOR Y8, Y3, Y3
616	VPXOR Y1, Y3, Y1
617
618	VPADDQ Y1, Y0, Y0
619
620	VPERM2F128 $0x0, Y0, Y0, Y5
621
622	VPAND MASK_YMM_LO<>(SB), Y0, Y0
623
624	VPERM2F128 $0x11, Y4, Y4, Y2
625	VPSRLQ     $6, Y2, Y8
626
627	MOVQ  R8, DI
628	RORXQ $41, R11, R13
629	RORXQ $18, R11, R14
630	ADDQ  1*8+frame_YFER(SP), CX
631	ORQ   R9, DI
632
633	MOVQ  AX, R15
634	RORXQ $34, R8, R12
635	XORQ  R14, R13
636	XORQ  BX, R15
637
638	RORXQ $14, R11, R14
639	XORQ  R14, R13
640	RORXQ $39, R8, R14
641	ANDQ  R11, R15
642	ADDQ  CX, R10
643
644	ANDQ DX, DI
645	XORQ R12, R14
646
647	RORXQ $28, R8, R12
648	XORQ  BX, R15
649
650	XORQ R12, R14
651	MOVQ R8, R12
652	ANDQ R9, R12
653	ADDQ R13, R15
654
655	ORQ  R12, DI
656	ADDQ R14, CX
657
658	ADDQ R15, R10
659	ADDQ R15, CX
660	ADDQ DI, CX
661
662	VPSRLQ $19, Y2, Y3
663	VPSLLQ $(64-19), Y2, Y1
664	VPOR   Y1, Y3, Y3
665	VPXOR  Y3, Y8, Y8
666	VPSRLQ $61, Y2, Y3
667	VPSLLQ $(64-61), Y2, Y1
668	VPOR   Y1, Y3, Y3
669	VPXOR  Y3, Y8, Y8
670
671	VPADDQ Y8, Y5, Y5
672
673	VPSRLQ $6, Y5, Y8
674
675	MOVQ  CX, DI
676	RORXQ $41, R10, R13
677	ADDQ  2*8+frame_YFER(SP), BX
678
679	RORXQ $18, R10, R14
680	ORQ   DX, DI
681	MOVQ  R11, R15
682	XORQ  AX, R15
683
684	RORXQ $34, CX, R12
685	XORQ  R14, R13
686	ANDQ  R10, R15
687
688	RORXQ $14, R10, R14
689	ADDQ  BX, R9
690	ANDQ  R8, DI
691
692	XORQ  R14, R13
693	RORXQ $39, CX, R14
694	XORQ  AX, R15
695
696	XORQ  R12, R14
697	RORXQ $28, CX, R12
698
699	XORQ R12, R14
700	MOVQ CX, R12
701	ANDQ DX, R12
702	ADDQ R13, R15
703
704	ORQ  R12, DI
705	ADDQ R14, BX
706	ADDQ R15, R9
707	ADDQ R15, BX
708
709	ADDQ DI, BX
710
711	VPSRLQ $19, Y5, Y3
712	VPSLLQ $(64-19), Y5, Y1
713	VPOR   Y1, Y3, Y3
714	VPXOR  Y3, Y8, Y8
715	VPSRLQ $61, Y5, Y3
716	VPSLLQ $(64-61), Y5, Y1
717	VPOR   Y1, Y3, Y3
718	VPXOR  Y3, Y8, Y8
719
720	VPADDQ Y8, Y0, Y2
721
722	VPBLENDD $0xF0, Y2, Y5, Y5
723
724	MOVQ  BX, DI
725	RORXQ $41, R9, R13
726	RORXQ $18, R9, R14
727	ADDQ  3*8+frame_YFER(SP), AX
728	ORQ   R8, DI
729
730	MOVQ  R10, R15
731	RORXQ $34, BX, R12
732	XORQ  R14, R13
733	XORQ  R11, R15
734
735	RORXQ $14, R9, R14
736	ANDQ  R9, R15
737	ADDQ  AX, DX
738	ANDQ  CX, DI
739
740	XORQ R14, R13
741	XORQ R11, R15
742
743	RORXQ $39, BX, R14
744	ADDQ  R13, R15
745
746	XORQ R12, R14
747	ADDQ R15, DX
748
749	RORXQ $28, BX, R12
750
751	XORQ R12, R14
752	MOVQ BX, R12
753	ANDQ R8, R12
754	ORQ  R12, DI
755
756	ADDQ R14, AX
757	ADDQ R15, AX
758	ADDQ DI, AX
759
760	VPADDQ  2*32(BP), Y6, Y0
761	VMOVDQU Y0, frame_YFER(SP)
762
763	MY_VPALIGNR(Y0, Y5, Y4, 8)
764
765	VPADDQ Y6, Y0, Y0
766
767	MY_VPALIGNR(Y1, Y7, Y6, 8)
768
769	VPSRLQ $1, Y1, Y2
770	VPSLLQ $(64-1), Y1, Y3
771	VPOR   Y2, Y3, Y3
772
773	VPSRLQ $7, Y1, Y8
774
775	MOVQ  AX, DI
776	RORXQ $41, DX, R13
777	RORXQ $18, DX, R14
778	ADDQ  frame_YFER(SP), R11
779	ORQ   CX, DI
780	MOVQ  R9, R15
781	RORXQ $34, AX, R12
782
783	XORQ  R14, R13
784	XORQ  R10, R15
785	RORXQ $14, DX, R14
786
787	ANDQ  DX, R15
788	XORQ  R14, R13
789	RORXQ $39, AX, R14
790	ADDQ  R11, R8
791
792	ANDQ  BX, DI
793	XORQ  R12, R14
794	RORXQ $28, AX, R12
795
796	XORQ R10, R15
797	XORQ R12, R14
798	MOVQ AX, R12
799	ANDQ CX, R12
800
801	ADDQ R13, R15
802	ORQ  R12, DI
803	ADDQ R14, R11
804
805	ADDQ R15, R8
806
807	ADDQ R15, R11
808	ADDQ DI, R11
809
810	VPSRLQ $8, Y1, Y2
811	VPSLLQ $(64-8), Y1, Y1
812	VPOR   Y2, Y1, Y1
813
814	VPXOR Y8, Y3, Y3
815	VPXOR Y1, Y3, Y1
816
817	VPADDQ Y1, Y0, Y0
818
819	VPERM2F128 $0x0, Y0, Y0, Y6
820
821	VPAND MASK_YMM_LO<>(SB), Y0, Y0
822
823	VPERM2F128 $0x11, Y5, Y5, Y2
824	VPSRLQ     $6, Y2, Y8
825
826	MOVQ  R11, DI
827	RORXQ $41, R8, R13
828	RORXQ $18, R8, R14
829	ADDQ  1*8+frame_YFER(SP), R10
830	ORQ   BX, DI
831
832	MOVQ  DX, R15
833	RORXQ $34, R11, R12
834	XORQ  R14, R13
835	XORQ  R9, R15
836
837	RORXQ $14, R8, R14
838	XORQ  R14, R13
839	RORXQ $39, R11, R14
840	ANDQ  R8, R15
841	ADDQ  R10, CX
842
843	ANDQ AX, DI
844	XORQ R12, R14
845
846	RORXQ $28, R11, R12
847	XORQ  R9, R15
848
849	XORQ R12, R14
850	MOVQ R11, R12
851	ANDQ BX, R12
852	ADDQ R13, R15
853
854	ORQ  R12, DI
855	ADDQ R14, R10
856
857	ADDQ R15, CX
858	ADDQ R15, R10
859	ADDQ DI, R10
860
861	VPSRLQ $19, Y2, Y3
862	VPSLLQ $(64-19), Y2, Y1
863	VPOR   Y1, Y3, Y3
864	VPXOR  Y3, Y8, Y8
865	VPSRLQ $61, Y2, Y3
866	VPSLLQ $(64-61), Y2, Y1
867	VPOR   Y1, Y3, Y3
868	VPXOR  Y3, Y8, Y8
869
870	VPADDQ Y8, Y6, Y6
871
872	VPSRLQ $6, Y6, Y8
873
874	MOVQ  R10, DI
875	RORXQ $41, CX, R13
876	ADDQ  2*8+frame_YFER(SP), R9
877
878	RORXQ $18, CX, R14
879	ORQ   AX, DI
880	MOVQ  R8, R15
881	XORQ  DX, R15
882
883	RORXQ $34, R10, R12
884	XORQ  R14, R13
885	ANDQ  CX, R15
886
887	RORXQ $14, CX, R14
888	ADDQ  R9, BX
889	ANDQ  R11, DI
890
891	XORQ  R14, R13
892	RORXQ $39, R10, R14
893	XORQ  DX, R15
894
895	XORQ  R12, R14
896	RORXQ $28, R10, R12
897
898	XORQ R12, R14
899	MOVQ R10, R12
900	ANDQ AX, R12
901	ADDQ R13, R15
902
903	ORQ  R12, DI
904	ADDQ R14, R9
905	ADDQ R15, BX
906	ADDQ R15, R9
907
908	ADDQ DI, R9
909
910	VPSRLQ $19, Y6, Y3
911	VPSLLQ $(64-19), Y6, Y1
912	VPOR   Y1, Y3, Y3
913	VPXOR  Y3, Y8, Y8
914	VPSRLQ $61, Y6, Y3
915	VPSLLQ $(64-61), Y6, Y1
916	VPOR   Y1, Y3, Y3
917	VPXOR  Y3, Y8, Y8
918
919	VPADDQ Y8, Y0, Y2
920
921	VPBLENDD $0xF0, Y2, Y6, Y6
922
923	MOVQ  R9, DI
924	RORXQ $41, BX, R13
925	RORXQ $18, BX, R14
926	ADDQ  3*8+frame_YFER(SP), DX
927	ORQ   R11, DI
928
929	MOVQ  CX, R15
930	RORXQ $34, R9, R12
931	XORQ  R14, R13
932	XORQ  R8, R15
933
934	RORXQ $14, BX, R14
935	ANDQ  BX, R15
936	ADDQ  DX, AX
937	ANDQ  R10, DI
938
939	XORQ R14, R13
940	XORQ R8, R15
941
942	RORXQ $39, R9, R14
943	ADDQ  R13, R15
944
945	XORQ R12, R14
946	ADDQ R15, AX
947
948	RORXQ $28, R9, R12
949
950	XORQ R12, R14
951	MOVQ R9, R12
952	ANDQ R11, R12
953	ORQ  R12, DI
954
955	ADDQ R14, DX
956	ADDQ R15, DX
957	ADDQ DI, DX
958
959	VPADDQ  3*32(BP), Y7, Y0
960	VMOVDQU Y0, frame_YFER(SP)
961	ADDQ    $(4*32), BP
962
963	MY_VPALIGNR(Y0, Y6, Y5, 8)
964
965	VPADDQ Y7, Y0, Y0
966
967	MY_VPALIGNR(Y1, Y4, Y7, 8)
968
969	VPSRLQ $1, Y1, Y2
970	VPSLLQ $(64-1), Y1, Y3
971	VPOR   Y2, Y3, Y3
972
973	VPSRLQ $7, Y1, Y8
974
975	MOVQ  DX, DI
976	RORXQ $41, AX, R13
977	RORXQ $18, AX, R14
978	ADDQ  frame_YFER(SP), R8
979	ORQ   R10, DI
980	MOVQ  BX, R15
981	RORXQ $34, DX, R12
982
983	XORQ  R14, R13
984	XORQ  CX, R15
985	RORXQ $14, AX, R14
986
987	ANDQ  AX, R15
988	XORQ  R14, R13
989	RORXQ $39, DX, R14
990	ADDQ  R8, R11
991
992	ANDQ  R9, DI
993	XORQ  R12, R14
994	RORXQ $28, DX, R12
995
996	XORQ CX, R15
997	XORQ R12, R14
998	MOVQ DX, R12
999	ANDQ R10, R12
1000
1001	ADDQ R13, R15
1002	ORQ  R12, DI
1003	ADDQ R14, R8
1004
1005	ADDQ R15, R11
1006
1007	ADDQ R15, R8
1008	ADDQ DI, R8
1009
1010	VPSRLQ $8, Y1, Y2
1011	VPSLLQ $(64-8), Y1, Y1
1012	VPOR   Y2, Y1, Y1
1013
1014	VPXOR Y8, Y3, Y3
1015	VPXOR Y1, Y3, Y1
1016
1017	VPADDQ Y1, Y0, Y0
1018
1019	VPERM2F128 $0x0, Y0, Y0, Y7
1020
1021	VPAND MASK_YMM_LO<>(SB), Y0, Y0
1022
1023	VPERM2F128 $0x11, Y6, Y6, Y2
1024	VPSRLQ     $6, Y2, Y8
1025
1026	MOVQ  R8, DI
1027	RORXQ $41, R11, R13
1028	RORXQ $18, R11, R14
1029	ADDQ  1*8+frame_YFER(SP), CX
1030	ORQ   R9, DI
1031
1032	MOVQ  AX, R15
1033	RORXQ $34, R8, R12
1034	XORQ  R14, R13
1035	XORQ  BX, R15
1036
1037	RORXQ $14, R11, R14
1038	XORQ  R14, R13
1039	RORXQ $39, R8, R14
1040	ANDQ  R11, R15
1041	ADDQ  CX, R10
1042
1043	ANDQ DX, DI
1044	XORQ R12, R14
1045
1046	RORXQ $28, R8, R12
1047	XORQ  BX, R15
1048
1049	XORQ R12, R14
1050	MOVQ R8, R12
1051	ANDQ R9, R12
1052	ADDQ R13, R15
1053
1054	ORQ  R12, DI
1055	ADDQ R14, CX
1056
1057	ADDQ R15, R10
1058	ADDQ R15, CX
1059	ADDQ DI, CX
1060
1061	VPSRLQ $19, Y2, Y3
1062	VPSLLQ $(64-19), Y2, Y1
1063	VPOR   Y1, Y3, Y3
1064	VPXOR  Y3, Y8, Y8
1065	VPSRLQ $61, Y2, Y3
1066	VPSLLQ $(64-61), Y2, Y1
1067	VPOR   Y1, Y3, Y3
1068	VPXOR  Y3, Y8, Y8
1069
1070	VPADDQ Y8, Y7, Y7
1071
1072	VPSRLQ $6, Y7, Y8
1073
1074	MOVQ  CX, DI
1075	RORXQ $41, R10, R13
1076	ADDQ  2*8+frame_YFER(SP), BX
1077
1078	RORXQ $18, R10, R14
1079	ORQ   DX, DI
1080	MOVQ  R11, R15
1081	XORQ  AX, R15
1082
1083	RORXQ $34, CX, R12
1084	XORQ  R14, R13
1085	ANDQ  R10, R15
1086
1087	RORXQ $14, R10, R14
1088	ADDQ  BX, R9
1089	ANDQ  R8, DI
1090
1091	XORQ  R14, R13
1092	RORXQ $39, CX, R14
1093	XORQ  AX, R15
1094
1095	XORQ  R12, R14
1096	RORXQ $28, CX, R12
1097
1098	XORQ R12, R14
1099	MOVQ CX, R12
1100	ANDQ DX, R12
1101	ADDQ R13, R15
1102
1103	ORQ  R12, DI
1104	ADDQ R14, BX
1105	ADDQ R15, R9
1106	ADDQ R15, BX
1107
1108	ADDQ DI, BX
1109
1110	VPSRLQ $19, Y7, Y3
1111	VPSLLQ $(64-19), Y7, Y1
1112	VPOR   Y1, Y3, Y3
1113	VPXOR  Y3, Y8, Y8
1114	VPSRLQ $61, Y7, Y3
1115	VPSLLQ $(64-61), Y7, Y1
1116	VPOR   Y1, Y3, Y3
1117	VPXOR  Y3, Y8, Y8
1118
1119	VPADDQ Y8, Y0, Y2
1120
1121	VPBLENDD $0xF0, Y2, Y7, Y7
1122
1123	MOVQ  BX, DI
1124	RORXQ $41, R9, R13
1125	RORXQ $18, R9, R14
1126	ADDQ  3*8+frame_YFER(SP), AX
1127	ORQ   R8, DI
1128
1129	MOVQ  R10, R15
1130	RORXQ $34, BX, R12
1131	XORQ  R14, R13
1132	XORQ  R11, R15
1133
1134	RORXQ $14, R9, R14
1135	ANDQ  R9, R15
1136	ADDQ  AX, DX
1137	ANDQ  CX, DI
1138
1139	XORQ R14, R13
1140	XORQ R11, R15
1141
1142	RORXQ $39, BX, R14
1143	ADDQ  R13, R15
1144
1145	XORQ R12, R14
1146	ADDQ R15, DX
1147
1148	RORXQ $28, BX, R12
1149
1150	XORQ R12, R14
1151	MOVQ BX, R12
1152	ANDQ R8, R12
1153	ORQ  R12, DI
1154
1155	ADDQ R14, AX
1156	ADDQ R15, AX
1157	ADDQ DI, AX
1158
1159	SUBQ $1, frame_SRND(SP)
1160	JNE  loop1
1161
1162	MOVQ $2, frame_SRND(SP)
1163
1164loop2:
1165	VPADDQ  (BP), Y4, Y0
1166	VMOVDQU Y0, frame_YFER(SP)
1167
1168	MOVQ  R9, R15
1169	RORXQ $41, DX, R13
1170	RORXQ $18, DX, R14
1171	XORQ  R10, R15
1172
1173	XORQ  R14, R13
1174	RORXQ $14, DX, R14
1175	ANDQ  DX, R15
1176
1177	XORQ  R14, R13
1178	RORXQ $34, AX, R12
1179	XORQ  R10, R15
1180	RORXQ $39, AX, R14
1181	MOVQ  AX, DI
1182
1183	XORQ  R12, R14
1184	RORXQ $28, AX, R12
1185	ADDQ  frame_YFER(SP), R11
1186	ORQ   CX, DI
1187
1188	XORQ R12, R14
1189	MOVQ AX, R12
1190	ANDQ BX, DI
1191	ANDQ CX, R12
1192	ADDQ R13, R15
1193
1194	ADDQ R11, R8
1195	ORQ  R12, DI
1196	ADDQ R14, R11
1197
1198	ADDQ R15, R8
1199
1200	ADDQ  R15, R11
1201	MOVQ  DX, R15
1202	RORXQ $41, R8, R13
1203	RORXQ $18, R8, R14
1204	XORQ  R9, R15
1205
1206	XORQ  R14, R13
1207	RORXQ $14, R8, R14
1208	ANDQ  R8, R15
1209	ADDQ  DI, R11
1210
1211	XORQ  R14, R13
1212	RORXQ $34, R11, R12
1213	XORQ  R9, R15
1214	RORXQ $39, R11, R14
1215	MOVQ  R11, DI
1216
1217	XORQ  R12, R14
1218	RORXQ $28, R11, R12
1219	ADDQ  8*1+frame_YFER(SP), R10
1220	ORQ   BX, DI
1221
1222	XORQ R12, R14
1223	MOVQ R11, R12
1224	ANDQ AX, DI
1225	ANDQ BX, R12
1226	ADDQ R13, R15
1227
1228	ADDQ R10, CX
1229	ORQ  R12, DI
1230	ADDQ R14, R10
1231
1232	ADDQ R15, CX
1233
1234	ADDQ  R15, R10
1235	MOVQ  R8, R15
1236	RORXQ $41, CX, R13
1237	RORXQ $18, CX, R14
1238	XORQ  DX, R15
1239
1240	XORQ  R14, R13
1241	RORXQ $14, CX, R14
1242	ANDQ  CX, R15
1243	ADDQ  DI, R10
1244
1245	XORQ  R14, R13
1246	RORXQ $34, R10, R12
1247	XORQ  DX, R15
1248	RORXQ $39, R10, R14
1249	MOVQ  R10, DI
1250
1251	XORQ  R12, R14
1252	RORXQ $28, R10, R12
1253	ADDQ  8*2+frame_YFER(SP), R9
1254	ORQ   AX, DI
1255
1256	XORQ R12, R14
1257	MOVQ R10, R12
1258	ANDQ R11, DI
1259	ANDQ AX, R12
1260	ADDQ R13, R15
1261
1262	ADDQ R9, BX
1263	ORQ  R12, DI
1264	ADDQ R14, R9
1265
1266	ADDQ R15, BX
1267
1268	ADDQ  R15, R9
1269	MOVQ  CX, R15
1270	RORXQ $41, BX, R13
1271	RORXQ $18, BX, R14
1272	XORQ  R8, R15
1273
1274	XORQ  R14, R13
1275	RORXQ $14, BX, R14
1276	ANDQ  BX, R15
1277	ADDQ  DI, R9
1278
1279	XORQ  R14, R13
1280	RORXQ $34, R9, R12
1281	XORQ  R8, R15
1282	RORXQ $39, R9, R14
1283	MOVQ  R9, DI
1284
1285	XORQ  R12, R14
1286	RORXQ $28, R9, R12
1287	ADDQ  8*3+frame_YFER(SP), DX
1288	ORQ   R11, DI
1289
1290	XORQ R12, R14
1291	MOVQ R9, R12
1292	ANDQ R10, DI
1293	ANDQ R11, R12
1294	ADDQ R13, R15
1295
1296	ADDQ DX, AX
1297	ORQ  R12, DI
1298	ADDQ R14, DX
1299
1300	ADDQ R15, AX
1301
1302	ADDQ R15, DX
1303
1304	ADDQ DI, DX
1305
1306	VPADDQ  1*32(BP), Y5, Y0
1307	VMOVDQU Y0, frame_YFER(SP)
1308	ADDQ    $(2*32), BP
1309
1310	MOVQ  BX, R15
1311	RORXQ $41, AX, R13
1312	RORXQ $18, AX, R14
1313	XORQ  CX, R15
1314
1315	XORQ  R14, R13
1316	RORXQ $14, AX, R14
1317	ANDQ  AX, R15
1318
1319	XORQ  R14, R13
1320	RORXQ $34, DX, R12
1321	XORQ  CX, R15
1322	RORXQ $39, DX, R14
1323	MOVQ  DX, DI
1324
1325	XORQ  R12, R14
1326	RORXQ $28, DX, R12
1327	ADDQ  frame_YFER(SP), R8
1328	ORQ   R10, DI
1329
1330	XORQ R12, R14
1331	MOVQ DX, R12
1332	ANDQ R9, DI
1333	ANDQ R10, R12
1334	ADDQ R13, R15
1335
1336	ADDQ R8, R11
1337	ORQ  R12, DI
1338	ADDQ R14, R8
1339
1340	ADDQ R15, R11
1341
1342	ADDQ  R15, R8
1343	MOVQ  AX, R15
1344	RORXQ $41, R11, R13
1345	RORXQ $18, R11, R14
1346	XORQ  BX, R15
1347
1348	XORQ  R14, R13
1349	RORXQ $14, R11, R14
1350	ANDQ  R11, R15
1351	ADDQ  DI, R8
1352
1353	XORQ  R14, R13
1354	RORXQ $34, R8, R12
1355	XORQ  BX, R15
1356	RORXQ $39, R8, R14
1357	MOVQ  R8, DI
1358
1359	XORQ  R12, R14
1360	RORXQ $28, R8, R12
1361	ADDQ  8*1+frame_YFER(SP), CX
1362	ORQ   R9, DI
1363
1364	XORQ R12, R14
1365	MOVQ R8, R12
1366	ANDQ DX, DI
1367	ANDQ R9, R12
1368	ADDQ R13, R15
1369
1370	ADDQ CX, R10
1371	ORQ  R12, DI
1372	ADDQ R14, CX
1373
1374	ADDQ R15, R10
1375
1376	ADDQ  R15, CX
1377	MOVQ  R11, R15
1378	RORXQ $41, R10, R13
1379	RORXQ $18, R10, R14
1380	XORQ  AX, R15
1381
1382	XORQ  R14, R13
1383	RORXQ $14, R10, R14
1384	ANDQ  R10, R15
1385	ADDQ  DI, CX
1386
1387	XORQ  R14, R13
1388	RORXQ $34, CX, R12
1389	XORQ  AX, R15
1390	RORXQ $39, CX, R14
1391	MOVQ  CX, DI
1392
1393	XORQ  R12, R14
1394	RORXQ $28, CX, R12
1395	ADDQ  8*2+frame_YFER(SP), BX
1396	ORQ   DX, DI
1397
1398	XORQ R12, R14
1399	MOVQ CX, R12
1400	ANDQ R8, DI
1401	ANDQ DX, R12
1402	ADDQ R13, R15
1403
1404	ADDQ BX, R9
1405	ORQ  R12, DI
1406	ADDQ R14, BX
1407
1408	ADDQ R15, R9
1409
1410	ADDQ  R15, BX
1411	MOVQ  R10, R15
1412	RORXQ $41, R9, R13
1413	RORXQ $18, R9, R14
1414	XORQ  R11, R15
1415
1416	XORQ  R14, R13
1417	RORXQ $14, R9, R14
1418	ANDQ  R9, R15
1419	ADDQ  DI, BX
1420
1421	XORQ  R14, R13
1422	RORXQ $34, BX, R12
1423	XORQ  R11, R15
1424	RORXQ $39, BX, R14
1425	MOVQ  BX, DI
1426
1427	XORQ  R12, R14
1428	RORXQ $28, BX, R12
1429	ADDQ  8*3+frame_YFER(SP), AX
1430	ORQ   R8, DI
1431
1432	XORQ R12, R14
1433	MOVQ BX, R12
1434	ANDQ CX, DI
1435	ANDQ R8, R12
1436	ADDQ R13, R15
1437
1438	ADDQ AX, DX
1439	ORQ  R12, DI
1440	ADDQ R14, AX
1441
1442	ADDQ R15, DX
1443
1444	ADDQ R15, AX
1445
1446	ADDQ DI, AX
1447
1448	VMOVDQU Y6, Y4
1449	VMOVDQU Y7, Y5
1450
1451	SUBQ $1, frame_SRND(SP)
1452	JNE  loop2
1453
1454	addm(8*0(SI),AX)
1455	addm(8*1(SI),BX)
1456	addm(8*2(SI),CX)
1457	addm(8*3(SI),R8)
1458	addm(8*4(SI),DX)
1459	addm(8*5(SI),R9)
1460	addm(8*6(SI),R10)
1461	addm(8*7(SI),R11)
1462
1463	MOVQ frame_INP(SP), DI
1464	ADDQ $128, DI
1465	CMPQ DI, frame_INPEND(SP)
1466	JNE  loop0
1467
1468done_hash:
1469	VZEROUPPER
1470	RET
1471