1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7#include "textflag.h"
8
9// SHA256 block routine. See sha256block.go for Go equivalent.
10//
11// The algorithm is detailed in FIPS 180-4:
12//
13//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
14
15// The avx2-version is described in an Intel White-Paper:
16// "Fast SHA-256 Implementations on Intel Architecture Processors"
17// To find it, surf to http://www.intel.com/p/en_US/embedded
18// and search for that title.
19// AVX2 version by Intel, same algorithm as code in Linux kernel:
20// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
21// by
22//     James Guilford <[email protected]>
23//     Kirk Yap <[email protected]>
24//     Tim Chen <[email protected]>
25
26// Wt = Mt; for 0 <= t <= 15
27// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
28//
29// a = H0
30// b = H1
31// c = H2
32// d = H3
33// e = H4
34// f = H5
35// g = H6
36// h = H7
37//
38// for t = 0 to 63 {
39//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
40//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
41//    h = g
42//    g = f
43//    f = e
44//    e = d + T1
45//    d = c
46//    c = b
47//    b = a
48//    a = T1 + T2
49// }
50//
51// H0 = a + H0
52// H1 = b + H1
53// H2 = c + H2
54// H3 = d + H3
55// H4 = e + H4
56// H5 = f + H5
57// H6 = g + H6
58// H7 = h + H7
59
60// Wt = Mt; for 0 <= t <= 15
61#define MSGSCHEDULE0(index) \
62	MOVL	(index*4)(SI), AX; \
63	BSWAPL	AX; \
64	MOVL	AX, (index*4)(BP)
65
66// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
67//   SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
68//   SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
69#define MSGSCHEDULE1(index) \
70	MOVL	((index-2)*4)(BP), AX; \
71	MOVL	AX, CX; \
72	RORL	$17, AX; \
73	MOVL	CX, DX; \
74	RORL	$19, CX; \
75	SHRL	$10, DX; \
76	MOVL	((index-15)*4)(BP), BX; \
77	XORL	CX, AX; \
78	MOVL	BX, CX; \
79	XORL	DX, AX; \
80	RORL	$7, BX; \
81	MOVL	CX, DX; \
82	SHRL	$3, DX; \
83	RORL	$18, CX; \
84	ADDL	((index-7)*4)(BP), AX; \
85	XORL	CX, BX; \
86	XORL	DX, BX; \
87	ADDL	((index-16)*4)(BP), BX; \
88	ADDL	BX, AX; \
89	MOVL	AX, ((index)*4)(BP)
90
91// Calculate T1 in AX - uses AX, CX and DX registers.
92// h is also used as an accumulator. Wt is passed in AX.
93//   T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
94//     BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
95//     Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
96#define SHA256T1(const, e, f, g, h) \
97	ADDL	AX, h; \
98	MOVL	e, AX; \
99	ADDL	$const, h; \
100	MOVL	e, CX; \
101	RORL	$6, AX; \
102	MOVL	e, DX; \
103	RORL	$11, CX; \
104	XORL	CX, AX; \
105	MOVL	e, CX; \
106	RORL	$25, DX; \
107	ANDL	f, CX; \
108	XORL	AX, DX; \
109	MOVL	e, AX; \
110	NOTL	AX; \
111	ADDL	DX, h; \
112	ANDL	g, AX; \
113	XORL	CX, AX; \
114	ADDL	h, AX
115
116// Calculate T2 in BX - uses BX, CX, DX and DI registers.
117//   T2 = BIGSIGMA0(a) + Maj(a, b, c)
118//     BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
119//     Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
120#define SHA256T2(a, b, c) \
121	MOVL	a, DI; \
122	MOVL	c, BX; \
123	RORL	$2, DI; \
124	MOVL	a, DX; \
125	ANDL	b, BX; \
126	RORL	$13, DX; \
127	MOVL	a, CX; \
128	ANDL	c, CX; \
129	XORL	DX, DI; \
130	XORL	CX, BX; \
131	MOVL	a, DX; \
132	MOVL	b, CX; \
133	RORL	$22, DX; \
134	ANDL	a, CX; \
135	XORL	CX, BX; \
136	XORL	DX, DI; \
137	ADDL	DI, BX
138
139// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
140// The values for e and a are stored in d and h, ready for rotation.
141#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
142	SHA256T1(const, e, f, g, h); \
143	SHA256T2(a, b, c); \
144	MOVL	BX, h; \
145	ADDL	AX, d; \
146	ADDL	AX, h
147
148#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
149	MSGSCHEDULE0(index); \
150	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
151
152#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
153	MSGSCHEDULE1(index); \
154	SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
155
156
157// Definitions for AVX2 version
158
159// addm (mem), reg
160// Add reg to mem using reg-mem add and store
161#define addm(P1, P2) \
162	ADDL P2, P1; \
163	MOVL P1, P2
164
165#define XDWORD0 Y4
166#define XDWORD1 Y5
167#define XDWORD2 Y6
168#define XDWORD3 Y7
169
170#define XWORD0 X4
171#define XWORD1 X5
172#define XWORD2 X6
173#define XWORD3 X7
174
175#define XTMP0 Y0
176#define XTMP1 Y1
177#define XTMP2 Y2
178#define XTMP3 Y3
179#define XTMP4 Y8
180#define XTMP5 Y11
181
182#define XFER  Y9
183
184#define BYTE_FLIP_MASK	Y13 // mask to convert LE -> BE
185#define X_BYTE_FLIP_MASK X13
186
187#define NUM_BYTES DX
188#define INP	DI
189
190#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
191
192#define a AX
193#define b BX
194#define c CX
195#define d R8
196#define e DX
197#define f R9
198#define g R10
199#define h R11
200
201#define old_h R11
202
203#define TBL BP
204
205#define SRND SI // SRND is same register as CTX
206
207#define T1 R12
208
209#define y0 R13
210#define y1 R14
211#define y2 R15
212#define y3 DI
213
214// Offsets
215#define XFER_SIZE 2*64*4
216#define INP_END_SIZE 8
217#define INP_SIZE 8
218
219#define _XFER 0
220#define _INP_END _XFER + XFER_SIZE
221#define _INP _INP_END + INP_END_SIZE
222#define STACK_SIZE _INP + INP_SIZE
223
224#define ROUND_AND_SCHED_N_0(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
225	;                                     \ // #############################  RND N + 0 ############################//
226	MOVL     a, y3;                       \ // y3 = a					// MAJA
227	RORXL    $25, e, y0;                  \ // y0 = e >> 25				// S1A
228	RORXL    $11, e, y1;                  \ // y1 = e >> 11				// S1B
229	;                                     \
230	ADDL     (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h        // disp = k + w
231	ORL      c, y3;                       \ // y3 = a|c				// MAJA
232	VPALIGNR $4, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-7]
233	MOVL     f, y2;                       \ // y2 = f				// CH
234	RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
235	;                                     \
236	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
237	XORL     g, y2;                       \ // y2 = f^g	// CH
238	VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
239	RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
240	;                                     \
241	ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
242	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
243	RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
244	ADDL     h, d;                        \ // d = k + w + h + d	// --
245	;                                     \
246	ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
247	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
248	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
249	RORXL    $2, a, T1;                   \ // T1 = (a >> 2)						// S0
250	;                                     \
251	XORL     g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
252	VPSRLD   $7, XTMP1, XTMP2;            \
253	XORL     T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
254	MOVL     a, T1;                       \ // T1 = a								// MAJB
255	ANDL     c, T1;                       \ // T1 = a&c								// MAJB
256	;                                     \
257	ADDL     y0, y2;                      \ // y2 = S1 + CH							// --
258	VPSLLD   $(32-7), XTMP1, XTMP3;       \
259	ORL      T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
260	ADDL     y1, h;                       \ // h = k + w + h + S0					// --
261	;                                     \
262	ADDL     y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
263	VPOR     XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7
264	;                                     \
265	VPSRLD   $18, XTMP1, XTMP2;           \
266	ADDL     y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
267	ADDL     y3, h                        // h = t1 + S0 + MAJ                     // --
268
269#define ROUND_AND_SCHED_N_1(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
270	;                                    \ // ################################### RND N + 1 ############################
271	;                                    \
272	MOVL    a, y3;                       \ // y3 = a                       // MAJA
273	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
274	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
275	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h		// --
276	ORL     c, y3;                       \ // y3 = a|c						// MAJA
277	;                                    \
278	VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
279	MOVL    f, y2;                       \ // y2 = f						// CH
280	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
281	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
282	XORL    g, y2;                       \ // y2 = f^g						// CH
283	;                                    \
284	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
285	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
286	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
287	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
288	ADDL    h, d;                        \ // d = k + w + h + d				// --
289	;                                    \
290	VPSLLD  $(32-18), XTMP1, XTMP1;      \
291	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
292	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
293	;                                    \
294	VPXOR   XTMP1, XTMP3, XTMP3;         \
295	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
296	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g		// CH
297	;                                    \
298	VPXOR   XTMP2, XTMP3, XTMP3;         \ // XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
299	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
300	MOVL    a, T1;                       \ // T1 = a						// MAJB
301	ANDL    c, T1;                       \ // T1 = a&c						// MAJB
302	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
303	;                                    \
304	VPXOR   XTMP4, XTMP3, XTMP1;         \ // XTMP1 = s0
305	VPSHUFD $0xFA, XDWORD3, XTMP2;       \ // XTMP2 = W[-2] {BBAA}
306	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
307	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
308	;                                    \
309	VPADDD  XTMP1, XTMP0, XTMP0;         \ // XTMP0 = W[-16] + W[-7] + s0
310	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
311	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
312	ADDL    y3, h;                       \ // h = t1 + S0 + MAJ                     // --
313	;                                    \
314	VPSRLD  $10, XTMP2, XTMP4            // XTMP4 = W[-2] >> 10 {BBAA}
315
316#define ROUND_AND_SCHED_N_2(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
317	;                                    \ // ################################### RND N + 2 ############################
318	;                                    \
319	MOVL    a, y3;                       \ // y3 = a							// MAJA
320	RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
321	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h			// --
322	;                                    \
323	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
324	RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
325	ORL     c, y3;                       \ // y3 = a|c                         // MAJA
326	MOVL    f, y2;                       \ // y2 = f                           // CH
327	XORL    g, y2;                       \ // y2 = f^g                         // CH
328	;                                    \
329	RORXL   $13, a, T1;                  \ // T1 = a >> 13						// S0B
330	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)			// S1
331	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xBxA}
332	ANDL    e, y2;                       \ // y2 = (f^g)&e						// CH
333	;                                    \
334	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)					// S1
335	VPXOR   XTMP3, XTMP2, XTMP2;         \
336	ADDL    h, d;                        \ // d = k + w + h + d				// --
337	ANDL    b, y3;                       \ // y3 = (a|c)&b						// MAJA
338	;                                    \
339	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
340	RORXL   $22, a, y1;                  \ // y1 = a >> 22						// S0A
341	VPXOR   XTMP2, XTMP4, XTMP4;         \ // XTMP4 = s1 {xBxA}
342	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
343	;                                    \
344	VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4;\ // XTMP4 = s1 {00BA}
345	;                                    \
346	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
347	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
348	VPADDD  XTMP4, XTMP0, XTMP0;         \ // XTMP0 = {..., ..., W[1], W[0]}
349	;                                    \
350	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
351	MOVL    a, T1;                       \ // T1 = a                                // MAJB
352	ANDL    c, T1;                       \ // T1 = a&c                              // MAJB
353	ADDL    y0, y2;                      \ // y2 = S1 + CH                          // --
354	VPSHUFD $80, XTMP0, XTMP2;           \ // XTMP2 = W[-2] {DDCC}
355	;                                    \
356	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)             // MAJ
357	ADDL    y1, h;                       \ // h = k + w + h + S0                    // --
358	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
359	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
360	;                                    \
361	ADDL    y3, h                        // h = t1 + S0 + MAJ                     // --
362
363#define ROUND_AND_SCHED_N_3(disp, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3) \
364	;                                    \ // ################################### RND N + 3 ############################
365	;                                    \
366	MOVL    a, y3;                       \ // y3 = a						// MAJA
367	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
368	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
369	ADDL    (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h				// --
370	ORL     c, y3;                       \ // y3 = a|c                     // MAJA
371	;                                    \
372	VPSRLD  $10, XTMP2, XTMP5;           \ // XTMP5 = W[-2] >> 10 {DDCC}
373	MOVL    f, y2;                       \ // y2 = f						// CH
374	RORXL   $13, a, T1;                  \ // T1 = a >> 13					// S0B
375	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)		// S1
376	XORL    g, y2;                       \ // y2 = f^g						// CH
377	;                                    \
378	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xDxC}
379	RORXL   $6, e, y1;                   \ // y1 = (e >> 6)				// S1
380	ANDL    e, y2;                       \ // y2 = (f^g)&e					// CH
381	ADDL    h, d;                        \ // d = k + w + h + d			// --
382	ANDL    b, y3;                       \ // y3 = (a|c)&b					// MAJA
383	;                                    \
384	VPSRLQ  $17, XTMP2, XTMP2;           \ // XTMP2 = W[-2] ror 17 {xDxC}
385	XORL    y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
386	XORL    g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
387	;                                    \
388	VPXOR   XTMP3, XTMP2, XTMP2;         \
389	RORXL   $22, a, y1;                  \ // y1 = a >> 22					// S0A
390	ADDL    y0, y2;                      \ // y2 = S1 + CH					// --
391	;                                    \
392	VPXOR   XTMP2, XTMP5, XTMP5;         \ // XTMP5 = s1 {xDxC}
393	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)		// S0
394	ADDL    y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1  // --
395	;                                    \
396	RORXL   $2, a, T1;                   \ // T1 = (a >> 2)				// S0
397	;                                    \
398	VPSHUFB shuff_DC00<>(SB), XTMP5, XTMP5;\ // XTMP5 = s1 {DC00}
399	;                                    \
400	VPADDD  XTMP0, XTMP5, XDWORD0;       \ // XDWORD0 = {W[3], W[2], W[1], W[0]}
401	XORL    T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
402	MOVL    a, T1;                       \ // T1 = a							// MAJB
403	ANDL    c, T1;                       \ // T1 = a&c							// MAJB
404	ORL     T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)		// MAJ
405	;                                    \
406	ADDL    y1, h;                       \ // h = k + w + h + S0				// --
407	ADDL    y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
408	ADDL    y3, h                        // h = t1 + S0 + MAJ				// --
409
410#define DO_ROUND_N_0(disp, a, b, c, d, e, f, g, h, old_h) \
411	;                                  \ // ################################### RND N + 0 ###########################
412	MOVL  f, y2;                       \ // y2 = f					// CH
413	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
414	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
415	XORL  g, y2;                       \ // y2 = f^g					// CH
416	;                                  \
417	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)	// S1
418	RORXL $6, e, y1;                   \ // y1 = (e >> 6)			// S1
419	ANDL  e, y2;                       \ // y2 = (f^g)&e				// CH
420	;                                  \
421	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)	// S1
422	RORXL $13, a, T1;                  \ // T1 = a >> 13						// S0B
423	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g			// CH
424	RORXL $22, a, y1;                  \ // y1 = a >> 22						// S0A
425	MOVL  a, y3;                       \ // y3 = a							// MAJA
426	;                                  \
427	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)			// S0
428	RORXL $2, a, T1;                   \ // T1 = (a >> 2)					// S0
429	ADDL  (disp + 0*4)(SP)(SRND*1), h; \ // h = k + w + h // --
430	ORL   c, y3;                       \ // y3 = a|c							// MAJA
431	;                                  \
432	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)	// S0
433	MOVL  a, T1;                       \ // T1 = a							// MAJB
434	ANDL  b, y3;                       \ // y3 = (a|c)&b						// MAJA
435	ANDL  c, T1;                       \ // T1 = a&c							// MAJB
436	ADDL  y0, y2;                      \ // y2 = S1 + CH						// --
437	;                                  \
438	ADDL  h, d;                        \ // d = k + w + h + d					// --
439	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
440	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
441	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1	// --
442
443#define DO_ROUND_N_1(disp, a, b, c, d, e, f, g, h, old_h) \
444	;                                  \ // ################################### RND N + 1 ###########################
445	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0 // --
446	MOVL  f, y2;                       \ // y2 = f                                // CH
447	RORXL $25, e, y0;                  \ // y0 = e >> 25				// S1A
448	RORXL $11, e, y1;                  \ // y1 = e >> 11				// S1B
449	XORL  g, y2;                       \ // y2 = f^g                             // CH
450	;                                  \
451	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
452	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
453	ANDL  e, y2;                       \ // y2 = (f^g)&e                         // CH
454	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ                    // --
455	;                                  \
456	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
457	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
458	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
459	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
460	MOVL  a, y3;                       \ // y3 = a                               // MAJA
461	;                                  \
462	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
463	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
464	ADDL  (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h // --
465	ORL   c, y3;                       \ // y3 = a|c                             // MAJA
466	;                                  \
467	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
468	MOVL  a, T1;                       \ // T1 = a                               // MAJB
469	ANDL  b, y3;                       \ // y3 = (a|c)&b                         // MAJA
470	ANDL  c, T1;                       \ // T1 = a&c                             // MAJB
471	ADDL  y0, y2;                      \ // y2 = S1 + CH                         // --
472	;                                  \
473	ADDL  h, d;                        \ // d = k + w + h + d                    // --
474	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)            // MAJ
475	ADDL  y1, h;                       \ // h = k + w + h + S0                   // --
476	;                                  \
477	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
478
479#define DO_ROUND_N_2(disp, a, b, c, d, e, f, g, h, old_h) \
480	;                                  \ // ################################### RND N + 2 ##############################
481	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
482	MOVL  f, y2;                       \ // y2 = f								// CH
483	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
484	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
485	XORL  g, y2;                       \ // y2 = f^g								// CH
486	;                                  \
487	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
488	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
489	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
490	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
491	;                                  \
492	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
493	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
494	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g                // CH
495	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
496	MOVL  a, y3;                       \ // y3 = a								// MAJA
497	;                                  \
498	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
499	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
500	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h	// --
501	ORL   c, y3;                       \ // y3 = a|c								// MAJA
502	;                                  \
503	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
504	MOVL  a, T1;                       \ // T1 = a								// MAJB
505	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
506	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
507	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
508	;                                  \
509	ADDL  h, d;                        \ // d = k + w + h + d					// --
510	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
511	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
512	;                                  \
513	ADDL  y2, d                        // d = k + w + h + d + S1 + CH = d + t1 // --
514
515#define DO_ROUND_N_3(disp, a, b, c, d, e, f, g, h, old_h) \
516	;                                  \ // ################################### RND N + 3 ###########################
517	ADDL  y2, old_h;                   \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
518	MOVL  f, y2;                       \ // y2 = f								// CH
519	RORXL $25, e, y0;                  \ // y0 = e >> 25							// S1A
520	RORXL $11, e, y1;                  \ // y1 = e >> 11							// S1B
521	XORL  g, y2;                       \ // y2 = f^g								// CH
522	;                                  \
523	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)				// S1
524	RORXL $6, e, y1;                   \ // y1 = (e >> 6)						// S1
525	ANDL  e, y2;                       \ // y2 = (f^g)&e							// CH
526	ADDL  y3, old_h;                   \ // h = t1 + S0 + MAJ					// --
527	;                                  \
528	XORL  y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
529	RORXL $13, a, T1;                  \ // T1 = a >> 13							// S0B
530	XORL  g, y2;                       \ // y2 = CH = ((f^g)&e)^g				// CH
531	RORXL $22, a, y1;                  \ // y1 = a >> 22							// S0A
532	MOVL  a, y3;                       \ // y3 = a								// MAJA
533	;                                  \
534	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
535	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
536	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h	// --
537	ORL   c, y3;                       \ // y3 = a|c								// MAJA
538	;                                  \
539	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
540	MOVL  a, T1;                       \ // T1 = a								// MAJB
541	ANDL  b, y3;                       \ // y3 = (a|c)&b							// MAJA
542	ANDL  c, T1;                       \ // T1 = a&c								// MAJB
543	ADDL  y0, y2;                      \ // y2 = S1 + CH							// --
544	;                                  \
545	ADDL  h, d;                        \ // d = k + w + h + d					// --
546	ORL   T1, y3;                      \ // y3 = MAJ = (a|c)&b)|(a&c)			// MAJ
547	ADDL  y1, h;                       \ // h = k + w + h + S0					// --
548	;                                  \
549	ADDL  y2, d;                       \ // d = k + w + h + d + S1 + CH = d + t1	// --
550	;                                  \
551	ADDL  y2, h;                       \ // h = k + w + h + S0 + S1 + CH = t1 + S0// --
552	;                                  \
553	ADDL  y3, h                        // h = t1 + S0 + MAJ					// --
554
555// Definitions for sha-ni version
556//
557// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
558// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
559//
560// Reference
561// S. Gulley, et al, "New Instructions Supporting the Secure Hash
562// Algorithm on Intel® Architecture Processors", July 2013
563// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
564//
565
566#define digestPtr	DI	// input/output, base pointer to digest hash vector H0, H1, ..., H7
567#define dataPtr		SI	// input, base pointer to first input data block
568#define numBytes	DX	// input, number of input bytes to be processed
569#define sha256Constants	AX	// round contents from K256 table, indexed by round number x 32
570#define msg		X0	// input data
571#define state0		X1	// round intermediates and outputs
572#define state1		X2
573#define m0		X3	// m0, m1,... m4 -- round message temps
574#define m1		X4
575#define m2		X5
576#define m3		X6
577#define m4		X7
578#define shufMask	X8	// input data endian conversion control mask
579#define abefSave	X9	// digest hash vector inter-block buffer abef
580#define cdghSave	X10	// digest hash vector inter-block buffer cdgh
581
582#define nop(m,a)		// nop instead of final SHA256MSG1 for first and last few rounds
583
584#define sha256msg1(m,a) \	// final SHA256MSG1 for middle rounds that require it
585	SHA256MSG1		m, a
586
587#define vmov(a,b) \		// msg copy for all but rounds 12-15
588	VMOVDQA		a, b
589
590#define vmovrev(a,b) \		// reverse copy for rounds 12-15
591	VMOVDQA		b, a
592
593// sha rounds 0 to 11
594// identical with the exception of the final msg op
595// which is replaced with a nop for rounds where it is not needed
596// refer to Gulley, et al for more information
597#define rounds0to11(m,a,c,sha256Msg1)				\
598	VMOVDQU			c*16(dataPtr), msg		\
599	PSHUFB			shufMask, msg			\
600	VMOVDQA			msg, m				\
601	PADDD			(c*32)(sha256Constants), msg	\
602	SHA256RNDS2		msg, state0, state1		\
603	PSHUFD			$0x0e, msg, msg			\
604	SHA256RNDS2		msg, state1, state0		\
605	sha256Msg1		(m,a)
606
607// sha rounds 12 to 59
608// identical with the exception of the final msg op
609// and the reverse copy(m,msg) in round 12 which is required
610// after the last data load
611// refer to Gulley, et al for more information
612#define rounds12to59(m,c,a,t,sha256Msg1,movop)			\
613	movop			(m,msg)				\
614	PADDD			(c*32)(sha256Constants), msg	\
615	SHA256RNDS2		msg, state0, state1		\
616	VMOVDQA			m, m4				\
617	PALIGNR			$4, a, m4			\
618	PADDD			m4, t				\
619	SHA256MSG2		m, t				\
620	PSHUFD			$0x0e, msg, msg			\
621	SHA256RNDS2		msg, state1, state0		\
622	sha256Msg1		(m,a)
623
624TEXT ·block(SB), 0, $536-32
625	CMPB	·useSHA(SB), $1
626	JE	sha_ni
627	CMPB	·useAVX2(SB), $1
628	JE	avx2
629
630	MOVQ p_base+8(FP), SI
631	MOVQ p_len+16(FP), DX
632	SHRQ $6, DX
633	SHLQ $6, DX
634
635	LEAQ (SI)(DX*1), DI
636	MOVQ DI, 256(SP)
637	CMPQ SI, DI
638	JEQ  end
639
640	MOVQ dig+0(FP), BP
641	MOVL (0*4)(BP), R8  // a = H0
642	MOVL (1*4)(BP), R9  // b = H1
643	MOVL (2*4)(BP), R10 // c = H2
644	MOVL (3*4)(BP), R11 // d = H3
645	MOVL (4*4)(BP), R12 // e = H4
646	MOVL (5*4)(BP), R13 // f = H5
647	MOVL (6*4)(BP), R14 // g = H6
648	MOVL (7*4)(BP), R15 // h = H7
649
650loop:
651	MOVQ SP, BP
652
653	SHA256ROUND0(0, 0x428a2f98, R8, R9, R10, R11, R12, R13, R14, R15)
654	SHA256ROUND0(1, 0x71374491, R15, R8, R9, R10, R11, R12, R13, R14)
655	SHA256ROUND0(2, 0xb5c0fbcf, R14, R15, R8, R9, R10, R11, R12, R13)
656	SHA256ROUND0(3, 0xe9b5dba5, R13, R14, R15, R8, R9, R10, R11, R12)
657	SHA256ROUND0(4, 0x3956c25b, R12, R13, R14, R15, R8, R9, R10, R11)
658	SHA256ROUND0(5, 0x59f111f1, R11, R12, R13, R14, R15, R8, R9, R10)
659	SHA256ROUND0(6, 0x923f82a4, R10, R11, R12, R13, R14, R15, R8, R9)
660	SHA256ROUND0(7, 0xab1c5ed5, R9, R10, R11, R12, R13, R14, R15, R8)
661	SHA256ROUND0(8, 0xd807aa98, R8, R9, R10, R11, R12, R13, R14, R15)
662	SHA256ROUND0(9, 0x12835b01, R15, R8, R9, R10, R11, R12, R13, R14)
663	SHA256ROUND0(10, 0x243185be, R14, R15, R8, R9, R10, R11, R12, R13)
664	SHA256ROUND0(11, 0x550c7dc3, R13, R14, R15, R8, R9, R10, R11, R12)
665	SHA256ROUND0(12, 0x72be5d74, R12, R13, R14, R15, R8, R9, R10, R11)
666	SHA256ROUND0(13, 0x80deb1fe, R11, R12, R13, R14, R15, R8, R9, R10)
667	SHA256ROUND0(14, 0x9bdc06a7, R10, R11, R12, R13, R14, R15, R8, R9)
668	SHA256ROUND0(15, 0xc19bf174, R9, R10, R11, R12, R13, R14, R15, R8)
669
670	SHA256ROUND1(16, 0xe49b69c1, R8, R9, R10, R11, R12, R13, R14, R15)
671	SHA256ROUND1(17, 0xefbe4786, R15, R8, R9, R10, R11, R12, R13, R14)
672	SHA256ROUND1(18, 0x0fc19dc6, R14, R15, R8, R9, R10, R11, R12, R13)
673	SHA256ROUND1(19, 0x240ca1cc, R13, R14, R15, R8, R9, R10, R11, R12)
674	SHA256ROUND1(20, 0x2de92c6f, R12, R13, R14, R15, R8, R9, R10, R11)
675	SHA256ROUND1(21, 0x4a7484aa, R11, R12, R13, R14, R15, R8, R9, R10)
676	SHA256ROUND1(22, 0x5cb0a9dc, R10, R11, R12, R13, R14, R15, R8, R9)
677	SHA256ROUND1(23, 0x76f988da, R9, R10, R11, R12, R13, R14, R15, R8)
678	SHA256ROUND1(24, 0x983e5152, R8, R9, R10, R11, R12, R13, R14, R15)
679	SHA256ROUND1(25, 0xa831c66d, R15, R8, R9, R10, R11, R12, R13, R14)
680	SHA256ROUND1(26, 0xb00327c8, R14, R15, R8, R9, R10, R11, R12, R13)
681	SHA256ROUND1(27, 0xbf597fc7, R13, R14, R15, R8, R9, R10, R11, R12)
682	SHA256ROUND1(28, 0xc6e00bf3, R12, R13, R14, R15, R8, R9, R10, R11)
683	SHA256ROUND1(29, 0xd5a79147, R11, R12, R13, R14, R15, R8, R9, R10)
684	SHA256ROUND1(30, 0x06ca6351, R10, R11, R12, R13, R14, R15, R8, R9)
685	SHA256ROUND1(31, 0x14292967, R9, R10, R11, R12, R13, R14, R15, R8)
686	SHA256ROUND1(32, 0x27b70a85, R8, R9, R10, R11, R12, R13, R14, R15)
687	SHA256ROUND1(33, 0x2e1b2138, R15, R8, R9, R10, R11, R12, R13, R14)
688	SHA256ROUND1(34, 0x4d2c6dfc, R14, R15, R8, R9, R10, R11, R12, R13)
689	SHA256ROUND1(35, 0x53380d13, R13, R14, R15, R8, R9, R10, R11, R12)
690	SHA256ROUND1(36, 0x650a7354, R12, R13, R14, R15, R8, R9, R10, R11)
691	SHA256ROUND1(37, 0x766a0abb, R11, R12, R13, R14, R15, R8, R9, R10)
692	SHA256ROUND1(38, 0x81c2c92e, R10, R11, R12, R13, R14, R15, R8, R9)
693	SHA256ROUND1(39, 0x92722c85, R9, R10, R11, R12, R13, R14, R15, R8)
694	SHA256ROUND1(40, 0xa2bfe8a1, R8, R9, R10, R11, R12, R13, R14, R15)
695	SHA256ROUND1(41, 0xa81a664b, R15, R8, R9, R10, R11, R12, R13, R14)
696	SHA256ROUND1(42, 0xc24b8b70, R14, R15, R8, R9, R10, R11, R12, R13)
697	SHA256ROUND1(43, 0xc76c51a3, R13, R14, R15, R8, R9, R10, R11, R12)
698	SHA256ROUND1(44, 0xd192e819, R12, R13, R14, R15, R8, R9, R10, R11)
699	SHA256ROUND1(45, 0xd6990624, R11, R12, R13, R14, R15, R8, R9, R10)
700	SHA256ROUND1(46, 0xf40e3585, R10, R11, R12, R13, R14, R15, R8, R9)
701	SHA256ROUND1(47, 0x106aa070, R9, R10, R11, R12, R13, R14, R15, R8)
702	SHA256ROUND1(48, 0x19a4c116, R8, R9, R10, R11, R12, R13, R14, R15)
703	SHA256ROUND1(49, 0x1e376c08, R15, R8, R9, R10, R11, R12, R13, R14)
704	SHA256ROUND1(50, 0x2748774c, R14, R15, R8, R9, R10, R11, R12, R13)
705	SHA256ROUND1(51, 0x34b0bcb5, R13, R14, R15, R8, R9, R10, R11, R12)
706	SHA256ROUND1(52, 0x391c0cb3, R12, R13, R14, R15, R8, R9, R10, R11)
707	SHA256ROUND1(53, 0x4ed8aa4a, R11, R12, R13, R14, R15, R8, R9, R10)
708	SHA256ROUND1(54, 0x5b9cca4f, R10, R11, R12, R13, R14, R15, R8, R9)
709	SHA256ROUND1(55, 0x682e6ff3, R9, R10, R11, R12, R13, R14, R15, R8)
710	SHA256ROUND1(56, 0x748f82ee, R8, R9, R10, R11, R12, R13, R14, R15)
711	SHA256ROUND1(57, 0x78a5636f, R15, R8, R9, R10, R11, R12, R13, R14)
712	SHA256ROUND1(58, 0x84c87814, R14, R15, R8, R9, R10, R11, R12, R13)
713	SHA256ROUND1(59, 0x8cc70208, R13, R14, R15, R8, R9, R10, R11, R12)
714	SHA256ROUND1(60, 0x90befffa, R12, R13, R14, R15, R8, R9, R10, R11)
715	SHA256ROUND1(61, 0xa4506ceb, R11, R12, R13, R14, R15, R8, R9, R10)
716	SHA256ROUND1(62, 0xbef9a3f7, R10, R11, R12, R13, R14, R15, R8, R9)
717	SHA256ROUND1(63, 0xc67178f2, R9, R10, R11, R12, R13, R14, R15, R8)
718
719	MOVQ dig+0(FP), BP
720	ADDL (0*4)(BP), R8  // H0 = a + H0
721	MOVL R8, (0*4)(BP)
722	ADDL (1*4)(BP), R9  // H1 = b + H1
723	MOVL R9, (1*4)(BP)
724	ADDL (2*4)(BP), R10 // H2 = c + H2
725	MOVL R10, (2*4)(BP)
726	ADDL (3*4)(BP), R11 // H3 = d + H3
727	MOVL R11, (3*4)(BP)
728	ADDL (4*4)(BP), R12 // H4 = e + H4
729	MOVL R12, (4*4)(BP)
730	ADDL (5*4)(BP), R13 // H5 = f + H5
731	MOVL R13, (5*4)(BP)
732	ADDL (6*4)(BP), R14 // H6 = g + H6
733	MOVL R14, (6*4)(BP)
734	ADDL (7*4)(BP), R15 // H7 = h + H7
735	MOVL R15, (7*4)(BP)
736
737	ADDQ $64, SI
738	CMPQ SI, 256(SP)
739	JB   loop
740
741end:
742	RET
743
744avx2:
745	MOVQ dig+0(FP), CTX          // d.h[8]
746	MOVQ p_base+8(FP), INP
747	MOVQ p_len+16(FP), NUM_BYTES
748
749	LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
750	MOVQ NUM_BYTES, _INP_END(SP)
751
752	CMPQ NUM_BYTES, INP
753	JE   avx2_only_one_block
754
755	// Load initial digest
756	MOVL 0(CTX), a  // a = H0
757	MOVL 4(CTX), b  // b = H1
758	MOVL 8(CTX), c  // c = H2
759	MOVL 12(CTX), d // d = H3
760	MOVL 16(CTX), e // e = H4
761	MOVL 20(CTX), f // f = H5
762	MOVL 24(CTX), g // g = H6
763	MOVL 28(CTX), h // h = H7
764
765avx2_loop0: // at each iteration works with one block (512 bit)
766
767	VMOVDQU (0*32)(INP), XTMP0
768	VMOVDQU (1*32)(INP), XTMP1
769	VMOVDQU (2*32)(INP), XTMP2
770	VMOVDQU (3*32)(INP), XTMP3
771
772	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
773
774	// Apply Byte Flip Mask: LE -> BE
775	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
776	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
777	VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
778	VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
779
780	// Transpose data into high/low parts
781	VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
782	VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
783	VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
784	VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
785
786	MOVQ $K256<>(SB), TBL // Loading address of table with round-specific constants
787
788avx2_last_block_enter:
789	ADDQ $64, INP
790	MOVQ INP, _INP(SP)
791	XORQ SRND, SRND
792
793avx2_loop1: // for w0 - w47
794	// Do 4 rounds and scheduling
795	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
796	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
797	ROUND_AND_SCHED_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
798	ROUND_AND_SCHED_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
799	ROUND_AND_SCHED_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
800	ROUND_AND_SCHED_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
801
802	// Do 4 rounds and scheduling
803	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
804	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
805	ROUND_AND_SCHED_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
806	ROUND_AND_SCHED_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
807	ROUND_AND_SCHED_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
808	ROUND_AND_SCHED_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
809
810	// Do 4 rounds and scheduling
811	VPADDD  2*32(TBL)(SRND*1), XDWORD2, XFER
812	VMOVDQU XFER, (_XFER + 2*32)(SP)(SRND*1)
813	ROUND_AND_SCHED_N_0(_XFER + 2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
814	ROUND_AND_SCHED_N_1(_XFER + 2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
815	ROUND_AND_SCHED_N_2(_XFER + 2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
816	ROUND_AND_SCHED_N_3(_XFER + 2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
817
818	// Do 4 rounds and scheduling
819	VPADDD  3*32(TBL)(SRND*1), XDWORD3, XFER
820	VMOVDQU XFER, (_XFER + 3*32)(SP)(SRND*1)
821	ROUND_AND_SCHED_N_0(_XFER + 3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
822	ROUND_AND_SCHED_N_1(_XFER + 3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
823	ROUND_AND_SCHED_N_2(_XFER + 3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
824	ROUND_AND_SCHED_N_3(_XFER + 3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
825
826	ADDQ $4*32, SRND
827	CMPQ SRND, $3*4*32
828	JB   avx2_loop1
829
830avx2_loop2:
831	// w48 - w63 processed with no scheduling (last 16 rounds)
832	VPADDD  0*32(TBL)(SRND*1), XDWORD0, XFER
833	VMOVDQU XFER, (_XFER + 0*32)(SP)(SRND*1)
834	DO_ROUND_N_0(_XFER + 0*32, a, b, c, d, e, f, g, h, h)
835	DO_ROUND_N_1(_XFER + 0*32, h, a, b, c, d, e, f, g, h)
836	DO_ROUND_N_2(_XFER + 0*32, g, h, a, b, c, d, e, f, g)
837	DO_ROUND_N_3(_XFER + 0*32, f, g, h, a, b, c, d, e, f)
838
839	VPADDD  1*32(TBL)(SRND*1), XDWORD1, XFER
840	VMOVDQU XFER, (_XFER + 1*32)(SP)(SRND*1)
841	DO_ROUND_N_0(_XFER + 1*32, e, f, g, h, a, b, c, d, e)
842	DO_ROUND_N_1(_XFER + 1*32, d, e, f, g, h, a, b, c, d)
843	DO_ROUND_N_2(_XFER + 1*32, c, d, e, f, g, h, a, b, c)
844	DO_ROUND_N_3(_XFER + 1*32, b, c, d, e, f, g, h, a, b)
845
846	ADDQ $2*32, SRND
847
848	VMOVDQU XDWORD2, XDWORD0
849	VMOVDQU XDWORD3, XDWORD1
850
851	CMPQ SRND, $4*4*32
852	JB   avx2_loop2
853
854	MOVQ dig+0(FP), CTX // d.h[8]
855	MOVQ _INP(SP), INP
856
857	addm(  0(CTX), a)
858	addm(  4(CTX), b)
859	addm(  8(CTX), c)
860	addm( 12(CTX), d)
861	addm( 16(CTX), e)
862	addm( 20(CTX), f)
863	addm( 24(CTX), g)
864	addm( 28(CTX), h)
865
866	CMPQ _INP_END(SP), INP
867	JB   done_hash
868
869	XORQ SRND, SRND
870
871avx2_loop3: // Do second block using previously scheduled results
872	DO_ROUND_N_0(_XFER + 0*32 + 16, a, b, c, d, e, f, g, h, a)
873	DO_ROUND_N_1(_XFER + 0*32 + 16, h, a, b, c, d, e, f, g, h)
874	DO_ROUND_N_2(_XFER + 0*32 + 16, g, h, a, b, c, d, e, f, g)
875	DO_ROUND_N_3(_XFER + 0*32 + 16, f, g, h, a, b, c, d, e, f)
876
877	DO_ROUND_N_0(_XFER + 1*32 + 16, e, f, g, h, a, b, c, d, e)
878	DO_ROUND_N_1(_XFER + 1*32 + 16, d, e, f, g, h, a, b, c, d)
879	DO_ROUND_N_2(_XFER + 1*32 + 16, c, d, e, f, g, h, a, b, c)
880	DO_ROUND_N_3(_XFER + 1*32 + 16, b, c, d, e, f, g, h, a, b)
881
882	ADDQ $2*32, SRND
883	CMPQ SRND, $4*4*32
884	JB   avx2_loop3
885
886	MOVQ dig+0(FP), CTX // d.h[8]
887	MOVQ _INP(SP), INP
888	ADDQ $64, INP
889
890	addm(  0(CTX), a)
891	addm(  4(CTX), b)
892	addm(  8(CTX), c)
893	addm( 12(CTX), d)
894	addm( 16(CTX), e)
895	addm( 20(CTX), f)
896	addm( 24(CTX), g)
897	addm( 28(CTX), h)
898
899	CMPQ _INP_END(SP), INP
900	JA   avx2_loop0
901	JB   done_hash
902
903avx2_do_last_block:
904
905	VMOVDQU 0(INP), XWORD0
906	VMOVDQU 16(INP), XWORD1
907	VMOVDQU 32(INP), XWORD2
908	VMOVDQU 48(INP), XWORD3
909
910	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
911
912	VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
913	VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
914	VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
915	VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
916
917	MOVQ $K256<>(SB), TBL
918
919	JMP avx2_last_block_enter
920
921avx2_only_one_block:
922	// Load initial digest
923	MOVL 0(CTX), a  // a = H0
924	MOVL 4(CTX), b  // b = H1
925	MOVL 8(CTX), c  // c = H2
926	MOVL 12(CTX), d // d = H3
927	MOVL 16(CTX), e // e = H4
928	MOVL 20(CTX), f // f = H5
929	MOVL 24(CTX), g // g = H6
930	MOVL 28(CTX), h // h = H7
931
932	JMP avx2_do_last_block
933
934done_hash:
935	VZEROUPPER
936	RET
937
938sha_ni:
939	MOVQ		dig+0(FP), digestPtr		// init digest hash vector H0, H1,..., H7 pointer
940	MOVQ		p_base+8(FP), dataPtr		// init input data base pointer
941	MOVQ		p_len+16(FP), numBytes		// get number of input bytes to hash
942	SHRQ		$6, numBytes			// force modulo 64 input buffer length
943	SHLQ		$6, numBytes
944	CMPQ		numBytes, $0			// exit early for zero-length input buffer
945	JEQ		done
946	ADDQ		dataPtr, numBytes		// point numBytes to end of input buffer
947	VMOVDQU		(0*16)(digestPtr), state0	// load initial hash values and reorder
948	VMOVDQU		(1*16)(digestPtr), state1	// DCBA, HGFE -> ABEF, CDGH
949	PSHUFD		$0xb1, state0, state0		// CDAB
950	PSHUFD		$0x1b, state1, state1		// EFGH
951	VMOVDQA		state0, m4
952	PALIGNR		$8, state1, state0		// ABEF
953	PBLENDW		$0xf0, m4, state1		// CDGH
954	VMOVDQA		flip_mask<>(SB), shufMask
955	LEAQ		K256<>(SB), sha256Constants
956
957roundLoop:
958	// save hash values for addition after rounds
959	VMOVDQA		state0, abefSave
960	VMOVDQA		state1, cdghSave
961
962	// do rounds 0-59
963	rounds0to11	(m0,-,0,nop)			// 0-3
964	rounds0to11	(m1,m0,1,sha256msg1)		// 4-7
965	rounds0to11	(m2,m1,2,sha256msg1)		// 8-11
966	VMOVDQU		(3*16)(dataPtr), msg
967	PSHUFB		shufMask, msg
968	rounds12to59	(m3,3,m2,m0,sha256msg1,vmovrev)	// 12-15
969	rounds12to59	(m0,4,m3,m1,sha256msg1,vmov)    // 16-19
970	rounds12to59	(m1,5,m0,m2,sha256msg1,vmov)    // 20-23
971	rounds12to59	(m2,6,m1,m3,sha256msg1,vmov)    // 24-27
972	rounds12to59	(m3,7,m2,m0,sha256msg1,vmov)    // 28-31
973	rounds12to59	(m0,8,m3,m1,sha256msg1,vmov)    // 32-35
974	rounds12to59	(m1,9,m0,m2,sha256msg1,vmov)    // 36-39
975	rounds12to59	(m2,10,m1,m3,sha256msg1,vmov)   // 40-43
976	rounds12to59	(m3,11,m2,m0,sha256msg1,vmov)   // 44-47
977	rounds12to59	(m0,12,m3,m1,sha256msg1,vmov)   // 48-51
978	rounds12to59	(m1,13,m0,m2,nop,vmov)          // 52-55
979	rounds12to59	(m2,14,m1,m3,nop,vmov)		// 56-59
980
981	// do rounds 60-63
982	VMOVDQA		m3, msg
983	PADDD		(15*32)(sha256Constants), msg
984	SHA256RNDS2	msg, state0, state1
985	PSHUFD		$0x0e, msg, msg
986	SHA256RNDS2	msg, state1, state0
987
988	// add current hash values with previously saved
989	PADDD		abefSave, state0
990	PADDD		cdghSave, state1
991
992	// advance data pointer; loop until buffer empty
993	ADDQ		$64, dataPtr
994	CMPQ		numBytes, dataPtr
995	JNE		roundLoop
996
997	// write hash values back in the correct order
998	PSHUFD		$0x1b, state0, state0		// FEBA
999	PSHUFD		$0xb1, state1, state1		// DCHG
1000	VMOVDQA		state0, m4
1001	PBLENDW		$0xf0, state1, state0		// DCBA
1002	PALIGNR		$8, m4, state1			// HGFE
1003	VMOVDQU		state0, (0*16)(digestPtr)
1004	VMOVDQU		state1, (1*16)(digestPtr)
1005
1006done:
1007	RET
1008
1009// shuffle byte order from LE to BE
1010DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
1011DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
1012DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
1013DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
1014GLOBL flip_mask<>(SB), 8, $32
1015
1016// shuffle xBxA -> 00BA
1017DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
1018DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
1019DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
1020DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
1021GLOBL shuff_00BA<>(SB), 8, $32
1022
1023// shuffle xDxC -> DC00
1024DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
1025DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
1026DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
1027DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
1028GLOBL shuff_DC00<>(SB), 8, $32
1029
1030// Round specific constants
1031DATA K256<>+0x00(SB)/4, $0x428a2f98 // k1
1032DATA K256<>+0x04(SB)/4, $0x71374491 // k2
1033DATA K256<>+0x08(SB)/4, $0xb5c0fbcf // k3
1034DATA K256<>+0x0c(SB)/4, $0xe9b5dba5 // k4
1035DATA K256<>+0x10(SB)/4, $0x428a2f98 // k1
1036DATA K256<>+0x14(SB)/4, $0x71374491 // k2
1037DATA K256<>+0x18(SB)/4, $0xb5c0fbcf // k3
1038DATA K256<>+0x1c(SB)/4, $0xe9b5dba5 // k4
1039
1040DATA K256<>+0x20(SB)/4, $0x3956c25b // k5 - k8
1041DATA K256<>+0x24(SB)/4, $0x59f111f1
1042DATA K256<>+0x28(SB)/4, $0x923f82a4
1043DATA K256<>+0x2c(SB)/4, $0xab1c5ed5
1044DATA K256<>+0x30(SB)/4, $0x3956c25b
1045DATA K256<>+0x34(SB)/4, $0x59f111f1
1046DATA K256<>+0x38(SB)/4, $0x923f82a4
1047DATA K256<>+0x3c(SB)/4, $0xab1c5ed5
1048
1049DATA K256<>+0x40(SB)/4, $0xd807aa98 // k9 - k12
1050DATA K256<>+0x44(SB)/4, $0x12835b01
1051DATA K256<>+0x48(SB)/4, $0x243185be
1052DATA K256<>+0x4c(SB)/4, $0x550c7dc3
1053DATA K256<>+0x50(SB)/4, $0xd807aa98
1054DATA K256<>+0x54(SB)/4, $0x12835b01
1055DATA K256<>+0x58(SB)/4, $0x243185be
1056DATA K256<>+0x5c(SB)/4, $0x550c7dc3
1057
1058DATA K256<>+0x60(SB)/4, $0x72be5d74 // k13 - k16
1059DATA K256<>+0x64(SB)/4, $0x80deb1fe
1060DATA K256<>+0x68(SB)/4, $0x9bdc06a7
1061DATA K256<>+0x6c(SB)/4, $0xc19bf174
1062DATA K256<>+0x70(SB)/4, $0x72be5d74
1063DATA K256<>+0x74(SB)/4, $0x80deb1fe
1064DATA K256<>+0x78(SB)/4, $0x9bdc06a7
1065DATA K256<>+0x7c(SB)/4, $0xc19bf174
1066
1067DATA K256<>+0x80(SB)/4, $0xe49b69c1 // k17 - k20
1068DATA K256<>+0x84(SB)/4, $0xefbe4786
1069DATA K256<>+0x88(SB)/4, $0x0fc19dc6
1070DATA K256<>+0x8c(SB)/4, $0x240ca1cc
1071DATA K256<>+0x90(SB)/4, $0xe49b69c1
1072DATA K256<>+0x94(SB)/4, $0xefbe4786
1073DATA K256<>+0x98(SB)/4, $0x0fc19dc6
1074DATA K256<>+0x9c(SB)/4, $0x240ca1cc
1075
1076DATA K256<>+0xa0(SB)/4, $0x2de92c6f // k21 - k24
1077DATA K256<>+0xa4(SB)/4, $0x4a7484aa
1078DATA K256<>+0xa8(SB)/4, $0x5cb0a9dc
1079DATA K256<>+0xac(SB)/4, $0x76f988da
1080DATA K256<>+0xb0(SB)/4, $0x2de92c6f
1081DATA K256<>+0xb4(SB)/4, $0x4a7484aa
1082DATA K256<>+0xb8(SB)/4, $0x5cb0a9dc
1083DATA K256<>+0xbc(SB)/4, $0x76f988da
1084
1085DATA K256<>+0xc0(SB)/4, $0x983e5152 // k25 - k28
1086DATA K256<>+0xc4(SB)/4, $0xa831c66d
1087DATA K256<>+0xc8(SB)/4, $0xb00327c8
1088DATA K256<>+0xcc(SB)/4, $0xbf597fc7
1089DATA K256<>+0xd0(SB)/4, $0x983e5152
1090DATA K256<>+0xd4(SB)/4, $0xa831c66d
1091DATA K256<>+0xd8(SB)/4, $0xb00327c8
1092DATA K256<>+0xdc(SB)/4, $0xbf597fc7
1093
1094DATA K256<>+0xe0(SB)/4, $0xc6e00bf3 // k29 - k32
1095DATA K256<>+0xe4(SB)/4, $0xd5a79147
1096DATA K256<>+0xe8(SB)/4, $0x06ca6351
1097DATA K256<>+0xec(SB)/4, $0x14292967
1098DATA K256<>+0xf0(SB)/4, $0xc6e00bf3
1099DATA K256<>+0xf4(SB)/4, $0xd5a79147
1100DATA K256<>+0xf8(SB)/4, $0x06ca6351
1101DATA K256<>+0xfc(SB)/4, $0x14292967
1102
1103DATA K256<>+0x100(SB)/4, $0x27b70a85
1104DATA K256<>+0x104(SB)/4, $0x2e1b2138
1105DATA K256<>+0x108(SB)/4, $0x4d2c6dfc
1106DATA K256<>+0x10c(SB)/4, $0x53380d13
1107DATA K256<>+0x110(SB)/4, $0x27b70a85
1108DATA K256<>+0x114(SB)/4, $0x2e1b2138
1109DATA K256<>+0x118(SB)/4, $0x4d2c6dfc
1110DATA K256<>+0x11c(SB)/4, $0x53380d13
1111
1112DATA K256<>+0x120(SB)/4, $0x650a7354
1113DATA K256<>+0x124(SB)/4, $0x766a0abb
1114DATA K256<>+0x128(SB)/4, $0x81c2c92e
1115DATA K256<>+0x12c(SB)/4, $0x92722c85
1116DATA K256<>+0x130(SB)/4, $0x650a7354
1117DATA K256<>+0x134(SB)/4, $0x766a0abb
1118DATA K256<>+0x138(SB)/4, $0x81c2c92e
1119DATA K256<>+0x13c(SB)/4, $0x92722c85
1120
1121DATA K256<>+0x140(SB)/4, $0xa2bfe8a1
1122DATA K256<>+0x144(SB)/4, $0xa81a664b
1123DATA K256<>+0x148(SB)/4, $0xc24b8b70
1124DATA K256<>+0x14c(SB)/4, $0xc76c51a3
1125DATA K256<>+0x150(SB)/4, $0xa2bfe8a1
1126DATA K256<>+0x154(SB)/4, $0xa81a664b
1127DATA K256<>+0x158(SB)/4, $0xc24b8b70
1128DATA K256<>+0x15c(SB)/4, $0xc76c51a3
1129
1130DATA K256<>+0x160(SB)/4, $0xd192e819
1131DATA K256<>+0x164(SB)/4, $0xd6990624
1132DATA K256<>+0x168(SB)/4, $0xf40e3585
1133DATA K256<>+0x16c(SB)/4, $0x106aa070
1134DATA K256<>+0x170(SB)/4, $0xd192e819
1135DATA K256<>+0x174(SB)/4, $0xd6990624
1136DATA K256<>+0x178(SB)/4, $0xf40e3585
1137DATA K256<>+0x17c(SB)/4, $0x106aa070
1138
1139DATA K256<>+0x180(SB)/4, $0x19a4c116
1140DATA K256<>+0x184(SB)/4, $0x1e376c08
1141DATA K256<>+0x188(SB)/4, $0x2748774c
1142DATA K256<>+0x18c(SB)/4, $0x34b0bcb5
1143DATA K256<>+0x190(SB)/4, $0x19a4c116
1144DATA K256<>+0x194(SB)/4, $0x1e376c08
1145DATA K256<>+0x198(SB)/4, $0x2748774c
1146DATA K256<>+0x19c(SB)/4, $0x34b0bcb5
1147
1148DATA K256<>+0x1a0(SB)/4, $0x391c0cb3
1149DATA K256<>+0x1a4(SB)/4, $0x4ed8aa4a
1150DATA K256<>+0x1a8(SB)/4, $0x5b9cca4f
1151DATA K256<>+0x1ac(SB)/4, $0x682e6ff3
1152DATA K256<>+0x1b0(SB)/4, $0x391c0cb3
1153DATA K256<>+0x1b4(SB)/4, $0x4ed8aa4a
1154DATA K256<>+0x1b8(SB)/4, $0x5b9cca4f
1155DATA K256<>+0x1bc(SB)/4, $0x682e6ff3
1156
1157DATA K256<>+0x1c0(SB)/4, $0x748f82ee
1158DATA K256<>+0x1c4(SB)/4, $0x78a5636f
1159DATA K256<>+0x1c8(SB)/4, $0x84c87814
1160DATA K256<>+0x1cc(SB)/4, $0x8cc70208
1161DATA K256<>+0x1d0(SB)/4, $0x748f82ee
1162DATA K256<>+0x1d4(SB)/4, $0x78a5636f
1163DATA K256<>+0x1d8(SB)/4, $0x84c87814
1164DATA K256<>+0x1dc(SB)/4, $0x8cc70208
1165
1166DATA K256<>+0x1e0(SB)/4, $0x90befffa
1167DATA K256<>+0x1e4(SB)/4, $0xa4506ceb
1168DATA K256<>+0x1e8(SB)/4, $0xbef9a3f7
1169DATA K256<>+0x1ec(SB)/4, $0xc67178f2
1170DATA K256<>+0x1f0(SB)/4, $0x90befffa
1171DATA K256<>+0x1f4(SB)/4, $0xa4506ceb
1172DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
1173DATA K256<>+0x1fc(SB)/4, $0xc67178f2
1174
1175GLOBL K256<>(SB), (NOPTR + RODATA), $512
1176