Lines Matching +full:0 +full:- +full:7 +full:a +full:- +full:e

2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
11 # This software is available to you under a choice of one of two
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
48 # This code schedules 2 blocks at a time, with 4 lanes per block
60 # Add reg to mem using reg-mem add and store
87 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
98 e = %edx # clobbers NUM_BLKS define
103 a = %eax define
117 _XMM_SAVE_SIZE = 0
122 _XFER = 0
140 # Rotate values of symbols a...h
146 f = e
147 e = d define
150 b = a
151 a = TMP_ define
155 ################################### RND N + 0 ############################
157 mov a, y3 # y3 = a # MAJA
158 rorx $25, e, y0 # y0 = e >> 25 # S1A
159 rorx $11, e, y1 # y1 = e >> 11 # S1B
161 addl \disp(%rsp, SRND), h # h = k + w + h # --
162 or c, y3 # y3 = a|c # MAJA
163 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
165 rorx $13, a, T1 # T1 = a >> 13 # S0B
167 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
169 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
170 rorx $6, e, y1 # y1 = (e >> 6) # S1
172 and e, y2 # y2 = (f^g)&e # CH
173 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
174 rorx $22, a, y1 # y1 = a >> 22 # S0A
175 add h, d # d = k + w + h + d # --
177 and b, y3 # y3 = (a|c)&b # MAJA
178 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
179 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
180 rorx $2, a, T1 # T1 = (a >> 2) # S0
182 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
183 vpsrld $7, XTMP1, XTMP2
184 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
185 mov a, T1 # T1 = a # MAJB
186 and c, T1 # T1 = a&c # MAJB
188 add y0, y2 # y2 = S1 + CH # --
189 vpslld $(32-7), XTMP1, XTMP3
190 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
191 add y1, h # h = k + w + h + S0 # --
193 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
194 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
197 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
198 add y3, h # h = t1 + S0 + MAJ # --
205 mov a, y3 # y3 = a # MAJA
206 rorx $25, e, y0 # y0 = e >> 25 # S1A
207 rorx $11, e, y1 # y1 = e >> 11 # S1B
209 addl offset(%rsp, SRND), h # h = k + w + h # --
210 or c, y3 # y3 = a|c # MAJA
213 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
215 rorx $13, a, T1 # T1 = a >> 13 # S0B
216 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
220 rorx $6, e, y1 # y1 = (e >> 6) # S1
221 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
222 rorx $22, a, y1 # y1 = a >> 22 # S0A
223 and e, y2 # y2 = (f^g)&e # CH
224 add h, d # d = k + w + h + d # --
226 vpslld $(32-18), XTMP1, XTMP1
227 and b, y3 # y3 = (a|c)&b # MAJA
228 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
231 rorx $2, a, T1 # T1 = (a >> 2) # S0
232 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
234 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
235 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
236 mov a, T1 # T1 = a # MAJB
237 and c, T1 # T1 = a&c # MAJB
238 add y0, y2 # y2 = S1 + CH # --
241 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
242 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
243 add y1, h # h = k + w + h + S0 # --
245 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
246 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
247 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
248 add y3, h # h = t1 + S0 + MAJ # --
250 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
257 mov a, y3 # y3 = a # MAJA
258 rorx $25, e, y0 # y0 = e >> 25 # S1A
260 addl offset(%rsp, SRND), h # h = k + w + h # --
262 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
263 rorx $11, e, y1 # y1 = e >> 11 # S1B
264 or c, y3 # y3 = a|c # MAJA
268 rorx $13, a, T1 # T1 = a >> 13 # S0B
269 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
270 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
271 and e, y2 # y2 = (f^g)&e # CH
273 rorx $6, e, y1 # y1 = (e >> 6) # S1
275 add h, d # d = k + w + h + d # --
276 and b, y3 # y3 = (a|c)&b # MAJA
278 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
279 rorx $22, a, y1 # y1 = a >> 22 # S0A
281 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
284 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
285 rorx $2, a ,T1 # T1 = (a >> 2) # S0
286 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
288 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
289 mov a, T1 # T1 = a # MAJB
290 and c, T1 # T1 = a&c # MAJB
291 add y0, y2 # y2 = S1 + CH # --
292 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
294 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
295 add y1,h # h = k + w + h + S0 # --
296 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
297 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
299 add y3,h # h = t1 + S0 + MAJ # --
306 mov a, y3 # y3 = a # MAJA
307 rorx $25, e, y0 # y0 = e >> 25 # S1A
308 rorx $11, e, y1 # y1 = e >> 11 # S1B
310 addl offset(%rsp, SRND), h # h = k + w + h # --
311 or c, y3 # y3 = a|c # MAJA
314 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
316 rorx $13, a, T1 # T1 = a >> 13 # S0B
317 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
321 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
322 rorx $6, e, y1 # y1 = (e >> 6) # S1
323 and e, y2 # y2 = (f^g)&e # CH
324 add h, d # d = k + w + h + d # --
325 and b, y3 # y3 = (a|c)&b # MAJA
327 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
328 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
329 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
332 rorx $22, a, y1 # y1 = a >> 22 # S0A
333 add y0, y2 # y2 = S1 + CH # --
336 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
337 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
339 rorx $2, a, T1 # T1 = (a >> 2) # S0
342 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
343 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
344 mov a, T1 # T1 = a # MAJB
345 and c, T1 # T1 = a&c # MAJB
346 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
348 add y1, h # h = k + w + h + S0 # --
349 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
350 add y3, h # h = t1 + S0 + MAJ # --
357 ################################### RND N + 0 ###########################
360 rorx $25, e, y0 # y0 = e >> 25 # S1A
361 rorx $11, e, y1 # y1 = e >> 11 # S1B
364 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
365 rorx $6, e, y1 # y1 = (e >> 6) # S1
366 and e, y2 # y2 = (f^g)&e # CH
368 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
369 rorx $13, a, T1 # T1 = a >> 13 # S0B
370 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
371 rorx $22, a, y1 # y1 = a >> 22 # S0A
372 mov a, y3 # y3 = a # MAJA
374 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
375 rorx $2, a, T1 # T1 = (a >> 2) # S0
376 addl \disp(%rsp, SRND), h # h = k + w + h # --
377 or c, y3 # y3 = a|c # MAJA
379 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
380 mov a, T1 # T1 = a # MAJB
381 and b, y3 # y3 = (a|c)&b # MAJA
382 and c, T1 # T1 = a&c # MAJB
383 add y0, y2 # y2 = S1 + CH # --
386 add h, d # d = k + w + h + d # --
387 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
388 add y1, h # h = k + w + h + S0 # --
389 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
395 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
397 rorx $25, e, y0 # y0 = e >> 25 # S1A
398 rorx $11, e, y1 # y1 = e >> 11 # S1B
401 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
402 rorx $6, e, y1 # y1 = (e >> 6) # S1
403 and e, y2 # y2 = (f^g)&e # CH
404 add y3, old_h # h = t1 + S0 + MAJ # --
406 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
407 rorx $13, a, T1 # T1 = a >> 13 # S0B
408 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
409 rorx $22, a, y1 # y1 = a >> 22 # S0A
410 mov a, y3 # y3 = a # MAJA
412 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
413 rorx $2, a, T1 # T1 = (a >> 2) # S0
415 addl offset(%rsp, SRND), h # h = k + w + h # --
416 or c, y3 # y3 = a|c # MAJA
418 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
419 mov a, T1 # T1 = a # MAJB
420 and b, y3 # y3 = (a|c)&b # MAJA
421 and c, T1 # T1 = a&c # MAJB
422 add y0, y2 # y2 = S1 + CH # --
425 add h, d # d = k + w + h + d # --
426 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
427 add y1, h # h = k + w + h + S0 # --
429 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
435 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
437 rorx $25, e, y0 # y0 = e >> 25 # S1A
438 rorx $11, e, y1 # y1 = e >> 11 # S1B
441 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
442 rorx $6, e, y1 # y1 = (e >> 6) # S1
443 and e, y2 # y2 = (f^g)&e # CH
444 add y3, old_h # h = t1 + S0 + MAJ # --
446 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
447 rorx $13, a, T1 # T1 = a >> 13 # S0B
448 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
449 rorx $22, a, y1 # y1 = a >> 22 # S0A
450 mov a, y3 # y3 = a # MAJA
452 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
453 rorx $2, a, T1 # T1 = (a >> 2) # S0
455 addl offset(%rsp, SRND), h # h = k + w + h # --
456 or c, y3 # y3 = a|c # MAJA
458 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
459 mov a, T1 # T1 = a # MAJB
460 and b, y3 # y3 = (a|c)&b # MAJA
461 and c, T1 # T1 = a&c # MAJB
462 add y0, y2 # y2 = S1 + CH # --
465 add h, d # d = k + w + h + d # --
466 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
467 add y1, h # h = k + w + h + S0 # --
469 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
475 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
477 rorx $25, e, y0 # y0 = e >> 25 # S1A
478 rorx $11, e, y1 # y1 = e >> 11 # S1B
481 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
482 rorx $6, e, y1 # y1 = (e >> 6) # S1
483 and e, y2 # y2 = (f^g)&e # CH
484 add y3, old_h # h = t1 + S0 + MAJ # --
486 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
487 rorx $13, a, T1 # T1 = a >> 13 # S0B
488 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
489 rorx $22, a, y1 # y1 = a >> 22 # S0A
490 mov a, y3 # y3 = a # MAJA
492 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
493 rorx $2, a, T1 # T1 = (a >> 2) # S0
495 addl offset(%rsp, SRND), h # h = k + w + h # --
496 or c, y3 # y3 = a|c # MAJA
498 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
499 mov a, T1 # T1 = a # MAJB
500 and b, y3 # y3 = (a|c)&b # MAJA
501 and c, T1 # T1 = a&c # MAJB
502 add y0, y2 # y2 = S1 + CH # --
505 add h, d # d = k + w + h + d # --
506 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
507 add y1, h # h = k + w + h + S0 # --
509 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
512 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
514 add y3, h # h = t1 + S0 + MAJ # --
538 and $-32, %rsp # align rsp to 32 byte boundary
542 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
549 mov (CTX), a
553 mov 4*4(CTX), e
556 mov 4*7(CTX), h
566 VMOVDQ 0*32(INP),XTMP0
578 vperm2i128 $0x20, XTMP2, XTMP0, X0
579 vperm2i128 $0x31, XTMP2, XTMP0, X1
580 vperm2i128 $0x20, XTMP3, XTMP1, X2
581 vperm2i128 $0x31, XTMP3, XTMP1, X3
592 leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
594 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
595 FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
618 leaq K256+0*32(%rip), INP
620 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
621 DO_4ROUNDS (_XFER + 0*32)
638 addm (4*0)(CTX),a
642 addm (4*4)(CTX),e
645 addm (4*7)(CTX),h
654 DO_4ROUNDS (_XFER + 0*32 + 16)
664 addm (4*0)(CTX),a
668 addm (4*4)(CTX),e
671 addm (4*7)(CTX),h
678 VMOVDQ 0*16(INP),XWORD0
693 mov (4*0)(CTX),a
697 mov (4*4)(CTX),e
700 mov (4*7)(CTX),h
726 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
727 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
728 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
729 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
730 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
731 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
732 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
733 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
734 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
735 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
736 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
737 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
738 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
739 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
740 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
741 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
742 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
743 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
744 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
745 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
746 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
747 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
748 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
749 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
750 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
751 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
752 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
753 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
754 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
755 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
756 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
757 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
762 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
764 # shuffle xBxA -> 00BA
768 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
770 # shuffle xDxC -> DC00
774 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF