Lines Matching +full:0 +full:- +full:7 +full:a +full:- +full:e
2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
11 # This software is available to you under a choice of one of two
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
48 # This code schedules 2 blocks at a time, with 4 lanes per block
59 # Add reg to mem using reg-mem add and store
86 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
87 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
97 e = %edx # clobbers NUM_BLKS define
102 a = %eax define
116 _XMM_SAVE_SIZE = 0
121 _XFER = 0
139 # Rotate values of symbols a...h
145 f = e
146 e = d define
149 b = a
150 a = TMP_ define
154 ################################### RND N + 0 ############################
156 mov a, y3 # y3 = a # MAJA
157 rorx $25, e, y0 # y0 = e >> 25 # S1A
158 rorx $11, e, y1 # y1 = e >> 11 # S1B
160 addl \disp(%rsp, SRND), h # h = k + w + h # --
161 or c, y3 # y3 = a|c # MAJA
162 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
164 rorx $13, a, T1 # T1 = a >> 13 # S0B
166 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
168 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
169 rorx $6, e, y1 # y1 = (e >> 6) # S1
171 and e, y2 # y2 = (f^g)&e # CH
172 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
173 rorx $22, a, y1 # y1 = a >> 22 # S0A
174 add h, d # d = k + w + h + d # --
176 and b, y3 # y3 = (a|c)&b # MAJA
177 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
178 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
179 rorx $2, a, T1 # T1 = (a >> 2) # S0
181 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
182 vpsrld $7, XTMP1, XTMP2
183 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
184 mov a, T1 # T1 = a # MAJB
185 and c, T1 # T1 = a&c # MAJB
187 add y0, y2 # y2 = S1 + CH # --
188 vpslld $(32-7), XTMP1, XTMP3
189 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
190 add y1, h # h = k + w + h + S0 # --
192 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
193 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
196 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
197 add y3, h # h = t1 + S0 + MAJ # --
204 mov a, y3 # y3 = a # MAJA
205 rorx $25, e, y0 # y0 = e >> 25 # S1A
206 rorx $11, e, y1 # y1 = e >> 11 # S1B
208 addl offset(%rsp, SRND), h # h = k + w + h # --
209 or c, y3 # y3 = a|c # MAJA
212 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
214 rorx $13, a, T1 # T1 = a >> 13 # S0B
215 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
219 rorx $6, e, y1 # y1 = (e >> 6) # S1
220 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
221 rorx $22, a, y1 # y1 = a >> 22 # S0A
222 and e, y2 # y2 = (f^g)&e # CH
223 add h, d # d = k + w + h + d # --
225 vpslld $(32-18), XTMP1, XTMP1
226 and b, y3 # y3 = (a|c)&b # MAJA
227 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
230 rorx $2, a, T1 # T1 = (a >> 2) # S0
231 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
233 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
234 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
235 mov a, T1 # T1 = a # MAJB
236 and c, T1 # T1 = a&c # MAJB
237 add y0, y2 # y2 = S1 + CH # --
240 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
241 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
242 add y1, h # h = k + w + h + S0 # --
244 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
245 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
246 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
247 add y3, h # h = t1 + S0 + MAJ # --
249 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
256 mov a, y3 # y3 = a # MAJA
257 rorx $25, e, y0 # y0 = e >> 25 # S1A
259 addl offset(%rsp, SRND), h # h = k + w + h # --
261 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
262 rorx $11, e, y1 # y1 = e >> 11 # S1B
263 or c, y3 # y3 = a|c # MAJA
267 rorx $13, a, T1 # T1 = a >> 13 # S0B
268 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
269 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
270 and e, y2 # y2 = (f^g)&e # CH
272 rorx $6, e, y1 # y1 = (e >> 6) # S1
274 add h, d # d = k + w + h + d # --
275 and b, y3 # y3 = (a|c)&b # MAJA
277 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
278 rorx $22, a, y1 # y1 = a >> 22 # S0A
280 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
283 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
284 rorx $2, a ,T1 # T1 = (a >> 2) # S0
285 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
287 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
288 mov a, T1 # T1 = a # MAJB
289 and c, T1 # T1 = a&c # MAJB
290 add y0, y2 # y2 = S1 + CH # --
291 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
293 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
294 add y1,h # h = k + w + h + S0 # --
295 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
296 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
298 add y3,h # h = t1 + S0 + MAJ # --
305 mov a, y3 # y3 = a # MAJA
306 rorx $25, e, y0 # y0 = e >> 25 # S1A
307 rorx $11, e, y1 # y1 = e >> 11 # S1B
309 addl offset(%rsp, SRND), h # h = k + w + h # --
310 or c, y3 # y3 = a|c # MAJA
313 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
315 rorx $13, a, T1 # T1 = a >> 13 # S0B
316 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
320 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
321 rorx $6, e, y1 # y1 = (e >> 6) # S1
322 and e, y2 # y2 = (f^g)&e # CH
323 add h, d # d = k + w + h + d # --
324 and b, y3 # y3 = (a|c)&b # MAJA
326 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
327 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
328 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
331 rorx $22, a, y1 # y1 = a >> 22 # S0A
332 add y0, y2 # y2 = S1 + CH # --
335 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
336 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
338 rorx $2, a, T1 # T1 = (a >> 2) # S0
341 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
342 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
343 mov a, T1 # T1 = a # MAJB
344 and c, T1 # T1 = a&c # MAJB
345 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
347 add y1, h # h = k + w + h + S0 # --
348 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
349 add y3, h # h = t1 + S0 + MAJ # --
356 ################################### RND N + 0 ###########################
359 rorx $25, e, y0 # y0 = e >> 25 # S1A
360 rorx $11, e, y1 # y1 = e >> 11 # S1B
363 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
364 rorx $6, e, y1 # y1 = (e >> 6) # S1
365 and e, y2 # y2 = (f^g)&e # CH
367 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
368 rorx $13, a, T1 # T1 = a >> 13 # S0B
369 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
370 rorx $22, a, y1 # y1 = a >> 22 # S0A
371 mov a, y3 # y3 = a # MAJA
373 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
374 rorx $2, a, T1 # T1 = (a >> 2) # S0
375 addl \disp(%rsp, SRND), h # h = k + w + h # --
376 or c, y3 # y3 = a|c # MAJA
378 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
379 mov a, T1 # T1 = a # MAJB
380 and b, y3 # y3 = (a|c)&b # MAJA
381 and c, T1 # T1 = a&c # MAJB
382 add y0, y2 # y2 = S1 + CH # --
385 add h, d # d = k + w + h + d # --
386 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
387 add y1, h # h = k + w + h + S0 # --
388 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
394 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
396 rorx $25, e, y0 # y0 = e >> 25 # S1A
397 rorx $11, e, y1 # y1 = e >> 11 # S1B
400 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
401 rorx $6, e, y1 # y1 = (e >> 6) # S1
402 and e, y2 # y2 = (f^g)&e # CH
403 add y3, old_h # h = t1 + S0 + MAJ # --
405 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
406 rorx $13, a, T1 # T1 = a >> 13 # S0B
407 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
408 rorx $22, a, y1 # y1 = a >> 22 # S0A
409 mov a, y3 # y3 = a # MAJA
411 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
412 rorx $2, a, T1 # T1 = (a >> 2) # S0
414 addl offset(%rsp, SRND), h # h = k + w + h # --
415 or c, y3 # y3 = a|c # MAJA
417 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
418 mov a, T1 # T1 = a # MAJB
419 and b, y3 # y3 = (a|c)&b # MAJA
420 and c, T1 # T1 = a&c # MAJB
421 add y0, y2 # y2 = S1 + CH # --
424 add h, d # d = k + w + h + d # --
425 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
426 add y1, h # h = k + w + h + S0 # --
428 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
434 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
436 rorx $25, e, y0 # y0 = e >> 25 # S1A
437 rorx $11, e, y1 # y1 = e >> 11 # S1B
440 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
441 rorx $6, e, y1 # y1 = (e >> 6) # S1
442 and e, y2 # y2 = (f^g)&e # CH
443 add y3, old_h # h = t1 + S0 + MAJ # --
445 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
446 rorx $13, a, T1 # T1 = a >> 13 # S0B
447 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
448 rorx $22, a, y1 # y1 = a >> 22 # S0A
449 mov a, y3 # y3 = a # MAJA
451 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
452 rorx $2, a, T1 # T1 = (a >> 2) # S0
454 addl offset(%rsp, SRND), h # h = k + w + h # --
455 or c, y3 # y3 = a|c # MAJA
457 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
458 mov a, T1 # T1 = a # MAJB
459 and b, y3 # y3 = (a|c)&b # MAJA
460 and c, T1 # T1 = a&c # MAJB
461 add y0, y2 # y2 = S1 + CH # --
464 add h, d # d = k + w + h + d # --
465 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
466 add y1, h # h = k + w + h + S0 # --
468 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
474 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
476 rorx $25, e, y0 # y0 = e >> 25 # S1A
477 rorx $11, e, y1 # y1 = e >> 11 # S1B
480 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
481 rorx $6, e, y1 # y1 = (e >> 6) # S1
482 and e, y2 # y2 = (f^g)&e # CH
483 add y3, old_h # h = t1 + S0 + MAJ # --
485 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
486 rorx $13, a, T1 # T1 = a >> 13 # S0B
487 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
488 rorx $22, a, y1 # y1 = a >> 22 # S0A
489 mov a, y3 # y3 = a # MAJA
491 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
492 rorx $2, a, T1 # T1 = (a >> 2) # S0
494 addl offset(%rsp, SRND), h # h = k + w + h # --
495 or c, y3 # y3 = a|c # MAJA
497 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
498 mov a, T1 # T1 = a # MAJB
499 and b, y3 # y3 = (a|c)&b # MAJA
500 and c, T1 # T1 = a&c # MAJB
501 add y0, y2 # y2 = S1 + CH # --
504 add h, d # d = k + w + h + d # --
505 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
506 add y1, h # h = k + w + h + S0 # --
508 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
511 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
513 add y3, h # h = t1 + S0 + MAJ # --
535 and $-32, %rsp # align rsp to 32 byte boundary
538 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
545 mov (CTX), a
549 mov 4*4(CTX), e
552 mov 4*7(CTX), h
562 VMOVDQ 0*32(INP),XTMP0
574 vperm2i128 $0x20, XTMP2, XTMP0, X0
575 vperm2i128 $0x31, XTMP2, XTMP0, X1
576 vperm2i128 $0x20, XTMP3, XTMP1, X2
577 vperm2i128 $0x31, XTMP3, XTMP1, X3
588 leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
590 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
591 FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
614 leaq K256+0*32(%rip), INP
616 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
617 DO_4ROUNDS (_XFER + 0*32)
634 addm (4*0)(CTX),a
638 addm (4*4)(CTX),e
641 addm (4*7)(CTX),h
650 DO_4ROUNDS (_XFER + 0*32 + 16)
660 addm (4*0)(CTX),a
664 addm (4*4)(CTX),e
667 addm (4*7)(CTX),h
674 VMOVDQ 0*16(INP),XWORD0
689 mov (4*0)(CTX),a
693 mov (4*4)(CTX),e
696 mov (4*7)(CTX),h
722 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
724 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
726 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
728 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
730 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
732 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
734 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
736 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
738 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
740 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
742 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
744 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
746 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
748 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
750 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
752 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
758 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
760 # shuffle xBxA -> 00BA
764 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
766 # shuffle xDxC -> DC00
770 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF