Lines Matching +full:- +full:2 +full:g
2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
13 # General Public License (GPL) Version 2, available from the file
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
48 # This code schedules 2 blocks at a time, with 4 lanes per block
60 # Add reg to mem using reg-mem add and store
87 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
94 INP = %rsi # 2nd arg
106 g = %r10d define
116 _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
144 h = g
145 g = f define
161 addl \disp(%rsp, SRND), h # h = k + w + h # --
163 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
168 xor g, y2 # y2 = f^g # CH
169 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
172 and e, y2 # y2 = (f^g)&e # CH
175 add h, d # d = k + w + h + d # --
178 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
180 rorx $2, a, T1 # T1 = (a >> 2) # S0
182 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
184 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
188 add y0, y2 # y2 = S1 + CH # --
189 vpslld $(32-7), XTMP1, XTMP3
191 add y1, h # h = k + w + h + S0 # --
193 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
194 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
197 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
198 add y3, h # h = t1 + S0 + MAJ # --
209 addl offset(%rsp, SRND), h # h = k + w + h # --
213 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
217 xor g, y2 # y2 = f^g # CH
223 and e, y2 # y2 = (f^g)&e # CH
224 add h, d # d = k + w + h + d # --
226 vpslld $(32-18), XTMP1, XTMP1
231 rorx $2, a, T1 # T1 = (a >> 2) # S0
232 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
234 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
235 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
238 add y0, y2 # y2 = S1 + CH # --
241 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
243 add y1, h # h = k + w + h + S0 # --
245 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
246 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
247 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
248 add y3, h # h = t1 + S0 + MAJ # --
250 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
255 ################################### RND N + 2 ############################
259 offset = \disp + 2*4
260 addl offset(%rsp, SRND), h # h = k + w + h # --
262 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
266 xor g, y2 # y2 = f^g # CH
270 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
271 and e, y2 # y2 = (f^g)&e # CH
275 add h, d # d = k + w + h + d # --
281 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
285 rorx $2, a ,T1 # T1 = (a >> 2) # S0
288 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
291 add y0, y2 # y2 = S1 + CH # --
292 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
295 add y1,h # h = k + w + h + S0 # --
296 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
297 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
299 add y3,h # h = t1 + S0 + MAJ # --
310 addl offset(%rsp, SRND), h # h = k + w + h # --
314 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
318 xor g, y2 # y2 = f^g # CH
321 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
323 and e, y2 # y2 = (f^g)&e # CH
324 add h, d # d = k + w + h + d # --
327 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
329 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
333 add y0, y2 # y2 = S1 + CH # --
337 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
339 rorx $2, a, T1 # T1 = (a >> 2) # S0
342 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
343 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
348 add y1, h # h = k + w + h + S0 # --
349 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
350 add y3, h # h = t1 + S0 + MAJ # --
362 xor g, y2 # y2 = f^g # CH
366 and e, y2 # y2 = (f^g)&e # CH
370 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
375 rorx $2, a, T1 # T1 = (a >> 2) # S0
376 addl \disp(%rsp, SRND), h # h = k + w + h # --
379 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
383 add y0, y2 # y2 = S1 + CH # --
386 add h, d # d = k + w + h + d # --
388 add y1, h # h = k + w + h + S0 # --
389 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
395 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
399 xor g, y2 # y2 = f^g # CH
403 and e, y2 # y2 = (f^g)&e # CH
404 add y3, old_h # h = t1 + S0 + MAJ # --
408 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
413 rorx $2, a, T1 # T1 = (a >> 2) # S0
415 addl offset(%rsp, SRND), h # h = k + w + h # --
418 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
422 add y0, y2 # y2 = S1 + CH # --
425 add h, d # d = k + w + h + d # --
427 add y1, h # h = k + w + h + S0 # --
429 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
433 ################################### RND N + 2 ##############################
435 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
439 xor g, y2 # y2 = f^g # CH
443 and e, y2 # y2 = (f^g)&e # CH
444 add y3, old_h # h = t1 + S0 + MAJ # --
448 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
453 rorx $2, a, T1 # T1 = (a >> 2) # S0
454 offset = 4*2 + \disp
455 addl offset(%rsp, SRND), h # h = k + w + h # --
458 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
462 add y0, y2 # y2 = S1 + CH # --
465 add h, d # d = k + w + h + d # --
467 add y1, h # h = k + w + h + S0 # --
469 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
475 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
479 xor g, y2 # y2 = f^g # CH
483 and e, y2 # y2 = (f^g)&e # CH
484 add y3, old_h # h = t1 + S0 + MAJ # --
488 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
493 rorx $2, a, T1 # T1 = (a >> 2) # S0
495 addl offset(%rsp, SRND), h # h = k + w + h # --
498 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
502 add y0, y2 # y2 = S1 + CH # --
505 add h, d # d = k + w + h + d # --
507 add y1, h # h = k + w + h + S0 # --
509 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
512 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
514 add y3, h # h = t1 + S0 + MAJ # --
523 ## arg 2 : pointer to input data
538 and $-32, %rsp # align rsp to 32 byte boundary
542 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
551 mov 4*2(CTX), c
555 mov 4*6(CTX), g
568 VMOVDQ 2*32(INP),XTMP2
602 leaq K256+2*32(%rip), INP
604 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605 FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
627 add $2*32, SRND
640 addm (4*2)(CTX),c
644 addm (4*6)(CTX),g
656 add $2*32, SRND
666 addm (4*2)(CTX),c
670 addm (4*6)(CTX),g
680 VMOVDQ 2*16(INP),XWORD2
695 mov (4*2)(CTX),c
699 mov (4*6)(CTX),g
764 # shuffle xBxA -> 00BA
770 # shuffle xDxC -> DC00