Lines Matching +full:a +full:- +full:h
2 # Implement fast SHA-512 with AVX2 instructions. (x86_64)
12 # This software is available to you under a choice of one of two
22 # - Redistributions of source code must retain the above
26 # - Redistributions in binary form must reproduce the above
33 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
42 # This code is described in an Intel White-Paper:
43 # "Fast SHA-512 Implementations on Intel Architecture Processors"
49 # This code schedules 1 blocks at a time, with 4 lanes per block
52 #include <linux/linkage.h>
53 #include <linux/cfi_types.h>
87 a = %rax define
92 h = %r11 define
118 # Add reg to mem using reg-mem add and store
143 # Rotate symbols a..h right
144 old_h = h
145 TMP_ = h
146 h = g define
152 b = a
153 a = TMP_ define
166 # Extract w[t-7]
167 MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7]
168 # Calculate w[t-16] + w[t-7]
169 vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16]
170 # Extract w[t-15]
171 MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15]
175 # Calculate w[t-15] ror 1
177 vpsllq $(64-1), YTMP1, YTMP3
178 vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1
179 # Calculate w[t-15] shr 7
180 vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7
182 mov a, y3 # y3 = a # MAJA
185 add frame_XFER(%rsp),h # h = k + w + h # --
186 or c, y3 # y3 = a|c # MAJA
188 rorx $34, a, T1 # T1 = a >> 34 # S0B
196 rorx $39, a, y1 # y1 = a >> 39 # S0A
197 add h, d # d = k + w + h + d # --
199 and b, y3 # y3 = (a|c)&b # MAJA
200 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
201 rorx $28, a, T1 # T1 = (a >> 28) # S0
204 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
205 mov a, T1 # T1 = a # MAJB
206 and c, T1 # T1 = a&c # MAJB
208 add y0, y2 # y2 = S1 + CH # --
209 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
210 add y1, h # h = k + w + h + S0 # --
212 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
214 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
215 add y3, h # h = t1 + S0 + MAJ # --
221 # Calculate w[t-15] ror 8
223 vpsllq $(64-8), YTMP1, YTMP1
224 vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8
226 vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
230 # Add three components, w[t-16], w[t-7] and sigma0
231 vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0
233 vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA}
235 vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00}
240 vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA}
241 vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA}
244 mov a, y3 # y3 = a # MAJA
247 add 1*8+frame_XFER(%rsp), h # h = k + w + h # --
248 or c, y3 # y3 = a|c # MAJA
252 rorx $34, a, T1 # T1 = a >> 34 # S0B
259 rorx $39, a, y1 # y1 = a >> 39 # S0A
261 add h, d # d = k + w + h + d # --
263 and b, y3 # y3 = (a|c)&b # MAJA
264 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
266 rorx $28, a, T1 # T1 = (a >> 28) # S0
269 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
270 mov a, T1 # T1 = a # MAJB
271 and c, T1 # T1 = a&c # MAJB
272 add y0, y2 # y2 = S1 + CH # --
274 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
275 add y1, h # h = k + w + h + S0 # --
277 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
278 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
279 add y3, h # h = t1 + S0 + MAJ # --
286 vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA}
287 vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA}
288 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA}
289 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
290 vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA}
291 vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA}
292 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA}
293 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
294 # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
300 vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--}
302 mov a, y3 # y3 = a # MAJA
304 add 2*8+frame_XFER(%rsp), h # h = k + w + h # --
307 or c, y3 # y3 = a|c # MAJA
311 rorx $34, a, T1 # T1 = a >> 34 # S0B
316 add h, d # d = k + w + h + d # --
317 and b, y3 # y3 = (a|c)&b # MAJA
320 rorx $39, a, y1 # y1 = a >> 39 # S0A
323 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
324 rorx $28, a, T1 # T1 = (a >> 28) # S0
326 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
327 mov a, T1 # T1 = a # MAJB
328 and c, T1 # T1 = a&c # MAJB
329 add y0, y2 # y2 = S1 + CH # --
331 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
332 add y1, h # h = k + w + h + S0 # --
333 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
334 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
336 add y3, h # h = t1 + S0 + MAJ # --
342 vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--}
343 vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--}
344 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--}
345 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
346 vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--}
347 vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--}
348 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--}
349 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^
350 # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
352 # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
354 vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --}
359 mov a, y3 # y3 = a # MAJA
362 add 3*8+frame_XFER(%rsp), h # h = k + w + h # --
363 or c, y3 # y3 = a|c # MAJA
367 rorx $34, a, T1 # T1 = a >> 34 # S0B
374 add h, d # d = k + w + h + d # --
375 and b, y3 # y3 = (a|c)&b # MAJA
380 rorx $39, a, y1 # y1 = a >> 39 # S0A
381 add y0, y2 # y2 = S1 + CH # --
383 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
384 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
386 rorx $28, a, T1 # T1 = (a >> 28) # S0
388 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
389 mov a, T1 # T1 = a # MAJB
390 and c, T1 # T1 = a&c # MAJB
391 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
393 add y1, h # h = k + w + h + S0 # --
394 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
395 add y3, h # h = t1 + S0 + MAJ # --
416 rorx $34, a, T1 # T1 = a >> 34 # S0B
418 rorx $39, a, y1 # y1 = a >> 39 # S0A
419 mov a, y3 # y3 = a # MAJA
421 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
422 rorx $28, a, T1 # T1 = (a >> 28) # S0
423 add frame_XFER(%rsp), h # h = k + w + h # --
424 or c, y3 # y3 = a|c # MAJA
426 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
427 mov a, T1 # T1 = a # MAJB
428 and b, y3 # y3 = (a|c)&b # MAJA
429 and c, T1 # T1 = a&c # MAJB
430 add y0, y2 # y2 = S1 + CH # --
432 add h, d # d = k + w + h + d # --
433 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
434 add y1, h # h = k + w + h + S0 # --
436 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
442 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
451 add y3, old_h # h = t1 + S0 + MAJ # --
454 rorx $34, a, T1 # T1 = a >> 34 # S0B
456 rorx $39, a, y1 # y1 = a >> 39 # S0A
457 mov a, y3 # y3 = a # MAJA
459 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
460 rorx $28, a, T1 # T1 = (a >> 28) # S0
461 add 8*1+frame_XFER(%rsp), h # h = k + w + h # --
462 or c, y3 # y3 = a|c # MAJA
464 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
465 mov a, T1 # T1 = a # MAJB
466 and b, y3 # y3 = (a|c)&b # MAJA
467 and c, T1 # T1 = a&c # MAJB
468 add y0, y2 # y2 = S1 + CH # --
470 add h, d # d = k + w + h + d # --
471 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
472 add y1, h # h = k + w + h + S0 # --
474 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
480 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
489 add y3, old_h # h = t1 + S0 + MAJ # --
492 rorx $34, a, T1 # T1 = a >> 34 # S0B
494 rorx $39, a, y1 # y1 = a >> 39 # S0A
495 mov a, y3 # y3 = a # MAJA
497 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
498 rorx $28, a, T1 # T1 = (a >> 28) # S0
499 add 8*2+frame_XFER(%rsp), h # h = k + w + h # --
500 or c, y3 # y3 = a|c # MAJA
502 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
503 mov a, T1 # T1 = a # MAJB
504 and b, y3 # y3 = (a|c)&b # MAJA
505 and c, T1 # T1 = a&c # MAJB
506 add y0, y2 # y2 = S1 + CH # --
508 add h, d # d = k + w + h + d # --
509 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
510 add y1, h # h = k + w + h + S0 # --
512 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
518 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
527 add y3, old_h # h = t1 + S0 + MAJ # --
530 rorx $34, a, T1 # T1 = a >> 34 # S0B
532 rorx $39, a, y1 # y1 = a >> 39 # S0A
533 mov a, y3 # y3 = a # MAJA
535 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0
536 rorx $28, a, T1 # T1 = (a >> 28) # S0
537 add 8*3+frame_XFER(%rsp), h # h = k + w + h # --
538 or c, y3 # y3 = a|c # MAJA
540 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0
541 mov a, T1 # T1 = a # MAJB
542 and b, y3 # y3 = (a|c)&b # MAJA
543 and c, T1 # T1 = a&c # MAJB
544 add y0, y2 # y2 = S1 + CH # --
547 add h, d # d = k + w + h + d # --
548 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
549 add y1, h # h = k + w + h + S0 # --
551 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
553 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
555 add y3, h # h = t1 + S0 + MAJ # --
581 and $~(0x20 - 1), %rsp
589 mov 8*0(CTX1), a
596 mov 8*7(CTX1), h
656 addm 8*0(CTX2), a
663 addm 8*7(CTX2), h
691 # Mergeable 640-byte rodata section. This allows linker to merge the table
692 # with other, exactly the same 640-byte fragment of another rodata section
741 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.