1######################################################################## 2# Implement fast SHA-512 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# David Cote <david.m.cote@intel.com> 10# Tim Chen <tim.c.chen@linux.intel.com> 11# 12# This software is available to you under a choice of one of two 13# licenses. You may choose to be licensed under the terms of the GNU 14# General Public License (GPL) Version 2, available from the file 15# COPYING in the main directory of this source tree, or the 16# OpenIB.org BSD license below: 17# 18# Redistribution and use in source and binary forms, with or 19# without modification, are permitted provided that the following 20# conditions are met: 21# 22# - Redistributions of source code must retain the above 23# copyright notice, this list of conditions and the following 24# disclaimer. 25# 26# - Redistributions in binary form must reproduce the above 27# copyright notice, this list of conditions and the following 28# disclaimer in the documentation and/or other materials 29# provided with the distribution. 30# 31# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 32# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 33# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 34# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 35# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 36# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 37# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 38# SOFTWARE. 39# 40######################################################################## 41# 42# This code is described in an Intel White-Paper: 43# "Fast SHA-512 Implementations on Intel Architecture Processors" 44# 45# To find it, surf to http://www.intel.com/p/en_US/embedded 46# and search for that title. 47# 48######################################################################## 49# This code schedules 1 blocks at a time, with 4 lanes per block 50######################################################################## 51 52#include <linux/linkage.h> 53#include <linux/cfi_types.h> 54 55.text 56 57# Virtual Registers 58Y_0 = %ymm4 59Y_1 = %ymm5 60Y_2 = %ymm6 61Y_3 = %ymm7 62 63YTMP0 = %ymm0 64YTMP1 = %ymm1 65YTMP2 = %ymm2 66YTMP3 = %ymm3 67YTMP4 = %ymm8 68XFER = YTMP0 69 70BYTE_FLIP_MASK = %ymm9 71 72# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 73CTX1 = %rdi 74CTX2 = %r12 75# 2nd arg 76INP = %rsi 77# 3rd arg 78NUM_BLKS = %rdx 79 80c = %rcx 81d = %r8 82e = %rdx 83y3 = %rsi 84 85TBL = %rdi # clobbers CTX1 86 87a = %rax 88b = %rbx 89 90f = %r9 91g = %r10 92h = %r11 93old_h = %r11 94 95T1 = %r12 # clobbers CTX2 96y0 = %r13 97y1 = %r14 98y2 = %r15 99 100# Local variables (stack frame) 101XFER_SIZE = 4*8 102SRND_SIZE = 1*8 103INP_SIZE = 1*8 104INPEND_SIZE = 1*8 105CTX_SIZE = 1*8 106 107frame_XFER = 0 108frame_SRND = frame_XFER + XFER_SIZE 109frame_INP = frame_SRND + SRND_SIZE 110frame_INPEND = frame_INP + INP_SIZE 111frame_CTX = frame_INPEND + INPEND_SIZE 112frame_size = frame_CTX + CTX_SIZE 113 114## assume buffers not aligned 115#define VMOVDQ vmovdqu 116 117# addm [mem], reg 118# Add reg to mem using reg-mem add and store 119.macro addm p1 p2 120 add \p1, \p2 121 mov \p2, \p1 122.endm 123 124 125# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask 126# Load ymm with mem and byte swap each dword 127.macro COPY_YMM_AND_BSWAP p1 p2 p3 128 VMOVDQ \p2, \p1 129 vpshufb \p3, \p1, \p1 130.endm 131# rotate_Ys 132# Rotate values of symbols Y0...Y3 133.macro rotate_Ys 134 Y_ = Y_0 135 Y_0 = Y_1 136 Y_1 = Y_2 137 Y_2 = Y_3 138 Y_3 = Y_ 139.endm 140 141# RotateState 142.macro RotateState 143 # Rotate symbols a..h right 144 old_h = h 145 TMP_ = h 146 h = g 147 g = f 148 f = e 149 e = d 150 d = c 151 c = b 152 b = a 153 a = TMP_ 154.endm 155 156# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL 157# YDST = {YSRC1, YSRC2} >> RVAL*8 158.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL 159 vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} 160 vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 161.endm 162 163.macro FOUR_ROUNDS_AND_SCHED 164################################### RND N + 0 ######################################### 165 166 # Extract w[t-7] 167 MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] 168 # Calculate w[t-16] + w[t-7] 169 vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] 170 # Extract w[t-15] 171 MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] 172 173 # Calculate sigma0 174 175 # Calculate w[t-15] ror 1 176 vpsrlq $1, YTMP1, YTMP2 177 vpsllq $(64-1), YTMP1, YTMP3 178 vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 179 # Calculate w[t-15] shr 7 180 vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 181 182 mov a, y3 # y3 = a # MAJA 183 rorx $41, e, y0 # y0 = e >> 41 # S1A 184 rorx $18, e, y1 # y1 = e >> 18 # S1B 185 add frame_XFER(%rsp),h # h = k + w + h # -- 186 or c, y3 # y3 = a|c # MAJA 187 mov f, y2 # y2 = f # CH 188 rorx $34, a, T1 # T1 = a >> 34 # S0B 189 190 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 191 xor g, y2 # y2 = f^g # CH 192 rorx $14, e, y1 # y1 = (e >> 14) # S1 193 194 and e, y2 # y2 = (f^g)&e # CH 195 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 196 rorx $39, a, y1 # y1 = a >> 39 # S0A 197 add h, d # d = k + w + h + d # -- 198 199 and b, y3 # y3 = (a|c)&b # MAJA 200 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 201 rorx $28, a, T1 # T1 = (a >> 28) # S0 202 203 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 204 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 205 mov a, T1 # T1 = a # MAJB 206 and c, T1 # T1 = a&c # MAJB 207 208 add y0, y2 # y2 = S1 + CH # -- 209 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 210 add y1, h # h = k + w + h + S0 # -- 211 212 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 213 214 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 215 add y3, h # h = t1 + S0 + MAJ # -- 216 217 RotateState 218 219################################### RND N + 1 ######################################### 220 221 # Calculate w[t-15] ror 8 222 vpsrlq $8, YTMP1, YTMP2 223 vpsllq $(64-8), YTMP1, YTMP1 224 vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 225 # XOR the three components 226 vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 227 vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 228 229 230 # Add three components, w[t-16], w[t-7] and sigma0 231 vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 232 # Move to appropriate lanes for calculating w[16] and w[17] 233 vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} 234 # Move to appropriate lanes for calculating w[18] and w[19] 235 vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} 236 237 # Calculate w[16] and w[17] in both 128 bit lanes 238 239 # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes 240 vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} 241 vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} 242 243 244 mov a, y3 # y3 = a # MAJA 245 rorx $41, e, y0 # y0 = e >> 41 # S1A 246 rorx $18, e, y1 # y1 = e >> 18 # S1B 247 add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- 248 or c, y3 # y3 = a|c # MAJA 249 250 251 mov f, y2 # y2 = f # CH 252 rorx $34, a, T1 # T1 = a >> 34 # S0B 253 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 254 xor g, y2 # y2 = f^g # CH 255 256 257 rorx $14, e, y1 # y1 = (e >> 14) # S1 258 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 259 rorx $39, a, y1 # y1 = a >> 39 # S0A 260 and e, y2 # y2 = (f^g)&e # CH 261 add h, d # d = k + w + h + d # -- 262 263 and b, y3 # y3 = (a|c)&b # MAJA 264 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 265 266 rorx $28, a, T1 # T1 = (a >> 28) # S0 267 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 268 269 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 270 mov a, T1 # T1 = a # MAJB 271 and c, T1 # T1 = a&c # MAJB 272 add y0, y2 # y2 = S1 + CH # -- 273 274 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 275 add y1, h # h = k + w + h + S0 # -- 276 277 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 278 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 279 add y3, h # h = t1 + S0 + MAJ # -- 280 281 RotateState 282 283 284################################### RND N + 2 ######################################### 285 286 vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} 287 vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} 288 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} 289 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} 290 vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} 291 vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} 292 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} 293 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 294 # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} 295 296 # Add sigma1 to the other compunents to get w[16] and w[17] 297 vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} 298 299 # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane 300 vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} 301 302 mov a, y3 # y3 = a # MAJA 303 rorx $41, e, y0 # y0 = e >> 41 # S1A 304 add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- 305 306 rorx $18, e, y1 # y1 = e >> 18 # S1B 307 or c, y3 # y3 = a|c # MAJA 308 mov f, y2 # y2 = f # CH 309 xor g, y2 # y2 = f^g # CH 310 311 rorx $34, a, T1 # T1 = a >> 34 # S0B 312 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 313 and e, y2 # y2 = (f^g)&e # CH 314 315 rorx $14, e, y1 # y1 = (e >> 14) # S1 316 add h, d # d = k + w + h + d # -- 317 and b, y3 # y3 = (a|c)&b # MAJA 318 319 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 320 rorx $39, a, y1 # y1 = a >> 39 # S0A 321 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 322 323 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 324 rorx $28, a, T1 # T1 = (a >> 28) # S0 325 326 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 327 mov a, T1 # T1 = a # MAJB 328 and c, T1 # T1 = a&c # MAJB 329 add y0, y2 # y2 = S1 + CH # -- 330 331 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 332 add y1, h # h = k + w + h + S0 # -- 333 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 334 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 335 336 add y3, h # h = t1 + S0 + MAJ # -- 337 338 RotateState 339 340################################### RND N + 3 ######################################### 341 342 vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} 343 vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} 344 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} 345 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} 346 vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} 347 vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} 348 vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} 349 vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ 350 # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} 351 352 # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] 353 # to newly calculated sigma1 to get w[18] and w[19] 354 vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} 355 356 # Form w[19, w[18], w17], w[16] 357 vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} 358 359 mov a, y3 # y3 = a # MAJA 360 rorx $41, e, y0 # y0 = e >> 41 # S1A 361 rorx $18, e, y1 # y1 = e >> 18 # S1B 362 add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- 363 or c, y3 # y3 = a|c # MAJA 364 365 366 mov f, y2 # y2 = f # CH 367 rorx $34, a, T1 # T1 = a >> 34 # S0B 368 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 369 xor g, y2 # y2 = f^g # CH 370 371 372 rorx $14, e, y1 # y1 = (e >> 14) # S1 373 and e, y2 # y2 = (f^g)&e # CH 374 add h, d # d = k + w + h + d # -- 375 and b, y3 # y3 = (a|c)&b # MAJA 376 377 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 378 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 379 380 rorx $39, a, y1 # y1 = a >> 39 # S0A 381 add y0, y2 # y2 = S1 + CH # -- 382 383 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 384 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 385 386 rorx $28, a, T1 # T1 = (a >> 28) # S0 387 388 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 389 mov a, T1 # T1 = a # MAJB 390 and c, T1 # T1 = a&c # MAJB 391 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 392 393 add y1, h # h = k + w + h + S0 # -- 394 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 395 add y3, h # h = t1 + S0 + MAJ # -- 396 397 RotateState 398 399 rotate_Ys 400.endm 401 402.macro DO_4ROUNDS 403 404################################### RND N + 0 ######################################### 405 406 mov f, y2 # y2 = f # CH 407 rorx $41, e, y0 # y0 = e >> 41 # S1A 408 rorx $18, e, y1 # y1 = e >> 18 # S1B 409 xor g, y2 # y2 = f^g # CH 410 411 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 412 rorx $14, e, y1 # y1 = (e >> 14) # S1 413 and e, y2 # y2 = (f^g)&e # CH 414 415 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 416 rorx $34, a, T1 # T1 = a >> 34 # S0B 417 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 418 rorx $39, a, y1 # y1 = a >> 39 # S0A 419 mov a, y3 # y3 = a # MAJA 420 421 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 422 rorx $28, a, T1 # T1 = (a >> 28) # S0 423 add frame_XFER(%rsp), h # h = k + w + h # -- 424 or c, y3 # y3 = a|c # MAJA 425 426 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 427 mov a, T1 # T1 = a # MAJB 428 and b, y3 # y3 = (a|c)&b # MAJA 429 and c, T1 # T1 = a&c # MAJB 430 add y0, y2 # y2 = S1 + CH # -- 431 432 add h, d # d = k + w + h + d # -- 433 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 434 add y1, h # h = k + w + h + S0 # -- 435 436 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 437 438 RotateState 439 440################################### RND N + 1 ######################################### 441 442 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 443 mov f, y2 # y2 = f # CH 444 rorx $41, e, y0 # y0 = e >> 41 # S1A 445 rorx $18, e, y1 # y1 = e >> 18 # S1B 446 xor g, y2 # y2 = f^g # CH 447 448 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 449 rorx $14, e, y1 # y1 = (e >> 14) # S1 450 and e, y2 # y2 = (f^g)&e # CH 451 add y3, old_h # h = t1 + S0 + MAJ # -- 452 453 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 454 rorx $34, a, T1 # T1 = a >> 34 # S0B 455 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 456 rorx $39, a, y1 # y1 = a >> 39 # S0A 457 mov a, y3 # y3 = a # MAJA 458 459 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 460 rorx $28, a, T1 # T1 = (a >> 28) # S0 461 add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- 462 or c, y3 # y3 = a|c # MAJA 463 464 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 465 mov a, T1 # T1 = a # MAJB 466 and b, y3 # y3 = (a|c)&b # MAJA 467 and c, T1 # T1 = a&c # MAJB 468 add y0, y2 # y2 = S1 + CH # -- 469 470 add h, d # d = k + w + h + d # -- 471 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 472 add y1, h # h = k + w + h + S0 # -- 473 474 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 475 476 RotateState 477 478################################### RND N + 2 ######################################### 479 480 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 481 mov f, y2 # y2 = f # CH 482 rorx $41, e, y0 # y0 = e >> 41 # S1A 483 rorx $18, e, y1 # y1 = e >> 18 # S1B 484 xor g, y2 # y2 = f^g # CH 485 486 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 487 rorx $14, e, y1 # y1 = (e >> 14) # S1 488 and e, y2 # y2 = (f^g)&e # CH 489 add y3, old_h # h = t1 + S0 + MAJ # -- 490 491 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 492 rorx $34, a, T1 # T1 = a >> 34 # S0B 493 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 494 rorx $39, a, y1 # y1 = a >> 39 # S0A 495 mov a, y3 # y3 = a # MAJA 496 497 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 498 rorx $28, a, T1 # T1 = (a >> 28) # S0 499 add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- 500 or c, y3 # y3 = a|c # MAJA 501 502 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 503 mov a, T1 # T1 = a # MAJB 504 and b, y3 # y3 = (a|c)&b # MAJA 505 and c, T1 # T1 = a&c # MAJB 506 add y0, y2 # y2 = S1 + CH # -- 507 508 add h, d # d = k + w + h + d # -- 509 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 510 add y1, h # h = k + w + h + S0 # -- 511 512 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 513 514 RotateState 515 516################################### RND N + 3 ######################################### 517 518 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 519 mov f, y2 # y2 = f # CH 520 rorx $41, e, y0 # y0 = e >> 41 # S1A 521 rorx $18, e, y1 # y1 = e >> 18 # S1B 522 xor g, y2 # y2 = f^g # CH 523 524 xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 525 rorx $14, e, y1 # y1 = (e >> 14) # S1 526 and e, y2 # y2 = (f^g)&e # CH 527 add y3, old_h # h = t1 + S0 + MAJ # -- 528 529 xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 530 rorx $34, a, T1 # T1 = a >> 34 # S0B 531 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 532 rorx $39, a, y1 # y1 = a >> 39 # S0A 533 mov a, y3 # y3 = a # MAJA 534 535 xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 536 rorx $28, a, T1 # T1 = (a >> 28) # S0 537 add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- 538 or c, y3 # y3 = a|c # MAJA 539 540 xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 541 mov a, T1 # T1 = a # MAJB 542 and b, y3 # y3 = (a|c)&b # MAJA 543 and c, T1 # T1 = a&c # MAJB 544 add y0, y2 # y2 = S1 + CH # -- 545 546 547 add h, d # d = k + w + h + d # -- 548 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 549 add y1, h # h = k + w + h + S0 # -- 550 551 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 552 553 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 554 555 add y3, h # h = t1 + S0 + MAJ # -- 556 557 RotateState 558 559.endm 560 561######################################################################## 562# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks) 563# Purpose: Updates the SHA512 digest stored at "state" with the message 564# stored in "data". 565# The size of the message pointed to by "data" must be an integer multiple 566# of SHA512 message blocks. 567# "blocks" is the message length in SHA512 blocks 568######################################################################## 569SYM_TYPED_FUNC_START(sha512_transform_rorx) 570 # Save GPRs 571 push %rbx 572 push %r12 573 push %r13 574 push %r14 575 push %r15 576 577 # Allocate Stack Space 578 push %rbp 579 mov %rsp, %rbp 580 sub $frame_size, %rsp 581 and $~(0x20 - 1), %rsp 582 583 shl $7, NUM_BLKS # convert to bytes 584 jz .Ldone_hash 585 add INP, NUM_BLKS # pointer to end of data 586 mov NUM_BLKS, frame_INPEND(%rsp) 587 588 ## load initial digest 589 mov 8*0(CTX1), a 590 mov 8*1(CTX1), b 591 mov 8*2(CTX1), c 592 mov 8*3(CTX1), d 593 mov 8*4(CTX1), e 594 mov 8*5(CTX1), f 595 mov 8*6(CTX1), g 596 mov 8*7(CTX1), h 597 598 # save %rdi (CTX) before it gets clobbered 599 mov %rdi, frame_CTX(%rsp) 600 601 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 602 603.Lloop0: 604 lea K512(%rip), TBL 605 606 ## byte swap first 16 dwords 607 COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK 608 COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK 609 COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK 610 COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK 611 612 mov INP, frame_INP(%rsp) 613 614 ## schedule 64 input dwords, by doing 12 rounds of 4 each 615 movq $4, frame_SRND(%rsp) 616 617.align 16 618.Lloop1: 619 vpaddq (TBL), Y_0, XFER 620 vmovdqa XFER, frame_XFER(%rsp) 621 FOUR_ROUNDS_AND_SCHED 622 623 vpaddq 1*32(TBL), Y_0, XFER 624 vmovdqa XFER, frame_XFER(%rsp) 625 FOUR_ROUNDS_AND_SCHED 626 627 vpaddq 2*32(TBL), Y_0, XFER 628 vmovdqa XFER, frame_XFER(%rsp) 629 FOUR_ROUNDS_AND_SCHED 630 631 vpaddq 3*32(TBL), Y_0, XFER 632 vmovdqa XFER, frame_XFER(%rsp) 633 add $(4*32), TBL 634 FOUR_ROUNDS_AND_SCHED 635 636 subq $1, frame_SRND(%rsp) 637 jne .Lloop1 638 639 movq $2, frame_SRND(%rsp) 640.Lloop2: 641 vpaddq (TBL), Y_0, XFER 642 vmovdqa XFER, frame_XFER(%rsp) 643 DO_4ROUNDS 644 vpaddq 1*32(TBL), Y_1, XFER 645 vmovdqa XFER, frame_XFER(%rsp) 646 add $(2*32), TBL 647 DO_4ROUNDS 648 649 vmovdqa Y_2, Y_0 650 vmovdqa Y_3, Y_1 651 652 subq $1, frame_SRND(%rsp) 653 jne .Lloop2 654 655 mov frame_CTX(%rsp), CTX2 656 addm 8*0(CTX2), a 657 addm 8*1(CTX2), b 658 addm 8*2(CTX2), c 659 addm 8*3(CTX2), d 660 addm 8*4(CTX2), e 661 addm 8*5(CTX2), f 662 addm 8*6(CTX2), g 663 addm 8*7(CTX2), h 664 665 mov frame_INP(%rsp), INP 666 add $128, INP 667 cmp frame_INPEND(%rsp), INP 668 jne .Lloop0 669 670.Ldone_hash: 671 672 # Restore Stack Pointer 673 mov %rbp, %rsp 674 pop %rbp 675 676 # Restore GPRs 677 pop %r15 678 pop %r14 679 pop %r13 680 pop %r12 681 pop %rbx 682 683 vzeroupper 684 RET 685SYM_FUNC_END(sha512_transform_rorx) 686 687######################################################################## 688### Binary Data 689 690 691# Mergeable 640-byte rodata section. This allows linker to merge the table 692# with other, exactly the same 640-byte fragment of another rodata section 693# (if such section exists). 694.section .rodata.cst640.K512, "aM", @progbits, 640 695.align 64 696# K[t] used in SHA512 hashing 697K512: 698 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 699 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 700 .quad 0x3956c25bf348b538,0x59f111f1b605d019 701 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 702 .quad 0xd807aa98a3030242,0x12835b0145706fbe 703 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 704 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 705 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 706 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 707 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 708 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 709 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 710 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 711 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 712 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 713 .quad 0x06ca6351e003826f,0x142929670a0e6e70 714 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 715 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 716 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 717 .quad 0x81c2c92e47edaee6,0x92722c851482353b 718 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 719 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 720 .quad 0xd192e819d6ef5218,0xd69906245565a910 721 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 722 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 723 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 724 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 725 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 726 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 727 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 728 .quad 0x90befffa23631e28,0xa4506cebde82bde9 729 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 730 .quad 0xca273eceea26619c,0xd186b8c721c0c207 731 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 732 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 733 .quad 0x113f9804bef90dae,0x1b710b35131c471b 734 .quad 0x28db77f523047d84,0x32caab7b40c72493 735 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 736 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 737 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 738 739.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 740.align 32 741# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 742PSHUFFLE_BYTE_FLIP_MASK: 743 .octa 0x08090a0b0c0d0e0f0001020304050607 744 .octa 0x18191a1b1c1d1e1f1011121314151617 745 746.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 747.align 32 748MASK_YMM_LO: 749 .octa 0x00000000000000000000000000000000 750 .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 751