1######################################################################## 2# Implement fast SHA-256 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# Tim Chen <tim.c.chen@linux.intel.com> 10# 11# This software is available to you under a choice of one of two 12# licenses. You may choose to be licensed under the terms of the GNU 13# General Public License (GPL) Version 2, available from the file 14# COPYING in the main directory of this source tree, or the 15# OpenIB.org BSD license below: 16# 17# Redistribution and use in source and binary forms, with or 18# without modification, are permitted provided that the following 19# conditions are met: 20# 21# - Redistributions of source code must retain the above 22# copyright notice, this list of conditions and the following 23# disclaimer. 24# 25# - Redistributions in binary form must reproduce the above 26# copyright notice, this list of conditions and the following 27# disclaimer in the documentation and/or other materials 28# provided with the distribution. 29# 30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37# SOFTWARE. 38# 39######################################################################## 40# 41# This code is described in an Intel White-Paper: 42# "Fast SHA-256 Implementations on Intel Architecture Processors" 43# 44# To find it, surf to http://www.intel.com/p/en_US/embedded 45# and search for that title. 46# 47######################################################################## 48# This code schedules 2 blocks at a time, with 4 lanes per block 49######################################################################## 50 51#include <linux/linkage.h> 52#include <linux/cfi_types.h> 53 54## assume buffers not aligned 55#define VMOVDQ vmovdqu 56 57################################ Define Macros 58 59# addm [mem], reg 60# Add reg to mem using reg-mem add and store 61.macro addm p1 p2 62 add \p1, \p2 63 mov \p2, \p1 64.endm 65 66################################ 67 68X0 = %ymm4 69X1 = %ymm5 70X2 = %ymm6 71X3 = %ymm7 72 73# XMM versions of above 74XWORD0 = %xmm4 75XWORD1 = %xmm5 76XWORD2 = %xmm6 77XWORD3 = %xmm7 78 79XTMP0 = %ymm0 80XTMP1 = %ymm1 81XTMP2 = %ymm2 82XTMP3 = %ymm3 83XTMP4 = %ymm8 84XFER = %ymm9 85XTMP5 = %ymm11 86 87SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 88SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 89BYTE_FLIP_MASK = %ymm13 90 91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 92 93NUM_BLKS = %rdx # 3rd arg 94INP = %rsi # 2nd arg 95CTX = %rdi # 1st arg 96c = %ecx 97d = %r8d 98e = %edx # clobbers NUM_BLKS 99y3 = %esi # clobbers INP 100 101SRND = CTX # SRND is same register as CTX 102 103a = %eax 104b = %ebx 105f = %r9d 106g = %r10d 107h = %r11d 108old_h = %r11d 109 110T1 = %r12d 111y0 = %r13d 112y1 = %r14d 113y2 = %r15d 114 115 116_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 117_XMM_SAVE_SIZE = 0 118_INP_END_SIZE = 8 119_INP_SIZE = 8 120_CTX_SIZE = 8 121 122_XFER = 0 123_XMM_SAVE = _XFER + _XFER_SIZE 124_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 125_INP = _INP_END + _INP_END_SIZE 126_CTX = _INP + _INP_SIZE 127STACK_SIZE = _CTX + _CTX_SIZE 128 129# rotate_Xs 130# Rotate values of symbols X0...X3 131.macro rotate_Xs 132 X_ = X0 133 X0 = X1 134 X1 = X2 135 X2 = X3 136 X3 = X_ 137.endm 138 139# ROTATE_ARGS 140# Rotate values of symbols a...h 141.macro ROTATE_ARGS 142 old_h = h 143 TMP_ = h 144 h = g 145 g = f 146 f = e 147 e = d 148 d = c 149 c = b 150 b = a 151 a = TMP_ 152.endm 153 154.macro FOUR_ROUNDS_AND_SCHED disp 155################################### RND N + 0 ############################ 156 157 mov a, y3 # y3 = a # MAJA 158 rorx $25, e, y0 # y0 = e >> 25 # S1A 159 rorx $11, e, y1 # y1 = e >> 11 # S1B 160 161 addl \disp(%rsp, SRND), h # h = k + w + h # -- 162 or c, y3 # y3 = a|c # MAJA 163 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 164 mov f, y2 # y2 = f # CH 165 rorx $13, a, T1 # T1 = a >> 13 # S0B 166 167 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 168 xor g, y2 # y2 = f^g # CH 169 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 170 rorx $6, e, y1 # y1 = (e >> 6) # S1 171 172 and e, y2 # y2 = (f^g)&e # CH 173 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 174 rorx $22, a, y1 # y1 = a >> 22 # S0A 175 add h, d # d = k + w + h + d # -- 176 177 and b, y3 # y3 = (a|c)&b # MAJA 178 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 179 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 180 rorx $2, a, T1 # T1 = (a >> 2) # S0 181 182 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 183 vpsrld $7, XTMP1, XTMP2 184 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 185 mov a, T1 # T1 = a # MAJB 186 and c, T1 # T1 = a&c # MAJB 187 188 add y0, y2 # y2 = S1 + CH # -- 189 vpslld $(32-7), XTMP1, XTMP3 190 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 191 add y1, h # h = k + w + h + S0 # -- 192 193 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 194 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 195 196 vpsrld $18, XTMP1, XTMP2 197 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 198 add y3, h # h = t1 + S0 + MAJ # -- 199 200 201 ROTATE_ARGS 202 203################################### RND N + 1 ############################ 204 205 mov a, y3 # y3 = a # MAJA 206 rorx $25, e, y0 # y0 = e >> 25 # S1A 207 rorx $11, e, y1 # y1 = e >> 11 # S1B 208 offset = \disp + 1*4 209 addl offset(%rsp, SRND), h # h = k + w + h # -- 210 or c, y3 # y3 = a|c # MAJA 211 212 213 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 214 mov f, y2 # y2 = f # CH 215 rorx $13, a, T1 # T1 = a >> 13 # S0B 216 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 217 xor g, y2 # y2 = f^g # CH 218 219 220 rorx $6, e, y1 # y1 = (e >> 6) # S1 221 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 222 rorx $22, a, y1 # y1 = a >> 22 # S0A 223 and e, y2 # y2 = (f^g)&e # CH 224 add h, d # d = k + w + h + d # -- 225 226 vpslld $(32-18), XTMP1, XTMP1 227 and b, y3 # y3 = (a|c)&b # MAJA 228 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 229 230 vpxor XTMP1, XTMP3, XTMP3 231 rorx $2, a, T1 # T1 = (a >> 2) # S0 232 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 233 234 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 235 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 236 mov a, T1 # T1 = a # MAJB 237 and c, T1 # T1 = a&c # MAJB 238 add y0, y2 # y2 = S1 + CH # -- 239 240 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 241 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 242 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 243 add y1, h # h = k + w + h + S0 # -- 244 245 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 246 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 247 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 248 add y3, h # h = t1 + S0 + MAJ # -- 249 250 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 251 252 253 ROTATE_ARGS 254 255################################### RND N + 2 ############################ 256 257 mov a, y3 # y3 = a # MAJA 258 rorx $25, e, y0 # y0 = e >> 25 # S1A 259 offset = \disp + 2*4 260 addl offset(%rsp, SRND), h # h = k + w + h # -- 261 262 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 263 rorx $11, e, y1 # y1 = e >> 11 # S1B 264 or c, y3 # y3 = a|c # MAJA 265 mov f, y2 # y2 = f # CH 266 xor g, y2 # y2 = f^g # CH 267 268 rorx $13, a, T1 # T1 = a >> 13 # S0B 269 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 270 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 271 and e, y2 # y2 = (f^g)&e # CH 272 273 rorx $6, e, y1 # y1 = (e >> 6) # S1 274 vpxor XTMP3, XTMP2, XTMP2 275 add h, d # d = k + w + h + d # -- 276 and b, y3 # y3 = (a|c)&b # MAJA 277 278 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 279 rorx $22, a, y1 # y1 = a >> 22 # S0A 280 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 281 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 282 283 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 284 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 285 rorx $2, a ,T1 # T1 = (a >> 2) # S0 286 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 287 288 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 289 mov a, T1 # T1 = a # MAJB 290 and c, T1 # T1 = a&c # MAJB 291 add y0, y2 # y2 = S1 + CH # -- 292 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 293 294 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 295 add y1,h # h = k + w + h + S0 # -- 296 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 297 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 298 299 add y3,h # h = t1 + S0 + MAJ # -- 300 301 302 ROTATE_ARGS 303 304################################### RND N + 3 ############################ 305 306 mov a, y3 # y3 = a # MAJA 307 rorx $25, e, y0 # y0 = e >> 25 # S1A 308 rorx $11, e, y1 # y1 = e >> 11 # S1B 309 offset = \disp + 3*4 310 addl offset(%rsp, SRND), h # h = k + w + h # -- 311 or c, y3 # y3 = a|c # MAJA 312 313 314 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 315 mov f, y2 # y2 = f # CH 316 rorx $13, a, T1 # T1 = a >> 13 # S0B 317 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 318 xor g, y2 # y2 = f^g # CH 319 320 321 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 322 rorx $6, e, y1 # y1 = (e >> 6) # S1 323 and e, y2 # y2 = (f^g)&e # CH 324 add h, d # d = k + w + h + d # -- 325 and b, y3 # y3 = (a|c)&b # MAJA 326 327 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 328 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 329 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 330 331 vpxor XTMP3, XTMP2, XTMP2 332 rorx $22, a, y1 # y1 = a >> 22 # S0A 333 add y0, y2 # y2 = S1 + CH # -- 334 335 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 336 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 337 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 338 339 rorx $2, a, T1 # T1 = (a >> 2) # S0 340 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 341 342 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 343 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 344 mov a, T1 # T1 = a # MAJB 345 and c, T1 # T1 = a&c # MAJB 346 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 347 348 add y1, h # h = k + w + h + S0 # -- 349 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 350 add y3, h # h = t1 + S0 + MAJ # -- 351 352 ROTATE_ARGS 353 rotate_Xs 354.endm 355 356.macro DO_4ROUNDS disp 357################################### RND N + 0 ########################### 358 359 mov f, y2 # y2 = f # CH 360 rorx $25, e, y0 # y0 = e >> 25 # S1A 361 rorx $11, e, y1 # y1 = e >> 11 # S1B 362 xor g, y2 # y2 = f^g # CH 363 364 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 365 rorx $6, e, y1 # y1 = (e >> 6) # S1 366 and e, y2 # y2 = (f^g)&e # CH 367 368 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 369 rorx $13, a, T1 # T1 = a >> 13 # S0B 370 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 371 rorx $22, a, y1 # y1 = a >> 22 # S0A 372 mov a, y3 # y3 = a # MAJA 373 374 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 375 rorx $2, a, T1 # T1 = (a >> 2) # S0 376 addl \disp(%rsp, SRND), h # h = k + w + h # -- 377 or c, y3 # y3 = a|c # MAJA 378 379 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 380 mov a, T1 # T1 = a # MAJB 381 and b, y3 # y3 = (a|c)&b # MAJA 382 and c, T1 # T1 = a&c # MAJB 383 add y0, y2 # y2 = S1 + CH # -- 384 385 386 add h, d # d = k + w + h + d # -- 387 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 388 add y1, h # h = k + w + h + S0 # -- 389 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 390 391 ROTATE_ARGS 392 393################################### RND N + 1 ########################### 394 395 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 396 mov f, y2 # y2 = f # CH 397 rorx $25, e, y0 # y0 = e >> 25 # S1A 398 rorx $11, e, y1 # y1 = e >> 11 # S1B 399 xor g, y2 # y2 = f^g # CH 400 401 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 402 rorx $6, e, y1 # y1 = (e >> 6) # S1 403 and e, y2 # y2 = (f^g)&e # CH 404 add y3, old_h # h = t1 + S0 + MAJ # -- 405 406 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 407 rorx $13, a, T1 # T1 = a >> 13 # S0B 408 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 409 rorx $22, a, y1 # y1 = a >> 22 # S0A 410 mov a, y3 # y3 = a # MAJA 411 412 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 413 rorx $2, a, T1 # T1 = (a >> 2) # S0 414 offset = 4*1 + \disp 415 addl offset(%rsp, SRND), h # h = k + w + h # -- 416 or c, y3 # y3 = a|c # MAJA 417 418 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 419 mov a, T1 # T1 = a # MAJB 420 and b, y3 # y3 = (a|c)&b # MAJA 421 and c, T1 # T1 = a&c # MAJB 422 add y0, y2 # y2 = S1 + CH # -- 423 424 425 add h, d # d = k + w + h + d # -- 426 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 427 add y1, h # h = k + w + h + S0 # -- 428 429 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 430 431 ROTATE_ARGS 432 433################################### RND N + 2 ############################## 434 435 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 436 mov f, y2 # y2 = f # CH 437 rorx $25, e, y0 # y0 = e >> 25 # S1A 438 rorx $11, e, y1 # y1 = e >> 11 # S1B 439 xor g, y2 # y2 = f^g # CH 440 441 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 442 rorx $6, e, y1 # y1 = (e >> 6) # S1 443 and e, y2 # y2 = (f^g)&e # CH 444 add y3, old_h # h = t1 + S0 + MAJ # -- 445 446 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 447 rorx $13, a, T1 # T1 = a >> 13 # S0B 448 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 449 rorx $22, a, y1 # y1 = a >> 22 # S0A 450 mov a, y3 # y3 = a # MAJA 451 452 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 453 rorx $2, a, T1 # T1 = (a >> 2) # S0 454 offset = 4*2 + \disp 455 addl offset(%rsp, SRND), h # h = k + w + h # -- 456 or c, y3 # y3 = a|c # MAJA 457 458 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 459 mov a, T1 # T1 = a # MAJB 460 and b, y3 # y3 = (a|c)&b # MAJA 461 and c, T1 # T1 = a&c # MAJB 462 add y0, y2 # y2 = S1 + CH # -- 463 464 465 add h, d # d = k + w + h + d # -- 466 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 467 add y1, h # h = k + w + h + S0 # -- 468 469 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 470 471 ROTATE_ARGS 472 473################################### RND N + 3 ########################### 474 475 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 476 mov f, y2 # y2 = f # CH 477 rorx $25, e, y0 # y0 = e >> 25 # S1A 478 rorx $11, e, y1 # y1 = e >> 11 # S1B 479 xor g, y2 # y2 = f^g # CH 480 481 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 482 rorx $6, e, y1 # y1 = (e >> 6) # S1 483 and e, y2 # y2 = (f^g)&e # CH 484 add y3, old_h # h = t1 + S0 + MAJ # -- 485 486 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 487 rorx $13, a, T1 # T1 = a >> 13 # S0B 488 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 489 rorx $22, a, y1 # y1 = a >> 22 # S0A 490 mov a, y3 # y3 = a # MAJA 491 492 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 493 rorx $2, a, T1 # T1 = (a >> 2) # S0 494 offset = 4*3 + \disp 495 addl offset(%rsp, SRND), h # h = k + w + h # -- 496 or c, y3 # y3 = a|c # MAJA 497 498 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 499 mov a, T1 # T1 = a # MAJB 500 and b, y3 # y3 = (a|c)&b # MAJA 501 and c, T1 # T1 = a&c # MAJB 502 add y0, y2 # y2 = S1 + CH # -- 503 504 505 add h, d # d = k + w + h + d # -- 506 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 507 add y1, h # h = k + w + h + S0 # -- 508 509 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 510 511 512 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 513 514 add y3, h # h = t1 + S0 + MAJ # -- 515 516 ROTATE_ARGS 517 518.endm 519 520######################################################################## 521## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) 522## arg 1 : pointer to state 523## arg 2 : pointer to input data 524## arg 3 : Num blocks 525######################################################################## 526.text 527SYM_TYPED_FUNC_START(sha256_transform_rorx) 528 pushq %rbx 529 pushq %r12 530 pushq %r13 531 pushq %r14 532 pushq %r15 533 534 push %rbp 535 mov %rsp, %rbp 536 537 subq $STACK_SIZE, %rsp 538 and $-32, %rsp # align rsp to 32 byte boundary 539 540 shl $6, NUM_BLKS # convert to bytes 541 jz .Ldone_hash 542 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 543 mov NUM_BLKS, _INP_END(%rsp) 544 545 cmp NUM_BLKS, INP 546 je .Lonly_one_block 547 548 ## load initial digest 549 mov (CTX), a 550 mov 4*1(CTX), b 551 mov 4*2(CTX), c 552 mov 4*3(CTX), d 553 mov 4*4(CTX), e 554 mov 4*5(CTX), f 555 mov 4*6(CTX), g 556 mov 4*7(CTX), h 557 558 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 559 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 560 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 561 562 mov CTX, _CTX(%rsp) 563 564.Lloop0: 565 ## Load first 16 dwords from two blocks 566 VMOVDQ 0*32(INP),XTMP0 567 VMOVDQ 1*32(INP),XTMP1 568 VMOVDQ 2*32(INP),XTMP2 569 VMOVDQ 3*32(INP),XTMP3 570 571 ## byte swap data 572 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 573 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 574 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 575 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 576 577 ## transpose data into high/low halves 578 vperm2i128 $0x20, XTMP2, XTMP0, X0 579 vperm2i128 $0x31, XTMP2, XTMP0, X1 580 vperm2i128 $0x20, XTMP3, XTMP1, X2 581 vperm2i128 $0x31, XTMP3, XTMP1, X3 582 583.Llast_block_enter: 584 add $64, INP 585 mov INP, _INP(%rsp) 586 587 ## schedule 48 input dwords, by doing 3 rounds of 12 each 588 xor SRND, SRND 589 590.align 16 591.Lloop1: 592 leaq K256+0*32(%rip), INP ## reuse INP as scratch reg 593 vpaddd (INP, SRND), X0, XFER 594 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 595 FOUR_ROUNDS_AND_SCHED _XFER + 0*32 596 597 leaq K256+1*32(%rip), INP 598 vpaddd (INP, SRND), X0, XFER 599 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 600 FOUR_ROUNDS_AND_SCHED _XFER + 1*32 601 602 leaq K256+2*32(%rip), INP 603 vpaddd (INP, SRND), X0, XFER 604 vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 605 FOUR_ROUNDS_AND_SCHED _XFER + 2*32 606 607 leaq K256+3*32(%rip), INP 608 vpaddd (INP, SRND), X0, XFER 609 vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 610 FOUR_ROUNDS_AND_SCHED _XFER + 3*32 611 612 add $4*32, SRND 613 cmp $3*4*32, SRND 614 jb .Lloop1 615 616.Lloop2: 617 ## Do last 16 rounds with no scheduling 618 leaq K256+0*32(%rip), INP 619 vpaddd (INP, SRND), X0, XFER 620 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 621 DO_4ROUNDS _XFER + 0*32 622 623 leaq K256+1*32(%rip), INP 624 vpaddd (INP, SRND), X1, XFER 625 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 626 DO_4ROUNDS _XFER + 1*32 627 add $2*32, SRND 628 629 vmovdqa X2, X0 630 vmovdqa X3, X1 631 632 cmp $4*4*32, SRND 633 jb .Lloop2 634 635 mov _CTX(%rsp), CTX 636 mov _INP(%rsp), INP 637 638 addm (4*0)(CTX),a 639 addm (4*1)(CTX),b 640 addm (4*2)(CTX),c 641 addm (4*3)(CTX),d 642 addm (4*4)(CTX),e 643 addm (4*5)(CTX),f 644 addm (4*6)(CTX),g 645 addm (4*7)(CTX),h 646 647 cmp _INP_END(%rsp), INP 648 ja .Ldone_hash 649 650 #### Do second block using previously scheduled results 651 xor SRND, SRND 652.align 16 653.Lloop3: 654 DO_4ROUNDS _XFER + 0*32 + 16 655 DO_4ROUNDS _XFER + 1*32 + 16 656 add $2*32, SRND 657 cmp $4*4*32, SRND 658 jb .Lloop3 659 660 mov _CTX(%rsp), CTX 661 mov _INP(%rsp), INP 662 add $64, INP 663 664 addm (4*0)(CTX),a 665 addm (4*1)(CTX),b 666 addm (4*2)(CTX),c 667 addm (4*3)(CTX),d 668 addm (4*4)(CTX),e 669 addm (4*5)(CTX),f 670 addm (4*6)(CTX),g 671 addm (4*7)(CTX),h 672 673 cmp _INP_END(%rsp), INP 674 jb .Lloop0 675 ja .Ldone_hash 676 677.Ldo_last_block: 678 VMOVDQ 0*16(INP),XWORD0 679 VMOVDQ 1*16(INP),XWORD1 680 VMOVDQ 2*16(INP),XWORD2 681 VMOVDQ 3*16(INP),XWORD3 682 683 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 684 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 685 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 686 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 687 688 jmp .Llast_block_enter 689 690.Lonly_one_block: 691 692 ## load initial digest 693 mov (4*0)(CTX),a 694 mov (4*1)(CTX),b 695 mov (4*2)(CTX),c 696 mov (4*3)(CTX),d 697 mov (4*4)(CTX),e 698 mov (4*5)(CTX),f 699 mov (4*6)(CTX),g 700 mov (4*7)(CTX),h 701 702 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 703 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 704 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 705 706 mov CTX, _CTX(%rsp) 707 jmp .Ldo_last_block 708 709.Ldone_hash: 710 711 mov %rbp, %rsp 712 pop %rbp 713 714 popq %r15 715 popq %r14 716 popq %r13 717 popq %r12 718 popq %rbx 719 RET 720SYM_FUNC_END(sha256_transform_rorx) 721 722.section .rodata.cst512.K256, "aM", @progbits, 512 723.align 64 724K256: 725 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 726 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 727 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 728 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 729 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 730 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 731 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 732 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 733 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 734 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 735 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 736 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 737 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 738 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 739 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 740 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 741 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 742 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 743 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 744 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 745 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 746 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 747 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 748 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 749 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 750 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 751 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 752 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 753 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 754 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 755 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 756 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 757 758.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 759.align 32 760PSHUFFLE_BYTE_FLIP_MASK: 761 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 762 763# shuffle xBxA -> 00BA 764.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 765.align 32 766_SHUF_00BA: 767 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 768 769# shuffle xDxC -> DC00 770.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 771.align 32 772_SHUF_DC00: 773 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 774