1######################################################################## 2# Implement fast SHA-256 with AVX2 instructions. (x86_64) 3# 4# Copyright (C) 2013 Intel Corporation. 5# 6# Authors: 7# James Guilford <james.guilford@intel.com> 8# Kirk Yap <kirk.s.yap@intel.com> 9# Tim Chen <tim.c.chen@linux.intel.com> 10# 11# This software is available to you under a choice of one of two 12# licenses. You may choose to be licensed under the terms of the GNU 13# General Public License (GPL) Version 2, available from the file 14# COPYING in the main directory of this source tree, or the 15# OpenIB.org BSD license below: 16# 17# Redistribution and use in source and binary forms, with or 18# without modification, are permitted provided that the following 19# conditions are met: 20# 21# - Redistributions of source code must retain the above 22# copyright notice, this list of conditions and the following 23# disclaimer. 24# 25# - Redistributions in binary form must reproduce the above 26# copyright notice, this list of conditions and the following 27# disclaimer in the documentation and/or other materials 28# provided with the distribution. 29# 30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37# SOFTWARE. 38# 39######################################################################## 40# 41# This code is described in an Intel White-Paper: 42# "Fast SHA-256 Implementations on Intel Architecture Processors" 43# 44# To find it, surf to http://www.intel.com/p/en_US/embedded 45# and search for that title. 46# 47######################################################################## 48# This code schedules 2 blocks at a time, with 4 lanes per block 49######################################################################## 50 51#include <linux/linkage.h> 52 53## assume buffers not aligned 54#define VMOVDQ vmovdqu 55 56################################ Define Macros 57 58# addm [mem], reg 59# Add reg to mem using reg-mem add and store 60.macro addm p1 p2 61 add \p1, \p2 62 mov \p2, \p1 63.endm 64 65################################ 66 67X0 = %ymm4 68X1 = %ymm5 69X2 = %ymm6 70X3 = %ymm7 71 72# XMM versions of above 73XWORD0 = %xmm4 74XWORD1 = %xmm5 75XWORD2 = %xmm6 76XWORD3 = %xmm7 77 78XTMP0 = %ymm0 79XTMP1 = %ymm1 80XTMP2 = %ymm2 81XTMP3 = %ymm3 82XTMP4 = %ymm8 83XFER = %ymm9 84XTMP5 = %ymm11 85 86SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA 87SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 88BYTE_FLIP_MASK = %ymm13 89 90X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK 91 92NUM_BLKS = %rdx # 3rd arg 93INP = %rsi # 2nd arg 94CTX = %rdi # 1st arg 95c = %ecx 96d = %r8d 97e = %edx # clobbers NUM_BLKS 98y3 = %esi # clobbers INP 99 100SRND = CTX # SRND is same register as CTX 101 102a = %eax 103b = %ebx 104f = %r9d 105g = %r10d 106h = %r11d 107old_h = %r11d 108 109T1 = %r12d 110y0 = %r13d 111y1 = %r14d 112y2 = %r15d 113 114 115_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round 116_XMM_SAVE_SIZE = 0 117_INP_END_SIZE = 8 118_INP_SIZE = 8 119_CTX_SIZE = 8 120 121_XFER = 0 122_XMM_SAVE = _XFER + _XFER_SIZE 123_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE 124_INP = _INP_END + _INP_END_SIZE 125_CTX = _INP + _INP_SIZE 126STACK_SIZE = _CTX + _CTX_SIZE 127 128# rotate_Xs 129# Rotate values of symbols X0...X3 130.macro rotate_Xs 131 X_ = X0 132 X0 = X1 133 X1 = X2 134 X2 = X3 135 X3 = X_ 136.endm 137 138# ROTATE_ARGS 139# Rotate values of symbols a...h 140.macro ROTATE_ARGS 141 old_h = h 142 TMP_ = h 143 h = g 144 g = f 145 f = e 146 e = d 147 d = c 148 c = b 149 b = a 150 a = TMP_ 151.endm 152 153.macro FOUR_ROUNDS_AND_SCHED disp 154################################### RND N + 0 ############################ 155 156 mov a, y3 # y3 = a # MAJA 157 rorx $25, e, y0 # y0 = e >> 25 # S1A 158 rorx $11, e, y1 # y1 = e >> 11 # S1B 159 160 addl \disp(%rsp, SRND), h # h = k + w + h # -- 161 or c, y3 # y3 = a|c # MAJA 162 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] 163 mov f, y2 # y2 = f # CH 164 rorx $13, a, T1 # T1 = a >> 13 # S0B 165 166 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 167 xor g, y2 # y2 = f^g # CH 168 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 169 rorx $6, e, y1 # y1 = (e >> 6) # S1 170 171 and e, y2 # y2 = (f^g)&e # CH 172 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 173 rorx $22, a, y1 # y1 = a >> 22 # S0A 174 add h, d # d = k + w + h + d # -- 175 176 and b, y3 # y3 = (a|c)&b # MAJA 177 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] 178 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 179 rorx $2, a, T1 # T1 = (a >> 2) # S0 180 181 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 182 vpsrld $7, XTMP1, XTMP2 183 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 184 mov a, T1 # T1 = a # MAJB 185 and c, T1 # T1 = a&c # MAJB 186 187 add y0, y2 # y2 = S1 + CH # -- 188 vpslld $(32-7), XTMP1, XTMP3 189 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 190 add y1, h # h = k + w + h + S0 # -- 191 192 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 193 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 194 195 vpsrld $18, XTMP1, XTMP2 196 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 197 add y3, h # h = t1 + S0 + MAJ # -- 198 199 200 ROTATE_ARGS 201 202################################### RND N + 1 ############################ 203 204 mov a, y3 # y3 = a # MAJA 205 rorx $25, e, y0 # y0 = e >> 25 # S1A 206 rorx $11, e, y1 # y1 = e >> 11 # S1B 207 offset = \disp + 1*4 208 addl offset(%rsp, SRND), h # h = k + w + h # -- 209 or c, y3 # y3 = a|c # MAJA 210 211 212 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 213 mov f, y2 # y2 = f # CH 214 rorx $13, a, T1 # T1 = a >> 13 # S0B 215 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 216 xor g, y2 # y2 = f^g # CH 217 218 219 rorx $6, e, y1 # y1 = (e >> 6) # S1 220 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 221 rorx $22, a, y1 # y1 = a >> 22 # S0A 222 and e, y2 # y2 = (f^g)&e # CH 223 add h, d # d = k + w + h + d # -- 224 225 vpslld $(32-18), XTMP1, XTMP1 226 and b, y3 # y3 = (a|c)&b # MAJA 227 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 228 229 vpxor XTMP1, XTMP3, XTMP3 230 rorx $2, a, T1 # T1 = (a >> 2) # S0 231 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 232 233 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 234 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 235 mov a, T1 # T1 = a # MAJB 236 and c, T1 # T1 = a&c # MAJB 237 add y0, y2 # y2 = S1 + CH # -- 238 239 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 240 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} 241 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 242 add y1, h # h = k + w + h + S0 # -- 243 244 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 245 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 246 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 247 add y3, h # h = t1 + S0 + MAJ # -- 248 249 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} 250 251 252 ROTATE_ARGS 253 254################################### RND N + 2 ############################ 255 256 mov a, y3 # y3 = a # MAJA 257 rorx $25, e, y0 # y0 = e >> 25 # S1A 258 offset = \disp + 2*4 259 addl offset(%rsp, SRND), h # h = k + w + h # -- 260 261 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} 262 rorx $11, e, y1 # y1 = e >> 11 # S1B 263 or c, y3 # y3 = a|c # MAJA 264 mov f, y2 # y2 = f # CH 265 xor g, y2 # y2 = f^g # CH 266 267 rorx $13, a, T1 # T1 = a >> 13 # S0B 268 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 269 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} 270 and e, y2 # y2 = (f^g)&e # CH 271 272 rorx $6, e, y1 # y1 = (e >> 6) # S1 273 vpxor XTMP3, XTMP2, XTMP2 274 add h, d # d = k + w + h + d # -- 275 and b, y3 # y3 = (a|c)&b # MAJA 276 277 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 278 rorx $22, a, y1 # y1 = a >> 22 # S0A 279 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} 280 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 281 282 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} 283 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 284 rorx $2, a ,T1 # T1 = (a >> 2) # S0 285 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} 286 287 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 288 mov a, T1 # T1 = a # MAJB 289 and c, T1 # T1 = a&c # MAJB 290 add y0, y2 # y2 = S1 + CH # -- 291 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} 292 293 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 294 add y1,h # h = k + w + h + S0 # -- 295 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- 296 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 297 298 add y3,h # h = t1 + S0 + MAJ # -- 299 300 301 ROTATE_ARGS 302 303################################### RND N + 3 ############################ 304 305 mov a, y3 # y3 = a # MAJA 306 rorx $25, e, y0 # y0 = e >> 25 # S1A 307 rorx $11, e, y1 # y1 = e >> 11 # S1B 308 offset = \disp + 3*4 309 addl offset(%rsp, SRND), h # h = k + w + h # -- 310 or c, y3 # y3 = a|c # MAJA 311 312 313 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} 314 mov f, y2 # y2 = f # CH 315 rorx $13, a, T1 # T1 = a >> 13 # S0B 316 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 317 xor g, y2 # y2 = f^g # CH 318 319 320 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} 321 rorx $6, e, y1 # y1 = (e >> 6) # S1 322 and e, y2 # y2 = (f^g)&e # CH 323 add h, d # d = k + w + h + d # -- 324 and b, y3 # y3 = (a|c)&b # MAJA 325 326 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} 327 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 328 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 329 330 vpxor XTMP3, XTMP2, XTMP2 331 rorx $22, a, y1 # y1 = a >> 22 # S0A 332 add y0, y2 # y2 = S1 + CH # -- 333 334 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} 335 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 336 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 337 338 rorx $2, a, T1 # T1 = (a >> 2) # S0 339 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} 340 341 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} 342 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 343 mov a, T1 # T1 = a # MAJB 344 and c, T1 # T1 = a&c # MAJB 345 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 346 347 add y1, h # h = k + w + h + S0 # -- 348 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 349 add y3, h # h = t1 + S0 + MAJ # -- 350 351 ROTATE_ARGS 352 rotate_Xs 353.endm 354 355.macro DO_4ROUNDS disp 356################################### RND N + 0 ########################### 357 358 mov f, y2 # y2 = f # CH 359 rorx $25, e, y0 # y0 = e >> 25 # S1A 360 rorx $11, e, y1 # y1 = e >> 11 # S1B 361 xor g, y2 # y2 = f^g # CH 362 363 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 364 rorx $6, e, y1 # y1 = (e >> 6) # S1 365 and e, y2 # y2 = (f^g)&e # CH 366 367 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 368 rorx $13, a, T1 # T1 = a >> 13 # S0B 369 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 370 rorx $22, a, y1 # y1 = a >> 22 # S0A 371 mov a, y3 # y3 = a # MAJA 372 373 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 374 rorx $2, a, T1 # T1 = (a >> 2) # S0 375 addl \disp(%rsp, SRND), h # h = k + w + h # -- 376 or c, y3 # y3 = a|c # MAJA 377 378 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 379 mov a, T1 # T1 = a # MAJB 380 and b, y3 # y3 = (a|c)&b # MAJA 381 and c, T1 # T1 = a&c # MAJB 382 add y0, y2 # y2 = S1 + CH # -- 383 384 385 add h, d # d = k + w + h + d # -- 386 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 387 add y1, h # h = k + w + h + S0 # -- 388 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 389 390 ROTATE_ARGS 391 392################################### RND N + 1 ########################### 393 394 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 395 mov f, y2 # y2 = f # CH 396 rorx $25, e, y0 # y0 = e >> 25 # S1A 397 rorx $11, e, y1 # y1 = e >> 11 # S1B 398 xor g, y2 # y2 = f^g # CH 399 400 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 401 rorx $6, e, y1 # y1 = (e >> 6) # S1 402 and e, y2 # y2 = (f^g)&e # CH 403 add y3, old_h # h = t1 + S0 + MAJ # -- 404 405 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 406 rorx $13, a, T1 # T1 = a >> 13 # S0B 407 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 408 rorx $22, a, y1 # y1 = a >> 22 # S0A 409 mov a, y3 # y3 = a # MAJA 410 411 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 412 rorx $2, a, T1 # T1 = (a >> 2) # S0 413 offset = 4*1 + \disp 414 addl offset(%rsp, SRND), h # h = k + w + h # -- 415 or c, y3 # y3 = a|c # MAJA 416 417 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 418 mov a, T1 # T1 = a # MAJB 419 and b, y3 # y3 = (a|c)&b # MAJA 420 and c, T1 # T1 = a&c # MAJB 421 add y0, y2 # y2 = S1 + CH # -- 422 423 424 add h, d # d = k + w + h + d # -- 425 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 426 add y1, h # h = k + w + h + S0 # -- 427 428 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 429 430 ROTATE_ARGS 431 432################################### RND N + 2 ############################## 433 434 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 435 mov f, y2 # y2 = f # CH 436 rorx $25, e, y0 # y0 = e >> 25 # S1A 437 rorx $11, e, y1 # y1 = e >> 11 # S1B 438 xor g, y2 # y2 = f^g # CH 439 440 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 441 rorx $6, e, y1 # y1 = (e >> 6) # S1 442 and e, y2 # y2 = (f^g)&e # CH 443 add y3, old_h # h = t1 + S0 + MAJ # -- 444 445 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 446 rorx $13, a, T1 # T1 = a >> 13 # S0B 447 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 448 rorx $22, a, y1 # y1 = a >> 22 # S0A 449 mov a, y3 # y3 = a # MAJA 450 451 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 452 rorx $2, a, T1 # T1 = (a >> 2) # S0 453 offset = 4*2 + \disp 454 addl offset(%rsp, SRND), h # h = k + w + h # -- 455 or c, y3 # y3 = a|c # MAJA 456 457 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 458 mov a, T1 # T1 = a # MAJB 459 and b, y3 # y3 = (a|c)&b # MAJA 460 and c, T1 # T1 = a&c # MAJB 461 add y0, y2 # y2 = S1 + CH # -- 462 463 464 add h, d # d = k + w + h + d # -- 465 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 466 add y1, h # h = k + w + h + S0 # -- 467 468 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 469 470 ROTATE_ARGS 471 472################################### RND N + 3 ########################### 473 474 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 475 mov f, y2 # y2 = f # CH 476 rorx $25, e, y0 # y0 = e >> 25 # S1A 477 rorx $11, e, y1 # y1 = e >> 11 # S1B 478 xor g, y2 # y2 = f^g # CH 479 480 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 481 rorx $6, e, y1 # y1 = (e >> 6) # S1 482 and e, y2 # y2 = (f^g)&e # CH 483 add y3, old_h # h = t1 + S0 + MAJ # -- 484 485 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 486 rorx $13, a, T1 # T1 = a >> 13 # S0B 487 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH 488 rorx $22, a, y1 # y1 = a >> 22 # S0A 489 mov a, y3 # y3 = a # MAJA 490 491 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 492 rorx $2, a, T1 # T1 = (a >> 2) # S0 493 offset = 4*3 + \disp 494 addl offset(%rsp, SRND), h # h = k + w + h # -- 495 or c, y3 # y3 = a|c # MAJA 496 497 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 498 mov a, T1 # T1 = a # MAJB 499 and b, y3 # y3 = (a|c)&b # MAJA 500 and c, T1 # T1 = a&c # MAJB 501 add y0, y2 # y2 = S1 + CH # -- 502 503 504 add h, d # d = k + w + h + d # -- 505 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ 506 add y1, h # h = k + w + h + S0 # -- 507 508 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- 509 510 511 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- 512 513 add y3, h # h = t1 + S0 + MAJ # -- 514 515 ROTATE_ARGS 516 517.endm 518 519######################################################################## 520## void sha256_transform_rorx(struct sha256_block_state *state, 521## const u8 *data, size_t nblocks); 522######################################################################## 523.text 524SYM_FUNC_START(sha256_transform_rorx) 525 pushq %rbx 526 pushq %r12 527 pushq %r13 528 pushq %r14 529 pushq %r15 530 531 push %rbp 532 mov %rsp, %rbp 533 534 subq $STACK_SIZE, %rsp 535 and $-32, %rsp # align rsp to 32 byte boundary 536 537 shl $6, NUM_BLKS # convert to bytes 538 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block 539 mov NUM_BLKS, _INP_END(%rsp) 540 541 cmp NUM_BLKS, INP 542 je .Lonly_one_block 543 544 ## load initial digest 545 mov (CTX), a 546 mov 4*1(CTX), b 547 mov 4*2(CTX), c 548 mov 4*3(CTX), d 549 mov 4*4(CTX), e 550 mov 4*5(CTX), f 551 mov 4*6(CTX), g 552 mov 4*7(CTX), h 553 554 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 555 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 556 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 557 558 mov CTX, _CTX(%rsp) 559 560.Lloop0: 561 ## Load first 16 dwords from two blocks 562 VMOVDQ 0*32(INP),XTMP0 563 VMOVDQ 1*32(INP),XTMP1 564 VMOVDQ 2*32(INP),XTMP2 565 VMOVDQ 3*32(INP),XTMP3 566 567 ## byte swap data 568 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 569 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 570 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 571 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 572 573 ## transpose data into high/low halves 574 vperm2i128 $0x20, XTMP2, XTMP0, X0 575 vperm2i128 $0x31, XTMP2, XTMP0, X1 576 vperm2i128 $0x20, XTMP3, XTMP1, X2 577 vperm2i128 $0x31, XTMP3, XTMP1, X3 578 579.Llast_block_enter: 580 add $64, INP 581 mov INP, _INP(%rsp) 582 583 ## schedule 48 input dwords, by doing 3 rounds of 12 each 584 xor SRND, SRND 585 586.align 16 587.Lloop1: 588 leaq K256+0*32(%rip), INP ## reuse INP as scratch reg 589 vpaddd (INP, SRND), X0, XFER 590 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 591 FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) 592 593 leaq K256+1*32(%rip), INP 594 vpaddd (INP, SRND), X0, XFER 595 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 596 FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) 597 598 leaq K256+2*32(%rip), INP 599 vpaddd (INP, SRND), X0, XFER 600 vmovdqa XFER, 2*32+_XFER(%rsp, SRND) 601 FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) 602 603 leaq K256+3*32(%rip), INP 604 vpaddd (INP, SRND), X0, XFER 605 vmovdqa XFER, 3*32+_XFER(%rsp, SRND) 606 FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) 607 608 add $4*32, SRND 609 cmp $3*4*32, SRND 610 jb .Lloop1 611 612.Lloop2: 613 ## Do last 16 rounds with no scheduling 614 leaq K256+0*32(%rip), INP 615 vpaddd (INP, SRND), X0, XFER 616 vmovdqa XFER, 0*32+_XFER(%rsp, SRND) 617 DO_4ROUNDS (_XFER + 0*32) 618 619 leaq K256+1*32(%rip), INP 620 vpaddd (INP, SRND), X1, XFER 621 vmovdqa XFER, 1*32+_XFER(%rsp, SRND) 622 DO_4ROUNDS (_XFER + 1*32) 623 add $2*32, SRND 624 625 vmovdqa X2, X0 626 vmovdqa X3, X1 627 628 cmp $4*4*32, SRND 629 jb .Lloop2 630 631 mov _CTX(%rsp), CTX 632 mov _INP(%rsp), INP 633 634 addm (4*0)(CTX),a 635 addm (4*1)(CTX),b 636 addm (4*2)(CTX),c 637 addm (4*3)(CTX),d 638 addm (4*4)(CTX),e 639 addm (4*5)(CTX),f 640 addm (4*6)(CTX),g 641 addm (4*7)(CTX),h 642 643 cmp _INP_END(%rsp), INP 644 ja .Ldone_hash 645 646 #### Do second block using previously scheduled results 647 xor SRND, SRND 648.align 16 649.Lloop3: 650 DO_4ROUNDS (_XFER + 0*32 + 16) 651 DO_4ROUNDS (_XFER + 1*32 + 16) 652 add $2*32, SRND 653 cmp $4*4*32, SRND 654 jb .Lloop3 655 656 mov _CTX(%rsp), CTX 657 mov _INP(%rsp), INP 658 add $64, INP 659 660 addm (4*0)(CTX),a 661 addm (4*1)(CTX),b 662 addm (4*2)(CTX),c 663 addm (4*3)(CTX),d 664 addm (4*4)(CTX),e 665 addm (4*5)(CTX),f 666 addm (4*6)(CTX),g 667 addm (4*7)(CTX),h 668 669 cmp _INP_END(%rsp), INP 670 jb .Lloop0 671 ja .Ldone_hash 672 673.Ldo_last_block: 674 VMOVDQ 0*16(INP),XWORD0 675 VMOVDQ 1*16(INP),XWORD1 676 VMOVDQ 2*16(INP),XWORD2 677 VMOVDQ 3*16(INP),XWORD3 678 679 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 680 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 681 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 682 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 683 684 jmp .Llast_block_enter 685 686.Lonly_one_block: 687 688 ## load initial digest 689 mov (4*0)(CTX),a 690 mov (4*1)(CTX),b 691 mov (4*2)(CTX),c 692 mov (4*3)(CTX),d 693 mov (4*4)(CTX),e 694 mov (4*5)(CTX),f 695 mov (4*6)(CTX),g 696 mov (4*7)(CTX),h 697 698 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK 699 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 700 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 701 702 mov CTX, _CTX(%rsp) 703 jmp .Ldo_last_block 704 705.Ldone_hash: 706 707 mov %rbp, %rsp 708 pop %rbp 709 710 popq %r15 711 popq %r14 712 popq %r13 713 popq %r12 714 popq %rbx 715 vzeroupper 716 RET 717SYM_FUNC_END(sha256_transform_rorx) 718 719.section .rodata.cst512.K256, "aM", @progbits, 512 720.align 64 721K256: 722 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 723 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 724 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 725 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 726 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 727 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 728 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 729 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 730 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 731 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 732 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 733 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 734 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 735 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 736 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 737 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 738 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 739 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 740 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 741 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 742 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 743 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 744 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 745 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 746 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 747 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 748 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 749 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 750 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 751 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 752 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 753 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 754 755.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 756.align 32 757PSHUFFLE_BYTE_FLIP_MASK: 758 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 759 760# shuffle xBxA -> 00BA 761.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 762.align 32 763_SHUF_00BA: 764 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 765 766# shuffle xDxC -> DC00 767.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 768.align 32 769_SHUF_DC00: 770 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF 771