1/* 2 * Intel SHA Extensions optimized implementation of a SHA-256 update function 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2015 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * Sean Gulley <sean.m.gulley@intel.com> 22 * Tim Chen <tim.c.chen@linux.intel.com> 23 * 24 * BSD LICENSE 25 * 26 * Copyright(c) 2015 Intel Corporation. 27 * 28 * Redistribution and use in source and binary forms, with or without 29 * modification, are permitted provided that the following conditions 30 * are met: 31 * 32 * * Redistributions of source code must retain the above copyright 33 * notice, this list of conditions and the following disclaimer. 34 * * Redistributions in binary form must reproduce the above copyright 35 * notice, this list of conditions and the following disclaimer in 36 * the documentation and/or other materials provided with the 37 * distribution. 38 * * Neither the name of Intel Corporation nor the names of its 39 * contributors may be used to endorse or promote products derived 40 * from this software without specific prior written permission. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 * 54 */ 55 56#include <linux/linkage.h> 57 58#define STATE_PTR %rdi /* 1st arg */ 59#define DATA_PTR %rsi /* 2nd arg */ 60#define NUM_BLKS %rdx /* 3rd arg */ 61 62#define SHA256CONSTANTS %rax 63 64#define MSG %xmm0 /* sha256rnds2 implicit operand */ 65#define STATE0 %xmm1 66#define STATE1 %xmm2 67#define MSG0 %xmm3 68#define MSG1 %xmm4 69#define MSG2 %xmm5 70#define MSG3 %xmm6 71#define TMP %xmm7 72 73#define SHUF_MASK %xmm8 74 75#define ABEF_SAVE %xmm9 76#define CDGH_SAVE %xmm10 77 78.macro do_4rounds i, m0, m1, m2, m3 79.if \i < 16 80 movdqu \i*4(DATA_PTR), \m0 81 pshufb SHUF_MASK, \m0 82.endif 83 movdqa (\i-32)*4(SHA256CONSTANTS), MSG 84 paddd \m0, MSG 85 sha256rnds2 STATE0, STATE1 86.if \i >= 12 && \i < 60 87 movdqa \m0, TMP 88 palignr $4, \m3, TMP 89 paddd TMP, \m1 90 sha256msg2 \m0, \m1 91.endif 92 punpckhqdq MSG, MSG 93 sha256rnds2 STATE1, STATE0 94.if \i >= 4 && \i < 52 95 sha256msg1 \m0, \m3 96.endif 97.endm 98 99/* 100 * Intel SHA Extensions optimized implementation of a SHA-256 block function 101 * 102 * This function takes a pointer to the current SHA-256 state, a pointer to the 103 * input data, and the number of 64-byte blocks to process. Once all blocks 104 * have been processed, the state is updated with the new state. This function 105 * only processes complete blocks. State initialization, buffering of partial 106 * blocks, and digest finalization is expected to be handled elsewhere. 107 * 108 * void sha256_ni_transform(struct sha256_block_state *state, 109 * const u8 *data, size_t nblocks); 110 */ 111.text 112SYM_FUNC_START(sha256_ni_transform) 113 114 shl $6, NUM_BLKS /* convert to bytes */ 115 add DATA_PTR, NUM_BLKS /* pointer to end of data */ 116 117 /* 118 * load initial hash values 119 * Need to reorder these appropriately 120 * DCBA, HGFE -> ABEF, CDGH 121 */ 122 movdqu 0*16(STATE_PTR), STATE0 /* DCBA */ 123 movdqu 1*16(STATE_PTR), STATE1 /* HGFE */ 124 125 movdqa STATE0, TMP 126 punpcklqdq STATE1, STATE0 /* FEBA */ 127 punpckhqdq TMP, STATE1 /* DCHG */ 128 pshufd $0x1B, STATE0, STATE0 /* ABEF */ 129 pshufd $0xB1, STATE1, STATE1 /* CDGH */ 130 131 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 132 lea K256+32*4(%rip), SHA256CONSTANTS 133 134.Lloop0: 135 /* Save hash values for addition after rounds */ 136 movdqa STATE0, ABEF_SAVE 137 movdqa STATE1, CDGH_SAVE 138 139.irp i, 0, 16, 32, 48 140 do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 141 do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 142 do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 143 do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 144.endr 145 146 /* Add current hash values with previously saved */ 147 paddd ABEF_SAVE, STATE0 148 paddd CDGH_SAVE, STATE1 149 150 /* Increment data pointer and loop if more to process */ 151 add $64, DATA_PTR 152 cmp NUM_BLKS, DATA_PTR 153 jne .Lloop0 154 155 /* Write hash values back in the correct order */ 156 movdqa STATE0, TMP 157 punpcklqdq STATE1, STATE0 /* GHEF */ 158 punpckhqdq TMP, STATE1 /* ABCD */ 159 pshufd $0xB1, STATE0, STATE0 /* HGFE */ 160 pshufd $0x1B, STATE1, STATE1 /* DCBA */ 161 162 movdqu STATE1, 0*16(STATE_PTR) 163 movdqu STATE0, 1*16(STATE_PTR) 164 165 RET 166SYM_FUNC_END(sha256_ni_transform) 167 168#undef DIGEST_PTR 169#undef DATA_PTR 170#undef NUM_BLKS 171#undef SHA256CONSTANTS 172#undef MSG 173#undef STATE0 174#undef STATE1 175#undef MSG0 176#undef MSG1 177#undef MSG2 178#undef MSG3 179#undef TMP 180#undef SHUF_MASK 181#undef ABEF_SAVE 182#undef CDGH_SAVE 183 184// parameters for sha256_ni_finup2x() 185#define CTX %rdi 186#define DATA1 %rsi 187#define DATA2 %rdx 188#define LEN %ecx 189#define LEN8 %cl 190#define LEN64 %rcx 191#define OUT1 %r8 192#define OUT2 %r9 193 194// other scalar variables 195#define SHA256CONSTANTS %rax 196#define COUNT %r10 197#define COUNT32 %r10d 198#define FINAL_STEP %r11d 199 200// rbx is used as a temporary. 201 202#define MSG %xmm0 // sha256rnds2 implicit operand 203#define STATE0_A %xmm1 204#define STATE1_A %xmm2 205#define STATE0_B %xmm3 206#define STATE1_B %xmm4 207#define TMP_A %xmm5 208#define TMP_B %xmm6 209#define MSG0_A %xmm7 210#define MSG1_A %xmm8 211#define MSG2_A %xmm9 212#define MSG3_A %xmm10 213#define MSG0_B %xmm11 214#define MSG1_B %xmm12 215#define MSG2_B %xmm13 216#define MSG3_B %xmm14 217#define SHUF_MASK %xmm15 218 219#define OFFSETOF_STATE 0 // offsetof(struct __sha256_ctx, state) 220#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) 221#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) 222 223// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b 224// contain the current 4 message schedule words for the first and second message 225// respectively. 226// 227// If not all the message schedule words have been computed yet, then this also 228// computes 4 more message schedule words for each message. m1_a-m3_a contain 229// the next 3 groups of 4 message schedule words for the first message, and 230// likewise m1_b-m3_b for the second. After consuming the current value of 231// m0_a, this macro computes the group after m3_a and writes it to m0_a, and 232// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the 233// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must 234// cycle through the registers accordingly. 235.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b 236 movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A 237 movdqa TMP_A, TMP_B 238 paddd \m0_a, TMP_A 239 paddd \m0_b, TMP_B 240.if \i < 48 241 sha256msg1 \m1_a, \m0_a 242 sha256msg1 \m1_b, \m0_b 243.endif 244 movdqa TMP_A, MSG 245 sha256rnds2 STATE0_A, STATE1_A 246 movdqa TMP_B, MSG 247 sha256rnds2 STATE0_B, STATE1_B 248 pshufd $0x0E, TMP_A, MSG 249 sha256rnds2 STATE1_A, STATE0_A 250 pshufd $0x0E, TMP_B, MSG 251 sha256rnds2 STATE1_B, STATE0_B 252.if \i < 48 253 movdqa \m3_a, TMP_A 254 movdqa \m3_b, TMP_B 255 palignr $4, \m2_a, TMP_A 256 palignr $4, \m2_b, TMP_B 257 paddd TMP_A, \m0_a 258 paddd TMP_B, \m0_b 259 sha256msg2 \m3_a, \m0_a 260 sha256msg2 \m3_b, \m0_b 261.endif 262.endm 263 264// 265// void sha256_ni_finup2x(const struct __sha256_ctx *ctx, 266// const u8 *data1, const u8 *data2, int len, 267// u8 out1[SHA256_DIGEST_SIZE], 268// u8 out2[SHA256_DIGEST_SIZE]); 269// 270// This function computes the SHA-256 digests of two messages |data1| and 271// |data2| that are both |len| bytes long, starting from the initial context 272// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. 273// 274// The instructions for the two SHA-256 operations are interleaved. On many 275// CPUs, this is almost twice as fast as hashing each message individually due 276// to taking better advantage of the CPU's SHA-256 and SIMD throughput. 277// 278SYM_FUNC_START(sha256_ni_finup2x) 279 // Allocate 128 bytes of stack space, 16-byte aligned. 280 push %rbx 281 push %rbp 282 mov %rsp, %rbp 283 sub $128, %rsp 284 and $~15, %rsp 285 286 // Load the shuffle mask for swapping the endianness of 32-bit words. 287 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 288 289 // Set up pointer to the round constants. 290 lea K256+32*4(%rip), SHA256CONSTANTS 291 292 // Initially we're not processing the final blocks. 293 xor FINAL_STEP, FINAL_STEP 294 295 // Load the initial state from ctx->state. 296 movdqu OFFSETOF_STATE+0*16(CTX), STATE0_A // DCBA 297 movdqu OFFSETOF_STATE+1*16(CTX), STATE1_A // HGFE 298 movdqa STATE0_A, TMP_A 299 punpcklqdq STATE1_A, STATE0_A // FEBA 300 punpckhqdq TMP_A, STATE1_A // DCHG 301 pshufd $0x1B, STATE0_A, STATE0_A // ABEF 302 pshufd $0xB1, STATE1_A, STATE1_A // CDGH 303 304 // Load ctx->bytecount. Take the mod 64 of it to get the number of 305 // bytes that are buffered in ctx->buf. Also save it in a register with 306 // LEN added to it. 307 mov LEN, LEN 308 mov OFFSETOF_BYTECOUNT(CTX), %rbx 309 lea (%rbx, LEN64, 1), COUNT 310 and $63, %ebx 311 jz .Lfinup2x_enter_loop // No bytes buffered? 312 313 // %ebx bytes (1 to 63) are currently buffered in ctx->buf. Load them 314 // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we 315 // just load 64 bytes from each of ctx->buf, DATA1, and DATA2 316 // unconditionally and rearrange the data as needed. 317 318 movdqu OFFSETOF_BUF+0*16(CTX), MSG0_A 319 movdqu OFFSETOF_BUF+1*16(CTX), MSG1_A 320 movdqu OFFSETOF_BUF+2*16(CTX), MSG2_A 321 movdqu OFFSETOF_BUF+3*16(CTX), MSG3_A 322 movdqa MSG0_A, 0*16(%rsp) 323 movdqa MSG1_A, 1*16(%rsp) 324 movdqa MSG2_A, 2*16(%rsp) 325 movdqa MSG3_A, 3*16(%rsp) 326 327 movdqu 0*16(DATA1), MSG0_A 328 movdqu 1*16(DATA1), MSG1_A 329 movdqu 2*16(DATA1), MSG2_A 330 movdqu 3*16(DATA1), MSG3_A 331 movdqu MSG0_A, 0*16(%rsp,%rbx) 332 movdqu MSG1_A, 1*16(%rsp,%rbx) 333 movdqu MSG2_A, 2*16(%rsp,%rbx) 334 movdqu MSG3_A, 3*16(%rsp,%rbx) 335 movdqa 0*16(%rsp), MSG0_A 336 movdqa 1*16(%rsp), MSG1_A 337 movdqa 2*16(%rsp), MSG2_A 338 movdqa 3*16(%rsp), MSG3_A 339 340 movdqu 0*16(DATA2), MSG0_B 341 movdqu 1*16(DATA2), MSG1_B 342 movdqu 2*16(DATA2), MSG2_B 343 movdqu 3*16(DATA2), MSG3_B 344 movdqu MSG0_B, 0*16(%rsp,%rbx) 345 movdqu MSG1_B, 1*16(%rsp,%rbx) 346 movdqu MSG2_B, 2*16(%rsp,%rbx) 347 movdqu MSG3_B, 3*16(%rsp,%rbx) 348 movdqa 0*16(%rsp), MSG0_B 349 movdqa 1*16(%rsp), MSG1_B 350 movdqa 2*16(%rsp), MSG2_B 351 movdqa 3*16(%rsp), MSG3_B 352 353 sub $64, %rbx // rbx = buffered - 64 354 sub %rbx, DATA1 // DATA1 += 64 - buffered 355 sub %rbx, DATA2 // DATA2 += 64 - buffered 356 add %ebx, LEN // LEN += buffered - 64 357 movdqa STATE0_A, STATE0_B 358 movdqa STATE1_A, STATE1_B 359 jmp .Lfinup2x_loop_have_data 360 361.Lfinup2x_enter_loop: 362 sub $64, LEN 363 movdqa STATE0_A, STATE0_B 364 movdqa STATE1_A, STATE1_B 365.Lfinup2x_loop: 366 // Load the next two data blocks. 367 movdqu 0*16(DATA1), MSG0_A 368 movdqu 0*16(DATA2), MSG0_B 369 movdqu 1*16(DATA1), MSG1_A 370 movdqu 1*16(DATA2), MSG1_B 371 movdqu 2*16(DATA1), MSG2_A 372 movdqu 2*16(DATA2), MSG2_B 373 movdqu 3*16(DATA1), MSG3_A 374 movdqu 3*16(DATA2), MSG3_B 375 add $64, DATA1 376 add $64, DATA2 377.Lfinup2x_loop_have_data: 378 // Convert the words of the data blocks from big endian. 379 pshufb SHUF_MASK, MSG0_A 380 pshufb SHUF_MASK, MSG0_B 381 pshufb SHUF_MASK, MSG1_A 382 pshufb SHUF_MASK, MSG1_B 383 pshufb SHUF_MASK, MSG2_A 384 pshufb SHUF_MASK, MSG2_B 385 pshufb SHUF_MASK, MSG3_A 386 pshufb SHUF_MASK, MSG3_B 387.Lfinup2x_loop_have_bswapped_data: 388 389 // Save the original state for each block. 390 movdqa STATE0_A, 0*16(%rsp) 391 movdqa STATE0_B, 1*16(%rsp) 392 movdqa STATE1_A, 2*16(%rsp) 393 movdqa STATE1_B, 3*16(%rsp) 394 395 // Do the SHA-256 rounds on each block. 396.irp i, 0, 16, 32, 48 397 do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ 398 MSG0_B, MSG1_B, MSG2_B, MSG3_B 399 do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ 400 MSG1_B, MSG2_B, MSG3_B, MSG0_B 401 do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ 402 MSG2_B, MSG3_B, MSG0_B, MSG1_B 403 do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ 404 MSG3_B, MSG0_B, MSG1_B, MSG2_B 405.endr 406 407 // Add the original state for each block. 408 paddd 0*16(%rsp), STATE0_A 409 paddd 1*16(%rsp), STATE0_B 410 paddd 2*16(%rsp), STATE1_A 411 paddd 3*16(%rsp), STATE1_B 412 413 // Update LEN and loop back if more blocks remain. 414 sub $64, LEN 415 jge .Lfinup2x_loop 416 417 // Check if any final blocks need to be handled. 418 // FINAL_STEP = 2: all done 419 // FINAL_STEP = 1: need to do count-only padding block 420 // FINAL_STEP = 0: need to do the block with 0x80 padding byte 421 cmp $1, FINAL_STEP 422 jg .Lfinup2x_done 423 je .Lfinup2x_finalize_countonly 424 add $64, LEN 425 jz .Lfinup2x_finalize_blockaligned 426 427 // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block. 428 // To do this, write the padding starting with the 0x80 byte to 429 // &sp[64]. Then for each message, copy the last 64 data bytes to sp 430 // and load from &sp[64 - LEN] to get the needed padding block. This 431 // code relies on the data buffers being >= 64 bytes in length. 432 mov $64, %ebx 433 sub LEN, %ebx // ebx = 64 - LEN 434 sub %rbx, DATA1 // DATA1 -= 64 - LEN 435 sub %rbx, DATA2 // DATA2 -= 64 - LEN 436 mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary 437 movd FINAL_STEP, MSG0_A 438 pxor MSG1_A, MSG1_A 439 movdqa MSG0_A, 4*16(%rsp) 440 movdqa MSG1_A, 5*16(%rsp) 441 movdqa MSG1_A, 6*16(%rsp) 442 movdqa MSG1_A, 7*16(%rsp) 443 cmp $56, LEN 444 jge 1f // will COUNT spill into its own block? 445 shl $3, COUNT 446 bswap COUNT 447 mov COUNT, 56(%rsp,%rbx) 448 mov $2, FINAL_STEP // won't need count-only block 449 jmp 2f 4501: 451 mov $1, FINAL_STEP // will need count-only block 4522: 453 movdqu 0*16(DATA1), MSG0_A 454 movdqu 1*16(DATA1), MSG1_A 455 movdqu 2*16(DATA1), MSG2_A 456 movdqu 3*16(DATA1), MSG3_A 457 movdqa MSG0_A, 0*16(%rsp) 458 movdqa MSG1_A, 1*16(%rsp) 459 movdqa MSG2_A, 2*16(%rsp) 460 movdqa MSG3_A, 3*16(%rsp) 461 movdqu 0*16(%rsp,%rbx), MSG0_A 462 movdqu 1*16(%rsp,%rbx), MSG1_A 463 movdqu 2*16(%rsp,%rbx), MSG2_A 464 movdqu 3*16(%rsp,%rbx), MSG3_A 465 466 movdqu 0*16(DATA2), MSG0_B 467 movdqu 1*16(DATA2), MSG1_B 468 movdqu 2*16(DATA2), MSG2_B 469 movdqu 3*16(DATA2), MSG3_B 470 movdqa MSG0_B, 0*16(%rsp) 471 movdqa MSG1_B, 1*16(%rsp) 472 movdqa MSG2_B, 2*16(%rsp) 473 movdqa MSG3_B, 3*16(%rsp) 474 movdqu 0*16(%rsp,%rbx), MSG0_B 475 movdqu 1*16(%rsp,%rbx), MSG1_B 476 movdqu 2*16(%rsp,%rbx), MSG2_B 477 movdqu 3*16(%rsp,%rbx), MSG3_B 478 jmp .Lfinup2x_loop_have_data 479 480 // Prepare a padding block, either: 481 // 482 // {0x80, 0, 0, 0, ..., count (as __be64)} 483 // This is for a block aligned message. 484 // 485 // { 0, 0, 0, 0, ..., count (as __be64)} 486 // This is for a message whose length mod 64 is >= 56. 487 // 488 // Pre-swap the endianness of the words. 489.Lfinup2x_finalize_countonly: 490 pxor MSG0_A, MSG0_A 491 jmp 1f 492 493.Lfinup2x_finalize_blockaligned: 494 mov $0x80000000, %ebx 495 movd %ebx, MSG0_A 4961: 497 pxor MSG1_A, MSG1_A 498 pxor MSG2_A, MSG2_A 499 ror $29, COUNT 500 movq COUNT, MSG3_A 501 pslldq $8, MSG3_A 502 movdqa MSG0_A, MSG0_B 503 pxor MSG1_B, MSG1_B 504 pxor MSG2_B, MSG2_B 505 movdqa MSG3_A, MSG3_B 506 mov $2, FINAL_STEP 507 jmp .Lfinup2x_loop_have_bswapped_data 508 509.Lfinup2x_done: 510 // Write the two digests with all bytes in the correct order. 511 movdqa STATE0_A, TMP_A 512 movdqa STATE0_B, TMP_B 513 punpcklqdq STATE1_A, STATE0_A // GHEF 514 punpcklqdq STATE1_B, STATE0_B 515 punpckhqdq TMP_A, STATE1_A // ABCD 516 punpckhqdq TMP_B, STATE1_B 517 pshufd $0xB1, STATE0_A, STATE0_A // HGFE 518 pshufd $0xB1, STATE0_B, STATE0_B 519 pshufd $0x1B, STATE1_A, STATE1_A // DCBA 520 pshufd $0x1B, STATE1_B, STATE1_B 521 pshufb SHUF_MASK, STATE0_A 522 pshufb SHUF_MASK, STATE0_B 523 pshufb SHUF_MASK, STATE1_A 524 pshufb SHUF_MASK, STATE1_B 525 movdqu STATE0_A, 1*16(OUT1) 526 movdqu STATE0_B, 1*16(OUT2) 527 movdqu STATE1_A, 0*16(OUT1) 528 movdqu STATE1_B, 0*16(OUT2) 529 530 mov %rbp, %rsp 531 pop %rbp 532 pop %rbx 533 RET 534SYM_FUNC_END(sha256_ni_finup2x) 535 536.section .rodata.cst256.K256, "aM", @progbits, 256 537.align 64 538K256: 539 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 540 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 541 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 542 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 543 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 544 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 545 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 546 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 547 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 548 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 549 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 550 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 551 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 552 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 553 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 554 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 555 556.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 557.align 16 558PSHUFFLE_BYTE_FLIP_MASK: 559 .octa 0x0c0d0e0f08090a0b0405060700010203 560