1/*- 2 * Copyright (c) 2013 The Go Authors. All rights reserved. 3 * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> 4 * 5 * Adapted from Go's crypto/sha1/sha1block_amd64.s. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are 9 * met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above 14 * copyright notice, this list of conditions and the following disclaimer 15 * in the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Google Inc. nor the names of its 18 * contributors may be used to endorse or promote products derived from 19 * this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34#include <machine/asm.h> 35 36/* 37 * SHA-1 block routine. See sha1c.c for C equivalent. 38 * 39 * There are 80 rounds of 4 types: 40 * - rounds 0-15 are type 1 and load data (round1 macro). 41 * - rounds 16-19 are type 1 and do not load data (round1x macro). 42 * - rounds 20-39 are type 2 and do not load data (round2 macro). 43 * - rounds 40-59 are type 3 and do not load data (round3 macro). 44 * - rounds 60-79 are type 4 and do not load data (round4 macro). 45 * 46 * Each round loads or shuffles the data, then computes a per-round 47 * function of b, c, d, and then mixes the result into and rotates the 48 * five registers a, b, c, d, e holding the intermediate results. 49 * 50 * The register rotation is implemented by rotating the arguments to 51 * the round macros instead of by explicit move instructions. 52 */ 53.macro load index 54 mov (\index)*4(%rsi), %r10d 55 bswap %r10d 56 mov %r10d, (\index)*4(%rsp) 57.endm 58 59.macro shuffle index 60 mov ((\index )&0xf)*4(%rsp), %r10d 61 xor ((\index- 3)&0xf)*4(%rsp), %r10d 62 xor ((\index- 8)&0xf)*4(%rsp), %r10d 63 xor ((\index-14)&0xf)*4(%rsp), %r10d 64 rol $1, %r10d 65 mov %r10d, ((\index)&0xf)*4(%rsp) 66.endm 67 68.macro func1 a, b, c, d, e 69 mov \d, %r9d 70 xor \c, %r9d 71 and \b, %r9d 72 xor \d, %r9d 73.endm 74 75.macro func2 a, b, c, d, e 76 mov \b, %r9d 77 xor \c, %r9d 78 xor \d, %r9d 79.endm 80 81.macro func3 a, b, c, d, e 82 mov \b, %r8d 83 or \c, %r8d 84 and \d, %r8d 85 mov \b, %r9d 86 and \c, %r9d 87 or %r8d, %r9d 88.endm 89 90.macro func4 a, b, c, d, e 91 func2 \a, \b, \c, \d, \e 92.endm 93 94.macro mix a, b, c, d, e, const 95 rol $30, \b 96 add %r9d, \e 97 mov \a, %r8d 98 rol $5, %r8d 99 lea \const(\e, %r10d, 1), \e 100 add %r8d, \e 101.endm 102 103.macro round1 a, b, c, d, e, index 104 load \index 105 func1 \a, \b, \c, \d, \e 106 mix \a, \b, \c, \d, \e, 0x5a827999 107.endm 108 109.macro round1x a, b, c, d, e, index 110 shuffle \index 111 func1 \a, \b, \c, \d, \e 112 mix \a, \b, \c, \d, \e, 0x5a827999 113.endm 114 115.macro round2 a, b, c, d, e, index 116 shuffle \index 117 func2 \a, \b, \c, \d, \e 118 mix \a, \b, \c, \d, \e, 0x6ed9eba1 119.endm 120 121.macro round3 a, b, c, d, e, index 122 shuffle \index 123 func3 \a, \b, \c, \d, \e 124 mix \a, \b, \c, \d, \e, 0x8f1bbcdc 125.endm 126 127.macro round4 a, b, c, d, e, index 128 shuffle \index 129 func4 \a, \b, \c, \d, \e 130 mix \a, \b, \c, \d, \e, 0xca62c1d6 131.endm 132 133 // sha1block(SHA1_CTX, buf, len) 134ENTRY(_libmd_sha1block_scalar) 135 push %rbp 136 push %rbx 137 push %r12 138 push %r13 139 push %r14 140 push %r15 141 push %rdi // rdi: SHA1_CTX 142 sub $64+8, %rsp // 64 bytes for round keys 143 // plus alignment 144 145 mov %rdi, %rbp 146 // rsi: buf 147 and $~63, %rdx // rdx: length in blocks 148 lea (%rsi, %rdx, 1), %rdi // rdi: end pointer 149 mov (%rbp), %eax // c->h0 150 mov 4(%rbp), %ebx // c->h1 151 mov 8(%rbp), %ecx // c->h2 152 mov 12(%rbp), %edx // c->h3 153 mov 16(%rbp), %ebp // c->h4 154 155 cmp %rsi, %rdi // any data to process? 156 je .Lend 157 158.Lloop: mov %eax, %r11d 159 mov %ebx, %r12d 160 mov %ecx, %r13d 161 mov %edx, %r14d 162 mov %ebp, %r15d 163 164 round1 %eax, %ebx, %ecx, %edx, %ebp, 0 165 round1 %ebp, %eax, %ebx, %ecx, %edx, 1 166 round1 %edx, %ebp, %eax, %ebx, %ecx, 2 167 round1 %ecx, %edx, %ebp, %eax, %ebx, 3 168 round1 %ebx, %ecx, %edx, %ebp, %eax, 4 169 170 round1 %eax, %ebx, %ecx, %edx, %ebp, 5 171 round1 %ebp, %eax, %ebx, %ecx, %edx, 6 172 round1 %edx, %ebp, %eax, %ebx, %ecx, 7 173 round1 %ecx, %edx, %ebp, %eax, %ebx, 8 174 round1 %ebx, %ecx, %edx, %ebp, %eax, 9 175 176 round1 %eax, %ebx, %ecx, %edx, %ebp, 10 177 round1 %ebp, %eax, %ebx, %ecx, %edx, 11 178 round1 %edx, %ebp, %eax, %ebx, %ecx, 12 179 round1 %ecx, %edx, %ebp, %eax, %ebx, 13 180 round1 %ebx, %ecx, %edx, %ebp, %eax, 14 181 182 round1 %eax, %ebx, %ecx, %edx, %ebp, 15 183 round1x %ebp, %eax, %ebx, %ecx, %edx, 16 184 round1x %edx, %ebp, %eax, %ebx, %ecx, 17 185 round1x %ecx, %edx, %ebp, %eax, %ebx, 18 186 round1x %ebx, %ecx, %edx, %ebp, %eax, 19 187 188 round2 %eax, %ebx, %ecx, %edx, %ebp, 20 189 round2 %ebp, %eax, %ebx, %ecx, %edx, 21 190 round2 %edx, %ebp, %eax, %ebx, %ecx, 22 191 round2 %ecx, %edx, %ebp, %eax, %ebx, 23 192 round2 %ebx, %ecx, %edx, %ebp, %eax, 24 193 194 round2 %eax, %ebx, %ecx, %edx, %ebp, 25 195 round2 %ebp, %eax, %ebx, %ecx, %edx, 26 196 round2 %edx, %ebp, %eax, %ebx, %ecx, 27 197 round2 %ecx, %edx, %ebp, %eax, %ebx, 28 198 round2 %ebx, %ecx, %edx, %ebp, %eax, 29 199 200 round2 %eax, %ebx, %ecx, %edx, %ebp, 30 201 round2 %ebp, %eax, %ebx, %ecx, %edx, 31 202 round2 %edx, %ebp, %eax, %ebx, %ecx, 32 203 round2 %ecx, %edx, %ebp, %eax, %ebx, 33 204 round2 %ebx, %ecx, %edx, %ebp, %eax, 34 205 206 round2 %eax, %ebx, %ecx, %edx, %ebp, 35 207 round2 %ebp, %eax, %ebx, %ecx, %edx, 36 208 round2 %edx, %ebp, %eax, %ebx, %ecx, 37 209 round2 %ecx, %edx, %ebp, %eax, %ebx, 38 210 round2 %ebx, %ecx, %edx, %ebp, %eax, 39 211 212 round3 %eax, %ebx, %ecx, %edx, %ebp, 40 213 round3 %ebp, %eax, %ebx, %ecx, %edx, 41 214 round3 %edx, %ebp, %eax, %ebx, %ecx, 42 215 round3 %ecx, %edx, %ebp, %eax, %ebx, 43 216 round3 %ebx, %ecx, %edx, %ebp, %eax, 44 217 218 round3 %eax, %ebx, %ecx, %edx, %ebp, 45 219 round3 %ebp, %eax, %ebx, %ecx, %edx, 46 220 round3 %edx, %ebp, %eax, %ebx, %ecx, 47 221 round3 %ecx, %edx, %ebp, %eax, %ebx, 48 222 round3 %ebx, %ecx, %edx, %ebp, %eax, 49 223 224 round3 %eax, %ebx, %ecx, %edx, %ebp, 50 225 round3 %ebp, %eax, %ebx, %ecx, %edx, 51 226 round3 %edx, %ebp, %eax, %ebx, %ecx, 52 227 round3 %ecx, %edx, %ebp, %eax, %ebx, 53 228 round3 %ebx, %ecx, %edx, %ebp, %eax, 54 229 230 round3 %eax, %ebx, %ecx, %edx, %ebp, 55 231 round3 %ebp, %eax, %ebx, %ecx, %edx, 56 232 round3 %edx, %ebp, %eax, %ebx, %ecx, 57 233 round3 %ecx, %edx, %ebp, %eax, %ebx, 58 234 round3 %ebx, %ecx, %edx, %ebp, %eax, 59 235 236 round4 %eax, %ebx, %ecx, %edx, %ebp, 60 237 round4 %ebp, %eax, %ebx, %ecx, %edx, 61 238 round4 %edx, %ebp, %eax, %ebx, %ecx, 62 239 round4 %ecx, %edx, %ebp, %eax, %ebx, 63 240 round4 %ebx, %ecx, %edx, %ebp, %eax, 64 241 242 round4 %eax, %ebx, %ecx, %edx, %ebp, 65 243 round4 %ebp, %eax, %ebx, %ecx, %edx, 66 244 round4 %edx, %ebp, %eax, %ebx, %ecx, 67 245 round4 %ecx, %edx, %ebp, %eax, %ebx, 68 246 round4 %ebx, %ecx, %edx, %ebp, %eax, 69 247 248 round4 %eax, %ebx, %ecx, %edx, %ebp, 70 249 round4 %ebp, %eax, %ebx, %ecx, %edx, 71 250 round4 %edx, %ebp, %eax, %ebx, %ecx, 72 251 round4 %ecx, %edx, %ebp, %eax, %ebx, 73 252 round4 %ebx, %ecx, %edx, %ebp, %eax, 74 253 254 round4 %eax, %ebx, %ecx, %edx, %ebp, 75 255 round4 %ebp, %eax, %ebx, %ecx, %edx, 76 256 round4 %edx, %ebp, %eax, %ebx, %ecx, 77 257 round4 %ecx, %edx, %ebp, %eax, %ebx, 78 258 round4 %ebx, %ecx, %edx, %ebp, %eax, 79 259 260 add %r11d, %eax 261 add %r12d, %ebx 262 add %r13d, %ecx 263 add %r14d, %edx 264 add %r15d, %ebp 265 266 add $64, %rsi 267 cmp %rdi, %rsi 268 jb .Lloop 269 270.Lend: add $64+8, %rsp 271 pop %rdi // SHA1_CTX 272 mov %eax, (%rdi) 273 mov %ebx, 4(%rdi) 274 mov %ecx, 8(%rdi) 275 mov %edx, 12(%rdi) 276 mov %ebp, 16(%rdi) 277 278 pop %r15 279 pop %r14 280 pop %r13 281 pop %r12 282 pop %rbx 283 pop %rbp 284 ret 285END(_libmd_sha1block_scalar) 286 287/* 288 * This is the implementation using AVX2, BMI1 and BMI2. It is based on: 289 * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" 290 * From http://software.intel.com/en-us/articles 291 * (look for improving-the-performance-of-the-secure-hash-algorithm-1) 292 * This implementation is 2x unrolled, and interleaves vector instructions, 293 * used to precompute W, with scalar computation of current round 294 * for optimal scheduling. 295 */ 296 297 /* trivial helper macros */ 298.macro update_hash a, tb, c, d, e 299 add (%r9), \a 300 mov \a, (%r9) 301 add 4(%r9), \tb 302 mov \tb, 4(%r9) 303 add 8(%r9), \c 304 mov \c, 8(%r9) 305 add 12(%r9), \d 306 mov \d, 12(%r9) 307 add 16(%r9), \e 308 mov \e, 16(%r9) 309.endm 310 311 /* help macros for recalc, which does precomputations */ 312.macro precalc0 offset 313 vmovdqu \offset(%r10), %xmm0 314.endm 315 316.macro precalc1 offset 317 vinserti128 $1, \offset(%r13), %ymm0, %ymm0 318.endm 319 320.macro precalc2 yreg 321 vpshufb %ymm10, %ymm0, \yreg 322.endm 323 324.macro precalc4 yreg, k_offset 325 vpaddd \k_offset(%r8), \yreg, %ymm0 326.endm 327 328.macro precalc7 offset 329 vmovdqu %ymm0, (\offset)*2(%r14) 330.endm 331 332/* 333 * Message scheduling pre-compute for rounds 0-15 334 * r13 is a pointer to the even 64-byte block 335 * r10 is a pointer to the odd 64-byte block 336 * r14 is a pointer to the temp buffer 337 * xmm0 is used as a temp register 338 * yreg is clobbered as part of the computation 339 * offset chooses a 16 byte chunk within a block 340 * r8 is a pointer to the constants block 341 * k_offset chooses K constants relevant to this round 342 * xmm10 holds the swap mask 343 */ 344.macro precalc00_15 offset, yreg 345 precalc0 \offset 346 precalc1 \offset 347 precalc2 \yreg 348 precalc4 \yreg, 0 349 precalc7 \offset 350.endm 351 352 /* helper macros for precalc16_31 */ 353.macro precalc16 reg_sub16, reg_sub12, reg_sub4, reg 354 vpalignr $8, \reg_sub16, \reg_sub12, \reg // w[i - 14] 355 vpsrldq $4, \reg_sub4, %ymm0 // w[i - 3] 356.endm 357 358.macro precalc17 reg_sub16, reg_sub8, reg 359 vpxor \reg_sub8, \reg, \reg 360 vpxor \reg_sub16, %ymm0, %ymm0 361.endm 362 363.macro precalc18 reg 364 vpxor %ymm0, \reg, \reg 365 vpslldq $12, \reg, %ymm9 366.endm 367 368.macro precalc19 reg 369 vpslld $1, \reg, %ymm0 370 vpsrld $31, \reg, \reg 371 .endm 372 373.macro precalc20 reg 374 vpor \reg, %ymm0, %ymm0 375 vpslld $2, %ymm9, \reg 376.endm 377 378.macro precalc21 reg 379 vpsrld $30, %ymm9, %ymm9 380 vpxor \reg, %ymm0, %ymm0 381.endm 382 383.macro precalc23 reg, k_offset, offset 384 vpxor %ymm9, %ymm0, \reg 385 vpaddd \k_offset(%r8), \reg, %ymm0 386 vmovdqu %ymm0, (\offset)(%r14) 387.endm 388 389/* 390 * Message scheduling pre-compute for rounds 16-31 391 * calculating last 32 w[i] values in 8 XMM registers 392 * pre-calculate K+w[i] values and store to mem 393 * for later load by ALU add instruction. 394 * "brute force" vectorization for rounds 16-31 only 395 * due to w[i]->w[i-3] dependency. 396 + clobbers 5 input ymm registers REG_SUB* 397 * uses xmm0 and xmm9 as temp registers 398 * As always, r8 is a pointer to constants block 399 * and r14 is a pointer to temp buffer 400 */ 401.macro precalc16_31 reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset 402 precalc16 \reg_sub16, \reg_sub12, \reg_sub4, \reg 403 precalc17 \reg_sub16, \reg_sub8, \reg 404 precalc18 \reg 405 precalc19 \reg 406 precalc20 \reg 407 precalc21 \reg 408 precalc23 \reg, \k_offset, \offset 409.endm 410 411 /* helper macros for precalc_32_79 */ 412.macro precalc32 reg_sub8, reg_sub4 413 vpalignr $8, \reg_sub8, \reg_sub4, %ymm0 414.endm 415 416.macro precalc33 reg_sub28, reg 417 vpxor \reg_sub28, \reg, \reg 418.endm 419 420.macro precalc34 reg_sub16 421 vpxor \reg_sub16, %ymm0, %ymm0 422.endm 423 424.macro precalc35 reg 425 vpxor %ymm0, \reg, \reg 426.endm 427 428.macro precalc36 reg 429 vpslld $2, \reg, %ymm0 430.endm 431 432.macro precalc37 reg 433 vpsrld $30, \reg, \reg 434 vpor \reg, %ymm0, \reg 435.endm 436 437.macro precalc39 reg, k_offset, offset 438 vpaddd \k_offset(%r8), \reg, %ymm0 439 vmovdqu %ymm0, \offset(%r14) 440.endm 441 442.macro precalc32_79 reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset 443 precalc32 \reg_sub8, \reg_sub4 444 precalc33 \reg_sub28, \reg 445 precalc34 \reg_sub16 446 precalc35 \reg 447 precalc36 \reg 448 precalc37 \reg 449 precalc39 \reg, \k_offset, \offset 450.endm 451 452.macro precalc 453 precalc00_15 0x00, %ymm15 454 precalc00_15 0x10, %ymm14 455 precalc00_15 0x20, %ymm13 456 precalc00_15 0x30, %ymm12 457 precalc16_31 %ymm8, %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080 458 precalc16_31 %ymm7, %ymm8, %ymm12, %ymm13, %ymm14, 0x20, 0x0a0 459 precalc16_31 %ymm5, %ymm7, %ymm8, %ymm12, %ymm13, 0x20, 0x0c0 460 precalc16_31 %ymm3, %ymm5, %ymm7, %ymm8, %ymm12, 0x20, 0x0e0 461 precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x20, 0x100 462 precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x20, 0x120 463 precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x40, 0x140 464 precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x40, 0x160 465 precalc32_79 %ymm8, %ymm12, %ymm13, %ymm15, %ymm7, 0x40, 0x180 466 precalc32_79 %ymm7, %ymm8, %ymm12, %ymm14, %ymm5, 0x40, 0x1a0 467 precalc32_79 %ymm5, %ymm7, %ymm8, %ymm13, %ymm3, 0x40, 0x1c0 468 precalc32_79 %ymm3, %ymm5, %ymm7, %ymm12, %ymm15, 0x60, 0x1e0 469 precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x60, 0x200 470 precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x60, 0x220 471 precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x60, 0x240 472 precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x60, 0x260 473.endm 474 475/* 476 * Macros calculating individual rounds have general form 477 * calc_round_pre + precalc_round + calc_round_post 478 * calc_round_{pre,post} macros follow 479 */ 480.macro calc_f1_pre offset, reg_a, reg_b, reg_c, reg_e 481 add \offset(%r15), \reg_e 482 andn \reg_c, \reg_a, %ebp 483 add \reg_b, \reg_e // add F from the previous round 484 rorx $0x1b, \reg_a, %r12d 485 rorx $2, \reg_a, \reg_b // for the next round 486.endm 487 488/* 489 * Calculate F for the next round 490 */ 491.macro calc_f1_post reg_a, reg_b, reg_e 492 and \reg_b, \reg_a // b & c 493 xor %ebp, \reg_a // F1 = (b&c) ^ (~b&d) 494 add %r12d, \reg_e 495.endm 496 497/* 498 * Registers are cyclically rotated: 499 * edx -> eax -> edi -> esi -> ebx -> ecx 500 */ 501.macro calc0 502 mov %esi, %ebx // precalculate first round 503 rorx $2, %esi, %esi 504 andn %eax, %ebx, %ebp 505 and %edi, %ebx 506 xor %ebp, %ebx 507 calc_f1_pre 0x0, %ecx, %ebx, %edi, %edx 508 precalc0 0x80 509 calc_f1_post %ecx, %esi, %edx 510.endm 511 512.macro calc1 513 calc_f1_pre 0x4, %edx, %ecx, %esi, %eax 514 precalc1 0x80 515 calc_f1_post %edx, %ebx, %eax 516.endm 517 518.macro calc2 519 calc_f1_pre 0x8, %eax, %edx, %ebx, %edi 520 precalc2 %ymm15 521 calc_f1_post %eax, %ecx, %edi 522.endm 523 524.macro calc3 525 calc_f1_pre 0xc, %edi, %eax, %ecx, %esi 526 calc_f1_post %edi, %edx, %esi 527.endm 528 529.macro calc4 530 calc_f1_pre 0x20, %esi, %edi, %edx, %ebx 531 precalc4 %ymm15, 0x0 532 calc_f1_post %esi, %eax, %ebx 533.endm 534 535.macro calc5 536 calc_f1_pre 0x24, %ebx, %esi, %eax, %ecx 537 calc_f1_post %ebx, %edi, %ecx 538.endm 539 540.macro calc6 541 calc_f1_pre 0x28, %ecx, %ebx, %edi, %edx 542 calc_f1_post %ecx, %esi, %edx 543.endm 544 545.macro calc7 546 calc_f1_pre 0x2c, %edx, %ecx, %esi, %eax 547 precalc7 0x0 548 calc_f1_post %edx, %ebx, %eax 549.endm 550 551.macro calc8 552 calc_f1_pre 0x40, %eax, %edx, %ebx, %edi 553 precalc0 0x90 554 calc_f1_post %eax, %ecx, %edi 555.endm 556 557.macro calc9 558 calc_f1_pre 0x44, %edi, %eax, %ecx, %esi 559 precalc1 0x90 560 calc_f1_post %edi, %edx, %esi 561.endm 562 563.macro calc10 564 calc_f1_pre 0x48, %esi, %edi, %edx, %ebx 565 precalc2 %ymm14 566 calc_f1_post %esi, %eax, %ebx 567.endm 568 569.macro calc11 570 calc_f1_pre 0x4c, %ebx, %esi, %eax, %ecx 571 calc_f1_post %ebx, %edi, %ecx 572.endm 573 574.macro calc12 575 calc_f1_pre 0x60, %ecx, %ebx, %edi, %edx 576 precalc4 %ymm14, 0 577 calc_f1_post %ecx, %esi, %edx 578.endm 579 580.macro calc13 581 calc_f1_pre 0x64, %edx, %ecx, %esi, %eax 582 calc_f1_post %edx, %ebx, %eax 583.endm 584 585.macro calc14 586 calc_f1_pre 0x68, %eax, %edx, %ebx, %edi 587 calc_f1_post %eax, %ecx, %edi 588.endm 589 590.macro calc15 591 calc_f1_pre 0x6c, %edi, %eax, %ecx, %esi 592 precalc7 0x10 593 calc_f1_post %edi, %edx, %esi 594.endm 595 596.macro calc16 597 calc_f1_pre 0x80, %esi, %edi, %edx, %ebx 598 precalc0 0xa0 599 calc_f1_post %esi, %eax, %ebx 600.endm 601 602.macro calc17 603 calc_f1_pre 0x84, %ebx, %esi, %eax, %ecx 604 precalc1 0xa0 605 calc_f1_post %ebx, %edi, %ecx 606.endm 607 608.macro calc18 609 calc_f1_pre 0x88, %ecx, %ebx, %edi, %edx 610 precalc2 %ymm13 611 calc_f1_post %ecx, %esi, %edx 612.endm 613 614.macro calc_f2_pre offset, reg_a, reg_b, reg_e 615 add \offset(%r15), \reg_e 616 add \reg_b, \reg_e // add F from the previous round 617 rorx $0x1b, \reg_a, %r12d 618 rorx $2, \reg_a, \reg_b // for next round 619.endm 620 621.macro calc_f2_post reg_a, reg_b, reg_c, reg_e 622 xor \reg_b, \reg_a 623 add %r12d, \reg_e 624 xor \reg_c, \reg_a 625.endm 626 627.macro calc19 628 calc_f2_pre 0x8c, %edx, %ecx, %eax 629 calc_f2_post %edx, %ebx, %esi, %eax 630.endm 631 632.macro calc20 633 calc_f2_pre 0xa0, %eax, %edx, %edi 634 precalc4 %ymm13, 0x0 635 calc_f2_post %eax, %ecx, %ebx, %edi 636.endm 637 638.macro calc21 639 calc_f2_pre 0xa4, %edi, %eax, %esi 640 calc_f2_post %edi, %edx, %ecx, %esi 641.endm 642 643.macro calc22 644 calc_f2_pre 0xa8, %esi, %edi, %ebx 645 calc_f2_post %esi, %eax, %edx, %ebx 646.endm 647 648.macro calc23 649 calc_f2_pre 0xac, %ebx, %esi, %ecx 650 precalc7 0x20 651 calc_f2_post %ebx, %edi, %eax, %ecx 652.endm 653 654.macro calc24 655 calc_f2_pre 0xc0, %ecx, %ebx, %edx 656 precalc0 0xb0 657 calc_f2_post %ecx, %esi, %edi, %edx 658.endm 659 660.macro calc25 661 calc_f2_pre 0xc4, %edx, %ecx, %eax 662 precalc1 0xb0 663 calc_f2_post %edx, %ebx, %esi, %eax 664.endm 665 666.macro calc26 667 calc_f2_pre 0xc8, %eax, %edx, %edi 668 precalc2 %ymm12 669 calc_f2_post %eax, %ecx, %ebx, %edi 670.endm 671 672.macro calc27 673 calc_f2_pre 0xcc, %edi, %eax, %esi 674 calc_f2_post %edi, %edx, %ecx, %esi 675.endm 676 677.macro calc28 678 calc_f2_pre 0xe0, %esi, %edi, %ebx 679 precalc4 %ymm12, 0x0 680 calc_f2_post %esi, %eax, %edx, %ebx 681.endm 682 683.macro calc29 684 calc_f2_pre 0xe4, %ebx, %esi, %ecx 685 calc_f2_post %ebx, %edi, %eax, %ecx 686.endm 687 688.macro calc30 689 calc_f2_pre 0xe8, %ecx, %ebx, %edx 690 calc_f2_post %ecx, %esi, %edi, %edx 691.endm 692 693.macro calc31 694 calc_f2_pre 0xec, %edx, %ecx, %eax 695 precalc7 0x30 696 calc_f2_post %edx, %ebx, %esi, %eax 697.endm 698 699.macro calc32 700 calc_f2_pre 0x100, %eax, %edx, %edi 701 precalc16 %ymm15, %ymm14, %ymm12, %ymm8 702 calc_f2_post %eax, %ecx, %ebx, %edi 703.endm 704 705.macro calc33 706 calc_f2_pre 0x104, %edi, %eax, %esi 707 precalc17 %ymm15, %ymm13, %ymm8 708 calc_f2_post %edi, %edx, %ecx, %esi 709.endm 710 711.macro calc34 712 calc_f2_pre 0x108, %esi, %edi, %ebx 713 precalc18 %ymm8 714 calc_f2_post %esi, %eax, %edx, %ebx 715.endm 716 717.macro calc35 718 calc_f2_pre 0x10c, %ebx, %esi, %ecx 719 precalc19 %ymm8 720 calc_f2_post %ebx, %edi, %eax, %ecx 721.endm 722 723.macro calc36 724 calc_f2_pre 0x120, %ecx, %ebx, %edx 725 precalc20 %ymm8 726 calc_f2_post %ecx, %esi, %edi, %edx 727.endm 728 729.macro calc37 730 calc_f2_pre 0x124, %edx, %ecx, %eax 731 precalc21 %ymm8 732 calc_f2_post %edx, %ebx, %esi, %eax 733.endm 734 735.macro calc38 736 calc_f2_pre 0x128, %eax, %edx, %edi 737 calc_f2_post %eax, %ecx, %ebx, %edi 738.endm 739 740.macro calc_f3_pre offset, reg_e 741 add \offset(%r15), \reg_e 742.endm 743 744.macro calc_f3_post reg_a, reg_b, reg_c, reg_e, reg_tb 745 add \reg_tb, \reg_e // add F from the previous round 746 mov \reg_b, %ebp 747 or \reg_a, %ebp 748 rorx $0x1b, \reg_a, %r12d 749 rorx $2, \reg_a, \reg_tb 750 and \reg_c, %ebp // calculate F for the next round 751 and \reg_b, \reg_a 752 or %ebp, \reg_a 753 add %r12d, \reg_e 754.endm 755 756.macro calc39 757 calc_f3_pre 0x12c, %esi 758 precalc23 %ymm8, 0x0, 0x80 759 calc_f3_post %edi, %edx, %ecx, %esi, %eax 760.endm 761 762.macro calc40 763 calc_f3_pre 0x140, %ebx 764 precalc16 %ymm14, %ymm13, %ymm8, %ymm7 765 calc_f3_post %esi, %eax, %edx, %ebx, %edi 766.endm 767 768.macro calc41 769 calc_f3_pre 0x144, %ecx 770 precalc17 %ymm14, %ymm12, %ymm7 771 calc_f3_post %ebx, %edi, %eax, %ecx, %esi 772.endm 773 774.macro calc42 775 calc_f3_pre 0x148, %edx 776 precalc18 %ymm7 777 calc_f3_post %ecx, %esi, %edi, %edx, %ebx 778.endm 779 780.macro calc43 781 calc_f3_pre 0x14c, %eax 782 precalc19 %ymm7 783 calc_f3_post %edx, %ebx, %esi, %eax, %ecx 784.endm 785 786.macro calc44 787 calc_f3_pre 0x160, %edi 788 precalc20 %ymm7 789 calc_f3_post %eax, %ecx, %ebx, %edi, %edx 790.endm 791 792.macro calc45 793 calc_f3_pre 0x164, %esi 794 precalc21 %ymm7 795 calc_f3_post %edi, %edx, %ecx, %esi, %eax 796.endm 797 798.macro calc46 799 calc_f3_pre 0x168, %ebx 800 calc_f3_post %esi, %eax, %edx, %ebx, %edi 801.endm 802 803.macro calc47 804 calc_f3_pre 0x16c, %ecx 805 vpxor %ymm9, %ymm0, %ymm7 806 vpaddd 0x20(%r8), %ymm7, %ymm0 807 vmovdqu %ymm0, 0xa0(%r14) 808 calc_f3_post %ebx, %edi, %eax, %ecx, %esi 809.endm 810 811.macro calc48 812 calc_f3_pre 0x180, %edx 813 precalc16 %ymm13, %ymm12, %ymm7, %ymm5 814 calc_f3_post %ecx, %esi, %edi, %edx, %ebx 815.endm 816 817.macro calc49 818 calc_f3_pre 0x184, %eax 819 precalc17 %ymm13, %ymm8, %ymm5 820 calc_f3_post %edx, %ebx, %esi, %eax, %ecx 821.endm 822 823.macro calc50 824 calc_f3_pre 0x188, %edi 825 precalc18 %ymm5 826 calc_f3_post %eax, %ecx, %ebx, %edi, %edx 827.endm 828 829.macro calc51 830 calc_f3_pre 0x18c, %esi 831 precalc19 %ymm5 832 calc_f3_post %edi, %edx, %ecx, %esi, %eax 833.endm 834 835.macro calc52 836 calc_f3_pre 0x1a0, %ebx 837 precalc20 %ymm5 838 calc_f3_post %esi, %eax, %edx, %ebx, %edi 839.endm 840 841.macro calc53 842 calc_f3_pre 0x1a4, %ecx 843 precalc21 %ymm5 844 calc_f3_post %ebx, %edi, %eax, %ecx, %esi 845.endm 846 847.macro calc54 848 calc_f3_pre 0x1a8, %edx 849 calc_f3_post %ecx, %esi, %edi, %edx, %ebx 850.endm 851 852.macro calc55 853 calc_f3_pre 0x1ac, %eax 854 precalc23 %ymm5, 0x20, 0xc0 855 calc_f3_post %edx, %ebx, %esi, %eax, %ecx 856.endm 857 858.macro calc56 859 calc_f3_pre 0x1c0, %edi 860 precalc16 %ymm12, %ymm8, %ymm5, %ymm3 861 calc_f3_post %eax, %ecx, %ebx, %edi, %edx 862.endm 863 864.macro calc57 865 calc_f3_pre 0x1c4, %esi 866 precalc17 %ymm12, %ymm7, %ymm3 867 calc_f3_post %edi, %edx, %ecx, %esi, %eax 868.endm 869 870.macro calc58 871 calc_f3_pre 0x1c8, %ebx 872 precalc18 %ymm3 873 calc_f3_post %esi, %eax, %edx, %ebx, %edi 874.endm 875 876.macro calc59 877 calc_f2_pre 0x1cc, %ebx, %esi, %ecx 878 precalc19 %ymm3 879 calc_f2_post %ebx, %edi, %eax, %ecx 880.endm 881 882.macro calc60 883 calc_f2_pre 0x1e0, %ecx, %ebx, %edx 884 precalc20 %ymm3 885 calc_f2_post %ecx, %esi, %edi, %edx 886.endm 887 888.macro calc61 889 calc_f2_pre 0x1e4, %edx, %ecx, %eax 890 precalc21 %ymm3 891 calc_f2_post %edx, %ebx, %esi, %eax 892.endm 893 894.macro calc62 895 calc_f2_pre 0x1e8, %eax, %edx, %edi 896 calc_f2_post %eax, %ecx, %ebx, %edi 897.endm 898 899.macro calc63 900 calc_f2_pre 0x1ec, %edi, %eax, %esi 901 precalc23 %ymm3, 0x20, 0xe0 902 calc_f2_post %edi, %edx, %ecx, %esi 903.endm 904 905.macro calc64 906 calc_f2_pre 0x200, %esi, %edi, %ebx 907 precalc32 %ymm5, %ymm3 908 calc_f2_post %esi, %eax, %edx, %ebx 909.endm 910 911.macro calc65 912 calc_f2_pre 0x204, %ebx, %esi, %ecx 913 precalc33 %ymm14, %ymm15 914 calc_f2_post %ebx, %edi, %eax, %ecx 915.endm 916 917.macro calc66 918 calc_f2_pre 0x208, %ecx, %ebx, %edx 919 precalc34 %ymm8 920 calc_f2_post %ecx, %esi, %edi, %edx 921.endm 922 923.macro calc67 924 calc_f2_pre 0x20c, %edx, %ecx, %eax 925 precalc35 %ymm15 926 calc_f2_post %edx, %ebx, %esi, %eax 927.endm 928 929.macro calc68 930 calc_f2_pre 0x220, %eax, %edx, %edi 931 precalc36 %ymm15 932 calc_f2_post %eax, %ecx, %ebx, %edi 933.endm 934 935.macro calc69 936 calc_f2_pre 0x224, %edi, %eax, %esi 937 precalc37 %ymm15 938 calc_f2_post %edi, %edx, %ecx, %esi 939.endm 940 941.macro calc70 942 calc_f2_pre 0x228, %esi, %edi, %ebx 943 calc_f2_post %esi, %eax, %edx, %ebx 944.endm 945 946.macro calc71 947 calc_f2_pre 0x22c, %ebx, %esi, %ecx 948 precalc39 %ymm15, 0x20, 0x100 949 calc_f2_post %ebx, %edi, %eax, %ecx 950.endm 951 952.macro calc72 953 calc_f2_pre 0x240, %ecx, %ebx, %edx 954 precalc32 %ymm3, %ymm15 955 calc_f2_post %ecx, %esi, %edi, %edx 956.endm 957 958.macro calc73 959 calc_f2_pre 0x244, %edx, %ecx, %eax 960 precalc33 %ymm13, %ymm14 961 calc_f2_post %edx, %ebx, %esi, %eax 962.endm 963 964.macro calc74 965 calc_f2_pre 0x248, %eax, %edx, %edi 966 precalc34 %ymm7 967 calc_f2_post %eax, %ecx, %ebx, %edi 968.endm 969 970.macro calc75 971 calc_f2_pre 0x24c, %edi, %eax, %esi 972 precalc35 %ymm14 973 calc_f2_post %edi, %edx, %ecx, %esi 974.endm 975 976.macro calc76 977 calc_f2_pre 0x260, %esi, %edi, %ebx 978 precalc36 %ymm14 979 calc_f2_post %esi, %eax, %edx, %ebx 980.endm 981 982.macro calc77 983 calc_f2_pre 0x264, %ebx, %esi, %ecx 984 precalc37 %ymm14 985 calc_f2_post %ebx, %edi, %eax, %ecx 986.endm 987 988.macro calc78 989 calc_f2_pre 0x268, %ecx, %ebx, %edx 990 calc_f2_post %ecx, %esi, %edi, %edx 991.endm 992 993.macro calc79 994 add 0x26c(%r15), %eax 995 add %ecx, %eax 996 rorx $0x1b, %edx, %r12d 997 precalc39 %ymm14, 0x20, 0x120 998 add %r12d, %eax 999.endm 1000 1001/* 1002 * Similar to calc0 1003 */ 1004.macro calc80 1005 mov %ecx, %edx // precalculate first round 1006 rorx $2, %ecx, %ecx 1007 andn %esi, %edx, %ebp 1008 and %ebx, %edx 1009 xor %ebp, %edx 1010 calc_f1_pre 0x10, %eax, %edx, %ebx, %edi 1011 precalc32 %ymm15, %ymm14 1012 calc_f1_post %eax, %ecx, %edi 1013.endm 1014 1015.macro calc81 1016 calc_f1_pre 0x14, %edi, %eax, %ecx, %esi 1017 precalc33 %ymm12, %ymm13 1018 calc_f1_post %edi, %edx, %esi 1019.endm 1020 1021.macro calc82 1022 calc_f1_pre 0x18, %esi, %edi, %edx, %ebx 1023 precalc34 %ymm5 1024 calc_f1_post %esi, %eax, %ebx 1025.endm 1026 1027.macro calc83 1028 calc_f1_pre 0x1c, %ebx, %esi, %eax, %ecx 1029 precalc35 %ymm13 1030 calc_f1_post %ebx, %edi, %ecx 1031.endm 1032 1033.macro calc84 1034 calc_f1_pre 0x30, %ecx, %ebx, %edi, %edx 1035 precalc36 %ymm13 1036 calc_f1_post %ecx, %esi, %edx 1037.endm 1038 1039.macro calc85 1040 calc_f1_pre 0x34, %edx, %ecx, %esi, %eax 1041 precalc37 %ymm13 1042 calc_f1_post %edx, %ebx, %eax 1043.endm 1044 1045.macro calc86 1046 calc_f1_pre 0x38, %eax, %edx, %ebx, %edi 1047 calc_f1_post %eax, %ecx, %edi 1048.endm 1049 1050.macro calc87 1051 calc_f1_pre 0x3c, %edi, %eax, %ecx, %esi 1052 precalc39 %ymm13, 0x40, 0x140 1053 calc_f1_post %edi, %edx, %esi 1054.endm 1055 1056.macro calc88 1057 calc_f1_pre 0x50, %esi, %edi, %edx, %ebx 1058 precalc32 %ymm14, %ymm13 1059 calc_f1_post %esi, %eax, %ebx 1060.endm 1061 1062.macro calc89 1063 calc_f1_pre 0x54, %ebx, %esi, %eax, %ecx 1064 precalc33 %ymm8, %ymm12 1065 calc_f1_post %ebx, %edi, %ecx 1066.endm 1067 1068.macro calc90 1069 calc_f1_pre 0x58, %ecx, %ebx, %edi, %edx 1070 precalc34 %ymm3 1071 calc_f1_post %ecx, %esi, %edx 1072.endm 1073 1074.macro calc91 1075 calc_f1_pre 0x5c, %edx, %ecx, %esi, %eax 1076 precalc35 %ymm12 1077 calc_f1_post %edx, %ebx, %eax 1078.endm 1079 1080.macro calc92 1081 calc_f1_pre 0x70, %eax, %edx, %ebx, %edi 1082 precalc36 %ymm12 1083 calc_f1_post %eax, %ecx, %edi 1084.endm 1085 1086.macro calc93 1087 calc_f1_pre 0x74, %edi, %eax, %ecx, %esi 1088 precalc37 %ymm12 1089 calc_f1_post %edi, %edx, %esi 1090.endm 1091 1092.macro calc94 1093 calc_f1_pre 0x78, %esi, %edi, %edx, %ebx 1094 calc_f1_post %esi, %eax, %ebx 1095.endm 1096 1097.macro calc95 1098 calc_f1_pre 0x7c, %ebx, %esi, %eax, %ecx 1099 precalc39 %ymm12, 0x40, 0x160 1100 calc_f1_post %ebx, %edi, %ecx 1101.endm 1102 1103.macro calc96 1104 calc_f1_pre 0x90, %ecx, %ebx, %edi, %edx 1105 precalc32 %ymm13, %ymm12 1106 calc_f1_post %ecx, %esi, %edx 1107.endm 1108 1109.macro calc97 1110 calc_f1_pre 0x94, %edx, %ecx, %esi, %eax 1111 precalc33 %ymm7, %ymm8 1112 calc_f1_post %edx, %ebx, %eax 1113.endm 1114 1115.macro calc98 1116 calc_f1_pre 0x98, %eax, %edx, %ebx, %edi 1117 precalc34 %ymm15 1118 calc_f1_post %eax, %ecx, %edi 1119.endm 1120 1121.macro calc99 1122 calc_f2_pre 0x9c, %edi, %eax, %esi 1123 precalc35 %ymm8 1124 calc_f2_post %edi, %edx, %ecx, %esi 1125.endm 1126 1127.macro calc100 1128 calc_f2_pre 0xb0, %esi, %edi, %ebx 1129 precalc36 %ymm8 1130 calc_f2_post %esi, %eax, %edx, %ebx 1131.endm 1132 1133.macro calc101 1134 calc_f2_pre 0xb4, %ebx, %esi, %ecx 1135 precalc37 %ymm8 1136 calc_f2_post %ebx, %edi, %eax, %ecx 1137.endm 1138 1139.macro calc102 1140 calc_f2_pre 0xb8, %ecx, %ebx, %edx 1141 calc_f2_post %ecx, %esi, %edi, %edx 1142.endm 1143 1144.macro calc103 1145 calc_f2_pre 0xbc, %edx, %ecx, %eax 1146 precalc39 %ymm8, 0x40, 0x180 1147 calc_f2_post %edx, %ebx, %esi, %eax 1148.endm 1149 1150.macro calc104 1151 calc_f2_pre 0xd0, %eax, %edx, %edi 1152 precalc32 %ymm12, %ymm8 1153 calc_f2_post %eax, %ecx, %ebx, %edi 1154.endm 1155 1156.macro calc105 1157 calc_f2_pre 0xd4, %edi, %eax, %esi 1158 precalc33 %ymm5, %ymm7 1159 calc_f2_post %edi, %edx, %ecx, %esi 1160.endm 1161 1162.macro calc106 1163 calc_f2_pre 0xd8, %esi, %edi, %ebx 1164 precalc34 %ymm14 1165 calc_f2_post %esi, %eax, %edx, %ebx 1166.endm 1167 1168.macro calc107 1169 calc_f2_pre 0xdc, %ebx, %esi, %ecx 1170 precalc35 %ymm7 1171 calc_f2_post %ebx, %edi, %eax, %ecx 1172.endm 1173 1174.macro calc108 1175 calc_f2_pre 0xf0, %ecx, %ebx, %edx 1176 precalc36 %ymm7 1177 calc_f2_post %ecx, %esi, %edi, %edx 1178.endm 1179 1180.macro calc109 1181 calc_f2_pre 0xf4, %edx, %ecx, %eax 1182 precalc37 %ymm7 1183 calc_f2_post %edx, %ebx, %esi, %eax 1184.endm 1185 1186.macro calc110 1187 calc_f2_pre 0xf8, %eax, %edx, %edi 1188 calc_f2_post %eax, %ecx, %ebx, %edi 1189.endm 1190 1191.macro calc111 1192 calc_f2_pre 0xfc, %edi, %eax, %esi 1193 precalc39 %ymm7, 0x40, 0x1a0 1194 calc_f2_post %edi, %edx, %ecx, %esi 1195.endm 1196 1197.macro calc112 1198 calc_f2_pre 0x110, %esi, %edi, %ebx 1199 precalc32 %ymm8, %ymm7 1200 calc_f2_post %esi, %eax, %edx, %ebx 1201.endm 1202 1203.macro calc113 1204 calc_f2_pre 0x114, %ebx, %esi, %ecx 1205 precalc33 %ymm3, %ymm5 1206 calc_f2_post %ebx, %edi, %eax, %ecx 1207.endm 1208 1209.macro calc114 1210 calc_f2_pre 0x118, %ecx, %ebx, %edx 1211 precalc34 %ymm13 1212 calc_f2_post %ecx, %esi, %edi, %edx 1213.endm 1214 1215.macro calc115 1216 calc_f2_pre 0x11c, %edx, %ecx, %eax 1217 precalc35 %ymm5 1218 calc_f2_post %edx, %ebx, %esi, %eax 1219.endm 1220 1221.macro calc116 1222 calc_f2_pre 0x130, %eax, %edx, %edi 1223 precalc36 %ymm5 1224 calc_f2_post %eax, %ecx, %ebx, %edi 1225.endm 1226 1227.macro calc117 1228 calc_f2_pre 0x134, %edi, %eax, %esi 1229 precalc37 %ymm5 1230 calc_f2_post %edi, %edx, %ecx, %esi 1231.endm 1232 1233.macro calc118 1234 calc_f2_pre 0x138, %esi, %edi, %ebx 1235 calc_f2_post %esi, %eax, %edx, %ebx 1236.endm 1237 1238.macro calc119 1239 calc_f3_pre 0x13c, %ecx 1240 precalc39 %ymm5, 0x40, 0x1c0 1241 calc_f3_post %ebx, %edi, %eax, %ecx, %esi 1242.endm 1243 1244.macro calc120 1245 calc_f3_pre 0x150, %edx 1246 precalc32 %ymm7, %ymm5 1247 calc_f3_post %ecx, %esi, %edi, %edx, %ebx 1248.endm 1249 1250.macro calc121 1251 calc_f3_pre 0x154, %eax 1252 precalc33 %ymm15, %ymm3 1253 calc_f3_post %edx, %ebx, %esi, %eax, %ecx 1254.endm 1255 1256.macro calc122 1257 calc_f3_pre 0x158, %edi 1258 precalc34 %ymm12 1259 calc_f3_post %eax, %ecx, %ebx, %edi, %edx 1260.endm 1261 1262.macro calc123 1263 calc_f3_pre 0x15c, %esi 1264 precalc35 %ymm3 1265 calc_f3_post %edi, %edx, %ecx, %esi, %eax 1266.endm 1267 1268.macro calc124 1269 calc_f3_pre 0x170, %ebx 1270 precalc36 %ymm3 1271 calc_f3_post %esi, %eax, %edx, %ebx, %edi 1272.endm 1273 1274.macro calc125 1275 calc_f3_pre 0x174, %ecx 1276 precalc37 %ymm3 1277 calc_f3_post %ebx, %edi, %eax, %ecx, %esi 1278.endm 1279 1280.macro calc126 1281 calc_f3_pre 0x178, %edx 1282 calc_f3_post %ecx, %esi, %edi, %edx, %ebx 1283.endm 1284 1285.macro calc127 1286 calc_f3_pre 0x17c, %eax 1287 precalc39 %ymm3, 0x60, 0x1e0 1288 calc_f3_post %edx, %ebx, %esi, %eax, %ecx 1289.endm 1290 1291.macro calc128 1292 calc_f3_pre 0x190, %edi 1293 precalc32 %ymm5, %ymm3 1294 calc_f3_post %eax, %ecx, %ebx, %edi, %edx 1295.endm 1296 1297.macro calc129 1298 calc_f3_pre 0x194, %esi 1299 precalc33 %ymm14, %ymm15 1300 calc_f3_post %edi, %edx, %ecx, %esi, %eax 1301.endm 1302 1303.macro calc130 1304 calc_f3_pre 0x198, %ebx 1305 precalc34 %ymm8 1306 calc_f3_post %esi, %eax, %edx, %ebx, %edi 1307.endm 1308 1309.macro calc131 1310 calc_f3_pre 0x19c, %ecx 1311 precalc35 %ymm15 1312 calc_f3_post %ebx, %edi, %eax, %ecx, %esi 1313.endm 1314 1315.macro calc132 1316 calc_f3_pre 0x1b0, %edx 1317 precalc36 %ymm15 1318 calc_f3_post %ecx, %esi, %edi, %edx, %ebx 1319.endm 1320 1321.macro calc133 1322 calc_f3_pre 0x1b4, %eax 1323 precalc37 %ymm15 1324 calc_f3_post %edx, %ebx, %esi, %eax, %ecx 1325.endm 1326 1327.macro calc134 1328 calc_f3_pre 0x1b8, %edi 1329 calc_f3_post %eax, %ecx, %ebx, %edi, %edx 1330.endm 1331 1332.macro calc135 1333 calc_f3_pre 0x1bc, %esi 1334 precalc39 %ymm15, 0x60, 0x200 1335 calc_f3_post %edi, %edx, %ecx, %esi, %eax 1336.endm 1337 1338.macro calc136 1339 calc_f3_pre 0x1d0, %ebx 1340 precalc32 %ymm3, %ymm15 1341 calc_f3_post %esi, %eax, %edx, %ebx, %edi 1342.endm 1343 1344.macro calc137 1345 calc_f3_pre 0x1d4, %ecx 1346 precalc33 %ymm13, %ymm14 1347 calc_f3_post %ebx, %edi, %eax, %ecx, %esi 1348.endm 1349 1350.macro calc138 1351 calc_f3_pre 0x1d8, %edx 1352 precalc34 %ymm7 1353 calc_f3_post %ecx, %esi, %edi, %edx, %ebx 1354.endm 1355 1356.macro calc139 1357 calc_f2_pre 0x1dc, %edx, %ecx, %eax 1358 precalc35 %ymm14 1359 calc_f2_post %edx, %ebx, %esi, %eax 1360.endm 1361 1362.macro calc140 1363 calc_f2_pre 0x1f0, %eax, %edx, %edi 1364 precalc36 %ymm14 1365 calc_f2_post %eax, %ecx, %ebx, %edi 1366.endm 1367 1368.macro calc141 1369 calc_f2_pre 0x1f4, %edi, %eax, %esi 1370 precalc37 %ymm14 1371 calc_f2_post %edi, %edx, %ecx, %esi 1372.endm 1373 1374.macro calc142 1375 calc_f2_pre 0x1f8, %esi, %edi, %ebx 1376 calc_f2_post %esi, %eax, %edx, %ebx 1377.endm 1378 1379.macro calc143 1380 calc_f2_pre 0x1fc, %ebx, %esi, %ecx 1381 precalc39 %ymm14, 0x60, 0x220 1382 calc_f2_post %ebx, %edi, %eax, %ecx 1383.endm 1384 1385.macro calc144 1386 calc_f2_pre 0x210, %ecx, %ebx, %edx 1387 precalc32 %ymm15, %ymm14 1388 calc_f2_post %ecx, %esi, %edi, %edx 1389.endm 1390 1391.macro calc145 1392 calc_f2_pre 0x214, %edx, %ecx, %eax 1393 precalc33 %ymm12, %ymm13 1394 calc_f2_post %edx, %ebx, %esi, %eax 1395.endm 1396 1397.macro calc146 1398 calc_f2_pre 0x218, %eax, %edx, %edi 1399 precalc34 %ymm5 1400 calc_f2_post %eax, %ecx, %ebx, %edi 1401.endm 1402 1403.macro calc147 1404 calc_f2_pre 0x21c, %edi, %eax, %esi 1405 precalc35 %ymm13 1406 calc_f2_post %edi, %edx, %ecx, %esi 1407.endm 1408 1409.macro calc148 1410 calc_f2_pre 0x230, %esi, %edi, %ebx 1411 precalc36 %ymm13 1412 calc_f2_post %esi, %eax, %edx, %ebx 1413.endm 1414 1415.macro calc149 1416 calc_f2_pre 0x234, %ebx, %esi, %ecx 1417 precalc37 %ymm13 1418 calc_f2_post %ebx, %edi, %eax, %ecx 1419.endm 1420 1421.macro calc150 1422 calc_f2_pre 0x238, %ecx, %ebx, %edx 1423 calc_f2_post %ecx, %esi, %edi, %edx 1424.endm 1425 1426.macro calc151 1427 calc_f2_pre 0x23c, %edx, %ecx, %eax 1428 precalc39 %ymm13, 0x60, 0x240 1429 calc_f2_post %edx, %ebx, %esi, %eax 1430.endm 1431 1432.macro calc152 1433 calc_f2_pre 0x250, %eax, %edx, %edi 1434 precalc32 %ymm14, %ymm13 1435 calc_f2_post %eax, %ecx, %ebx, %edi 1436.endm 1437 1438.macro calc153 1439 calc_f2_pre 0x254, %edi, %eax, %esi 1440 precalc33 %ymm8, %ymm12 1441 calc_f2_post %edi, %edx, %ecx, %esi 1442.endm 1443 1444.macro calc154 1445 calc_f2_pre 0x258, %esi, %edi, %ebx 1446 precalc34 %ymm3 1447 calc_f2_post %esi, %eax, %edx, %ebx 1448.endm 1449 1450.macro calc155 1451 calc_f2_pre 0x25c, %ebx, %esi, %ecx 1452 precalc35 %ymm12 1453 calc_f2_post %ebx, %edi, %eax, %ecx 1454.endm 1455 1456.macro calc156 1457 calc_f2_pre 0x270, %ecx, %ebx, %edx 1458 precalc36 %ymm12 1459 calc_f2_post %ecx, %esi, %edi, %edx 1460.endm 1461 1462.macro calc157 1463 calc_f2_pre 0x274, %edx, %ecx, %eax 1464 precalc37 %ymm12 1465 calc_f2_post %edx, %ebx, %esi, %eax 1466.endm 1467 1468.macro calc158 1469 calc_f2_pre 0x278, %eax, %edx, %edi 1470 calc_f2_post %eax, %ecx, %ebx, %edi 1471.endm 1472 1473.macro calc159 1474 add 0x27c(%r15), %esi 1475 add %eax, %esi 1476 rorx $0x1b, %edi, %r12d 1477 precalc39 %ymm12, 0x60, 0x260 1478 add %r12d, %esi 1479.endm 1480 1481 // sha1block(SHA1_CTX, buf, len) 1482ENTRY(_libmd_sha1block_avx2) 1483 push %rbx 1484 push %rbp 1485 push %r12 1486 push %r13 1487 push %r14 1488 push %r15 1489 sub $1408+8, %rsp 1490 1491 and $~63, %rdx 1492 lea k_xmm_ar(%rip), %r8 1493 mov %rdi, %r9 1494 mov %rsi, %r10 1495 lea 64(%rsi), %r13 1496 lea 64(%rsi, %rdx), %r11 1497 cmp %r11, %r13 1498 cmovae %r8, %r13 1499 vmovdqu bswap_shufb_ctl(%rip), %ymm10 1500 1501 mov (%r9), %ecx 1502 mov 4(%r9), %esi 1503 mov 8(%r9), %edi 1504 mov 12(%r9), %eax 1505 mov 16(%r9), %edx 1506 mov %rsp, %r14 1507 lea 2*4*80+32(%rsp), %r15 1508 precalc // precalc WK for first 2 blocks 1509 xchg %r14, %r15 1510 1511 // this is unrolled 1512.Loop: cmp %r8, %r10 // we use the value of R8 (set below) 1513 // as a signal of the last block 1514 jne .Lbegin 1515 add $1408+8, %rsp 1516 pop %r15 1517 pop %r14 1518 pop %r13 1519 pop %r12 1520 pop %rbp 1521 pop %rbx 1522 vzeroupper 1523 ret 1524 1525.Lbegin: 1526 calc0 1527 calc1 1528 calc2 1529 calc3 1530 calc4 1531 calc5 1532 calc6 1533 calc7 1534 calc8 1535 calc9 1536 calc10 1537 calc11 1538 calc12 1539 calc13 1540 calc14 1541 calc15 1542 calc16 1543 calc17 1544 calc18 1545 calc19 1546 calc20 1547 calc21 1548 calc22 1549 calc23 1550 calc24 1551 calc25 1552 calc26 1553 calc27 1554 calc28 1555 calc29 1556 calc30 1557 calc31 1558 calc32 1559 calc33 1560 calc34 1561 calc35 1562 calc36 1563 calc37 1564 calc38 1565 calc39 1566 calc40 1567 calc41 1568 calc42 1569 calc43 1570 calc44 1571 calc45 1572 calc46 1573 calc47 1574 calc48 1575 calc49 1576 calc50 1577 calc51 1578 calc52 1579 calc53 1580 calc54 1581 calc55 1582 calc56 1583 calc57 1584 calc58 1585 calc59 1586 1587 add $128, %r10 // move to the next even-64-byte block 1588 cmp %r11, %r10 // is the current block the last one? 1589 cmovae %r8, %r10 // signal the last iteration smartly 1590 1591 calc60 1592 calc61 1593 calc62 1594 calc63 1595 calc64 1596 calc65 1597 calc66 1598 calc67 1599 calc68 1600 calc69 1601 calc70 1602 calc71 1603 calc72 1604 calc73 1605 calc74 1606 calc75 1607 calc76 1608 calc77 1609 calc78 1610 calc79 1611 1612 update_hash %eax, %edx, %ebx, %esi, %edi 1613 cmp %r8, %r10 // is the current block the last one? 1614 je .Loop 1615 mov %edx, %ecx 1616 1617 calc80 1618 calc81 1619 calc82 1620 calc83 1621 calc84 1622 calc85 1623 calc86 1624 calc87 1625 calc88 1626 calc89 1627 calc90 1628 calc91 1629 calc92 1630 calc93 1631 calc94 1632 calc95 1633 calc96 1634 calc97 1635 calc98 1636 calc99 1637 calc100 1638 calc101 1639 calc102 1640 calc103 1641 calc104 1642 calc105 1643 calc106 1644 calc107 1645 calc108 1646 calc109 1647 calc110 1648 calc111 1649 calc112 1650 calc113 1651 calc114 1652 calc115 1653 calc116 1654 calc117 1655 calc118 1656 calc119 1657 calc120 1658 calc121 1659 calc122 1660 calc123 1661 calc124 1662 calc125 1663 calc126 1664 calc127 1665 calc128 1666 calc129 1667 calc130 1668 calc131 1669 calc132 1670 calc133 1671 calc134 1672 calc135 1673 calc136 1674 calc137 1675 calc138 1676 calc139 1677 1678 add $128, %r13 // move to the next even-64-byte block 1679 cmp %r11, %r13 // is the current block the last one? 1680 cmovae %r8, %r10 1681 1682 calc140 1683 calc141 1684 calc142 1685 calc143 1686 calc144 1687 calc145 1688 calc146 1689 calc147 1690 calc148 1691 calc149 1692 calc150 1693 calc151 1694 calc152 1695 calc153 1696 calc154 1697 calc155 1698 calc156 1699 calc157 1700 calc158 1701 calc159 1702 1703 update_hash %esi, %edi, %edx, %ecx, %ebx 1704 mov %esi, %r12d // reset state for AVX2 reg permutation 1705 mov %edi, %esi 1706 mov %edx, %edi 1707 mov %ebx, %edx 1708 mov %ecx, %eax 1709 mov %r12d, %ecx 1710 xchg %r14, %r15 1711 jmp .Loop 1712END(_libmd_sha1block_avx2) 1713 1714 .section .rodata 1715 .balign 32 1716k_xmm_ar: 1717 .fill 8, 4, 0x5a827999 1718 .fill 8, 4, 0x6ed9eba1 1719 .fill 8, 4, 0x8f1bbcdc 1720 .fill 8, 4, 0xca62c1d6 1721 .size k_xmm_ar, .-k_xmm_ar 1722 1723bswap_shufb_ctl: 1724 .4byte 0x00010203 1725 .4byte 0x04050607 1726 .4byte 0x08090a0b 1727 .4byte 0x0c0d0e0f 1728 .4byte 0x00010203 1729 .4byte 0x04050607 1730 .4byte 0x08090a0b 1731 .4byte 0x0c0d0e0f 1732 .size bswap_shufb_ctl, .-bswap_shufb_ctl 1733 1734 /* 1735 * SHA1 implementation using the Intel SHA extensions (SHANI). 1736 * 1737 * Imlemented according to the Intel white paper 1738 * 1739 * S. Gulley, V. Gopal, K. Yap, W. Feghali, J. Guilford, 1740 * G. Wolrich: "Intel SHA Extensions: new instruction supporting 1741 * the Secure Hash Algorithm on Intel® architecture processors", 1742 * July 2013. 1743 */ 1744 // sha1block(SHA1_CTX, buf, len) 1745ENTRY(_libmd_sha1block_shani) 1746 and $~63, %rdx // round length to block-size multiple 1747 lea (%rsi, %rdx, 1), %rcx // end pointer 1748 test %rdx, %rdx // nothing to do? 1749 je 1f // if so, terminate immediately 1750 1751 movdqu (%rdi), %xmm6 // h0, h1, h2, h3 1752 pxor %xmm7, %xmm7 1753 pshufd $0x1b, %xmm6, %xmm6 // h3, h2, h1, h0 1754 pinsrd $3, 16(%rdi), %xmm7 // h4 in the highest word of xmm7 1755 movdqu shuf_mask(%rip), %xmm4 1756 1757 // main loop 17580: movdqa %xmm6, %xmm8 // stash ABCD 1759 movdqa %xmm7, %xmm9 // stash E 1760 1761 // rounds 0--3 1762 movdqu 0*16(%rsi), %xmm0 // load first message block 1763 pshufb %xmm4, %xmm0 // and byte-swap 1764 paddd %xmm0, %xmm7 // E += w[0] 1765 movdqa %xmm6, %xmm5 // E' = A 1766 sha1rnds4 $0, %xmm7, %xmm6 // perform rounds 0--3 1767 1768 // rounds 4--7 1769 movdqu 1*16(%rsi), %xmm1 1770 pshufb %xmm4, %xmm1 1771 sha1nexte %xmm1, %xmm5 1772 movdqa %xmm6, %xmm7 1773 sha1rnds4 $0, %xmm5, %xmm6 1774 sha1msg1 %xmm1, %xmm0 1775 1776 // rounds 8--11 1777 movdqu 2*16(%rsi), %xmm2 1778 pshufb %xmm4, %xmm2 1779 sha1nexte %xmm2, %xmm7 1780 movdqa %xmm6, %xmm5 1781 sha1rnds4 $0, %xmm7, %xmm6 1782 sha1msg1 %xmm2, %xmm1 1783 pxor %xmm2, %xmm0 1784 1785.macro midround msg3, msg0, msg1, msg2, e1, e0, k 1786 sha1nexte \msg3, \e1 1787 movdqa %xmm6, \e0 1788 sha1msg2 \msg3, \msg0 1789 sha1rnds4 $\k, \e1, %xmm6 1790 sha1msg1 \msg3, \msg2 1791 pxor \msg3, \msg1 1792.endm 1793 1794 movdqu 3*16(%rsi), %xmm3 // load third message block 1795 pshufb %xmm4, %xmm3 1796 1797 add $4*16, %rsi 1798 1799 midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 0 // 12--15 1800 midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 0 // 16--19 1801 midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 20--23 1802 midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 1 // 24--27 1803 midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 1 // 28--31 1804 midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 1 // 32--35 1805 midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 36--39 1806 midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 40--43 1807 midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 2 // 44--47 1808 midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 2 // 48--51 1809 midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 2 // 52--55 1810 midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 56--59 1811 midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 3 // 60--63 1812 midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 3 // 64--67 1813 1814 // rounds 68--71 1815 sha1nexte %xmm1, %xmm5 1816 movdqa %xmm6, %xmm7 1817 sha1msg2 %xmm1, %xmm2 1818 sha1rnds4 $3, %xmm5, %xmm6 1819 pxor %xmm1, %xmm3 1820 1821 // rounds 72--75 1822 sha1nexte %xmm2, %xmm7 1823 movdqa %xmm6, %xmm5 1824 sha1msg2 %xmm2, %xmm3 1825 sha1rnds4 $3, %xmm7, %xmm6 1826 1827 // rounds 76--79 1828 sha1nexte %xmm3, %xmm5 1829 movdqa %xmm6, %xmm7 1830 sha1rnds4 $3, %xmm5, %xmm6 1831 1832 sha1nexte %xmm9, %xmm7 // add saved E 1833 paddd %xmm8, %xmm6 // add saved ABCD 1834 1835 cmp %rsi, %rcx // end reached? 1836 jne 0b 1837 1838 pshufd $0x1b, %xmm6, %xmm6 // restore order of h0--h3 1839 movdqu %xmm6, (%rdi) // write h0--h3 1840 pextrd $3, %xmm7, 16(%rdi) // write h4 18411: ret 1842END(_libmd_sha1block_shani) 1843 1844 .section .rodata 1845 .balign 16 1846shuf_mask: 1847 .8byte 0x08090a0b0c0d0e0f 1848 .8byte 0x0001020304050607 1849 .size shuf_mask, .-shuf_mask 1850 1851 .section .note.GNU-stack,"",%progbits 1852