/*- * Copyright (c) 2013 The Go Authors. All rights reserved. * Copyright (c) 2024 Robert Clausecker * * Adapted from Go's crypto/sha1/sha1block_amd64.s. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include /* * SHA-1 block routine. See sha1c.c for C equivalent. * * There are 80 rounds of 4 types: * - rounds 0-15 are type 1 and load data (round1 macro). * - rounds 16-19 are type 1 and do not load data (round1x macro). * - rounds 20-39 are type 2 and do not load data (round2 macro). * - rounds 40-59 are type 3 and do not load data (round3 macro). * - rounds 60-79 are type 4 and do not load data (round4 macro). * * Each round loads or shuffles the data, then computes a per-round * function of b, c, d, and then mixes the result into and rotates the * five registers a, b, c, d, e holding the intermediate results. * * The register rotation is implemented by rotating the arguments to * the round macros instead of by explicit move instructions. */ .macro load index mov (\index)*4(%rsi), %r10d bswap %r10d mov %r10d, (\index)*4(%rsp) .endm .macro shuffle index mov ((\index )&0xf)*4(%rsp), %r10d xor ((\index- 3)&0xf)*4(%rsp), %r10d xor ((\index- 8)&0xf)*4(%rsp), %r10d xor ((\index-14)&0xf)*4(%rsp), %r10d rol $1, %r10d mov %r10d, ((\index)&0xf)*4(%rsp) .endm .macro func1 a, b, c, d, e mov \d, %r9d xor \c, %r9d and \b, %r9d xor \d, %r9d .endm .macro func2 a, b, c, d, e mov \b, %r9d xor \c, %r9d xor \d, %r9d .endm .macro func3 a, b, c, d, e mov \b, %r8d or \c, %r8d and \d, %r8d mov \b, %r9d and \c, %r9d or %r8d, %r9d .endm .macro func4 a, b, c, d, e func2 \a, \b, \c, \d, \e .endm .macro mix a, b, c, d, e, const rol $30, \b add %r9d, \e mov \a, %r8d rol $5, %r8d lea \const(\e, %r10d, 1), \e add %r8d, \e .endm .macro round1 a, b, c, d, e, index load \index func1 \a, \b, \c, \d, \e mix \a, \b, \c, \d, \e, 0x5a827999 .endm .macro round1x a, b, c, d, e, index shuffle \index func1 \a, \b, \c, \d, \e mix \a, \b, \c, \d, \e, 0x5a827999 .endm .macro round2 a, b, c, d, e, index shuffle \index func2 \a, \b, \c, \d, \e mix \a, \b, \c, \d, \e, 0x6ed9eba1 .endm .macro round3 a, b, c, d, e, index shuffle \index func3 \a, \b, \c, \d, \e mix \a, \b, \c, \d, \e, 0x8f1bbcdc .endm .macro round4 a, b, c, d, e, index shuffle \index func4 \a, \b, \c, \d, \e mix \a, \b, \c, \d, \e, 0xca62c1d6 .endm // sha1block(SHA1_CTX, buf, len) ENTRY(_libmd_sha1block_scalar) push %rbp push %rbx push %r12 push %r13 push %r14 push %r15 push %rdi // rdi: SHA1_CTX sub $64+8, %rsp // 64 bytes for round keys // plus alignment mov %rdi, %rbp // rsi: buf and $~63, %rdx // rdx: length in blocks lea (%rsi, %rdx, 1), %rdi // rdi: end pointer mov (%rbp), %eax // c->h0 mov 4(%rbp), %ebx // c->h1 mov 8(%rbp), %ecx // c->h2 mov 12(%rbp), %edx // c->h3 mov 16(%rbp), %ebp // c->h4 cmp %rsi, %rdi // any data to process? je .Lend .Lloop: mov %eax, %r11d mov %ebx, %r12d mov %ecx, %r13d mov %edx, %r14d mov %ebp, %r15d round1 %eax, %ebx, %ecx, %edx, %ebp, 0 round1 %ebp, %eax, %ebx, %ecx, %edx, 1 round1 %edx, %ebp, %eax, %ebx, %ecx, 2 round1 %ecx, %edx, %ebp, %eax, %ebx, 3 round1 %ebx, %ecx, %edx, %ebp, %eax, 4 round1 %eax, %ebx, %ecx, %edx, %ebp, 5 round1 %ebp, %eax, %ebx, %ecx, %edx, 6 round1 %edx, %ebp, %eax, %ebx, %ecx, 7 round1 %ecx, %edx, %ebp, %eax, %ebx, 8 round1 %ebx, %ecx, %edx, %ebp, %eax, 9 round1 %eax, %ebx, %ecx, %edx, %ebp, 10 round1 %ebp, %eax, %ebx, %ecx, %edx, 11 round1 %edx, %ebp, %eax, %ebx, %ecx, 12 round1 %ecx, %edx, %ebp, %eax, %ebx, 13 round1 %ebx, %ecx, %edx, %ebp, %eax, 14 round1 %eax, %ebx, %ecx, %edx, %ebp, 15 round1x %ebp, %eax, %ebx, %ecx, %edx, 16 round1x %edx, %ebp, %eax, %ebx, %ecx, 17 round1x %ecx, %edx, %ebp, %eax, %ebx, 18 round1x %ebx, %ecx, %edx, %ebp, %eax, 19 round2 %eax, %ebx, %ecx, %edx, %ebp, 20 round2 %ebp, %eax, %ebx, %ecx, %edx, 21 round2 %edx, %ebp, %eax, %ebx, %ecx, 22 round2 %ecx, %edx, %ebp, %eax, %ebx, 23 round2 %ebx, %ecx, %edx, %ebp, %eax, 24 round2 %eax, %ebx, %ecx, %edx, %ebp, 25 round2 %ebp, %eax, %ebx, %ecx, %edx, 26 round2 %edx, %ebp, %eax, %ebx, %ecx, 27 round2 %ecx, %edx, %ebp, %eax, %ebx, 28 round2 %ebx, %ecx, %edx, %ebp, %eax, 29 round2 %eax, %ebx, %ecx, %edx, %ebp, 30 round2 %ebp, %eax, %ebx, %ecx, %edx, 31 round2 %edx, %ebp, %eax, %ebx, %ecx, 32 round2 %ecx, %edx, %ebp, %eax, %ebx, 33 round2 %ebx, %ecx, %edx, %ebp, %eax, 34 round2 %eax, %ebx, %ecx, %edx, %ebp, 35 round2 %ebp, %eax, %ebx, %ecx, %edx, 36 round2 %edx, %ebp, %eax, %ebx, %ecx, 37 round2 %ecx, %edx, %ebp, %eax, %ebx, 38 round2 %ebx, %ecx, %edx, %ebp, %eax, 39 round3 %eax, %ebx, %ecx, %edx, %ebp, 40 round3 %ebp, %eax, %ebx, %ecx, %edx, 41 round3 %edx, %ebp, %eax, %ebx, %ecx, 42 round3 %ecx, %edx, %ebp, %eax, %ebx, 43 round3 %ebx, %ecx, %edx, %ebp, %eax, 44 round3 %eax, %ebx, %ecx, %edx, %ebp, 45 round3 %ebp, %eax, %ebx, %ecx, %edx, 46 round3 %edx, %ebp, %eax, %ebx, %ecx, 47 round3 %ecx, %edx, %ebp, %eax, %ebx, 48 round3 %ebx, %ecx, %edx, %ebp, %eax, 49 round3 %eax, %ebx, %ecx, %edx, %ebp, 50 round3 %ebp, %eax, %ebx, %ecx, %edx, 51 round3 %edx, %ebp, %eax, %ebx, %ecx, 52 round3 %ecx, %edx, %ebp, %eax, %ebx, 53 round3 %ebx, %ecx, %edx, %ebp, %eax, 54 round3 %eax, %ebx, %ecx, %edx, %ebp, 55 round3 %ebp, %eax, %ebx, %ecx, %edx, 56 round3 %edx, %ebp, %eax, %ebx, %ecx, 57 round3 %ecx, %edx, %ebp, %eax, %ebx, 58 round3 %ebx, %ecx, %edx, %ebp, %eax, 59 round4 %eax, %ebx, %ecx, %edx, %ebp, 60 round4 %ebp, %eax, %ebx, %ecx, %edx, 61 round4 %edx, %ebp, %eax, %ebx, %ecx, 62 round4 %ecx, %edx, %ebp, %eax, %ebx, 63 round4 %ebx, %ecx, %edx, %ebp, %eax, 64 round4 %eax, %ebx, %ecx, %edx, %ebp, 65 round4 %ebp, %eax, %ebx, %ecx, %edx, 66 round4 %edx, %ebp, %eax, %ebx, %ecx, 67 round4 %ecx, %edx, %ebp, %eax, %ebx, 68 round4 %ebx, %ecx, %edx, %ebp, %eax, 69 round4 %eax, %ebx, %ecx, %edx, %ebp, 70 round4 %ebp, %eax, %ebx, %ecx, %edx, 71 round4 %edx, %ebp, %eax, %ebx, %ecx, 72 round4 %ecx, %edx, %ebp, %eax, %ebx, 73 round4 %ebx, %ecx, %edx, %ebp, %eax, 74 round4 %eax, %ebx, %ecx, %edx, %ebp, 75 round4 %ebp, %eax, %ebx, %ecx, %edx, 76 round4 %edx, %ebp, %eax, %ebx, %ecx, 77 round4 %ecx, %edx, %ebp, %eax, %ebx, 78 round4 %ebx, %ecx, %edx, %ebp, %eax, 79 add %r11d, %eax add %r12d, %ebx add %r13d, %ecx add %r14d, %edx add %r15d, %ebp add $64, %rsi cmp %rdi, %rsi jb .Lloop .Lend: add $64+8, %rsp pop %rdi // SHA1_CTX mov %eax, (%rdi) mov %ebx, 4(%rdi) mov %ecx, 8(%rdi) mov %edx, 12(%rdi) mov %ebp, 16(%rdi) pop %r15 pop %r14 pop %r13 pop %r12 pop %rbx pop %rbp ret END(_libmd_sha1block_scalar) /* * This is the implementation using AVX2, BMI1 and BMI2. It is based on: * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" * From http://software.intel.com/en-us/articles * (look for improving-the-performance-of-the-secure-hash-algorithm-1) * This implementation is 2x unrolled, and interleaves vector instructions, * used to precompute W, with scalar computation of current round * for optimal scheduling. */ /* trivial helper macros */ .macro update_hash a, tb, c, d, e add (%r9), \a mov \a, (%r9) add 4(%r9), \tb mov \tb, 4(%r9) add 8(%r9), \c mov \c, 8(%r9) add 12(%r9), \d mov \d, 12(%r9) add 16(%r9), \e mov \e, 16(%r9) .endm /* help macros for recalc, which does precomputations */ .macro precalc0 offset vmovdqu \offset(%r10), %xmm0 .endm .macro precalc1 offset vinserti128 $1, \offset(%r13), %ymm0, %ymm0 .endm .macro precalc2 yreg vpshufb %ymm10, %ymm0, \yreg .endm .macro precalc4 yreg, k_offset vpaddd \k_offset(%r8), \yreg, %ymm0 .endm .macro precalc7 offset vmovdqu %ymm0, (\offset)*2(%r14) .endm /* * Message scheduling pre-compute for rounds 0-15 * r13 is a pointer to the even 64-byte block * r10 is a pointer to the odd 64-byte block * r14 is a pointer to the temp buffer * xmm0 is used as a temp register * yreg is clobbered as part of the computation * offset chooses a 16 byte chunk within a block * r8 is a pointer to the constants block * k_offset chooses K constants relevant to this round * xmm10 holds the swap mask */ .macro precalc00_15 offset, yreg precalc0 \offset precalc1 \offset precalc2 \yreg precalc4 \yreg, 0 precalc7 \offset .endm /* helper macros for precalc16_31 */ .macro precalc16 reg_sub16, reg_sub12, reg_sub4, reg vpalignr $8, \reg_sub16, \reg_sub12, \reg // w[i - 14] vpsrldq $4, \reg_sub4, %ymm0 // w[i - 3] .endm .macro precalc17 reg_sub16, reg_sub8, reg vpxor \reg_sub8, \reg, \reg vpxor \reg_sub16, %ymm0, %ymm0 .endm .macro precalc18 reg vpxor %ymm0, \reg, \reg vpslldq $12, \reg, %ymm9 .endm .macro precalc19 reg vpslld $1, \reg, %ymm0 vpsrld $31, \reg, \reg .endm .macro precalc20 reg vpor \reg, %ymm0, %ymm0 vpslld $2, %ymm9, \reg .endm .macro precalc21 reg vpsrld $30, %ymm9, %ymm9 vpxor \reg, %ymm0, %ymm0 .endm .macro precalc23 reg, k_offset, offset vpxor %ymm9, %ymm0, \reg vpaddd \k_offset(%r8), \reg, %ymm0 vmovdqu %ymm0, (\offset)(%r14) .endm /* * Message scheduling pre-compute for rounds 16-31 * calculating last 32 w[i] values in 8 XMM registers * pre-calculate K+w[i] values and store to mem * for later load by ALU add instruction. * "brute force" vectorization for rounds 16-31 only * due to w[i]->w[i-3] dependency. + clobbers 5 input ymm registers REG_SUB* * uses xmm0 and xmm9 as temp registers * As always, r8 is a pointer to constants block * and r14 is a pointer to temp buffer */ .macro precalc16_31 reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset precalc16 \reg_sub16, \reg_sub12, \reg_sub4, \reg precalc17 \reg_sub16, \reg_sub8, \reg precalc18 \reg precalc19 \reg precalc20 \reg precalc21 \reg precalc23 \reg, \k_offset, \offset .endm /* helper macros for precalc_32_79 */ .macro precalc32 reg_sub8, reg_sub4 vpalignr $8, \reg_sub8, \reg_sub4, %ymm0 .endm .macro precalc33 reg_sub28, reg vpxor \reg_sub28, \reg, \reg .endm .macro precalc34 reg_sub16 vpxor \reg_sub16, %ymm0, %ymm0 .endm .macro precalc35 reg vpxor %ymm0, \reg, \reg .endm .macro precalc36 reg vpslld $2, \reg, %ymm0 .endm .macro precalc37 reg vpsrld $30, \reg, \reg vpor \reg, %ymm0, \reg .endm .macro precalc39 reg, k_offset, offset vpaddd \k_offset(%r8), \reg, %ymm0 vmovdqu %ymm0, \offset(%r14) .endm .macro precalc32_79 reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset precalc32 \reg_sub8, \reg_sub4 precalc33 \reg_sub28, \reg precalc34 \reg_sub16 precalc35 \reg precalc36 \reg precalc37 \reg precalc39 \reg, \k_offset, \offset .endm .macro precalc precalc00_15 0x00, %ymm15 precalc00_15 0x10, %ymm14 precalc00_15 0x20, %ymm13 precalc00_15 0x30, %ymm12 precalc16_31 %ymm8, %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080 precalc16_31 %ymm7, %ymm8, %ymm12, %ymm13, %ymm14, 0x20, 0x0a0 precalc16_31 %ymm5, %ymm7, %ymm8, %ymm12, %ymm13, 0x20, 0x0c0 precalc16_31 %ymm3, %ymm5, %ymm7, %ymm8, %ymm12, 0x20, 0x0e0 precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x20, 0x100 precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x20, 0x120 precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x40, 0x140 precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x40, 0x160 precalc32_79 %ymm8, %ymm12, %ymm13, %ymm15, %ymm7, 0x40, 0x180 precalc32_79 %ymm7, %ymm8, %ymm12, %ymm14, %ymm5, 0x40, 0x1a0 precalc32_79 %ymm5, %ymm7, %ymm8, %ymm13, %ymm3, 0x40, 0x1c0 precalc32_79 %ymm3, %ymm5, %ymm7, %ymm12, %ymm15, 0x60, 0x1e0 precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x60, 0x200 precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x60, 0x220 precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x60, 0x240 precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x60, 0x260 .endm /* * Macros calculating individual rounds have general form * calc_round_pre + precalc_round + calc_round_post * calc_round_{pre,post} macros follow */ .macro calc_f1_pre offset, reg_a, reg_b, reg_c, reg_e add \offset(%r15), \reg_e andn \reg_c, \reg_a, %ebp add \reg_b, \reg_e // add F from the previous round rorx $0x1b, \reg_a, %r12d rorx $2, \reg_a, \reg_b // for the next round .endm /* * Calculate F for the next round */ .macro calc_f1_post reg_a, reg_b, reg_e and \reg_b, \reg_a // b & c xor %ebp, \reg_a // F1 = (b&c) ^ (~b&d) add %r12d, \reg_e .endm /* * Registers are cyclically rotated: * edx -> eax -> edi -> esi -> ebx -> ecx */ .macro calc0 mov %esi, %ebx // precalculate first round rorx $2, %esi, %esi andn %eax, %ebx, %ebp and %edi, %ebx xor %ebp, %ebx calc_f1_pre 0x0, %ecx, %ebx, %edi, %edx precalc0 0x80 calc_f1_post %ecx, %esi, %edx .endm .macro calc1 calc_f1_pre 0x4, %edx, %ecx, %esi, %eax precalc1 0x80 calc_f1_post %edx, %ebx, %eax .endm .macro calc2 calc_f1_pre 0x8, %eax, %edx, %ebx, %edi precalc2 %ymm15 calc_f1_post %eax, %ecx, %edi .endm .macro calc3 calc_f1_pre 0xc, %edi, %eax, %ecx, %esi calc_f1_post %edi, %edx, %esi .endm .macro calc4 calc_f1_pre 0x20, %esi, %edi, %edx, %ebx precalc4 %ymm15, 0x0 calc_f1_post %esi, %eax, %ebx .endm .macro calc5 calc_f1_pre 0x24, %ebx, %esi, %eax, %ecx calc_f1_post %ebx, %edi, %ecx .endm .macro calc6 calc_f1_pre 0x28, %ecx, %ebx, %edi, %edx calc_f1_post %ecx, %esi, %edx .endm .macro calc7 calc_f1_pre 0x2c, %edx, %ecx, %esi, %eax precalc7 0x0 calc_f1_post %edx, %ebx, %eax .endm .macro calc8 calc_f1_pre 0x40, %eax, %edx, %ebx, %edi precalc0 0x90 calc_f1_post %eax, %ecx, %edi .endm .macro calc9 calc_f1_pre 0x44, %edi, %eax, %ecx, %esi precalc1 0x90 calc_f1_post %edi, %edx, %esi .endm .macro calc10 calc_f1_pre 0x48, %esi, %edi, %edx, %ebx precalc2 %ymm14 calc_f1_post %esi, %eax, %ebx .endm .macro calc11 calc_f1_pre 0x4c, %ebx, %esi, %eax, %ecx calc_f1_post %ebx, %edi, %ecx .endm .macro calc12 calc_f1_pre 0x60, %ecx, %ebx, %edi, %edx precalc4 %ymm14, 0 calc_f1_post %ecx, %esi, %edx .endm .macro calc13 calc_f1_pre 0x64, %edx, %ecx, %esi, %eax calc_f1_post %edx, %ebx, %eax .endm .macro calc14 calc_f1_pre 0x68, %eax, %edx, %ebx, %edi calc_f1_post %eax, %ecx, %edi .endm .macro calc15 calc_f1_pre 0x6c, %edi, %eax, %ecx, %esi precalc7 0x10 calc_f1_post %edi, %edx, %esi .endm .macro calc16 calc_f1_pre 0x80, %esi, %edi, %edx, %ebx precalc0 0xa0 calc_f1_post %esi, %eax, %ebx .endm .macro calc17 calc_f1_pre 0x84, %ebx, %esi, %eax, %ecx precalc1 0xa0 calc_f1_post %ebx, %edi, %ecx .endm .macro calc18 calc_f1_pre 0x88, %ecx, %ebx, %edi, %edx precalc2 %ymm13 calc_f1_post %ecx, %esi, %edx .endm .macro calc_f2_pre offset, reg_a, reg_b, reg_e add \offset(%r15), \reg_e add \reg_b, \reg_e // add F from the previous round rorx $0x1b, \reg_a, %r12d rorx $2, \reg_a, \reg_b // for next round .endm .macro calc_f2_post reg_a, reg_b, reg_c, reg_e xor \reg_b, \reg_a add %r12d, \reg_e xor \reg_c, \reg_a .endm .macro calc19 calc_f2_pre 0x8c, %edx, %ecx, %eax calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc20 calc_f2_pre 0xa0, %eax, %edx, %edi precalc4 %ymm13, 0x0 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc21 calc_f2_pre 0xa4, %edi, %eax, %esi calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc22 calc_f2_pre 0xa8, %esi, %edi, %ebx calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc23 calc_f2_pre 0xac, %ebx, %esi, %ecx precalc7 0x20 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc24 calc_f2_pre 0xc0, %ecx, %ebx, %edx precalc0 0xb0 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc25 calc_f2_pre 0xc4, %edx, %ecx, %eax precalc1 0xb0 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc26 calc_f2_pre 0xc8, %eax, %edx, %edi precalc2 %ymm12 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc27 calc_f2_pre 0xcc, %edi, %eax, %esi calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc28 calc_f2_pre 0xe0, %esi, %edi, %ebx precalc4 %ymm12, 0x0 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc29 calc_f2_pre 0xe4, %ebx, %esi, %ecx calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc30 calc_f2_pre 0xe8, %ecx, %ebx, %edx calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc31 calc_f2_pre 0xec, %edx, %ecx, %eax precalc7 0x30 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc32 calc_f2_pre 0x100, %eax, %edx, %edi precalc16 %ymm15, %ymm14, %ymm12, %ymm8 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc33 calc_f2_pre 0x104, %edi, %eax, %esi precalc17 %ymm15, %ymm13, %ymm8 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc34 calc_f2_pre 0x108, %esi, %edi, %ebx precalc18 %ymm8 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc35 calc_f2_pre 0x10c, %ebx, %esi, %ecx precalc19 %ymm8 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc36 calc_f2_pre 0x120, %ecx, %ebx, %edx precalc20 %ymm8 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc37 calc_f2_pre 0x124, %edx, %ecx, %eax precalc21 %ymm8 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc38 calc_f2_pre 0x128, %eax, %edx, %edi calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc_f3_pre offset, reg_e add \offset(%r15), \reg_e .endm .macro calc_f3_post reg_a, reg_b, reg_c, reg_e, reg_tb add \reg_tb, \reg_e // add F from the previous round mov \reg_b, %ebp or \reg_a, %ebp rorx $0x1b, \reg_a, %r12d rorx $2, \reg_a, \reg_tb and \reg_c, %ebp // calculate F for the next round and \reg_b, \reg_a or %ebp, \reg_a add %r12d, \reg_e .endm .macro calc39 calc_f3_pre 0x12c, %esi precalc23 %ymm8, 0x0, 0x80 calc_f3_post %edi, %edx, %ecx, %esi, %eax .endm .macro calc40 calc_f3_pre 0x140, %ebx precalc16 %ymm14, %ymm13, %ymm8, %ymm7 calc_f3_post %esi, %eax, %edx, %ebx, %edi .endm .macro calc41 calc_f3_pre 0x144, %ecx precalc17 %ymm14, %ymm12, %ymm7 calc_f3_post %ebx, %edi, %eax, %ecx, %esi .endm .macro calc42 calc_f3_pre 0x148, %edx precalc18 %ymm7 calc_f3_post %ecx, %esi, %edi, %edx, %ebx .endm .macro calc43 calc_f3_pre 0x14c, %eax precalc19 %ymm7 calc_f3_post %edx, %ebx, %esi, %eax, %ecx .endm .macro calc44 calc_f3_pre 0x160, %edi precalc20 %ymm7 calc_f3_post %eax, %ecx, %ebx, %edi, %edx .endm .macro calc45 calc_f3_pre 0x164, %esi precalc21 %ymm7 calc_f3_post %edi, %edx, %ecx, %esi, %eax .endm .macro calc46 calc_f3_pre 0x168, %ebx calc_f3_post %esi, %eax, %edx, %ebx, %edi .endm .macro calc47 calc_f3_pre 0x16c, %ecx vpxor %ymm9, %ymm0, %ymm7 vpaddd 0x20(%r8), %ymm7, %ymm0 vmovdqu %ymm0, 0xa0(%r14) calc_f3_post %ebx, %edi, %eax, %ecx, %esi .endm .macro calc48 calc_f3_pre 0x180, %edx precalc16 %ymm13, %ymm12, %ymm7, %ymm5 calc_f3_post %ecx, %esi, %edi, %edx, %ebx .endm .macro calc49 calc_f3_pre 0x184, %eax precalc17 %ymm13, %ymm8, %ymm5 calc_f3_post %edx, %ebx, %esi, %eax, %ecx .endm .macro calc50 calc_f3_pre 0x188, %edi precalc18 %ymm5 calc_f3_post %eax, %ecx, %ebx, %edi, %edx .endm .macro calc51 calc_f3_pre 0x18c, %esi precalc19 %ymm5 calc_f3_post %edi, %edx, %ecx, %esi, %eax .endm .macro calc52 calc_f3_pre 0x1a0, %ebx precalc20 %ymm5 calc_f3_post %esi, %eax, %edx, %ebx, %edi .endm .macro calc53 calc_f3_pre 0x1a4, %ecx precalc21 %ymm5 calc_f3_post %ebx, %edi, %eax, %ecx, %esi .endm .macro calc54 calc_f3_pre 0x1a8, %edx calc_f3_post %ecx, %esi, %edi, %edx, %ebx .endm .macro calc55 calc_f3_pre 0x1ac, %eax precalc23 %ymm5, 0x20, 0xc0 calc_f3_post %edx, %ebx, %esi, %eax, %ecx .endm .macro calc56 calc_f3_pre 0x1c0, %edi precalc16 %ymm12, %ymm8, %ymm5, %ymm3 calc_f3_post %eax, %ecx, %ebx, %edi, %edx .endm .macro calc57 calc_f3_pre 0x1c4, %esi precalc17 %ymm12, %ymm7, %ymm3 calc_f3_post %edi, %edx, %ecx, %esi, %eax .endm .macro calc58 calc_f3_pre 0x1c8, %ebx precalc18 %ymm3 calc_f3_post %esi, %eax, %edx, %ebx, %edi .endm .macro calc59 calc_f2_pre 0x1cc, %ebx, %esi, %ecx precalc19 %ymm3 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc60 calc_f2_pre 0x1e0, %ecx, %ebx, %edx precalc20 %ymm3 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc61 calc_f2_pre 0x1e4, %edx, %ecx, %eax precalc21 %ymm3 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc62 calc_f2_pre 0x1e8, %eax, %edx, %edi calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc63 calc_f2_pre 0x1ec, %edi, %eax, %esi precalc23 %ymm3, 0x20, 0xe0 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc64 calc_f2_pre 0x200, %esi, %edi, %ebx precalc32 %ymm5, %ymm3 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc65 calc_f2_pre 0x204, %ebx, %esi, %ecx precalc33 %ymm14, %ymm15 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc66 calc_f2_pre 0x208, %ecx, %ebx, %edx precalc34 %ymm8 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc67 calc_f2_pre 0x20c, %edx, %ecx, %eax precalc35 %ymm15 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc68 calc_f2_pre 0x220, %eax, %edx, %edi precalc36 %ymm15 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc69 calc_f2_pre 0x224, %edi, %eax, %esi precalc37 %ymm15 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc70 calc_f2_pre 0x228, %esi, %edi, %ebx calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc71 calc_f2_pre 0x22c, %ebx, %esi, %ecx precalc39 %ymm15, 0x20, 0x100 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc72 calc_f2_pre 0x240, %ecx, %ebx, %edx precalc32 %ymm3, %ymm15 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc73 calc_f2_pre 0x244, %edx, %ecx, %eax precalc33 %ymm13, %ymm14 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc74 calc_f2_pre 0x248, %eax, %edx, %edi precalc34 %ymm7 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc75 calc_f2_pre 0x24c, %edi, %eax, %esi precalc35 %ymm14 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc76 calc_f2_pre 0x260, %esi, %edi, %ebx precalc36 %ymm14 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc77 calc_f2_pre 0x264, %ebx, %esi, %ecx precalc37 %ymm14 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc78 calc_f2_pre 0x268, %ecx, %ebx, %edx calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc79 add 0x26c(%r15), %eax add %ecx, %eax rorx $0x1b, %edx, %r12d precalc39 %ymm14, 0x20, 0x120 add %r12d, %eax .endm /* * Similar to calc0 */ .macro calc80 mov %ecx, %edx // precalculate first round rorx $2, %ecx, %ecx andn %esi, %edx, %ebp and %ebx, %edx xor %ebp, %edx calc_f1_pre 0x10, %eax, %edx, %ebx, %edi precalc32 %ymm15, %ymm14 calc_f1_post %eax, %ecx, %edi .endm .macro calc81 calc_f1_pre 0x14, %edi, %eax, %ecx, %esi precalc33 %ymm12, %ymm13 calc_f1_post %edi, %edx, %esi .endm .macro calc82 calc_f1_pre 0x18, %esi, %edi, %edx, %ebx precalc34 %ymm5 calc_f1_post %esi, %eax, %ebx .endm .macro calc83 calc_f1_pre 0x1c, %ebx, %esi, %eax, %ecx precalc35 %ymm13 calc_f1_post %ebx, %edi, %ecx .endm .macro calc84 calc_f1_pre 0x30, %ecx, %ebx, %edi, %edx precalc36 %ymm13 calc_f1_post %ecx, %esi, %edx .endm .macro calc85 calc_f1_pre 0x34, %edx, %ecx, %esi, %eax precalc37 %ymm13 calc_f1_post %edx, %ebx, %eax .endm .macro calc86 calc_f1_pre 0x38, %eax, %edx, %ebx, %edi calc_f1_post %eax, %ecx, %edi .endm .macro calc87 calc_f1_pre 0x3c, %edi, %eax, %ecx, %esi precalc39 %ymm13, 0x40, 0x140 calc_f1_post %edi, %edx, %esi .endm .macro calc88 calc_f1_pre 0x50, %esi, %edi, %edx, %ebx precalc32 %ymm14, %ymm13 calc_f1_post %esi, %eax, %ebx .endm .macro calc89 calc_f1_pre 0x54, %ebx, %esi, %eax, %ecx precalc33 %ymm8, %ymm12 calc_f1_post %ebx, %edi, %ecx .endm .macro calc90 calc_f1_pre 0x58, %ecx, %ebx, %edi, %edx precalc34 %ymm3 calc_f1_post %ecx, %esi, %edx .endm .macro calc91 calc_f1_pre 0x5c, %edx, %ecx, %esi, %eax precalc35 %ymm12 calc_f1_post %edx, %ebx, %eax .endm .macro calc92 calc_f1_pre 0x70, %eax, %edx, %ebx, %edi precalc36 %ymm12 calc_f1_post %eax, %ecx, %edi .endm .macro calc93 calc_f1_pre 0x74, %edi, %eax, %ecx, %esi precalc37 %ymm12 calc_f1_post %edi, %edx, %esi .endm .macro calc94 calc_f1_pre 0x78, %esi, %edi, %edx, %ebx calc_f1_post %esi, %eax, %ebx .endm .macro calc95 calc_f1_pre 0x7c, %ebx, %esi, %eax, %ecx precalc39 %ymm12, 0x40, 0x160 calc_f1_post %ebx, %edi, %ecx .endm .macro calc96 calc_f1_pre 0x90, %ecx, %ebx, %edi, %edx precalc32 %ymm13, %ymm12 calc_f1_post %ecx, %esi, %edx .endm .macro calc97 calc_f1_pre 0x94, %edx, %ecx, %esi, %eax precalc33 %ymm7, %ymm8 calc_f1_post %edx, %ebx, %eax .endm .macro calc98 calc_f1_pre 0x98, %eax, %edx, %ebx, %edi precalc34 %ymm15 calc_f1_post %eax, %ecx, %edi .endm .macro calc99 calc_f2_pre 0x9c, %edi, %eax, %esi precalc35 %ymm8 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc100 calc_f2_pre 0xb0, %esi, %edi, %ebx precalc36 %ymm8 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc101 calc_f2_pre 0xb4, %ebx, %esi, %ecx precalc37 %ymm8 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc102 calc_f2_pre 0xb8, %ecx, %ebx, %edx calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc103 calc_f2_pre 0xbc, %edx, %ecx, %eax precalc39 %ymm8, 0x40, 0x180 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc104 calc_f2_pre 0xd0, %eax, %edx, %edi precalc32 %ymm12, %ymm8 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc105 calc_f2_pre 0xd4, %edi, %eax, %esi precalc33 %ymm5, %ymm7 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc106 calc_f2_pre 0xd8, %esi, %edi, %ebx precalc34 %ymm14 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc107 calc_f2_pre 0xdc, %ebx, %esi, %ecx precalc35 %ymm7 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc108 calc_f2_pre 0xf0, %ecx, %ebx, %edx precalc36 %ymm7 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc109 calc_f2_pre 0xf4, %edx, %ecx, %eax precalc37 %ymm7 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc110 calc_f2_pre 0xf8, %eax, %edx, %edi calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc111 calc_f2_pre 0xfc, %edi, %eax, %esi precalc39 %ymm7, 0x40, 0x1a0 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc112 calc_f2_pre 0x110, %esi, %edi, %ebx precalc32 %ymm8, %ymm7 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc113 calc_f2_pre 0x114, %ebx, %esi, %ecx precalc33 %ymm3, %ymm5 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc114 calc_f2_pre 0x118, %ecx, %ebx, %edx precalc34 %ymm13 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc115 calc_f2_pre 0x11c, %edx, %ecx, %eax precalc35 %ymm5 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc116 calc_f2_pre 0x130, %eax, %edx, %edi precalc36 %ymm5 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc117 calc_f2_pre 0x134, %edi, %eax, %esi precalc37 %ymm5 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc118 calc_f2_pre 0x138, %esi, %edi, %ebx calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc119 calc_f3_pre 0x13c, %ecx precalc39 %ymm5, 0x40, 0x1c0 calc_f3_post %ebx, %edi, %eax, %ecx, %esi .endm .macro calc120 calc_f3_pre 0x150, %edx precalc32 %ymm7, %ymm5 calc_f3_post %ecx, %esi, %edi, %edx, %ebx .endm .macro calc121 calc_f3_pre 0x154, %eax precalc33 %ymm15, %ymm3 calc_f3_post %edx, %ebx, %esi, %eax, %ecx .endm .macro calc122 calc_f3_pre 0x158, %edi precalc34 %ymm12 calc_f3_post %eax, %ecx, %ebx, %edi, %edx .endm .macro calc123 calc_f3_pre 0x15c, %esi precalc35 %ymm3 calc_f3_post %edi, %edx, %ecx, %esi, %eax .endm .macro calc124 calc_f3_pre 0x170, %ebx precalc36 %ymm3 calc_f3_post %esi, %eax, %edx, %ebx, %edi .endm .macro calc125 calc_f3_pre 0x174, %ecx precalc37 %ymm3 calc_f3_post %ebx, %edi, %eax, %ecx, %esi .endm .macro calc126 calc_f3_pre 0x178, %edx calc_f3_post %ecx, %esi, %edi, %edx, %ebx .endm .macro calc127 calc_f3_pre 0x17c, %eax precalc39 %ymm3, 0x60, 0x1e0 calc_f3_post %edx, %ebx, %esi, %eax, %ecx .endm .macro calc128 calc_f3_pre 0x190, %edi precalc32 %ymm5, %ymm3 calc_f3_post %eax, %ecx, %ebx, %edi, %edx .endm .macro calc129 calc_f3_pre 0x194, %esi precalc33 %ymm14, %ymm15 calc_f3_post %edi, %edx, %ecx, %esi, %eax .endm .macro calc130 calc_f3_pre 0x198, %ebx precalc34 %ymm8 calc_f3_post %esi, %eax, %edx, %ebx, %edi .endm .macro calc131 calc_f3_pre 0x19c, %ecx precalc35 %ymm15 calc_f3_post %ebx, %edi, %eax, %ecx, %esi .endm .macro calc132 calc_f3_pre 0x1b0, %edx precalc36 %ymm15 calc_f3_post %ecx, %esi, %edi, %edx, %ebx .endm .macro calc133 calc_f3_pre 0x1b4, %eax precalc37 %ymm15 calc_f3_post %edx, %ebx, %esi, %eax, %ecx .endm .macro calc134 calc_f3_pre 0x1b8, %edi calc_f3_post %eax, %ecx, %ebx, %edi, %edx .endm .macro calc135 calc_f3_pre 0x1bc, %esi precalc39 %ymm15, 0x60, 0x200 calc_f3_post %edi, %edx, %ecx, %esi, %eax .endm .macro calc136 calc_f3_pre 0x1d0, %ebx precalc32 %ymm3, %ymm15 calc_f3_post %esi, %eax, %edx, %ebx, %edi .endm .macro calc137 calc_f3_pre 0x1d4, %ecx precalc33 %ymm13, %ymm14 calc_f3_post %ebx, %edi, %eax, %ecx, %esi .endm .macro calc138 calc_f3_pre 0x1d8, %edx precalc34 %ymm7 calc_f3_post %ecx, %esi, %edi, %edx, %ebx .endm .macro calc139 calc_f2_pre 0x1dc, %edx, %ecx, %eax precalc35 %ymm14 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc140 calc_f2_pre 0x1f0, %eax, %edx, %edi precalc36 %ymm14 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc141 calc_f2_pre 0x1f4, %edi, %eax, %esi precalc37 %ymm14 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc142 calc_f2_pre 0x1f8, %esi, %edi, %ebx calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc143 calc_f2_pre 0x1fc, %ebx, %esi, %ecx precalc39 %ymm14, 0x60, 0x220 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc144 calc_f2_pre 0x210, %ecx, %ebx, %edx precalc32 %ymm15, %ymm14 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc145 calc_f2_pre 0x214, %edx, %ecx, %eax precalc33 %ymm12, %ymm13 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc146 calc_f2_pre 0x218, %eax, %edx, %edi precalc34 %ymm5 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc147 calc_f2_pre 0x21c, %edi, %eax, %esi precalc35 %ymm13 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc148 calc_f2_pre 0x230, %esi, %edi, %ebx precalc36 %ymm13 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc149 calc_f2_pre 0x234, %ebx, %esi, %ecx precalc37 %ymm13 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc150 calc_f2_pre 0x238, %ecx, %ebx, %edx calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc151 calc_f2_pre 0x23c, %edx, %ecx, %eax precalc39 %ymm13, 0x60, 0x240 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc152 calc_f2_pre 0x250, %eax, %edx, %edi precalc32 %ymm14, %ymm13 calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc153 calc_f2_pre 0x254, %edi, %eax, %esi precalc33 %ymm8, %ymm12 calc_f2_post %edi, %edx, %ecx, %esi .endm .macro calc154 calc_f2_pre 0x258, %esi, %edi, %ebx precalc34 %ymm3 calc_f2_post %esi, %eax, %edx, %ebx .endm .macro calc155 calc_f2_pre 0x25c, %ebx, %esi, %ecx precalc35 %ymm12 calc_f2_post %ebx, %edi, %eax, %ecx .endm .macro calc156 calc_f2_pre 0x270, %ecx, %ebx, %edx precalc36 %ymm12 calc_f2_post %ecx, %esi, %edi, %edx .endm .macro calc157 calc_f2_pre 0x274, %edx, %ecx, %eax precalc37 %ymm12 calc_f2_post %edx, %ebx, %esi, %eax .endm .macro calc158 calc_f2_pre 0x278, %eax, %edx, %edi calc_f2_post %eax, %ecx, %ebx, %edi .endm .macro calc159 add 0x27c(%r15), %esi add %eax, %esi rorx $0x1b, %edi, %r12d precalc39 %ymm12, 0x60, 0x260 add %r12d, %esi .endm // sha1block(SHA1_CTX, buf, len) ENTRY(_libmd_sha1block_avx2) push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 sub $1408+8, %rsp and $~63, %rdx lea k_xmm_ar(%rip), %r8 mov %rdi, %r9 mov %rsi, %r10 lea 64(%rsi), %r13 lea 64(%rsi, %rdx), %r11 cmp %r11, %r13 cmovae %r8, %r13 vmovdqu bswap_shufb_ctl(%rip), %ymm10 mov (%r9), %ecx mov 4(%r9), %esi mov 8(%r9), %edi mov 12(%r9), %eax mov 16(%r9), %edx mov %rsp, %r14 lea 2*4*80+32(%rsp), %r15 precalc // precalc WK for first 2 blocks xchg %r14, %r15 // this is unrolled .Loop: cmp %r8, %r10 // we use the value of R8 (set below) // as a signal of the last block jne .Lbegin add $1408+8, %rsp pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx vzeroupper ret .Lbegin: calc0 calc1 calc2 calc3 calc4 calc5 calc6 calc7 calc8 calc9 calc10 calc11 calc12 calc13 calc14 calc15 calc16 calc17 calc18 calc19 calc20 calc21 calc22 calc23 calc24 calc25 calc26 calc27 calc28 calc29 calc30 calc31 calc32 calc33 calc34 calc35 calc36 calc37 calc38 calc39 calc40 calc41 calc42 calc43 calc44 calc45 calc46 calc47 calc48 calc49 calc50 calc51 calc52 calc53 calc54 calc55 calc56 calc57 calc58 calc59 add $128, %r10 // move to the next even-64-byte block cmp %r11, %r10 // is the current block the last one? cmovae %r8, %r10 // signal the last iteration smartly calc60 calc61 calc62 calc63 calc64 calc65 calc66 calc67 calc68 calc69 calc70 calc71 calc72 calc73 calc74 calc75 calc76 calc77 calc78 calc79 update_hash %eax, %edx, %ebx, %esi, %edi cmp %r8, %r10 // is the current block the last one? je .Loop mov %edx, %ecx calc80 calc81 calc82 calc83 calc84 calc85 calc86 calc87 calc88 calc89 calc90 calc91 calc92 calc93 calc94 calc95 calc96 calc97 calc98 calc99 calc100 calc101 calc102 calc103 calc104 calc105 calc106 calc107 calc108 calc109 calc110 calc111 calc112 calc113 calc114 calc115 calc116 calc117 calc118 calc119 calc120 calc121 calc122 calc123 calc124 calc125 calc126 calc127 calc128 calc129 calc130 calc131 calc132 calc133 calc134 calc135 calc136 calc137 calc138 calc139 add $128, %r13 // move to the next even-64-byte block cmp %r11, %r13 // is the current block the last one? cmovae %r8, %r10 calc140 calc141 calc142 calc143 calc144 calc145 calc146 calc147 calc148 calc149 calc150 calc151 calc152 calc153 calc154 calc155 calc156 calc157 calc158 calc159 update_hash %esi, %edi, %edx, %ecx, %ebx mov %esi, %r12d // reset state for AVX2 reg permutation mov %edi, %esi mov %edx, %edi mov %ebx, %edx mov %ecx, %eax mov %r12d, %ecx xchg %r14, %r15 jmp .Loop END(_libmd_sha1block_avx2) .section .rodata .balign 32 k_xmm_ar: .fill 8, 4, 0x5a827999 .fill 8, 4, 0x6ed9eba1 .fill 8, 4, 0x8f1bbcdc .fill 8, 4, 0xca62c1d6 .size k_xmm_ar, .-k_xmm_ar bswap_shufb_ctl: .4byte 0x00010203 .4byte 0x04050607 .4byte 0x08090a0b .4byte 0x0c0d0e0f .4byte 0x00010203 .4byte 0x04050607 .4byte 0x08090a0b .4byte 0x0c0d0e0f .size bswap_shufb_ctl, .-bswap_shufb_ctl /* * SHA1 implementation using the Intel SHA extensions (SHANI). * * Imlemented according to the Intel white paper * * S. Gulley, V. Gopal, K. Yap, W. Feghali, J. Guilford, * G. Wolrich: "Intel SHA Extensions: new instruction supporting * the Secure Hash Algorithm on IntelĀ® architecture processors", * July 2013. */ // sha1block(SHA1_CTX, buf, len) ENTRY(_libmd_sha1block_shani) and $~63, %rdx // round length to block-size multiple lea (%rsi, %rdx, 1), %rcx // end pointer test %rdx, %rdx // nothing to do? je 1f // if so, terminate immediately movdqu (%rdi), %xmm6 // h0, h1, h2, h3 pxor %xmm7, %xmm7 pshufd $0x1b, %xmm6, %xmm6 // h3, h2, h1, h0 pinsrd $3, 16(%rdi), %xmm7 // h4 in the highest word of xmm7 movdqu shuf_mask(%rip), %xmm4 // main loop 0: movdqa %xmm6, %xmm8 // stash ABCD movdqa %xmm7, %xmm9 // stash E // rounds 0--3 movdqu 0*16(%rsi), %xmm0 // load first message block pshufb %xmm4, %xmm0 // and byte-swap paddd %xmm0, %xmm7 // E += w[0] movdqa %xmm6, %xmm5 // E' = A sha1rnds4 $0, %xmm7, %xmm6 // perform rounds 0--3 // rounds 4--7 movdqu 1*16(%rsi), %xmm1 pshufb %xmm4, %xmm1 sha1nexte %xmm1, %xmm5 movdqa %xmm6, %xmm7 sha1rnds4 $0, %xmm5, %xmm6 sha1msg1 %xmm1, %xmm0 // rounds 8--11 movdqu 2*16(%rsi), %xmm2 pshufb %xmm4, %xmm2 sha1nexte %xmm2, %xmm7 movdqa %xmm6, %xmm5 sha1rnds4 $0, %xmm7, %xmm6 sha1msg1 %xmm2, %xmm1 pxor %xmm2, %xmm0 .macro midround msg3, msg0, msg1, msg2, e1, e0, k sha1nexte \msg3, \e1 movdqa %xmm6, \e0 sha1msg2 \msg3, \msg0 sha1rnds4 $\k, \e1, %xmm6 sha1msg1 \msg3, \msg2 pxor \msg3, \msg1 .endm movdqu 3*16(%rsi), %xmm3 // load third message block pshufb %xmm4, %xmm3 add $4*16, %rsi midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 0 // 12--15 midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 0 // 16--19 midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 20--23 midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 1 // 24--27 midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 1 // 28--31 midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 1 // 32--35 midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 36--39 midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 40--43 midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 2 // 44--47 midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 2 // 48--51 midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 2 // 52--55 midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 56--59 midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 3 // 60--63 midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 3 // 64--67 // rounds 68--71 sha1nexte %xmm1, %xmm5 movdqa %xmm6, %xmm7 sha1msg2 %xmm1, %xmm2 sha1rnds4 $3, %xmm5, %xmm6 pxor %xmm1, %xmm3 // rounds 72--75 sha1nexte %xmm2, %xmm7 movdqa %xmm6, %xmm5 sha1msg2 %xmm2, %xmm3 sha1rnds4 $3, %xmm7, %xmm6 // rounds 76--79 sha1nexte %xmm3, %xmm5 movdqa %xmm6, %xmm7 sha1rnds4 $3, %xmm5, %xmm6 sha1nexte %xmm9, %xmm7 // add saved E paddd %xmm8, %xmm6 // add saved ABCD cmp %rsi, %rcx // end reached? jne 0b pshufd $0x1b, %xmm6, %xmm6 // restore order of h0--h3 movdqu %xmm6, (%rdi) // write h0--h3 pextrd $3, %xmm7, 16(%rdi) // write h4 1: ret END(_libmd_sha1block_shani) .section .rodata .balign 16 shuf_mask: .8byte 0x08090a0b0c0d0e0f .8byte 0x0001020304050607 .size shuf_mask, .-shuf_mask .section .note.GNU-stack,"",%progbits