15172d322SEric Biggers/* SPDX-License-Identifier: GPL-2.0-or-later */ 25172d322SEric Biggers/* 35172d322SEric Biggers * BLAKE2s digest algorithm, ARM scalar implementation 45172d322SEric Biggers * 55172d322SEric Biggers * Copyright 2020 Google LLC 65172d322SEric Biggers * 75172d322SEric Biggers * Author: Eric Biggers <ebiggers@google.com> 85172d322SEric Biggers */ 95172d322SEric Biggers 105172d322SEric Biggers#include <linux/linkage.h> 11d2f2516aSEric Biggers#include <asm/assembler.h> 125172d322SEric Biggers 135172d322SEric Biggers // Registers used to hold message words temporarily. There aren't 145172d322SEric Biggers // enough ARM registers to hold the whole message block, so we have to 155172d322SEric Biggers // load the words on-demand. 165172d322SEric Biggers M_0 .req r12 175172d322SEric Biggers M_1 .req r14 185172d322SEric Biggers 195172d322SEric Biggers// The BLAKE2s initialization vector 205172d322SEric Biggers.Lblake2s_IV: 215172d322SEric Biggers .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A 225172d322SEric Biggers .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 235172d322SEric Biggers 245172d322SEric Biggers.macro __ldrd a, b, src, offset 255172d322SEric Biggers#if __LINUX_ARM_ARCH__ >= 6 265172d322SEric Biggers ldrd \a, \b, [\src, #\offset] 275172d322SEric Biggers#else 285172d322SEric Biggers ldr \a, [\src, #\offset] 295172d322SEric Biggers ldr \b, [\src, #\offset + 4] 305172d322SEric Biggers#endif 315172d322SEric Biggers.endm 325172d322SEric Biggers 335172d322SEric Biggers.macro __strd a, b, dst, offset 345172d322SEric Biggers#if __LINUX_ARM_ARCH__ >= 6 355172d322SEric Biggers strd \a, \b, [\dst, #\offset] 365172d322SEric Biggers#else 375172d322SEric Biggers str \a, [\dst, #\offset] 385172d322SEric Biggers str \b, [\dst, #\offset + 4] 395172d322SEric Biggers#endif 405172d322SEric Biggers.endm 415172d322SEric Biggers 42d2f2516aSEric Biggers.macro _le32_bswap a, tmp 43d2f2516aSEric Biggers#ifdef __ARMEB__ 44d2f2516aSEric Biggers rev_l \a, \tmp 45d2f2516aSEric Biggers#endif 46d2f2516aSEric Biggers.endm 47d2f2516aSEric Biggers 48d2f2516aSEric Biggers.macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp 49d2f2516aSEric Biggers _le32_bswap \a, \tmp 50d2f2516aSEric Biggers _le32_bswap \b, \tmp 51d2f2516aSEric Biggers _le32_bswap \c, \tmp 52d2f2516aSEric Biggers _le32_bswap \d, \tmp 53d2f2516aSEric Biggers _le32_bswap \e, \tmp 54d2f2516aSEric Biggers _le32_bswap \f, \tmp 55d2f2516aSEric Biggers _le32_bswap \g, \tmp 56d2f2516aSEric Biggers _le32_bswap \h, \tmp 57d2f2516aSEric Biggers.endm 58d2f2516aSEric Biggers 595172d322SEric Biggers// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals. 605172d322SEric Biggers// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two 615172d322SEric Biggers// columns/diagonals. s0-s1 are the word offsets to the message words the first 625172d322SEric Biggers// column/diagonal needs, and likewise s2-s3 for the second column/diagonal. 635172d322SEric Biggers// M_0 and M_1 are free to use, and the message block can be found at sp + 32. 645172d322SEric Biggers// 655172d322SEric Biggers// Note that to save instructions, the rotations don't happen when the 665172d322SEric Biggers// pseudocode says they should, but rather they are delayed until the values are 675172d322SEric Biggers// used. See the comment above _blake2s_round(). 685172d322SEric Biggers.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3 695172d322SEric Biggers 705172d322SEric Biggers ldr M_0, [sp, #32 + 4 * \s0] 715172d322SEric Biggers ldr M_1, [sp, #32 + 4 * \s2] 725172d322SEric Biggers 735172d322SEric Biggers // a += b + m[blake2s_sigma[r][2*i + 0]]; 745172d322SEric Biggers add \a0, \a0, \b0, ror #brot 755172d322SEric Biggers add \a1, \a1, \b1, ror #brot 765172d322SEric Biggers add \a0, \a0, M_0 775172d322SEric Biggers add \a1, \a1, M_1 785172d322SEric Biggers 795172d322SEric Biggers // d = ror32(d ^ a, 16); 805172d322SEric Biggers eor \d0, \a0, \d0, ror #drot 815172d322SEric Biggers eor \d1, \a1, \d1, ror #drot 825172d322SEric Biggers 835172d322SEric Biggers // c += d; 845172d322SEric Biggers add \c0, \c0, \d0, ror #16 855172d322SEric Biggers add \c1, \c1, \d1, ror #16 865172d322SEric Biggers 875172d322SEric Biggers // b = ror32(b ^ c, 12); 885172d322SEric Biggers eor \b0, \c0, \b0, ror #brot 895172d322SEric Biggers eor \b1, \c1, \b1, ror #brot 905172d322SEric Biggers 915172d322SEric Biggers ldr M_0, [sp, #32 + 4 * \s1] 925172d322SEric Biggers ldr M_1, [sp, #32 + 4 * \s3] 935172d322SEric Biggers 945172d322SEric Biggers // a += b + m[blake2s_sigma[r][2*i + 1]]; 955172d322SEric Biggers add \a0, \a0, \b0, ror #12 965172d322SEric Biggers add \a1, \a1, \b1, ror #12 975172d322SEric Biggers add \a0, \a0, M_0 985172d322SEric Biggers add \a1, \a1, M_1 995172d322SEric Biggers 1005172d322SEric Biggers // d = ror32(d ^ a, 8); 1015172d322SEric Biggers eor \d0, \a0, \d0, ror#16 1025172d322SEric Biggers eor \d1, \a1, \d1, ror#16 1035172d322SEric Biggers 1045172d322SEric Biggers // c += d; 1055172d322SEric Biggers add \c0, \c0, \d0, ror#8 1065172d322SEric Biggers add \c1, \c1, \d1, ror#8 1075172d322SEric Biggers 1085172d322SEric Biggers // b = ror32(b ^ c, 7); 1095172d322SEric Biggers eor \b0, \c0, \b0, ror#12 1105172d322SEric Biggers eor \b1, \c1, \b1, ror#12 1115172d322SEric Biggers.endm 1125172d322SEric Biggers 1135172d322SEric Biggers// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9] 1145172d322SEric Biggers// are in r0..r9. The stack pointer points to 8 bytes of scratch space for 1155172d322SEric Biggers// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and 1165172d322SEric Biggers// r14 are free to use. The macro arguments s0-s15 give the order in which the 1175172d322SEric Biggers// message words are used in this round. 1185172d322SEric Biggers// 1195172d322SEric Biggers// All rotates are performed using the implicit rotate operand accepted by the 1205172d322SEric Biggers// 'add' and 'eor' instructions. This is faster than using explicit rotate 1215172d322SEric Biggers// instructions. To make this work, we allow the values in the second and last 1225172d322SEric Biggers// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the 1235172d322SEric Biggers// wrong rotation amount. The rotation amount is then fixed up just in time 1245172d322SEric Biggers// when the values are used. 'brot' is the number of bits the values in row 'b' 1255172d322SEric Biggers// need to be rotated right to arrive at the correct values, and 'drot' 1265172d322SEric Biggers// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such 1275172d322SEric Biggers// that they end up as (7, 8) after every round. 1285172d322SEric Biggers.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \ 1295172d322SEric Biggers s8, s9, s10, s11, s12, s13, s14, s15 1305172d322SEric Biggers 1315172d322SEric Biggers // Mix first two columns: 1325172d322SEric Biggers // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]). 1335172d322SEric Biggers __ldrd r10, r11, sp, 16 // load v[12] and v[13] 1345172d322SEric Biggers _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \ 1355172d322SEric Biggers \s0, \s1, \s2, \s3 1365172d322SEric Biggers __strd r8, r9, sp, 0 1375172d322SEric Biggers __strd r10, r11, sp, 16 1385172d322SEric Biggers 1395172d322SEric Biggers // Mix second two columns: 1405172d322SEric Biggers // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]). 1415172d322SEric Biggers __ldrd r8, r9, sp, 8 // load v[10] and v[11] 1425172d322SEric Biggers __ldrd r10, r11, sp, 24 // load v[14] and v[15] 1435172d322SEric Biggers _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \ 1445172d322SEric Biggers \s4, \s5, \s6, \s7 1455172d322SEric Biggers str r10, [sp, #24] // store v[14] 1465172d322SEric Biggers // v[10], v[11], and v[15] are used below, so no need to store them yet. 1475172d322SEric Biggers 1485172d322SEric Biggers .set brot, 7 1495172d322SEric Biggers .set drot, 8 1505172d322SEric Biggers 1515172d322SEric Biggers // Mix first two diagonals: 1525172d322SEric Biggers // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]). 1535172d322SEric Biggers ldr r10, [sp, #16] // load v[12] 1545172d322SEric Biggers _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \ 1555172d322SEric Biggers \s8, \s9, \s10, \s11 1565172d322SEric Biggers __strd r8, r9, sp, 8 1575172d322SEric Biggers str r11, [sp, #28] 1585172d322SEric Biggers str r10, [sp, #16] 1595172d322SEric Biggers 1605172d322SEric Biggers // Mix second two diagonals: 1615172d322SEric Biggers // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]). 1625172d322SEric Biggers __ldrd r8, r9, sp, 0 // load v[8] and v[9] 1635172d322SEric Biggers __ldrd r10, r11, sp, 20 // load v[13] and v[14] 1645172d322SEric Biggers _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \ 1655172d322SEric Biggers \s12, \s13, \s14, \s15 1665172d322SEric Biggers __strd r10, r11, sp, 20 1675172d322SEric Biggers.endm 1685172d322SEric Biggers 1695172d322SEric Biggers// 170*6048fdccSJason A. Donenfeld// void blake2s_compress(struct blake2s_state *state, 1715172d322SEric Biggers// const u8 *block, size_t nblocks, u32 inc); 1725172d322SEric Biggers// 1735172d322SEric Biggers// Only the first three fields of struct blake2s_state are used: 1745172d322SEric Biggers// u32 h[8]; (inout) 1755172d322SEric Biggers// u32 t[2]; (inout) 1765172d322SEric Biggers// u32 f[2]; (in) 1775172d322SEric Biggers// 1785172d322SEric Biggers .align 5 179*6048fdccSJason A. DonenfeldENTRY(blake2s_compress) 1805172d322SEric Biggers push {r0-r2,r4-r11,lr} // keep this an even number 1815172d322SEric Biggers 1825172d322SEric Biggers.Lnext_block: 1835172d322SEric Biggers // r0 is 'state' 1845172d322SEric Biggers // r1 is 'block' 1855172d322SEric Biggers // r3 is 'inc' 1865172d322SEric Biggers 1875172d322SEric Biggers // Load and increment the counter t[0..1]. 1885172d322SEric Biggers __ldrd r10, r11, r0, 32 1895172d322SEric Biggers adds r10, r10, r3 1905172d322SEric Biggers adc r11, r11, #0 1915172d322SEric Biggers __strd r10, r11, r0, 32 1925172d322SEric Biggers 1935172d322SEric Biggers // _blake2s_round is very short on registers, so copy the message block 1945172d322SEric Biggers // to the stack to save a register during the rounds. This also has the 1955172d322SEric Biggers // advantage that misalignment only needs to be dealt with in one place. 1965172d322SEric Biggers sub sp, sp, #64 1975172d322SEric Biggers mov r12, sp 1985172d322SEric Biggers tst r1, #3 1995172d322SEric Biggers bne .Lcopy_block_misaligned 2005172d322SEric Biggers ldmia r1!, {r2-r9} 201d2f2516aSEric Biggers _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 2025172d322SEric Biggers stmia r12!, {r2-r9} 2035172d322SEric Biggers ldmia r1!, {r2-r9} 204d2f2516aSEric Biggers _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14 2055172d322SEric Biggers stmia r12, {r2-r9} 2065172d322SEric Biggers.Lcopy_block_done: 2075172d322SEric Biggers str r1, [sp, #68] // Update message pointer 2085172d322SEric Biggers 2095172d322SEric Biggers // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space 2105172d322SEric Biggers // for spilling v[8..9]. Leave v[8..9] in r8-r9. 2115172d322SEric Biggers mov r14, r0 // r14 = state 2125172d322SEric Biggers adr r12, .Lblake2s_IV 2135172d322SEric Biggers ldmia r12!, {r8-r9} // load IV[0..1] 2145172d322SEric Biggers __ldrd r0, r1, r14, 40 // load f[0..1] 2155172d322SEric Biggers ldm r12, {r2-r7} // load IV[3..7] 2165172d322SEric Biggers eor r4, r4, r10 // v[12] = IV[4] ^ t[0] 2175172d322SEric Biggers eor r5, r5, r11 // v[13] = IV[5] ^ t[1] 2185172d322SEric Biggers eor r6, r6, r0 // v[14] = IV[6] ^ f[0] 2195172d322SEric Biggers eor r7, r7, r1 // v[15] = IV[7] ^ f[1] 2205172d322SEric Biggers push {r2-r7} // push v[9..15] 2215172d322SEric Biggers sub sp, sp, #8 // leave space for v[8..9] 2225172d322SEric Biggers 2235172d322SEric Biggers // Load h[0..7] == v[0..7]. 2245172d322SEric Biggers ldm r14, {r0-r7} 2255172d322SEric Biggers 2265172d322SEric Biggers // Execute the rounds. Each round is provided the order in which it 2275172d322SEric Biggers // needs to use the message words. 2285172d322SEric Biggers .set brot, 0 2295172d322SEric Biggers .set drot, 0 2305172d322SEric Biggers _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 2315172d322SEric Biggers _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 2325172d322SEric Biggers _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 2335172d322SEric Biggers _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 2345172d322SEric Biggers _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 2355172d322SEric Biggers _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 2365172d322SEric Biggers _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 2375172d322SEric Biggers _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 2385172d322SEric Biggers _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 2395172d322SEric Biggers _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 2405172d322SEric Biggers 2415172d322SEric Biggers // Fold the final state matrix into the hash chaining value: 2425172d322SEric Biggers // 2435172d322SEric Biggers // for (i = 0; i < 8; i++) 2445172d322SEric Biggers // h[i] ^= v[i] ^ v[i + 8]; 2455172d322SEric Biggers // 2465172d322SEric Biggers ldr r14, [sp, #96] // r14 = &h[0] 2475172d322SEric Biggers add sp, sp, #8 // v[8..9] are already loaded. 2485172d322SEric Biggers pop {r10-r11} // load v[10..11] 2495172d322SEric Biggers eor r0, r0, r8 2505172d322SEric Biggers eor r1, r1, r9 2515172d322SEric Biggers eor r2, r2, r10 2525172d322SEric Biggers eor r3, r3, r11 2535172d322SEric Biggers ldm r14, {r8-r11} // load h[0..3] 2545172d322SEric Biggers eor r0, r0, r8 2555172d322SEric Biggers eor r1, r1, r9 2565172d322SEric Biggers eor r2, r2, r10 2575172d322SEric Biggers eor r3, r3, r11 2585172d322SEric Biggers stmia r14!, {r0-r3} // store new h[0..3] 2595172d322SEric Biggers ldm r14, {r0-r3} // load old h[4..7] 2605172d322SEric Biggers pop {r8-r11} // load v[12..15] 2615172d322SEric Biggers eor r0, r0, r4, ror #brot 2625172d322SEric Biggers eor r1, r1, r5, ror #brot 2635172d322SEric Biggers eor r2, r2, r6, ror #brot 2645172d322SEric Biggers eor r3, r3, r7, ror #brot 2655172d322SEric Biggers eor r0, r0, r8, ror #drot 2665172d322SEric Biggers eor r1, r1, r9, ror #drot 2675172d322SEric Biggers eor r2, r2, r10, ror #drot 2685172d322SEric Biggers eor r3, r3, r11, ror #drot 2695172d322SEric Biggers add sp, sp, #64 // skip copy of message block 2705172d322SEric Biggers stm r14, {r0-r3} // store new h[4..7] 2715172d322SEric Biggers 2725172d322SEric Biggers // Advance to the next block, if there is one. Note that if there are 2735172d322SEric Biggers // multiple blocks, then 'inc' (the counter increment amount) must be 2745172d322SEric Biggers // 64. So we can simply set it to 64 without re-loading it. 2755172d322SEric Biggers ldm sp, {r0, r1, r2} // load (state, block, nblocks) 2765172d322SEric Biggers mov r3, #64 // set 'inc' 2775172d322SEric Biggers subs r2, r2, #1 // nblocks-- 2785172d322SEric Biggers str r2, [sp, #8] 2795172d322SEric Biggers bne .Lnext_block // nblocks != 0? 2805172d322SEric Biggers 2815172d322SEric Biggers pop {r0-r2,r4-r11,pc} 2825172d322SEric Biggers 2835172d322SEric Biggers // The next message block (pointed to by r1) isn't 4-byte aligned, so it 2845172d322SEric Biggers // can't be loaded using ldmia. Copy it to the stack buffer (pointed to 2855172d322SEric Biggers // by r12) using an alternative method. r2-r9 are free to use. 2865172d322SEric Biggers.Lcopy_block_misaligned: 2875172d322SEric Biggers mov r2, #64 2885172d322SEric Biggers1: 2895172d322SEric Biggers#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 2905172d322SEric Biggers ldr r3, [r1], #4 291d2f2516aSEric Biggers _le32_bswap r3, r4 2925172d322SEric Biggers#else 2935172d322SEric Biggers ldrb r3, [r1, #0] 2945172d322SEric Biggers ldrb r4, [r1, #1] 2955172d322SEric Biggers ldrb r5, [r1, #2] 2965172d322SEric Biggers ldrb r6, [r1, #3] 2975172d322SEric Biggers add r1, r1, #4 2985172d322SEric Biggers orr r3, r3, r4, lsl #8 2995172d322SEric Biggers orr r3, r3, r5, lsl #16 3005172d322SEric Biggers orr r3, r3, r6, lsl #24 3015172d322SEric Biggers#endif 3025172d322SEric Biggers subs r2, r2, #4 3035172d322SEric Biggers str r3, [r12], #4 3045172d322SEric Biggers bne 1b 3055172d322SEric Biggers b .Lcopy_block_done 306*6048fdccSJason A. DonenfeldENDPROC(blake2s_compress) 307