1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM 4 * processors that have NEON support but not the ARMv8 Crypto Extensions, 5 * typically this BLAKE2b implementation is much faster than the SHA-2 family 6 * and slightly faster than SHA-1. 7 * 8 * Copyright 2020 Google LLC 9 * 10 * Author: Eric Biggers <ebiggers@google.com> 11 */ 12 13#include <linux/linkage.h> 14 15 .text 16 .fpu neon 17 18 // The arguments to blake2b_compress_neon() 19 CTX .req r0 20 DATA .req r1 21 NBLOCKS .req r2 22 INC .req r3 23 24 // Pointers to the rotation tables 25 ROR24_TABLE .req r4 26 ROR16_TABLE .req r5 27 28 // The original stack pointer 29 ORIG_SP .req r6 30 31 // NEON registers which contain the message words of the current block. 32 // M_0-M_3 are occasionally used for other purposes too. 33 M_0 .req d16 34 M_1 .req d17 35 M_2 .req d18 36 M_3 .req d19 37 M_4 .req d20 38 M_5 .req d21 39 M_6 .req d22 40 M_7 .req d23 41 M_8 .req d24 42 M_9 .req d25 43 M_10 .req d26 44 M_11 .req d27 45 M_12 .req d28 46 M_13 .req d29 47 M_14 .req d30 48 M_15 .req d31 49 50 .align 4 51 // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8 52 // instruction. This is the most efficient way to implement these 53 // rotation amounts with NEON. (On Cortex-A53 it's the same speed as 54 // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.) 55.Lror24_table: 56 .byte 3, 4, 5, 6, 7, 0, 1, 2 57.Lror16_table: 58 .byte 2, 3, 4, 5, 6, 7, 0, 1 59 // The BLAKE2b initialization vector 60.Lblake2b_IV: 61 .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b 62 .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 63 .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f 64 .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 65 66// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the 67// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack 68// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9 69// (M_0-M_3), so that they can be reloaded if they are used as temporary 70// registers. The macro arguments s0-s15 give the order in which the message 71// words are used in this round. 'final' is 1 if this is the final round. 72.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ 73 s8, s9, s10, s11, s12, s13, s14, s15, final=0 74 75 // Mix the columns: 76 // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]), 77 // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]). 78 79 // a += b + m[blake2b_sigma[r][2*i + 0]]; 80 vadd.u64 q0, q0, q2 81 vadd.u64 q1, q1, q3 82 vadd.u64 d0, d0, M_\s0 83 vadd.u64 d1, d1, M_\s2 84 vadd.u64 d2, d2, M_\s4 85 vadd.u64 d3, d3, M_\s6 86 87 // d = ror64(d ^ a, 32); 88 veor q6, q6, q0 89 veor q7, q7, q1 90 vrev64.32 q6, q6 91 vrev64.32 q7, q7 92 93 // c += d; 94 vadd.u64 q4, q4, q6 95 vadd.u64 q5, q5, q7 96 97 // b = ror64(b ^ c, 24); 98 vld1.8 {M_0}, [ROR24_TABLE, :64] 99 veor q2, q2, q4 100 veor q3, q3, q5 101 vtbl.8 d4, {d4}, M_0 102 vtbl.8 d5, {d5}, M_0 103 vtbl.8 d6, {d6}, M_0 104 vtbl.8 d7, {d7}, M_0 105 106 // a += b + m[blake2b_sigma[r][2*i + 1]]; 107 // 108 // M_0 got clobbered above, so we have to reload it if any of the four 109 // message words this step needs happens to be M_0. Otherwise we don't 110 // need to reload it here, as it will just get clobbered again below. 111.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 112 vld1.8 {M_0}, [sp, :64] 113.endif 114 vadd.u64 q0, q0, q2 115 vadd.u64 q1, q1, q3 116 vadd.u64 d0, d0, M_\s1 117 vadd.u64 d1, d1, M_\s3 118 vadd.u64 d2, d2, M_\s5 119 vadd.u64 d3, d3, M_\s7 120 121 // d = ror64(d ^ a, 16); 122 vld1.8 {M_0}, [ROR16_TABLE, :64] 123 veor q6, q6, q0 124 veor q7, q7, q1 125 vtbl.8 d12, {d12}, M_0 126 vtbl.8 d13, {d13}, M_0 127 vtbl.8 d14, {d14}, M_0 128 vtbl.8 d15, {d15}, M_0 129 130 // c += d; 131 vadd.u64 q4, q4, q6 132 vadd.u64 q5, q5, q7 133 134 // b = ror64(b ^ c, 63); 135 // 136 // This rotation amount isn't a multiple of 8, so it has to be 137 // implemented using a pair of shifts, which requires temporary 138 // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards. 139 veor q8, q2, q4 140 veor q9, q3, q5 141 vshr.u64 q2, q8, #63 142 vshr.u64 q3, q9, #63 143 vsli.u64 q2, q8, #1 144 vsli.u64 q3, q9, #1 145 vld1.8 {q8-q9}, [sp, :256] 146 147 // Mix the diagonals: 148 // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]), 149 // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]). 150 // 151 // There are two possible ways to do this: use 'vext' instructions to 152 // shift the rows of the matrix so that the diagonals become columns, 153 // and undo it afterwards; or just use 64-bit operations on 'd' 154 // registers instead of 128-bit operations on 'q' registers. We use the 155 // latter approach, as it performs much better on Cortex-A7. 156 157 // a += b + m[blake2b_sigma[r][2*i + 0]]; 158 vadd.u64 d0, d0, d5 159 vadd.u64 d1, d1, d6 160 vadd.u64 d2, d2, d7 161 vadd.u64 d3, d3, d4 162 vadd.u64 d0, d0, M_\s8 163 vadd.u64 d1, d1, M_\s10 164 vadd.u64 d2, d2, M_\s12 165 vadd.u64 d3, d3, M_\s14 166 167 // d = ror64(d ^ a, 32); 168 veor d15, d15, d0 169 veor d12, d12, d1 170 veor d13, d13, d2 171 veor d14, d14, d3 172 vrev64.32 d15, d15 173 vrev64.32 d12, d12 174 vrev64.32 d13, d13 175 vrev64.32 d14, d14 176 177 // c += d; 178 vadd.u64 d10, d10, d15 179 vadd.u64 d11, d11, d12 180 vadd.u64 d8, d8, d13 181 vadd.u64 d9, d9, d14 182 183 // b = ror64(b ^ c, 24); 184 vld1.8 {M_0}, [ROR24_TABLE, :64] 185 veor d5, d5, d10 186 veor d6, d6, d11 187 veor d7, d7, d8 188 veor d4, d4, d9 189 vtbl.8 d5, {d5}, M_0 190 vtbl.8 d6, {d6}, M_0 191 vtbl.8 d7, {d7}, M_0 192 vtbl.8 d4, {d4}, M_0 193 194 // a += b + m[blake2b_sigma[r][2*i + 1]]; 195.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 196 vld1.8 {M_0}, [sp, :64] 197.endif 198 vadd.u64 d0, d0, d5 199 vadd.u64 d1, d1, d6 200 vadd.u64 d2, d2, d7 201 vadd.u64 d3, d3, d4 202 vadd.u64 d0, d0, M_\s9 203 vadd.u64 d1, d1, M_\s11 204 vadd.u64 d2, d2, M_\s13 205 vadd.u64 d3, d3, M_\s15 206 207 // d = ror64(d ^ a, 16); 208 vld1.8 {M_0}, [ROR16_TABLE, :64] 209 veor d15, d15, d0 210 veor d12, d12, d1 211 veor d13, d13, d2 212 veor d14, d14, d3 213 vtbl.8 d12, {d12}, M_0 214 vtbl.8 d13, {d13}, M_0 215 vtbl.8 d14, {d14}, M_0 216 vtbl.8 d15, {d15}, M_0 217 218 // c += d; 219 vadd.u64 d10, d10, d15 220 vadd.u64 d11, d11, d12 221 vadd.u64 d8, d8, d13 222 vadd.u64 d9, d9, d14 223 224 // b = ror64(b ^ c, 63); 225 veor d16, d4, d9 226 veor d17, d5, d10 227 veor d18, d6, d11 228 veor d19, d7, d8 229 vshr.u64 q2, q8, #63 230 vshr.u64 q3, q9, #63 231 vsli.u64 q2, q8, #1 232 vsli.u64 q3, q9, #1 233 // Reloading q8-q9 can be skipped on the final round. 234.if ! \final 235 vld1.8 {q8-q9}, [sp, :256] 236.endif 237.endm 238 239// 240// void blake2b_compress_neon(struct blake2b_ctx *ctx, 241// const u8 *data, size_t nblocks, u32 inc); 242// 243// Only the first three fields of struct blake2b_ctx are used: 244// u64 h[8]; (inout) 245// u64 t[2]; (inout) 246// u64 f[2]; (in) 247// 248 .align 5 249ENTRY(blake2b_compress_neon) 250 push {r4-r10} 251 252 // Allocate a 32-byte stack buffer that is 32-byte aligned. 253 mov ORIG_SP, sp 254 sub ip, sp, #32 255 bic ip, ip, #31 256 mov sp, ip 257 258 adr ROR24_TABLE, .Lror24_table 259 adr ROR16_TABLE, .Lror16_table 260 261 mov ip, CTX 262 vld1.64 {q0-q1}, [ip]! // Load h[0..3] 263 vld1.64 {q2-q3}, [ip]! // Load h[4..7] 264.Lnext_block: 265 adr r10, .Lblake2b_IV 266 vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1] 267 vld1.64 {q4-q5}, [r10]! // Load IV[0..3] 268 vmov r7, r8, d28 // Copy t[0] to (r7, r8) 269 vld1.64 {q6-q7}, [r10] // Load IV[4..7] 270 adds r7, r7, INC // Increment counter 271 bcs .Lslow_inc_ctr 272 vmov.i32 d28[0], r7 273 vst1.64 {d28}, [ip] // Update t[0] 274.Linc_ctr_done: 275 276 // Load the next message block and finish initializing the state matrix 277 // 'v'. Fortunately, there are exactly enough NEON registers to fit the 278 // entire state matrix in q0-q7 and the entire message block in q8-15. 279 // 280 // However, _blake2b_round also needs some extra registers for rotates, 281 // so we have to spill some registers. It's better to spill the message 282 // registers than the state registers, as the message doesn't change. 283 // Therefore we store a copy of the first 32 bytes of the message block 284 // (q8-q9) in an aligned buffer on the stack so that they can be 285 // reloaded when needed. (We could just reload directly from the 286 // message buffer, but it's faster to use aligned loads.) 287 vld1.8 {q8-q9}, [DATA]! 288 veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] 289 vld1.8 {q10-q11}, [DATA]! 290 veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] 291 vld1.8 {q12-q13}, [DATA]! 292 vst1.8 {q8-q9}, [sp, :256] 293 mov ip, CTX 294 vld1.8 {q14-q15}, [DATA]! 295 296 // Execute the rounds. Each round is provided the order in which it 297 // needs to use the message words. 298 _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 299 _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 300 _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 301 _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 302 _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 303 _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 304 _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 305 _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 306 _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 307 _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 308 _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 309 _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ 310 final=1 311 312 // Fold the final state matrix into the hash chaining value: 313 // 314 // for (i = 0; i < 8; i++) 315 // h[i] ^= v[i] ^ v[i + 8]; 316 // 317 vld1.64 {q8-q9}, [ip]! // Load old h[0..3] 318 veor q0, q0, q4 // v[0..1] ^= v[8..9] 319 veor q1, q1, q5 // v[2..3] ^= v[10..11] 320 vld1.64 {q10-q11}, [ip] // Load old h[4..7] 321 veor q2, q2, q6 // v[4..5] ^= v[12..13] 322 veor q3, q3, q7 // v[6..7] ^= v[14..15] 323 veor q0, q0, q8 // v[0..1] ^= h[0..1] 324 veor q1, q1, q9 // v[2..3] ^= h[2..3] 325 mov ip, CTX 326 subs NBLOCKS, NBLOCKS, #1 // nblocks-- 327 vst1.64 {q0-q1}, [ip]! // Store new h[0..3] 328 veor q2, q2, q10 // v[4..5] ^= h[4..5] 329 veor q3, q3, q11 // v[6..7] ^= h[6..7] 330 vst1.64 {q2-q3}, [ip]! // Store new h[4..7] 331 332 // Advance to the next block, if there is one. 333 bne .Lnext_block // nblocks != 0? 334 335 mov sp, ORIG_SP 336 pop {r4-r10} 337 mov pc, lr 338 339.Lslow_inc_ctr: 340 // Handle the case where the counter overflowed its low 32 bits, by 341 // carrying the overflow bit into the full 128-bit counter. 342 vmov r9, r10, d29 343 adcs r8, r8, #0 344 adcs r9, r9, #0 345 adc r10, r10, #0 346 vmov d28, r7, r8 347 vmov d29, r9, r10 348 vst1.64 {q14}, [ip] // Update t[0] and t[1] 349 b .Linc_ctr_done 350ENDPROC(blake2b_compress_neon) 351