1*1862eb00SEric Biggers/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*1862eb00SEric Biggers/* 3*1862eb00SEric Biggers * BLAKE2b digest algorithm, NEON accelerated 4*1862eb00SEric Biggers * 5*1862eb00SEric Biggers * Copyright 2020 Google LLC 6*1862eb00SEric Biggers * 7*1862eb00SEric Biggers * Author: Eric Biggers <ebiggers@google.com> 8*1862eb00SEric Biggers */ 9*1862eb00SEric Biggers 10*1862eb00SEric Biggers#include <linux/linkage.h> 11*1862eb00SEric Biggers 12*1862eb00SEric Biggers .text 13*1862eb00SEric Biggers .fpu neon 14*1862eb00SEric Biggers 15*1862eb00SEric Biggers // The arguments to blake2b_compress_neon() 16*1862eb00SEric Biggers STATE .req r0 17*1862eb00SEric Biggers BLOCK .req r1 18*1862eb00SEric Biggers NBLOCKS .req r2 19*1862eb00SEric Biggers INC .req r3 20*1862eb00SEric Biggers 21*1862eb00SEric Biggers // Pointers to the rotation tables 22*1862eb00SEric Biggers ROR24_TABLE .req r4 23*1862eb00SEric Biggers ROR16_TABLE .req r5 24*1862eb00SEric Biggers 25*1862eb00SEric Biggers // The original stack pointer 26*1862eb00SEric Biggers ORIG_SP .req r6 27*1862eb00SEric Biggers 28*1862eb00SEric Biggers // NEON registers which contain the message words of the current block. 29*1862eb00SEric Biggers // M_0-M_3 are occasionally used for other purposes too. 30*1862eb00SEric Biggers M_0 .req d16 31*1862eb00SEric Biggers M_1 .req d17 32*1862eb00SEric Biggers M_2 .req d18 33*1862eb00SEric Biggers M_3 .req d19 34*1862eb00SEric Biggers M_4 .req d20 35*1862eb00SEric Biggers M_5 .req d21 36*1862eb00SEric Biggers M_6 .req d22 37*1862eb00SEric Biggers M_7 .req d23 38*1862eb00SEric Biggers M_8 .req d24 39*1862eb00SEric Biggers M_9 .req d25 40*1862eb00SEric Biggers M_10 .req d26 41*1862eb00SEric Biggers M_11 .req d27 42*1862eb00SEric Biggers M_12 .req d28 43*1862eb00SEric Biggers M_13 .req d29 44*1862eb00SEric Biggers M_14 .req d30 45*1862eb00SEric Biggers M_15 .req d31 46*1862eb00SEric Biggers 47*1862eb00SEric Biggers .align 4 48*1862eb00SEric Biggers // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8 49*1862eb00SEric Biggers // instruction. This is the most efficient way to implement these 50*1862eb00SEric Biggers // rotation amounts with NEON. (On Cortex-A53 it's the same speed as 51*1862eb00SEric Biggers // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.) 52*1862eb00SEric Biggers.Lror24_table: 53*1862eb00SEric Biggers .byte 3, 4, 5, 6, 7, 0, 1, 2 54*1862eb00SEric Biggers.Lror16_table: 55*1862eb00SEric Biggers .byte 2, 3, 4, 5, 6, 7, 0, 1 56*1862eb00SEric Biggers // The BLAKE2b initialization vector 57*1862eb00SEric Biggers.Lblake2b_IV: 58*1862eb00SEric Biggers .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b 59*1862eb00SEric Biggers .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 60*1862eb00SEric Biggers .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f 61*1862eb00SEric Biggers .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 62*1862eb00SEric Biggers 63*1862eb00SEric Biggers// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the 64*1862eb00SEric Biggers// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack 65*1862eb00SEric Biggers// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9 66*1862eb00SEric Biggers// (M_0-M_3), so that they can be reloaded if they are used as temporary 67*1862eb00SEric Biggers// registers. The macro arguments s0-s15 give the order in which the message 68*1862eb00SEric Biggers// words are used in this round. 'final' is 1 if this is the final round. 69*1862eb00SEric Biggers.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ 70*1862eb00SEric Biggers s8, s9, s10, s11, s12, s13, s14, s15, final=0 71*1862eb00SEric Biggers 72*1862eb00SEric Biggers // Mix the columns: 73*1862eb00SEric Biggers // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]), 74*1862eb00SEric Biggers // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]). 75*1862eb00SEric Biggers 76*1862eb00SEric Biggers // a += b + m[blake2b_sigma[r][2*i + 0]]; 77*1862eb00SEric Biggers vadd.u64 q0, q0, q2 78*1862eb00SEric Biggers vadd.u64 q1, q1, q3 79*1862eb00SEric Biggers vadd.u64 d0, d0, M_\s0 80*1862eb00SEric Biggers vadd.u64 d1, d1, M_\s2 81*1862eb00SEric Biggers vadd.u64 d2, d2, M_\s4 82*1862eb00SEric Biggers vadd.u64 d3, d3, M_\s6 83*1862eb00SEric Biggers 84*1862eb00SEric Biggers // d = ror64(d ^ a, 32); 85*1862eb00SEric Biggers veor q6, q6, q0 86*1862eb00SEric Biggers veor q7, q7, q1 87*1862eb00SEric Biggers vrev64.32 q6, q6 88*1862eb00SEric Biggers vrev64.32 q7, q7 89*1862eb00SEric Biggers 90*1862eb00SEric Biggers // c += d; 91*1862eb00SEric Biggers vadd.u64 q4, q4, q6 92*1862eb00SEric Biggers vadd.u64 q5, q5, q7 93*1862eb00SEric Biggers 94*1862eb00SEric Biggers // b = ror64(b ^ c, 24); 95*1862eb00SEric Biggers vld1.8 {M_0}, [ROR24_TABLE, :64] 96*1862eb00SEric Biggers veor q2, q2, q4 97*1862eb00SEric Biggers veor q3, q3, q5 98*1862eb00SEric Biggers vtbl.8 d4, {d4}, M_0 99*1862eb00SEric Biggers vtbl.8 d5, {d5}, M_0 100*1862eb00SEric Biggers vtbl.8 d6, {d6}, M_0 101*1862eb00SEric Biggers vtbl.8 d7, {d7}, M_0 102*1862eb00SEric Biggers 103*1862eb00SEric Biggers // a += b + m[blake2b_sigma[r][2*i + 1]]; 104*1862eb00SEric Biggers // 105*1862eb00SEric Biggers // M_0 got clobbered above, so we have to reload it if any of the four 106*1862eb00SEric Biggers // message words this step needs happens to be M_0. Otherwise we don't 107*1862eb00SEric Biggers // need to reload it here, as it will just get clobbered again below. 108*1862eb00SEric Biggers.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 109*1862eb00SEric Biggers vld1.8 {M_0}, [sp, :64] 110*1862eb00SEric Biggers.endif 111*1862eb00SEric Biggers vadd.u64 q0, q0, q2 112*1862eb00SEric Biggers vadd.u64 q1, q1, q3 113*1862eb00SEric Biggers vadd.u64 d0, d0, M_\s1 114*1862eb00SEric Biggers vadd.u64 d1, d1, M_\s3 115*1862eb00SEric Biggers vadd.u64 d2, d2, M_\s5 116*1862eb00SEric Biggers vadd.u64 d3, d3, M_\s7 117*1862eb00SEric Biggers 118*1862eb00SEric Biggers // d = ror64(d ^ a, 16); 119*1862eb00SEric Biggers vld1.8 {M_0}, [ROR16_TABLE, :64] 120*1862eb00SEric Biggers veor q6, q6, q0 121*1862eb00SEric Biggers veor q7, q7, q1 122*1862eb00SEric Biggers vtbl.8 d12, {d12}, M_0 123*1862eb00SEric Biggers vtbl.8 d13, {d13}, M_0 124*1862eb00SEric Biggers vtbl.8 d14, {d14}, M_0 125*1862eb00SEric Biggers vtbl.8 d15, {d15}, M_0 126*1862eb00SEric Biggers 127*1862eb00SEric Biggers // c += d; 128*1862eb00SEric Biggers vadd.u64 q4, q4, q6 129*1862eb00SEric Biggers vadd.u64 q5, q5, q7 130*1862eb00SEric Biggers 131*1862eb00SEric Biggers // b = ror64(b ^ c, 63); 132*1862eb00SEric Biggers // 133*1862eb00SEric Biggers // This rotation amount isn't a multiple of 8, so it has to be 134*1862eb00SEric Biggers // implemented using a pair of shifts, which requires temporary 135*1862eb00SEric Biggers // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards. 136*1862eb00SEric Biggers veor q8, q2, q4 137*1862eb00SEric Biggers veor q9, q3, q5 138*1862eb00SEric Biggers vshr.u64 q2, q8, #63 139*1862eb00SEric Biggers vshr.u64 q3, q9, #63 140*1862eb00SEric Biggers vsli.u64 q2, q8, #1 141*1862eb00SEric Biggers vsli.u64 q3, q9, #1 142*1862eb00SEric Biggers vld1.8 {q8-q9}, [sp, :256] 143*1862eb00SEric Biggers 144*1862eb00SEric Biggers // Mix the diagonals: 145*1862eb00SEric Biggers // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]), 146*1862eb00SEric Biggers // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]). 147*1862eb00SEric Biggers // 148*1862eb00SEric Biggers // There are two possible ways to do this: use 'vext' instructions to 149*1862eb00SEric Biggers // shift the rows of the matrix so that the diagonals become columns, 150*1862eb00SEric Biggers // and undo it afterwards; or just use 64-bit operations on 'd' 151*1862eb00SEric Biggers // registers instead of 128-bit operations on 'q' registers. We use the 152*1862eb00SEric Biggers // latter approach, as it performs much better on Cortex-A7. 153*1862eb00SEric Biggers 154*1862eb00SEric Biggers // a += b + m[blake2b_sigma[r][2*i + 0]]; 155*1862eb00SEric Biggers vadd.u64 d0, d0, d5 156*1862eb00SEric Biggers vadd.u64 d1, d1, d6 157*1862eb00SEric Biggers vadd.u64 d2, d2, d7 158*1862eb00SEric Biggers vadd.u64 d3, d3, d4 159*1862eb00SEric Biggers vadd.u64 d0, d0, M_\s8 160*1862eb00SEric Biggers vadd.u64 d1, d1, M_\s10 161*1862eb00SEric Biggers vadd.u64 d2, d2, M_\s12 162*1862eb00SEric Biggers vadd.u64 d3, d3, M_\s14 163*1862eb00SEric Biggers 164*1862eb00SEric Biggers // d = ror64(d ^ a, 32); 165*1862eb00SEric Biggers veor d15, d15, d0 166*1862eb00SEric Biggers veor d12, d12, d1 167*1862eb00SEric Biggers veor d13, d13, d2 168*1862eb00SEric Biggers veor d14, d14, d3 169*1862eb00SEric Biggers vrev64.32 d15, d15 170*1862eb00SEric Biggers vrev64.32 d12, d12 171*1862eb00SEric Biggers vrev64.32 d13, d13 172*1862eb00SEric Biggers vrev64.32 d14, d14 173*1862eb00SEric Biggers 174*1862eb00SEric Biggers // c += d; 175*1862eb00SEric Biggers vadd.u64 d10, d10, d15 176*1862eb00SEric Biggers vadd.u64 d11, d11, d12 177*1862eb00SEric Biggers vadd.u64 d8, d8, d13 178*1862eb00SEric Biggers vadd.u64 d9, d9, d14 179*1862eb00SEric Biggers 180*1862eb00SEric Biggers // b = ror64(b ^ c, 24); 181*1862eb00SEric Biggers vld1.8 {M_0}, [ROR24_TABLE, :64] 182*1862eb00SEric Biggers veor d5, d5, d10 183*1862eb00SEric Biggers veor d6, d6, d11 184*1862eb00SEric Biggers veor d7, d7, d8 185*1862eb00SEric Biggers veor d4, d4, d9 186*1862eb00SEric Biggers vtbl.8 d5, {d5}, M_0 187*1862eb00SEric Biggers vtbl.8 d6, {d6}, M_0 188*1862eb00SEric Biggers vtbl.8 d7, {d7}, M_0 189*1862eb00SEric Biggers vtbl.8 d4, {d4}, M_0 190*1862eb00SEric Biggers 191*1862eb00SEric Biggers // a += b + m[blake2b_sigma[r][2*i + 1]]; 192*1862eb00SEric Biggers.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 193*1862eb00SEric Biggers vld1.8 {M_0}, [sp, :64] 194*1862eb00SEric Biggers.endif 195*1862eb00SEric Biggers vadd.u64 d0, d0, d5 196*1862eb00SEric Biggers vadd.u64 d1, d1, d6 197*1862eb00SEric Biggers vadd.u64 d2, d2, d7 198*1862eb00SEric Biggers vadd.u64 d3, d3, d4 199*1862eb00SEric Biggers vadd.u64 d0, d0, M_\s9 200*1862eb00SEric Biggers vadd.u64 d1, d1, M_\s11 201*1862eb00SEric Biggers vadd.u64 d2, d2, M_\s13 202*1862eb00SEric Biggers vadd.u64 d3, d3, M_\s15 203*1862eb00SEric Biggers 204*1862eb00SEric Biggers // d = ror64(d ^ a, 16); 205*1862eb00SEric Biggers vld1.8 {M_0}, [ROR16_TABLE, :64] 206*1862eb00SEric Biggers veor d15, d15, d0 207*1862eb00SEric Biggers veor d12, d12, d1 208*1862eb00SEric Biggers veor d13, d13, d2 209*1862eb00SEric Biggers veor d14, d14, d3 210*1862eb00SEric Biggers vtbl.8 d12, {d12}, M_0 211*1862eb00SEric Biggers vtbl.8 d13, {d13}, M_0 212*1862eb00SEric Biggers vtbl.8 d14, {d14}, M_0 213*1862eb00SEric Biggers vtbl.8 d15, {d15}, M_0 214*1862eb00SEric Biggers 215*1862eb00SEric Biggers // c += d; 216*1862eb00SEric Biggers vadd.u64 d10, d10, d15 217*1862eb00SEric Biggers vadd.u64 d11, d11, d12 218*1862eb00SEric Biggers vadd.u64 d8, d8, d13 219*1862eb00SEric Biggers vadd.u64 d9, d9, d14 220*1862eb00SEric Biggers 221*1862eb00SEric Biggers // b = ror64(b ^ c, 63); 222*1862eb00SEric Biggers veor d16, d4, d9 223*1862eb00SEric Biggers veor d17, d5, d10 224*1862eb00SEric Biggers veor d18, d6, d11 225*1862eb00SEric Biggers veor d19, d7, d8 226*1862eb00SEric Biggers vshr.u64 q2, q8, #63 227*1862eb00SEric Biggers vshr.u64 q3, q9, #63 228*1862eb00SEric Biggers vsli.u64 q2, q8, #1 229*1862eb00SEric Biggers vsli.u64 q3, q9, #1 230*1862eb00SEric Biggers // Reloading q8-q9 can be skipped on the final round. 231*1862eb00SEric Biggers.if ! \final 232*1862eb00SEric Biggers vld1.8 {q8-q9}, [sp, :256] 233*1862eb00SEric Biggers.endif 234*1862eb00SEric Biggers.endm 235*1862eb00SEric Biggers 236*1862eb00SEric Biggers// 237*1862eb00SEric Biggers// void blake2b_compress_neon(struct blake2b_state *state, 238*1862eb00SEric Biggers// const u8 *block, size_t nblocks, u32 inc); 239*1862eb00SEric Biggers// 240*1862eb00SEric Biggers// Only the first three fields of struct blake2b_state are used: 241*1862eb00SEric Biggers// u64 h[8]; (inout) 242*1862eb00SEric Biggers// u64 t[2]; (inout) 243*1862eb00SEric Biggers// u64 f[2]; (in) 244*1862eb00SEric Biggers// 245*1862eb00SEric Biggers .align 5 246*1862eb00SEric BiggersENTRY(blake2b_compress_neon) 247*1862eb00SEric Biggers push {r4-r10} 248*1862eb00SEric Biggers 249*1862eb00SEric Biggers // Allocate a 32-byte stack buffer that is 32-byte aligned. 250*1862eb00SEric Biggers mov ORIG_SP, sp 251*1862eb00SEric Biggers sub ip, sp, #32 252*1862eb00SEric Biggers bic ip, ip, #31 253*1862eb00SEric Biggers mov sp, ip 254*1862eb00SEric Biggers 255*1862eb00SEric Biggers adr ROR24_TABLE, .Lror24_table 256*1862eb00SEric Biggers adr ROR16_TABLE, .Lror16_table 257*1862eb00SEric Biggers 258*1862eb00SEric Biggers mov ip, STATE 259*1862eb00SEric Biggers vld1.64 {q0-q1}, [ip]! // Load h[0..3] 260*1862eb00SEric Biggers vld1.64 {q2-q3}, [ip]! // Load h[4..7] 261*1862eb00SEric Biggers.Lnext_block: 262*1862eb00SEric Biggers adr r10, .Lblake2b_IV 263*1862eb00SEric Biggers vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1] 264*1862eb00SEric Biggers vld1.64 {q4-q5}, [r10]! // Load IV[0..3] 265*1862eb00SEric Biggers vmov r7, r8, d28 // Copy t[0] to (r7, r8) 266*1862eb00SEric Biggers vld1.64 {q6-q7}, [r10] // Load IV[4..7] 267*1862eb00SEric Biggers adds r7, r7, INC // Increment counter 268*1862eb00SEric Biggers bcs .Lslow_inc_ctr 269*1862eb00SEric Biggers vmov.i32 d28[0], r7 270*1862eb00SEric Biggers vst1.64 {d28}, [ip] // Update t[0] 271*1862eb00SEric Biggers.Linc_ctr_done: 272*1862eb00SEric Biggers 273*1862eb00SEric Biggers // Load the next message block and finish initializing the state matrix 274*1862eb00SEric Biggers // 'v'. Fortunately, there are exactly enough NEON registers to fit the 275*1862eb00SEric Biggers // entire state matrix in q0-q7 and the entire message block in q8-15. 276*1862eb00SEric Biggers // 277*1862eb00SEric Biggers // However, _blake2b_round also needs some extra registers for rotates, 278*1862eb00SEric Biggers // so we have to spill some registers. It's better to spill the message 279*1862eb00SEric Biggers // registers than the state registers, as the message doesn't change. 280*1862eb00SEric Biggers // Therefore we store a copy of the first 32 bytes of the message block 281*1862eb00SEric Biggers // (q8-q9) in an aligned buffer on the stack so that they can be 282*1862eb00SEric Biggers // reloaded when needed. (We could just reload directly from the 283*1862eb00SEric Biggers // message buffer, but it's faster to use aligned loads.) 284*1862eb00SEric Biggers vld1.8 {q8-q9}, [BLOCK]! 285*1862eb00SEric Biggers veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] 286*1862eb00SEric Biggers vld1.8 {q10-q11}, [BLOCK]! 287*1862eb00SEric Biggers veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] 288*1862eb00SEric Biggers vld1.8 {q12-q13}, [BLOCK]! 289*1862eb00SEric Biggers vst1.8 {q8-q9}, [sp, :256] 290*1862eb00SEric Biggers mov ip, STATE 291*1862eb00SEric Biggers vld1.8 {q14-q15}, [BLOCK]! 292*1862eb00SEric Biggers 293*1862eb00SEric Biggers // Execute the rounds. Each round is provided the order in which it 294*1862eb00SEric Biggers // needs to use the message words. 295*1862eb00SEric Biggers _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 296*1862eb00SEric Biggers _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 297*1862eb00SEric Biggers _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 298*1862eb00SEric Biggers _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 299*1862eb00SEric Biggers _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 300*1862eb00SEric Biggers _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 301*1862eb00SEric Biggers _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 302*1862eb00SEric Biggers _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 303*1862eb00SEric Biggers _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 304*1862eb00SEric Biggers _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 305*1862eb00SEric Biggers _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 306*1862eb00SEric Biggers _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ 307*1862eb00SEric Biggers final=1 308*1862eb00SEric Biggers 309*1862eb00SEric Biggers // Fold the final state matrix into the hash chaining value: 310*1862eb00SEric Biggers // 311*1862eb00SEric Biggers // for (i = 0; i < 8; i++) 312*1862eb00SEric Biggers // h[i] ^= v[i] ^ v[i + 8]; 313*1862eb00SEric Biggers // 314*1862eb00SEric Biggers vld1.64 {q8-q9}, [ip]! // Load old h[0..3] 315*1862eb00SEric Biggers veor q0, q0, q4 // v[0..1] ^= v[8..9] 316*1862eb00SEric Biggers veor q1, q1, q5 // v[2..3] ^= v[10..11] 317*1862eb00SEric Biggers vld1.64 {q10-q11}, [ip] // Load old h[4..7] 318*1862eb00SEric Biggers veor q2, q2, q6 // v[4..5] ^= v[12..13] 319*1862eb00SEric Biggers veor q3, q3, q7 // v[6..7] ^= v[14..15] 320*1862eb00SEric Biggers veor q0, q0, q8 // v[0..1] ^= h[0..1] 321*1862eb00SEric Biggers veor q1, q1, q9 // v[2..3] ^= h[2..3] 322*1862eb00SEric Biggers mov ip, STATE 323*1862eb00SEric Biggers subs NBLOCKS, NBLOCKS, #1 // nblocks-- 324*1862eb00SEric Biggers vst1.64 {q0-q1}, [ip]! // Store new h[0..3] 325*1862eb00SEric Biggers veor q2, q2, q10 // v[4..5] ^= h[4..5] 326*1862eb00SEric Biggers veor q3, q3, q11 // v[6..7] ^= h[6..7] 327*1862eb00SEric Biggers vst1.64 {q2-q3}, [ip]! // Store new h[4..7] 328*1862eb00SEric Biggers 329*1862eb00SEric Biggers // Advance to the next block, if there is one. 330*1862eb00SEric Biggers bne .Lnext_block // nblocks != 0? 331*1862eb00SEric Biggers 332*1862eb00SEric Biggers mov sp, ORIG_SP 333*1862eb00SEric Biggers pop {r4-r10} 334*1862eb00SEric Biggers mov pc, lr 335*1862eb00SEric Biggers 336*1862eb00SEric Biggers.Lslow_inc_ctr: 337*1862eb00SEric Biggers // Handle the case where the counter overflowed its low 32 bits, by 338*1862eb00SEric Biggers // carrying the overflow bit into the full 128-bit counter. 339*1862eb00SEric Biggers vmov r9, r10, d29 340*1862eb00SEric Biggers adcs r8, r8, #0 341*1862eb00SEric Biggers adcs r9, r9, #0 342*1862eb00SEric Biggers adc r10, r10, #0 343*1862eb00SEric Biggers vmov d28, r7, r8 344*1862eb00SEric Biggers vmov d29, r9, r10 345*1862eb00SEric Biggers vst1.64 {q14}, [ip] // Update t[0] and t[1] 346*1862eb00SEric Biggers b .Linc_ctr_done 347*1862eb00SEric BiggersENDPROC(blake2b_compress_neon) 348