1*744bfb21SJohn Baldwin /* SPDX-License-Identifier: MIT 2*744bfb21SJohn Baldwin * 3*744bfb21SJohn Baldwin * Copyright (C) 2015-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4*744bfb21SJohn Baldwin * Copyright (c) 2022 The FreeBSD Foundation 5*744bfb21SJohn Baldwin */ 6*744bfb21SJohn Baldwin 7*744bfb21SJohn Baldwin #include <sys/types.h> 8*744bfb21SJohn Baldwin #include <sys/systm.h> 9*744bfb21SJohn Baldwin #include <sys/endian.h> 10*744bfb21SJohn Baldwin #include <sys/mbuf.h> 11*744bfb21SJohn Baldwin #include <opencrypto/cryptodev.h> 12*744bfb21SJohn Baldwin 13*744bfb21SJohn Baldwin #include "crypto.h" 14*744bfb21SJohn Baldwin 15*744bfb21SJohn Baldwin #ifndef COMPAT_NEED_CHACHA20POLY1305_MBUF 16*744bfb21SJohn Baldwin static crypto_session_t chacha20_poly1305_sid; 17*744bfb21SJohn Baldwin #endif 18*744bfb21SJohn Baldwin 19*744bfb21SJohn Baldwin #ifndef ARRAY_SIZE 20*744bfb21SJohn Baldwin #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 21*744bfb21SJohn Baldwin #endif 22*744bfb21SJohn Baldwin #ifndef noinline 23*744bfb21SJohn Baldwin #define noinline __attribute__((noinline)) 24*744bfb21SJohn Baldwin #endif 25*744bfb21SJohn Baldwin #ifndef __aligned 26*744bfb21SJohn Baldwin #define __aligned(x) __attribute__((aligned(x))) 27*744bfb21SJohn Baldwin #endif 28*744bfb21SJohn Baldwin #ifndef DIV_ROUND_UP 29*744bfb21SJohn Baldwin #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) 30*744bfb21SJohn Baldwin #endif 31*744bfb21SJohn Baldwin 32*744bfb21SJohn Baldwin #define le32_to_cpup(a) le32toh(*(a)) 33*744bfb21SJohn Baldwin #define le64_to_cpup(a) le64toh(*(a)) 34*744bfb21SJohn Baldwin #define cpu_to_le32(a) htole32(a) 35*744bfb21SJohn Baldwin #define cpu_to_le64(a) htole64(a) 36*744bfb21SJohn Baldwin 37*744bfb21SJohn Baldwin static inline __unused uint32_t get_unaligned_le32(const uint8_t *a) 38*744bfb21SJohn Baldwin { 39*744bfb21SJohn Baldwin uint32_t l; 40*744bfb21SJohn Baldwin __builtin_memcpy(&l, a, sizeof(l)); 41*744bfb21SJohn Baldwin return le32_to_cpup(&l); 42*744bfb21SJohn Baldwin } 43*744bfb21SJohn Baldwin static inline __unused uint64_t get_unaligned_le64(const uint8_t *a) 44*744bfb21SJohn Baldwin { 45*744bfb21SJohn Baldwin uint64_t l; 46*744bfb21SJohn Baldwin __builtin_memcpy(&l, a, sizeof(l)); 47*744bfb21SJohn Baldwin return le64_to_cpup(&l); 48*744bfb21SJohn Baldwin } 49*744bfb21SJohn Baldwin static inline __unused void put_unaligned_le32(uint32_t s, uint8_t *d) 50*744bfb21SJohn Baldwin { 51*744bfb21SJohn Baldwin uint32_t l = cpu_to_le32(s); 52*744bfb21SJohn Baldwin __builtin_memcpy(d, &l, sizeof(l)); 53*744bfb21SJohn Baldwin } 54*744bfb21SJohn Baldwin static inline __unused void cpu_to_le32_array(uint32_t *buf, unsigned int words) 55*744bfb21SJohn Baldwin { 56*744bfb21SJohn Baldwin while (words--) { 57*744bfb21SJohn Baldwin *buf = cpu_to_le32(*buf); 58*744bfb21SJohn Baldwin ++buf; 59*744bfb21SJohn Baldwin } 60*744bfb21SJohn Baldwin } 61*744bfb21SJohn Baldwin static inline __unused void le32_to_cpu_array(uint32_t *buf, unsigned int words) 62*744bfb21SJohn Baldwin { 63*744bfb21SJohn Baldwin while (words--) { 64*744bfb21SJohn Baldwin *buf = le32_to_cpup(buf); 65*744bfb21SJohn Baldwin ++buf; 66*744bfb21SJohn Baldwin } 67*744bfb21SJohn Baldwin } 68*744bfb21SJohn Baldwin static inline __unused uint32_t rol32(uint32_t word, unsigned int shift) 69*744bfb21SJohn Baldwin { 70*744bfb21SJohn Baldwin return (word << (shift & 31)) | (word >> ((-shift) & 31)); 71*744bfb21SJohn Baldwin } 72*744bfb21SJohn Baldwin static inline __unused uint32_t ror32(uint32_t word, unsigned int shift) 73*744bfb21SJohn Baldwin { 74*744bfb21SJohn Baldwin return (word >> (shift & 31)) | (word << ((-shift) & 31)); 75*744bfb21SJohn Baldwin } 76*744bfb21SJohn Baldwin 77*744bfb21SJohn Baldwin #if defined(COMPAT_NEED_CHACHA20POLY1305) || defined(COMPAT_NEED_CHACHA20POLY1305_MBUF) 78*744bfb21SJohn Baldwin static void xor_cpy(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t len) 79*744bfb21SJohn Baldwin { 80*744bfb21SJohn Baldwin size_t i; 81*744bfb21SJohn Baldwin 82*744bfb21SJohn Baldwin for (i = 0; i < len; ++i) 83*744bfb21SJohn Baldwin dst[i] = src1[i] ^ src2[i]; 84*744bfb21SJohn Baldwin } 85*744bfb21SJohn Baldwin 86*744bfb21SJohn Baldwin #define QUARTER_ROUND(x, a, b, c, d) ( \ 87*744bfb21SJohn Baldwin x[a] += x[b], \ 88*744bfb21SJohn Baldwin x[d] = rol32((x[d] ^ x[a]), 16), \ 89*744bfb21SJohn Baldwin x[c] += x[d], \ 90*744bfb21SJohn Baldwin x[b] = rol32((x[b] ^ x[c]), 12), \ 91*744bfb21SJohn Baldwin x[a] += x[b], \ 92*744bfb21SJohn Baldwin x[d] = rol32((x[d] ^ x[a]), 8), \ 93*744bfb21SJohn Baldwin x[c] += x[d], \ 94*744bfb21SJohn Baldwin x[b] = rol32((x[b] ^ x[c]), 7) \ 95*744bfb21SJohn Baldwin ) 96*744bfb21SJohn Baldwin 97*744bfb21SJohn Baldwin #define C(i, j) (i * 4 + j) 98*744bfb21SJohn Baldwin 99*744bfb21SJohn Baldwin #define DOUBLE_ROUND(x) ( \ 100*744bfb21SJohn Baldwin /* Column Round */ \ 101*744bfb21SJohn Baldwin QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \ 102*744bfb21SJohn Baldwin QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \ 103*744bfb21SJohn Baldwin QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \ 104*744bfb21SJohn Baldwin QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \ 105*744bfb21SJohn Baldwin /* Diagonal Round */ \ 106*744bfb21SJohn Baldwin QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \ 107*744bfb21SJohn Baldwin QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \ 108*744bfb21SJohn Baldwin QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \ 109*744bfb21SJohn Baldwin QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \ 110*744bfb21SJohn Baldwin ) 111*744bfb21SJohn Baldwin 112*744bfb21SJohn Baldwin #define TWENTY_ROUNDS(x) ( \ 113*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 114*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 115*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 116*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 117*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 118*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 119*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 120*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 121*744bfb21SJohn Baldwin DOUBLE_ROUND(x), \ 122*744bfb21SJohn Baldwin DOUBLE_ROUND(x) \ 123*744bfb21SJohn Baldwin ) 124*744bfb21SJohn Baldwin 125*744bfb21SJohn Baldwin enum chacha20_lengths { 126*744bfb21SJohn Baldwin CHACHA20_NONCE_SIZE = 16, 127*744bfb21SJohn Baldwin CHACHA20_KEY_SIZE = 32, 128*744bfb21SJohn Baldwin CHACHA20_KEY_WORDS = CHACHA20_KEY_SIZE / sizeof(uint32_t), 129*744bfb21SJohn Baldwin CHACHA20_BLOCK_SIZE = 64, 130*744bfb21SJohn Baldwin CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(uint32_t), 131*744bfb21SJohn Baldwin HCHACHA20_NONCE_SIZE = CHACHA20_NONCE_SIZE, 132*744bfb21SJohn Baldwin HCHACHA20_KEY_SIZE = CHACHA20_KEY_SIZE 133*744bfb21SJohn Baldwin }; 134*744bfb21SJohn Baldwin 135*744bfb21SJohn Baldwin enum chacha20_constants { /* expand 32-byte k */ 136*744bfb21SJohn Baldwin CHACHA20_CONSTANT_EXPA = 0x61707865U, 137*744bfb21SJohn Baldwin CHACHA20_CONSTANT_ND_3 = 0x3320646eU, 138*744bfb21SJohn Baldwin CHACHA20_CONSTANT_2_BY = 0x79622d32U, 139*744bfb21SJohn Baldwin CHACHA20_CONSTANT_TE_K = 0x6b206574U 140*744bfb21SJohn Baldwin }; 141*744bfb21SJohn Baldwin 142*744bfb21SJohn Baldwin struct chacha20_ctx { 143*744bfb21SJohn Baldwin union { 144*744bfb21SJohn Baldwin uint32_t state[16]; 145*744bfb21SJohn Baldwin struct { 146*744bfb21SJohn Baldwin uint32_t constant[4]; 147*744bfb21SJohn Baldwin uint32_t key[8]; 148*744bfb21SJohn Baldwin uint32_t counter[4]; 149*744bfb21SJohn Baldwin }; 150*744bfb21SJohn Baldwin }; 151*744bfb21SJohn Baldwin }; 152*744bfb21SJohn Baldwin 153*744bfb21SJohn Baldwin static void chacha20_init(struct chacha20_ctx *ctx, 154*744bfb21SJohn Baldwin const uint8_t key[CHACHA20_KEY_SIZE], 155*744bfb21SJohn Baldwin const uint64_t nonce) 156*744bfb21SJohn Baldwin { 157*744bfb21SJohn Baldwin ctx->constant[0] = CHACHA20_CONSTANT_EXPA; 158*744bfb21SJohn Baldwin ctx->constant[1] = CHACHA20_CONSTANT_ND_3; 159*744bfb21SJohn Baldwin ctx->constant[2] = CHACHA20_CONSTANT_2_BY; 160*744bfb21SJohn Baldwin ctx->constant[3] = CHACHA20_CONSTANT_TE_K; 161*744bfb21SJohn Baldwin ctx->key[0] = get_unaligned_le32(key + 0); 162*744bfb21SJohn Baldwin ctx->key[1] = get_unaligned_le32(key + 4); 163*744bfb21SJohn Baldwin ctx->key[2] = get_unaligned_le32(key + 8); 164*744bfb21SJohn Baldwin ctx->key[3] = get_unaligned_le32(key + 12); 165*744bfb21SJohn Baldwin ctx->key[4] = get_unaligned_le32(key + 16); 166*744bfb21SJohn Baldwin ctx->key[5] = get_unaligned_le32(key + 20); 167*744bfb21SJohn Baldwin ctx->key[6] = get_unaligned_le32(key + 24); 168*744bfb21SJohn Baldwin ctx->key[7] = get_unaligned_le32(key + 28); 169*744bfb21SJohn Baldwin ctx->counter[0] = 0; 170*744bfb21SJohn Baldwin ctx->counter[1] = 0; 171*744bfb21SJohn Baldwin ctx->counter[2] = nonce & 0xffffffffU; 172*744bfb21SJohn Baldwin ctx->counter[3] = nonce >> 32; 173*744bfb21SJohn Baldwin } 174*744bfb21SJohn Baldwin 175*744bfb21SJohn Baldwin static void chacha20_block(struct chacha20_ctx *ctx, uint32_t *stream) 176*744bfb21SJohn Baldwin { 177*744bfb21SJohn Baldwin uint32_t x[CHACHA20_BLOCK_WORDS]; 178*744bfb21SJohn Baldwin int i; 179*744bfb21SJohn Baldwin 180*744bfb21SJohn Baldwin for (i = 0; i < ARRAY_SIZE(x); ++i) 181*744bfb21SJohn Baldwin x[i] = ctx->state[i]; 182*744bfb21SJohn Baldwin 183*744bfb21SJohn Baldwin TWENTY_ROUNDS(x); 184*744bfb21SJohn Baldwin 185*744bfb21SJohn Baldwin for (i = 0; i < ARRAY_SIZE(x); ++i) 186*744bfb21SJohn Baldwin stream[i] = cpu_to_le32(x[i] + ctx->state[i]); 187*744bfb21SJohn Baldwin 188*744bfb21SJohn Baldwin ctx->counter[0] += 1; 189*744bfb21SJohn Baldwin } 190*744bfb21SJohn Baldwin 191*744bfb21SJohn Baldwin static void chacha20(struct chacha20_ctx *ctx, uint8_t *out, const uint8_t *in, 192*744bfb21SJohn Baldwin uint32_t len) 193*744bfb21SJohn Baldwin { 194*744bfb21SJohn Baldwin uint32_t buf[CHACHA20_BLOCK_WORDS]; 195*744bfb21SJohn Baldwin 196*744bfb21SJohn Baldwin while (len >= CHACHA20_BLOCK_SIZE) { 197*744bfb21SJohn Baldwin chacha20_block(ctx, buf); 198*744bfb21SJohn Baldwin xor_cpy(out, in, (uint8_t *)buf, CHACHA20_BLOCK_SIZE); 199*744bfb21SJohn Baldwin len -= CHACHA20_BLOCK_SIZE; 200*744bfb21SJohn Baldwin out += CHACHA20_BLOCK_SIZE; 201*744bfb21SJohn Baldwin in += CHACHA20_BLOCK_SIZE; 202*744bfb21SJohn Baldwin } 203*744bfb21SJohn Baldwin if (len) { 204*744bfb21SJohn Baldwin chacha20_block(ctx, buf); 205*744bfb21SJohn Baldwin xor_cpy(out, in, (uint8_t *)buf, len); 206*744bfb21SJohn Baldwin } 207*744bfb21SJohn Baldwin } 208*744bfb21SJohn Baldwin 209*744bfb21SJohn Baldwin static void hchacha20(uint32_t derived_key[CHACHA20_KEY_WORDS], 210*744bfb21SJohn Baldwin const uint8_t nonce[HCHACHA20_NONCE_SIZE], 211*744bfb21SJohn Baldwin const uint8_t key[HCHACHA20_KEY_SIZE]) 212*744bfb21SJohn Baldwin { 213*744bfb21SJohn Baldwin uint32_t x[] = { CHACHA20_CONSTANT_EXPA, 214*744bfb21SJohn Baldwin CHACHA20_CONSTANT_ND_3, 215*744bfb21SJohn Baldwin CHACHA20_CONSTANT_2_BY, 216*744bfb21SJohn Baldwin CHACHA20_CONSTANT_TE_K, 217*744bfb21SJohn Baldwin get_unaligned_le32(key + 0), 218*744bfb21SJohn Baldwin get_unaligned_le32(key + 4), 219*744bfb21SJohn Baldwin get_unaligned_le32(key + 8), 220*744bfb21SJohn Baldwin get_unaligned_le32(key + 12), 221*744bfb21SJohn Baldwin get_unaligned_le32(key + 16), 222*744bfb21SJohn Baldwin get_unaligned_le32(key + 20), 223*744bfb21SJohn Baldwin get_unaligned_le32(key + 24), 224*744bfb21SJohn Baldwin get_unaligned_le32(key + 28), 225*744bfb21SJohn Baldwin get_unaligned_le32(nonce + 0), 226*744bfb21SJohn Baldwin get_unaligned_le32(nonce + 4), 227*744bfb21SJohn Baldwin get_unaligned_le32(nonce + 8), 228*744bfb21SJohn Baldwin get_unaligned_le32(nonce + 12) 229*744bfb21SJohn Baldwin }; 230*744bfb21SJohn Baldwin 231*744bfb21SJohn Baldwin TWENTY_ROUNDS(x); 232*744bfb21SJohn Baldwin 233*744bfb21SJohn Baldwin memcpy(derived_key + 0, x + 0, sizeof(uint32_t) * 4); 234*744bfb21SJohn Baldwin memcpy(derived_key + 4, x + 12, sizeof(uint32_t) * 4); 235*744bfb21SJohn Baldwin } 236*744bfb21SJohn Baldwin 237*744bfb21SJohn Baldwin enum poly1305_lengths { 238*744bfb21SJohn Baldwin POLY1305_BLOCK_SIZE = 16, 239*744bfb21SJohn Baldwin POLY1305_KEY_SIZE = 32, 240*744bfb21SJohn Baldwin POLY1305_MAC_SIZE = 16 241*744bfb21SJohn Baldwin }; 242*744bfb21SJohn Baldwin 243*744bfb21SJohn Baldwin struct poly1305_internal { 244*744bfb21SJohn Baldwin uint32_t h[5]; 245*744bfb21SJohn Baldwin uint32_t r[5]; 246*744bfb21SJohn Baldwin uint32_t s[4]; 247*744bfb21SJohn Baldwin }; 248*744bfb21SJohn Baldwin 249*744bfb21SJohn Baldwin struct poly1305_ctx { 250*744bfb21SJohn Baldwin struct poly1305_internal state; 251*744bfb21SJohn Baldwin uint32_t nonce[4]; 252*744bfb21SJohn Baldwin uint8_t data[POLY1305_BLOCK_SIZE]; 253*744bfb21SJohn Baldwin size_t num; 254*744bfb21SJohn Baldwin }; 255*744bfb21SJohn Baldwin 256*744bfb21SJohn Baldwin static void poly1305_init_core(struct poly1305_internal *st, 257*744bfb21SJohn Baldwin const uint8_t key[16]) 258*744bfb21SJohn Baldwin { 259*744bfb21SJohn Baldwin /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 260*744bfb21SJohn Baldwin st->r[0] = (get_unaligned_le32(&key[0])) & 0x3ffffff; 261*744bfb21SJohn Baldwin st->r[1] = (get_unaligned_le32(&key[3]) >> 2) & 0x3ffff03; 262*744bfb21SJohn Baldwin st->r[2] = (get_unaligned_le32(&key[6]) >> 4) & 0x3ffc0ff; 263*744bfb21SJohn Baldwin st->r[3] = (get_unaligned_le32(&key[9]) >> 6) & 0x3f03fff; 264*744bfb21SJohn Baldwin st->r[4] = (get_unaligned_le32(&key[12]) >> 8) & 0x00fffff; 265*744bfb21SJohn Baldwin 266*744bfb21SJohn Baldwin /* s = 5*r */ 267*744bfb21SJohn Baldwin st->s[0] = st->r[1] * 5; 268*744bfb21SJohn Baldwin st->s[1] = st->r[2] * 5; 269*744bfb21SJohn Baldwin st->s[2] = st->r[3] * 5; 270*744bfb21SJohn Baldwin st->s[3] = st->r[4] * 5; 271*744bfb21SJohn Baldwin 272*744bfb21SJohn Baldwin /* h = 0 */ 273*744bfb21SJohn Baldwin st->h[0] = 0; 274*744bfb21SJohn Baldwin st->h[1] = 0; 275*744bfb21SJohn Baldwin st->h[2] = 0; 276*744bfb21SJohn Baldwin st->h[3] = 0; 277*744bfb21SJohn Baldwin st->h[4] = 0; 278*744bfb21SJohn Baldwin } 279*744bfb21SJohn Baldwin 280*744bfb21SJohn Baldwin static void poly1305_blocks_core(struct poly1305_internal *st, 281*744bfb21SJohn Baldwin const uint8_t *input, size_t len, 282*744bfb21SJohn Baldwin const uint32_t padbit) 283*744bfb21SJohn Baldwin { 284*744bfb21SJohn Baldwin const uint32_t hibit = padbit << 24; 285*744bfb21SJohn Baldwin uint32_t r0, r1, r2, r3, r4; 286*744bfb21SJohn Baldwin uint32_t s1, s2, s3, s4; 287*744bfb21SJohn Baldwin uint32_t h0, h1, h2, h3, h4; 288*744bfb21SJohn Baldwin uint64_t d0, d1, d2, d3, d4; 289*744bfb21SJohn Baldwin uint32_t c; 290*744bfb21SJohn Baldwin 291*744bfb21SJohn Baldwin r0 = st->r[0]; 292*744bfb21SJohn Baldwin r1 = st->r[1]; 293*744bfb21SJohn Baldwin r2 = st->r[2]; 294*744bfb21SJohn Baldwin r3 = st->r[3]; 295*744bfb21SJohn Baldwin r4 = st->r[4]; 296*744bfb21SJohn Baldwin 297*744bfb21SJohn Baldwin s1 = st->s[0]; 298*744bfb21SJohn Baldwin s2 = st->s[1]; 299*744bfb21SJohn Baldwin s3 = st->s[2]; 300*744bfb21SJohn Baldwin s4 = st->s[3]; 301*744bfb21SJohn Baldwin 302*744bfb21SJohn Baldwin h0 = st->h[0]; 303*744bfb21SJohn Baldwin h1 = st->h[1]; 304*744bfb21SJohn Baldwin h2 = st->h[2]; 305*744bfb21SJohn Baldwin h3 = st->h[3]; 306*744bfb21SJohn Baldwin h4 = st->h[4]; 307*744bfb21SJohn Baldwin 308*744bfb21SJohn Baldwin while (len >= POLY1305_BLOCK_SIZE) { 309*744bfb21SJohn Baldwin /* h += m[i] */ 310*744bfb21SJohn Baldwin h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff; 311*744bfb21SJohn Baldwin h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff; 312*744bfb21SJohn Baldwin h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff; 313*744bfb21SJohn Baldwin h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff; 314*744bfb21SJohn Baldwin h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit; 315*744bfb21SJohn Baldwin 316*744bfb21SJohn Baldwin /* h *= r */ 317*744bfb21SJohn Baldwin d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + 318*744bfb21SJohn Baldwin ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + 319*744bfb21SJohn Baldwin ((uint64_t)h4 * s1); 320*744bfb21SJohn Baldwin d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + 321*744bfb21SJohn Baldwin ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + 322*744bfb21SJohn Baldwin ((uint64_t)h4 * s2); 323*744bfb21SJohn Baldwin d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + 324*744bfb21SJohn Baldwin ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + 325*744bfb21SJohn Baldwin ((uint64_t)h4 * s3); 326*744bfb21SJohn Baldwin d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + 327*744bfb21SJohn Baldwin ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + 328*744bfb21SJohn Baldwin ((uint64_t)h4 * s4); 329*744bfb21SJohn Baldwin d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + 330*744bfb21SJohn Baldwin ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + 331*744bfb21SJohn Baldwin ((uint64_t)h4 * r0); 332*744bfb21SJohn Baldwin 333*744bfb21SJohn Baldwin /* (partial) h %= p */ 334*744bfb21SJohn Baldwin c = (uint32_t)(d0 >> 26); 335*744bfb21SJohn Baldwin h0 = (uint32_t)d0 & 0x3ffffff; 336*744bfb21SJohn Baldwin d1 += c; 337*744bfb21SJohn Baldwin c = (uint32_t)(d1 >> 26); 338*744bfb21SJohn Baldwin h1 = (uint32_t)d1 & 0x3ffffff; 339*744bfb21SJohn Baldwin d2 += c; 340*744bfb21SJohn Baldwin c = (uint32_t)(d2 >> 26); 341*744bfb21SJohn Baldwin h2 = (uint32_t)d2 & 0x3ffffff; 342*744bfb21SJohn Baldwin d3 += c; 343*744bfb21SJohn Baldwin c = (uint32_t)(d3 >> 26); 344*744bfb21SJohn Baldwin h3 = (uint32_t)d3 & 0x3ffffff; 345*744bfb21SJohn Baldwin d4 += c; 346*744bfb21SJohn Baldwin c = (uint32_t)(d4 >> 26); 347*744bfb21SJohn Baldwin h4 = (uint32_t)d4 & 0x3ffffff; 348*744bfb21SJohn Baldwin h0 += c * 5; 349*744bfb21SJohn Baldwin c = (h0 >> 26); 350*744bfb21SJohn Baldwin h0 = h0 & 0x3ffffff; 351*744bfb21SJohn Baldwin h1 += c; 352*744bfb21SJohn Baldwin 353*744bfb21SJohn Baldwin input += POLY1305_BLOCK_SIZE; 354*744bfb21SJohn Baldwin len -= POLY1305_BLOCK_SIZE; 355*744bfb21SJohn Baldwin } 356*744bfb21SJohn Baldwin 357*744bfb21SJohn Baldwin st->h[0] = h0; 358*744bfb21SJohn Baldwin st->h[1] = h1; 359*744bfb21SJohn Baldwin st->h[2] = h2; 360*744bfb21SJohn Baldwin st->h[3] = h3; 361*744bfb21SJohn Baldwin st->h[4] = h4; 362*744bfb21SJohn Baldwin } 363*744bfb21SJohn Baldwin 364*744bfb21SJohn Baldwin static void poly1305_emit_core(struct poly1305_internal *st, uint8_t mac[16], 365*744bfb21SJohn Baldwin const uint32_t nonce[4]) 366*744bfb21SJohn Baldwin { 367*744bfb21SJohn Baldwin uint32_t h0, h1, h2, h3, h4, c; 368*744bfb21SJohn Baldwin uint32_t g0, g1, g2, g3, g4; 369*744bfb21SJohn Baldwin uint64_t f; 370*744bfb21SJohn Baldwin uint32_t mask; 371*744bfb21SJohn Baldwin 372*744bfb21SJohn Baldwin /* fully carry h */ 373*744bfb21SJohn Baldwin h0 = st->h[0]; 374*744bfb21SJohn Baldwin h1 = st->h[1]; 375*744bfb21SJohn Baldwin h2 = st->h[2]; 376*744bfb21SJohn Baldwin h3 = st->h[3]; 377*744bfb21SJohn Baldwin h4 = st->h[4]; 378*744bfb21SJohn Baldwin 379*744bfb21SJohn Baldwin c = h1 >> 26; 380*744bfb21SJohn Baldwin h1 = h1 & 0x3ffffff; 381*744bfb21SJohn Baldwin h2 += c; 382*744bfb21SJohn Baldwin c = h2 >> 26; 383*744bfb21SJohn Baldwin h2 = h2 & 0x3ffffff; 384*744bfb21SJohn Baldwin h3 += c; 385*744bfb21SJohn Baldwin c = h3 >> 26; 386*744bfb21SJohn Baldwin h3 = h3 & 0x3ffffff; 387*744bfb21SJohn Baldwin h4 += c; 388*744bfb21SJohn Baldwin c = h4 >> 26; 389*744bfb21SJohn Baldwin h4 = h4 & 0x3ffffff; 390*744bfb21SJohn Baldwin h0 += c * 5; 391*744bfb21SJohn Baldwin c = h0 >> 26; 392*744bfb21SJohn Baldwin h0 = h0 & 0x3ffffff; 393*744bfb21SJohn Baldwin h1 += c; 394*744bfb21SJohn Baldwin 395*744bfb21SJohn Baldwin /* compute h + -p */ 396*744bfb21SJohn Baldwin g0 = h0 + 5; 397*744bfb21SJohn Baldwin c = g0 >> 26; 398*744bfb21SJohn Baldwin g0 &= 0x3ffffff; 399*744bfb21SJohn Baldwin g1 = h1 + c; 400*744bfb21SJohn Baldwin c = g1 >> 26; 401*744bfb21SJohn Baldwin g1 &= 0x3ffffff; 402*744bfb21SJohn Baldwin g2 = h2 + c; 403*744bfb21SJohn Baldwin c = g2 >> 26; 404*744bfb21SJohn Baldwin g2 &= 0x3ffffff; 405*744bfb21SJohn Baldwin g3 = h3 + c; 406*744bfb21SJohn Baldwin c = g3 >> 26; 407*744bfb21SJohn Baldwin g3 &= 0x3ffffff; 408*744bfb21SJohn Baldwin g4 = h4 + c - (1UL << 26); 409*744bfb21SJohn Baldwin 410*744bfb21SJohn Baldwin /* select h if h < p, or h + -p if h >= p */ 411*744bfb21SJohn Baldwin mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1; 412*744bfb21SJohn Baldwin g0 &= mask; 413*744bfb21SJohn Baldwin g1 &= mask; 414*744bfb21SJohn Baldwin g2 &= mask; 415*744bfb21SJohn Baldwin g3 &= mask; 416*744bfb21SJohn Baldwin g4 &= mask; 417*744bfb21SJohn Baldwin mask = ~mask; 418*744bfb21SJohn Baldwin 419*744bfb21SJohn Baldwin h0 = (h0 & mask) | g0; 420*744bfb21SJohn Baldwin h1 = (h1 & mask) | g1; 421*744bfb21SJohn Baldwin h2 = (h2 & mask) | g2; 422*744bfb21SJohn Baldwin h3 = (h3 & mask) | g3; 423*744bfb21SJohn Baldwin h4 = (h4 & mask) | g4; 424*744bfb21SJohn Baldwin 425*744bfb21SJohn Baldwin /* h = h % (2^128) */ 426*744bfb21SJohn Baldwin h0 = ((h0) | (h1 << 26)) & 0xffffffff; 427*744bfb21SJohn Baldwin h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; 428*744bfb21SJohn Baldwin h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; 429*744bfb21SJohn Baldwin h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; 430*744bfb21SJohn Baldwin 431*744bfb21SJohn Baldwin /* mac = (h + nonce) % (2^128) */ 432*744bfb21SJohn Baldwin f = (uint64_t)h0 + nonce[0]; 433*744bfb21SJohn Baldwin h0 = (uint32_t)f; 434*744bfb21SJohn Baldwin f = (uint64_t)h1 + nonce[1] + (f >> 32); 435*744bfb21SJohn Baldwin h1 = (uint32_t)f; 436*744bfb21SJohn Baldwin f = (uint64_t)h2 + nonce[2] + (f >> 32); 437*744bfb21SJohn Baldwin h2 = (uint32_t)f; 438*744bfb21SJohn Baldwin f = (uint64_t)h3 + nonce[3] + (f >> 32); 439*744bfb21SJohn Baldwin h3 = (uint32_t)f; 440*744bfb21SJohn Baldwin 441*744bfb21SJohn Baldwin put_unaligned_le32(h0, &mac[0]); 442*744bfb21SJohn Baldwin put_unaligned_le32(h1, &mac[4]); 443*744bfb21SJohn Baldwin put_unaligned_le32(h2, &mac[8]); 444*744bfb21SJohn Baldwin put_unaligned_le32(h3, &mac[12]); 445*744bfb21SJohn Baldwin } 446*744bfb21SJohn Baldwin 447*744bfb21SJohn Baldwin static void poly1305_init(struct poly1305_ctx *ctx, 448*744bfb21SJohn Baldwin const uint8_t key[POLY1305_KEY_SIZE]) 449*744bfb21SJohn Baldwin { 450*744bfb21SJohn Baldwin ctx->nonce[0] = get_unaligned_le32(&key[16]); 451*744bfb21SJohn Baldwin ctx->nonce[1] = get_unaligned_le32(&key[20]); 452*744bfb21SJohn Baldwin ctx->nonce[2] = get_unaligned_le32(&key[24]); 453*744bfb21SJohn Baldwin ctx->nonce[3] = get_unaligned_le32(&key[28]); 454*744bfb21SJohn Baldwin 455*744bfb21SJohn Baldwin poly1305_init_core(&ctx->state, key); 456*744bfb21SJohn Baldwin 457*744bfb21SJohn Baldwin ctx->num = 0; 458*744bfb21SJohn Baldwin } 459*744bfb21SJohn Baldwin 460*744bfb21SJohn Baldwin static void poly1305_update(struct poly1305_ctx *ctx, const uint8_t *input, 461*744bfb21SJohn Baldwin size_t len) 462*744bfb21SJohn Baldwin { 463*744bfb21SJohn Baldwin const size_t num = ctx->num; 464*744bfb21SJohn Baldwin size_t rem; 465*744bfb21SJohn Baldwin 466*744bfb21SJohn Baldwin if (num) { 467*744bfb21SJohn Baldwin rem = POLY1305_BLOCK_SIZE - num; 468*744bfb21SJohn Baldwin if (len < rem) { 469*744bfb21SJohn Baldwin memcpy(ctx->data + num, input, len); 470*744bfb21SJohn Baldwin ctx->num = num + len; 471*744bfb21SJohn Baldwin return; 472*744bfb21SJohn Baldwin } 473*744bfb21SJohn Baldwin memcpy(ctx->data + num, input, rem); 474*744bfb21SJohn Baldwin poly1305_blocks_core(&ctx->state, ctx->data, 475*744bfb21SJohn Baldwin POLY1305_BLOCK_SIZE, 1); 476*744bfb21SJohn Baldwin input += rem; 477*744bfb21SJohn Baldwin len -= rem; 478*744bfb21SJohn Baldwin } 479*744bfb21SJohn Baldwin 480*744bfb21SJohn Baldwin rem = len % POLY1305_BLOCK_SIZE; 481*744bfb21SJohn Baldwin len -= rem; 482*744bfb21SJohn Baldwin 483*744bfb21SJohn Baldwin if (len >= POLY1305_BLOCK_SIZE) { 484*744bfb21SJohn Baldwin poly1305_blocks_core(&ctx->state, input, len, 1); 485*744bfb21SJohn Baldwin input += len; 486*744bfb21SJohn Baldwin } 487*744bfb21SJohn Baldwin 488*744bfb21SJohn Baldwin if (rem) 489*744bfb21SJohn Baldwin memcpy(ctx->data, input, rem); 490*744bfb21SJohn Baldwin 491*744bfb21SJohn Baldwin ctx->num = rem; 492*744bfb21SJohn Baldwin } 493*744bfb21SJohn Baldwin 494*744bfb21SJohn Baldwin static void poly1305_final(struct poly1305_ctx *ctx, 495*744bfb21SJohn Baldwin uint8_t mac[POLY1305_MAC_SIZE]) 496*744bfb21SJohn Baldwin { 497*744bfb21SJohn Baldwin size_t num = ctx->num; 498*744bfb21SJohn Baldwin 499*744bfb21SJohn Baldwin if (num) { 500*744bfb21SJohn Baldwin ctx->data[num++] = 1; 501*744bfb21SJohn Baldwin while (num < POLY1305_BLOCK_SIZE) 502*744bfb21SJohn Baldwin ctx->data[num++] = 0; 503*744bfb21SJohn Baldwin poly1305_blocks_core(&ctx->state, ctx->data, 504*744bfb21SJohn Baldwin POLY1305_BLOCK_SIZE, 0); 505*744bfb21SJohn Baldwin } 506*744bfb21SJohn Baldwin 507*744bfb21SJohn Baldwin poly1305_emit_core(&ctx->state, mac, ctx->nonce); 508*744bfb21SJohn Baldwin 509*744bfb21SJohn Baldwin explicit_bzero(ctx, sizeof(*ctx)); 510*744bfb21SJohn Baldwin } 511*744bfb21SJohn Baldwin #endif 512*744bfb21SJohn Baldwin 513*744bfb21SJohn Baldwin #ifdef COMPAT_NEED_CHACHA20POLY1305 514*744bfb21SJohn Baldwin static const uint8_t pad0[16] = { 0 }; 515*744bfb21SJohn Baldwin 516*744bfb21SJohn Baldwin void 517*744bfb21SJohn Baldwin chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, 518*744bfb21SJohn Baldwin const uint8_t *ad, const size_t ad_len, 519*744bfb21SJohn Baldwin const uint64_t nonce, 520*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) 521*744bfb21SJohn Baldwin { 522*744bfb21SJohn Baldwin struct poly1305_ctx poly1305_state; 523*744bfb21SJohn Baldwin struct chacha20_ctx chacha20_state; 524*744bfb21SJohn Baldwin union { 525*744bfb21SJohn Baldwin uint8_t block0[POLY1305_KEY_SIZE]; 526*744bfb21SJohn Baldwin uint64_t lens[2]; 527*744bfb21SJohn Baldwin } b = { { 0 } }; 528*744bfb21SJohn Baldwin 529*744bfb21SJohn Baldwin chacha20_init(&chacha20_state, key, nonce); 530*744bfb21SJohn Baldwin chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0)); 531*744bfb21SJohn Baldwin poly1305_init(&poly1305_state, b.block0); 532*744bfb21SJohn Baldwin 533*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, ad, ad_len); 534*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); 535*744bfb21SJohn Baldwin 536*744bfb21SJohn Baldwin chacha20(&chacha20_state, dst, src, src_len); 537*744bfb21SJohn Baldwin 538*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, dst, src_len); 539*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf); 540*744bfb21SJohn Baldwin 541*744bfb21SJohn Baldwin b.lens[0] = cpu_to_le64(ad_len); 542*744bfb21SJohn Baldwin b.lens[1] = cpu_to_le64(src_len); 543*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens)); 544*744bfb21SJohn Baldwin 545*744bfb21SJohn Baldwin poly1305_final(&poly1305_state, dst + src_len); 546*744bfb21SJohn Baldwin 547*744bfb21SJohn Baldwin explicit_bzero(&chacha20_state, sizeof(chacha20_state)); 548*744bfb21SJohn Baldwin explicit_bzero(&b, sizeof(b)); 549*744bfb21SJohn Baldwin } 550*744bfb21SJohn Baldwin 551*744bfb21SJohn Baldwin bool 552*744bfb21SJohn Baldwin chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, 553*744bfb21SJohn Baldwin const uint8_t *ad, const size_t ad_len, 554*744bfb21SJohn Baldwin const uint64_t nonce, 555*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) 556*744bfb21SJohn Baldwin { 557*744bfb21SJohn Baldwin struct poly1305_ctx poly1305_state; 558*744bfb21SJohn Baldwin struct chacha20_ctx chacha20_state; 559*744bfb21SJohn Baldwin bool ret; 560*744bfb21SJohn Baldwin size_t dst_len; 561*744bfb21SJohn Baldwin union { 562*744bfb21SJohn Baldwin uint8_t block0[POLY1305_KEY_SIZE]; 563*744bfb21SJohn Baldwin uint8_t mac[POLY1305_MAC_SIZE]; 564*744bfb21SJohn Baldwin uint64_t lens[2]; 565*744bfb21SJohn Baldwin } b = { { 0 } }; 566*744bfb21SJohn Baldwin 567*744bfb21SJohn Baldwin if (src_len < POLY1305_MAC_SIZE) 568*744bfb21SJohn Baldwin return false; 569*744bfb21SJohn Baldwin 570*744bfb21SJohn Baldwin chacha20_init(&chacha20_state, key, nonce); 571*744bfb21SJohn Baldwin chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0)); 572*744bfb21SJohn Baldwin poly1305_init(&poly1305_state, b.block0); 573*744bfb21SJohn Baldwin 574*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, ad, ad_len); 575*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); 576*744bfb21SJohn Baldwin 577*744bfb21SJohn Baldwin dst_len = src_len - POLY1305_MAC_SIZE; 578*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, src, dst_len); 579*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf); 580*744bfb21SJohn Baldwin 581*744bfb21SJohn Baldwin b.lens[0] = cpu_to_le64(ad_len); 582*744bfb21SJohn Baldwin b.lens[1] = cpu_to_le64(dst_len); 583*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens)); 584*744bfb21SJohn Baldwin 585*744bfb21SJohn Baldwin poly1305_final(&poly1305_state, b.mac); 586*744bfb21SJohn Baldwin 587*744bfb21SJohn Baldwin ret = timingsafe_bcmp(b.mac, src + dst_len, POLY1305_MAC_SIZE) == 0; 588*744bfb21SJohn Baldwin if (ret) 589*744bfb21SJohn Baldwin chacha20(&chacha20_state, dst, src, dst_len); 590*744bfb21SJohn Baldwin 591*744bfb21SJohn Baldwin explicit_bzero(&chacha20_state, sizeof(chacha20_state)); 592*744bfb21SJohn Baldwin explicit_bzero(&b, sizeof(b)); 593*744bfb21SJohn Baldwin 594*744bfb21SJohn Baldwin return ret; 595*744bfb21SJohn Baldwin } 596*744bfb21SJohn Baldwin 597*744bfb21SJohn Baldwin void 598*744bfb21SJohn Baldwin xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, 599*744bfb21SJohn Baldwin const size_t src_len, const uint8_t *ad, 600*744bfb21SJohn Baldwin const size_t ad_len, 601*744bfb21SJohn Baldwin const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], 602*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) 603*744bfb21SJohn Baldwin { 604*744bfb21SJohn Baldwin uint32_t derived_key[CHACHA20_KEY_WORDS]; 605*744bfb21SJohn Baldwin 606*744bfb21SJohn Baldwin hchacha20(derived_key, nonce, key); 607*744bfb21SJohn Baldwin cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key)); 608*744bfb21SJohn Baldwin chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, 609*744bfb21SJohn Baldwin get_unaligned_le64(nonce + 16), 610*744bfb21SJohn Baldwin (uint8_t *)derived_key); 611*744bfb21SJohn Baldwin explicit_bzero(derived_key, CHACHA20POLY1305_KEY_SIZE); 612*744bfb21SJohn Baldwin } 613*744bfb21SJohn Baldwin 614*744bfb21SJohn Baldwin bool 615*744bfb21SJohn Baldwin xchacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, 616*744bfb21SJohn Baldwin const size_t src_len, const uint8_t *ad, 617*744bfb21SJohn Baldwin const size_t ad_len, 618*744bfb21SJohn Baldwin const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], 619*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) 620*744bfb21SJohn Baldwin { 621*744bfb21SJohn Baldwin bool ret; 622*744bfb21SJohn Baldwin uint32_t derived_key[CHACHA20_KEY_WORDS]; 623*744bfb21SJohn Baldwin 624*744bfb21SJohn Baldwin hchacha20(derived_key, nonce, key); 625*744bfb21SJohn Baldwin cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key)); 626*744bfb21SJohn Baldwin ret = chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, 627*744bfb21SJohn Baldwin get_unaligned_le64(nonce + 16), 628*744bfb21SJohn Baldwin (uint8_t *)derived_key); 629*744bfb21SJohn Baldwin explicit_bzero(derived_key, CHACHA20POLY1305_KEY_SIZE); 630*744bfb21SJohn Baldwin return ret; 631*744bfb21SJohn Baldwin } 632*744bfb21SJohn Baldwin #endif 633*744bfb21SJohn Baldwin 634*744bfb21SJohn Baldwin #ifdef COMPAT_NEED_CHACHA20POLY1305_MBUF 635*744bfb21SJohn Baldwin static inline int 636*744bfb21SJohn Baldwin chacha20poly1305_crypt_mbuf(struct mbuf *m0, uint64_t nonce, 637*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE], bool encrypt) 638*744bfb21SJohn Baldwin { 639*744bfb21SJohn Baldwin struct poly1305_ctx poly1305_state; 640*744bfb21SJohn Baldwin struct chacha20_ctx chacha20_state; 641*744bfb21SJohn Baldwin uint8_t *buf, mbuf_mac[POLY1305_MAC_SIZE]; 642*744bfb21SJohn Baldwin size_t len, leftover = 0; 643*744bfb21SJohn Baldwin struct mbuf *m; 644*744bfb21SJohn Baldwin int ret; 645*744bfb21SJohn Baldwin union { 646*744bfb21SJohn Baldwin uint32_t stream[CHACHA20_BLOCK_WORDS]; 647*744bfb21SJohn Baldwin uint8_t block0[POLY1305_KEY_SIZE]; 648*744bfb21SJohn Baldwin uint8_t mac[POLY1305_MAC_SIZE]; 649*744bfb21SJohn Baldwin uint64_t lens[2]; 650*744bfb21SJohn Baldwin } b = { { 0 } }; 651*744bfb21SJohn Baldwin 652*744bfb21SJohn Baldwin if (!encrypt) { 653*744bfb21SJohn Baldwin if (m0->m_pkthdr.len < POLY1305_MAC_SIZE) 654*744bfb21SJohn Baldwin return EMSGSIZE; 655*744bfb21SJohn Baldwin m_copydata(m0, m0->m_pkthdr.len - POLY1305_MAC_SIZE, POLY1305_MAC_SIZE, mbuf_mac); 656*744bfb21SJohn Baldwin m_adj(m0, -POLY1305_MAC_SIZE); 657*744bfb21SJohn Baldwin } 658*744bfb21SJohn Baldwin 659*744bfb21SJohn Baldwin chacha20_init(&chacha20_state, key, nonce); 660*744bfb21SJohn Baldwin chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0)); 661*744bfb21SJohn Baldwin poly1305_init(&poly1305_state, b.block0); 662*744bfb21SJohn Baldwin 663*744bfb21SJohn Baldwin for (m = m0; m; m = m->m_next) { 664*744bfb21SJohn Baldwin len = m->m_len; 665*744bfb21SJohn Baldwin buf = m->m_data; 666*744bfb21SJohn Baldwin 667*744bfb21SJohn Baldwin if (!encrypt) 668*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, m->m_data, m->m_len); 669*744bfb21SJohn Baldwin 670*744bfb21SJohn Baldwin if (leftover != 0) { 671*744bfb21SJohn Baldwin size_t l = min(len, leftover); 672*744bfb21SJohn Baldwin xor_cpy(buf, buf, ((uint8_t *)b.stream) + (CHACHA20_BLOCK_SIZE - leftover), l); 673*744bfb21SJohn Baldwin leftover -= l; 674*744bfb21SJohn Baldwin buf += l; 675*744bfb21SJohn Baldwin len -= l; 676*744bfb21SJohn Baldwin } 677*744bfb21SJohn Baldwin 678*744bfb21SJohn Baldwin while (len >= CHACHA20_BLOCK_SIZE) { 679*744bfb21SJohn Baldwin chacha20_block(&chacha20_state, b.stream); 680*744bfb21SJohn Baldwin xor_cpy(buf, buf, (uint8_t *)b.stream, CHACHA20_BLOCK_SIZE); 681*744bfb21SJohn Baldwin buf += CHACHA20_BLOCK_SIZE; 682*744bfb21SJohn Baldwin len -= CHACHA20_BLOCK_SIZE; 683*744bfb21SJohn Baldwin } 684*744bfb21SJohn Baldwin 685*744bfb21SJohn Baldwin if (len) { 686*744bfb21SJohn Baldwin chacha20_block(&chacha20_state, b.stream); 687*744bfb21SJohn Baldwin xor_cpy(buf, buf, (uint8_t *)b.stream, len); 688*744bfb21SJohn Baldwin leftover = CHACHA20_BLOCK_SIZE - len; 689*744bfb21SJohn Baldwin } 690*744bfb21SJohn Baldwin 691*744bfb21SJohn Baldwin if (encrypt) 692*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, m->m_data, m->m_len); 693*744bfb21SJohn Baldwin } 694*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, pad0, (0x10 - m0->m_pkthdr.len) & 0xf); 695*744bfb21SJohn Baldwin 696*744bfb21SJohn Baldwin b.lens[0] = 0; 697*744bfb21SJohn Baldwin b.lens[1] = cpu_to_le64(m0->m_pkthdr.len); 698*744bfb21SJohn Baldwin poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens)); 699*744bfb21SJohn Baldwin 700*744bfb21SJohn Baldwin poly1305_final(&poly1305_state, b.mac); 701*744bfb21SJohn Baldwin 702*744bfb21SJohn Baldwin if (encrypt) 703*744bfb21SJohn Baldwin ret = m_append(m0, POLY1305_MAC_SIZE, b.mac) ? 0 : ENOMEM; 704*744bfb21SJohn Baldwin else 705*744bfb21SJohn Baldwin ret = timingsafe_bcmp(b.mac, mbuf_mac, POLY1305_MAC_SIZE) == 0 ? 0 : EBADMSG; 706*744bfb21SJohn Baldwin 707*744bfb21SJohn Baldwin explicit_bzero(&chacha20_state, sizeof(chacha20_state)); 708*744bfb21SJohn Baldwin explicit_bzero(&b, sizeof(b)); 709*744bfb21SJohn Baldwin 710*744bfb21SJohn Baldwin return ret; 711*744bfb21SJohn Baldwin } 712*744bfb21SJohn Baldwin 713*744bfb21SJohn Baldwin int 714*744bfb21SJohn Baldwin chacha20poly1305_encrypt_mbuf(struct mbuf *m, const uint64_t nonce, 715*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) 716*744bfb21SJohn Baldwin { 717*744bfb21SJohn Baldwin return chacha20poly1305_crypt_mbuf(m, nonce, key, true); 718*744bfb21SJohn Baldwin } 719*744bfb21SJohn Baldwin 720*744bfb21SJohn Baldwin int 721*744bfb21SJohn Baldwin chacha20poly1305_decrypt_mbuf(struct mbuf *m, const uint64_t nonce, 722*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) 723*744bfb21SJohn Baldwin { 724*744bfb21SJohn Baldwin return chacha20poly1305_crypt_mbuf(m, nonce, key, false); 725*744bfb21SJohn Baldwin } 726*744bfb21SJohn Baldwin #else 727*744bfb21SJohn Baldwin static int 728*744bfb21SJohn Baldwin crypto_callback(struct cryptop *crp) 729*744bfb21SJohn Baldwin { 730*744bfb21SJohn Baldwin return (0); 731*744bfb21SJohn Baldwin } 732*744bfb21SJohn Baldwin 733*744bfb21SJohn Baldwin int 734*744bfb21SJohn Baldwin chacha20poly1305_encrypt_mbuf(struct mbuf *m, const uint64_t nonce, 735*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) 736*744bfb21SJohn Baldwin { 737*744bfb21SJohn Baldwin static const char blank_tag[POLY1305_HASH_LEN]; 738*744bfb21SJohn Baldwin struct cryptop crp; 739*744bfb21SJohn Baldwin int ret; 740*744bfb21SJohn Baldwin 741*744bfb21SJohn Baldwin if (!m_append(m, POLY1305_HASH_LEN, blank_tag)) 742*744bfb21SJohn Baldwin return (ENOMEM); 743*744bfb21SJohn Baldwin crypto_initreq(&crp, chacha20_poly1305_sid); 744*744bfb21SJohn Baldwin crp.crp_op = CRYPTO_OP_ENCRYPT | CRYPTO_OP_COMPUTE_DIGEST; 745*744bfb21SJohn Baldwin crp.crp_flags = CRYPTO_F_IV_SEPARATE | CRYPTO_F_CBIMM; 746*744bfb21SJohn Baldwin crypto_use_mbuf(&crp, m); 747*744bfb21SJohn Baldwin crp.crp_payload_length = m->m_pkthdr.len - POLY1305_HASH_LEN; 748*744bfb21SJohn Baldwin crp.crp_digest_start = crp.crp_payload_length; 749*744bfb21SJohn Baldwin le64enc(crp.crp_iv, nonce); 750*744bfb21SJohn Baldwin crp.crp_cipher_key = key; 751*744bfb21SJohn Baldwin crp.crp_callback = crypto_callback; 752*744bfb21SJohn Baldwin ret = crypto_dispatch(&crp); 753*744bfb21SJohn Baldwin crypto_destroyreq(&crp); 754*744bfb21SJohn Baldwin return (ret); 755*744bfb21SJohn Baldwin } 756*744bfb21SJohn Baldwin 757*744bfb21SJohn Baldwin int 758*744bfb21SJohn Baldwin chacha20poly1305_decrypt_mbuf(struct mbuf *m, const uint64_t nonce, 759*744bfb21SJohn Baldwin const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) 760*744bfb21SJohn Baldwin { 761*744bfb21SJohn Baldwin struct cryptop crp; 762*744bfb21SJohn Baldwin int ret; 763*744bfb21SJohn Baldwin 764*744bfb21SJohn Baldwin if (m->m_pkthdr.len < POLY1305_HASH_LEN) 765*744bfb21SJohn Baldwin return (EMSGSIZE); 766*744bfb21SJohn Baldwin crypto_initreq(&crp, chacha20_poly1305_sid); 767*744bfb21SJohn Baldwin crp.crp_op = CRYPTO_OP_DECRYPT | CRYPTO_OP_VERIFY_DIGEST; 768*744bfb21SJohn Baldwin crp.crp_flags = CRYPTO_F_IV_SEPARATE | CRYPTO_F_CBIMM; 769*744bfb21SJohn Baldwin crypto_use_mbuf(&crp, m); 770*744bfb21SJohn Baldwin crp.crp_payload_length = m->m_pkthdr.len - POLY1305_HASH_LEN; 771*744bfb21SJohn Baldwin crp.crp_digest_start = crp.crp_payload_length; 772*744bfb21SJohn Baldwin le64enc(crp.crp_iv, nonce); 773*744bfb21SJohn Baldwin crp.crp_cipher_key = key; 774*744bfb21SJohn Baldwin crp.crp_callback = crypto_callback; 775*744bfb21SJohn Baldwin ret = crypto_dispatch(&crp); 776*744bfb21SJohn Baldwin crypto_destroyreq(&crp); 777*744bfb21SJohn Baldwin if (ret) 778*744bfb21SJohn Baldwin return (ret); 779*744bfb21SJohn Baldwin m_adj(m, -POLY1305_HASH_LEN); 780*744bfb21SJohn Baldwin return (0); 781*744bfb21SJohn Baldwin } 782*744bfb21SJohn Baldwin #endif 783*744bfb21SJohn Baldwin 784*744bfb21SJohn Baldwin #ifdef COMPAT_NEED_BLAKE2S 785*744bfb21SJohn Baldwin static const uint32_t blake2s_iv[8] = { 786*744bfb21SJohn Baldwin 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 787*744bfb21SJohn Baldwin 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL 788*744bfb21SJohn Baldwin }; 789*744bfb21SJohn Baldwin 790*744bfb21SJohn Baldwin static const uint8_t blake2s_sigma[10][16] = { 791*744bfb21SJohn Baldwin { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 792*744bfb21SJohn Baldwin { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, 793*744bfb21SJohn Baldwin { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, 794*744bfb21SJohn Baldwin { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, 795*744bfb21SJohn Baldwin { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, 796*744bfb21SJohn Baldwin { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, 797*744bfb21SJohn Baldwin { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, 798*744bfb21SJohn Baldwin { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, 799*744bfb21SJohn Baldwin { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, 800*744bfb21SJohn Baldwin { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, 801*744bfb21SJohn Baldwin }; 802*744bfb21SJohn Baldwin 803*744bfb21SJohn Baldwin static inline void blake2s_set_lastblock(struct blake2s_state *state) 804*744bfb21SJohn Baldwin { 805*744bfb21SJohn Baldwin state->f[0] = -1; 806*744bfb21SJohn Baldwin } 807*744bfb21SJohn Baldwin 808*744bfb21SJohn Baldwin static inline void blake2s_increment_counter(struct blake2s_state *state, 809*744bfb21SJohn Baldwin const uint32_t inc) 810*744bfb21SJohn Baldwin { 811*744bfb21SJohn Baldwin state->t[0] += inc; 812*744bfb21SJohn Baldwin state->t[1] += (state->t[0] < inc); 813*744bfb21SJohn Baldwin } 814*744bfb21SJohn Baldwin 815*744bfb21SJohn Baldwin static inline void blake2s_init_param(struct blake2s_state *state, 816*744bfb21SJohn Baldwin const uint32_t param) 817*744bfb21SJohn Baldwin { 818*744bfb21SJohn Baldwin int i; 819*744bfb21SJohn Baldwin 820*744bfb21SJohn Baldwin memset(state, 0, sizeof(*state)); 821*744bfb21SJohn Baldwin for (i = 0; i < 8; ++i) 822*744bfb21SJohn Baldwin state->h[i] = blake2s_iv[i]; 823*744bfb21SJohn Baldwin state->h[0] ^= param; 824*744bfb21SJohn Baldwin } 825*744bfb21SJohn Baldwin 826*744bfb21SJohn Baldwin void blake2s_init(struct blake2s_state *state, const size_t outlen) 827*744bfb21SJohn Baldwin { 828*744bfb21SJohn Baldwin blake2s_init_param(state, 0x01010000 | outlen); 829*744bfb21SJohn Baldwin state->outlen = outlen; 830*744bfb21SJohn Baldwin } 831*744bfb21SJohn Baldwin 832*744bfb21SJohn Baldwin void blake2s_init_key(struct blake2s_state *state, const size_t outlen, 833*744bfb21SJohn Baldwin const uint8_t *key, const size_t keylen) 834*744bfb21SJohn Baldwin { 835*744bfb21SJohn Baldwin uint8_t block[BLAKE2S_BLOCK_SIZE] = { 0 }; 836*744bfb21SJohn Baldwin 837*744bfb21SJohn Baldwin blake2s_init_param(state, 0x01010000 | keylen << 8 | outlen); 838*744bfb21SJohn Baldwin state->outlen = outlen; 839*744bfb21SJohn Baldwin memcpy(block, key, keylen); 840*744bfb21SJohn Baldwin blake2s_update(state, block, BLAKE2S_BLOCK_SIZE); 841*744bfb21SJohn Baldwin explicit_bzero(block, BLAKE2S_BLOCK_SIZE); 842*744bfb21SJohn Baldwin } 843*744bfb21SJohn Baldwin 844*744bfb21SJohn Baldwin static inline void blake2s_compress(struct blake2s_state *state, 845*744bfb21SJohn Baldwin const uint8_t *block, size_t nblocks, 846*744bfb21SJohn Baldwin const uint32_t inc) 847*744bfb21SJohn Baldwin { 848*744bfb21SJohn Baldwin uint32_t m[16]; 849*744bfb21SJohn Baldwin uint32_t v[16]; 850*744bfb21SJohn Baldwin int i; 851*744bfb21SJohn Baldwin 852*744bfb21SJohn Baldwin while (nblocks > 0) { 853*744bfb21SJohn Baldwin blake2s_increment_counter(state, inc); 854*744bfb21SJohn Baldwin memcpy(m, block, BLAKE2S_BLOCK_SIZE); 855*744bfb21SJohn Baldwin le32_to_cpu_array(m, ARRAY_SIZE(m)); 856*744bfb21SJohn Baldwin memcpy(v, state->h, 32); 857*744bfb21SJohn Baldwin v[ 8] = blake2s_iv[0]; 858*744bfb21SJohn Baldwin v[ 9] = blake2s_iv[1]; 859*744bfb21SJohn Baldwin v[10] = blake2s_iv[2]; 860*744bfb21SJohn Baldwin v[11] = blake2s_iv[3]; 861*744bfb21SJohn Baldwin v[12] = blake2s_iv[4] ^ state->t[0]; 862*744bfb21SJohn Baldwin v[13] = blake2s_iv[5] ^ state->t[1]; 863*744bfb21SJohn Baldwin v[14] = blake2s_iv[6] ^ state->f[0]; 864*744bfb21SJohn Baldwin v[15] = blake2s_iv[7] ^ state->f[1]; 865*744bfb21SJohn Baldwin 866*744bfb21SJohn Baldwin #define G(r, i, a, b, c, d) do { \ 867*744bfb21SJohn Baldwin a += b + m[blake2s_sigma[r][2 * i + 0]]; \ 868*744bfb21SJohn Baldwin d = ror32(d ^ a, 16); \ 869*744bfb21SJohn Baldwin c += d; \ 870*744bfb21SJohn Baldwin b = ror32(b ^ c, 12); \ 871*744bfb21SJohn Baldwin a += b + m[blake2s_sigma[r][2 * i + 1]]; \ 872*744bfb21SJohn Baldwin d = ror32(d ^ a, 8); \ 873*744bfb21SJohn Baldwin c += d; \ 874*744bfb21SJohn Baldwin b = ror32(b ^ c, 7); \ 875*744bfb21SJohn Baldwin } while (0) 876*744bfb21SJohn Baldwin 877*744bfb21SJohn Baldwin #define ROUND(r) do { \ 878*744bfb21SJohn Baldwin G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ 879*744bfb21SJohn Baldwin G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ 880*744bfb21SJohn Baldwin G(r, 2, v[2], v[ 6], v[10], v[14]); \ 881*744bfb21SJohn Baldwin G(r, 3, v[3], v[ 7], v[11], v[15]); \ 882*744bfb21SJohn Baldwin G(r, 4, v[0], v[ 5], v[10], v[15]); \ 883*744bfb21SJohn Baldwin G(r, 5, v[1], v[ 6], v[11], v[12]); \ 884*744bfb21SJohn Baldwin G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ 885*744bfb21SJohn Baldwin G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ 886*744bfb21SJohn Baldwin } while (0) 887*744bfb21SJohn Baldwin ROUND(0); 888*744bfb21SJohn Baldwin ROUND(1); 889*744bfb21SJohn Baldwin ROUND(2); 890*744bfb21SJohn Baldwin ROUND(3); 891*744bfb21SJohn Baldwin ROUND(4); 892*744bfb21SJohn Baldwin ROUND(5); 893*744bfb21SJohn Baldwin ROUND(6); 894*744bfb21SJohn Baldwin ROUND(7); 895*744bfb21SJohn Baldwin ROUND(8); 896*744bfb21SJohn Baldwin ROUND(9); 897*744bfb21SJohn Baldwin 898*744bfb21SJohn Baldwin #undef G 899*744bfb21SJohn Baldwin #undef ROUND 900*744bfb21SJohn Baldwin 901*744bfb21SJohn Baldwin for (i = 0; i < 8; ++i) 902*744bfb21SJohn Baldwin state->h[i] ^= v[i] ^ v[i + 8]; 903*744bfb21SJohn Baldwin 904*744bfb21SJohn Baldwin block += BLAKE2S_BLOCK_SIZE; 905*744bfb21SJohn Baldwin --nblocks; 906*744bfb21SJohn Baldwin } 907*744bfb21SJohn Baldwin } 908*744bfb21SJohn Baldwin 909*744bfb21SJohn Baldwin void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen) 910*744bfb21SJohn Baldwin { 911*744bfb21SJohn Baldwin const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; 912*744bfb21SJohn Baldwin 913*744bfb21SJohn Baldwin if (!inlen) 914*744bfb21SJohn Baldwin return; 915*744bfb21SJohn Baldwin if (inlen > fill) { 916*744bfb21SJohn Baldwin memcpy(state->buf + state->buflen, in, fill); 917*744bfb21SJohn Baldwin blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); 918*744bfb21SJohn Baldwin state->buflen = 0; 919*744bfb21SJohn Baldwin in += fill; 920*744bfb21SJohn Baldwin inlen -= fill; 921*744bfb21SJohn Baldwin } 922*744bfb21SJohn Baldwin if (inlen > BLAKE2S_BLOCK_SIZE) { 923*744bfb21SJohn Baldwin const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); 924*744bfb21SJohn Baldwin /* Hash one less (full) block than strictly possible */ 925*744bfb21SJohn Baldwin blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); 926*744bfb21SJohn Baldwin in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); 927*744bfb21SJohn Baldwin inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); 928*744bfb21SJohn Baldwin } 929*744bfb21SJohn Baldwin memcpy(state->buf + state->buflen, in, inlen); 930*744bfb21SJohn Baldwin state->buflen += inlen; 931*744bfb21SJohn Baldwin } 932*744bfb21SJohn Baldwin 933*744bfb21SJohn Baldwin void blake2s_final(struct blake2s_state *state, uint8_t *out) 934*744bfb21SJohn Baldwin { 935*744bfb21SJohn Baldwin blake2s_set_lastblock(state); 936*744bfb21SJohn Baldwin memset(state->buf + state->buflen, 0, 937*744bfb21SJohn Baldwin BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ 938*744bfb21SJohn Baldwin blake2s_compress(state, state->buf, 1, state->buflen); 939*744bfb21SJohn Baldwin cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); 940*744bfb21SJohn Baldwin memcpy(out, state->h, state->outlen); 941*744bfb21SJohn Baldwin explicit_bzero(state, sizeof(*state)); 942*744bfb21SJohn Baldwin } 943*744bfb21SJohn Baldwin #endif 944*744bfb21SJohn Baldwin 945*744bfb21SJohn Baldwin #ifdef COMPAT_NEED_CURVE25519 946*744bfb21SJohn Baldwin /* Below here is fiat's implementation of x25519. 947*744bfb21SJohn Baldwin * 948*744bfb21SJohn Baldwin * Copyright (C) 2015-2016 The fiat-crypto Authors. 949*744bfb21SJohn Baldwin * Copyright (C) 2018-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 950*744bfb21SJohn Baldwin * 951*744bfb21SJohn Baldwin * This is a machine-generated formally verified implementation of Curve25519 952*744bfb21SJohn Baldwin * ECDH from: <https://github.com/mit-plv/fiat-crypto>. Though originally 953*744bfb21SJohn Baldwin * machine generated, it has been tweaked to be suitable for use in the kernel. 954*744bfb21SJohn Baldwin * It is optimized for 32-bit machines and machines that cannot work efficiently 955*744bfb21SJohn Baldwin * with 128-bit integer types. 956*744bfb21SJohn Baldwin */ 957*744bfb21SJohn Baldwin 958*744bfb21SJohn Baldwin /* fe means field element. Here the field is \Z/(2^255-19). An element t, 959*744bfb21SJohn Baldwin * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 960*744bfb21SJohn Baldwin * t[3]+2^102 t[4]+...+2^230 t[9]. 961*744bfb21SJohn Baldwin * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc. 962*744bfb21SJohn Baldwin * Multiplication and carrying produce fe from fe_loose. 963*744bfb21SJohn Baldwin */ 964*744bfb21SJohn Baldwin typedef struct fe { uint32_t v[10]; } fe; 965*744bfb21SJohn Baldwin 966*744bfb21SJohn Baldwin /* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc 967*744bfb21SJohn Baldwin * Addition and subtraction produce fe_loose from (fe, fe). 968*744bfb21SJohn Baldwin */ 969*744bfb21SJohn Baldwin typedef struct fe_loose { uint32_t v[10]; } fe_loose; 970*744bfb21SJohn Baldwin 971*744bfb21SJohn Baldwin static inline void fe_frombytes_impl(uint32_t h[10], const uint8_t *s) 972*744bfb21SJohn Baldwin { 973*744bfb21SJohn Baldwin /* Ignores top bit of s. */ 974*744bfb21SJohn Baldwin uint32_t a0 = get_unaligned_le32(s); 975*744bfb21SJohn Baldwin uint32_t a1 = get_unaligned_le32(s+4); 976*744bfb21SJohn Baldwin uint32_t a2 = get_unaligned_le32(s+8); 977*744bfb21SJohn Baldwin uint32_t a3 = get_unaligned_le32(s+12); 978*744bfb21SJohn Baldwin uint32_t a4 = get_unaligned_le32(s+16); 979*744bfb21SJohn Baldwin uint32_t a5 = get_unaligned_le32(s+20); 980*744bfb21SJohn Baldwin uint32_t a6 = get_unaligned_le32(s+24); 981*744bfb21SJohn Baldwin uint32_t a7 = get_unaligned_le32(s+28); 982*744bfb21SJohn Baldwin h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */ 983*744bfb21SJohn Baldwin h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */ 984*744bfb21SJohn Baldwin h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */ 985*744bfb21SJohn Baldwin h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */ 986*744bfb21SJohn Baldwin h[4] = (a3>> 6); /* (32- 6) = 26 */ 987*744bfb21SJohn Baldwin h[5] = a4&((1<<25)-1); /* 25 */ 988*744bfb21SJohn Baldwin h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */ 989*744bfb21SJohn Baldwin h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */ 990*744bfb21SJohn Baldwin h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */ 991*744bfb21SJohn Baldwin h[9] = (a7>> 6)&((1<<25)-1); /* 25 */ 992*744bfb21SJohn Baldwin } 993*744bfb21SJohn Baldwin 994*744bfb21SJohn Baldwin static inline void fe_frombytes(fe *h, const uint8_t *s) 995*744bfb21SJohn Baldwin { 996*744bfb21SJohn Baldwin fe_frombytes_impl(h->v, s); 997*744bfb21SJohn Baldwin } 998*744bfb21SJohn Baldwin 999*744bfb21SJohn Baldwin static inline uint8_t /*bool*/ 1000*744bfb21SJohn Baldwin addcarryx_u25(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) 1001*744bfb21SJohn Baldwin { 1002*744bfb21SJohn Baldwin /* This function extracts 25 bits of result and 1 bit of carry 1003*744bfb21SJohn Baldwin * (26 total), so a 32-bit intermediate is sufficient. 1004*744bfb21SJohn Baldwin */ 1005*744bfb21SJohn Baldwin uint32_t x = a + b + c; 1006*744bfb21SJohn Baldwin *low = x & ((1 << 25) - 1); 1007*744bfb21SJohn Baldwin return (x >> 25) & 1; 1008*744bfb21SJohn Baldwin } 1009*744bfb21SJohn Baldwin 1010*744bfb21SJohn Baldwin static inline uint8_t /*bool*/ 1011*744bfb21SJohn Baldwin addcarryx_u26(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) 1012*744bfb21SJohn Baldwin { 1013*744bfb21SJohn Baldwin /* This function extracts 26 bits of result and 1 bit of carry 1014*744bfb21SJohn Baldwin * (27 total), so a 32-bit intermediate is sufficient. 1015*744bfb21SJohn Baldwin */ 1016*744bfb21SJohn Baldwin uint32_t x = a + b + c; 1017*744bfb21SJohn Baldwin *low = x & ((1 << 26) - 1); 1018*744bfb21SJohn Baldwin return (x >> 26) & 1; 1019*744bfb21SJohn Baldwin } 1020*744bfb21SJohn Baldwin 1021*744bfb21SJohn Baldwin static inline uint8_t /*bool*/ 1022*744bfb21SJohn Baldwin subborrow_u25(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) 1023*744bfb21SJohn Baldwin { 1024*744bfb21SJohn Baldwin /* This function extracts 25 bits of result and 1 bit of borrow 1025*744bfb21SJohn Baldwin * (26 total), so a 32-bit intermediate is sufficient. 1026*744bfb21SJohn Baldwin */ 1027*744bfb21SJohn Baldwin uint32_t x = a - b - c; 1028*744bfb21SJohn Baldwin *low = x & ((1 << 25) - 1); 1029*744bfb21SJohn Baldwin return x >> 31; 1030*744bfb21SJohn Baldwin } 1031*744bfb21SJohn Baldwin 1032*744bfb21SJohn Baldwin static inline uint8_t /*bool*/ 1033*744bfb21SJohn Baldwin subborrow_u26(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) 1034*744bfb21SJohn Baldwin { 1035*744bfb21SJohn Baldwin /* This function extracts 26 bits of result and 1 bit of borrow 1036*744bfb21SJohn Baldwin *(27 total), so a 32-bit intermediate is sufficient. 1037*744bfb21SJohn Baldwin */ 1038*744bfb21SJohn Baldwin uint32_t x = a - b - c; 1039*744bfb21SJohn Baldwin *low = x & ((1 << 26) - 1); 1040*744bfb21SJohn Baldwin return x >> 31; 1041*744bfb21SJohn Baldwin } 1042*744bfb21SJohn Baldwin 1043*744bfb21SJohn Baldwin static inline uint32_t cmovznz32(uint32_t t, uint32_t z, uint32_t nz) 1044*744bfb21SJohn Baldwin { 1045*744bfb21SJohn Baldwin t = -!!t; /* all set if nonzero, 0 if 0 */ 1046*744bfb21SJohn Baldwin return (t&nz) | ((~t)&z); 1047*744bfb21SJohn Baldwin } 1048*744bfb21SJohn Baldwin 1049*744bfb21SJohn Baldwin static inline void fe_freeze(uint32_t out[10], const uint32_t in1[10]) 1050*744bfb21SJohn Baldwin { 1051*744bfb21SJohn Baldwin const uint32_t x17 = in1[9]; 1052*744bfb21SJohn Baldwin const uint32_t x18 = in1[8]; 1053*744bfb21SJohn Baldwin const uint32_t x16 = in1[7]; 1054*744bfb21SJohn Baldwin const uint32_t x14 = in1[6]; 1055*744bfb21SJohn Baldwin const uint32_t x12 = in1[5]; 1056*744bfb21SJohn Baldwin const uint32_t x10 = in1[4]; 1057*744bfb21SJohn Baldwin const uint32_t x8 = in1[3]; 1058*744bfb21SJohn Baldwin const uint32_t x6 = in1[2]; 1059*744bfb21SJohn Baldwin const uint32_t x4 = in1[1]; 1060*744bfb21SJohn Baldwin const uint32_t x2 = in1[0]; 1061*744bfb21SJohn Baldwin uint32_t x20; uint8_t/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20); 1062*744bfb21SJohn Baldwin uint32_t x23; uint8_t/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23); 1063*744bfb21SJohn Baldwin uint32_t x26; uint8_t/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26); 1064*744bfb21SJohn Baldwin uint32_t x29; uint8_t/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29); 1065*744bfb21SJohn Baldwin uint32_t x32; uint8_t/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32); 1066*744bfb21SJohn Baldwin uint32_t x35; uint8_t/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35); 1067*744bfb21SJohn Baldwin uint32_t x38; uint8_t/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38); 1068*744bfb21SJohn Baldwin uint32_t x41; uint8_t/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41); 1069*744bfb21SJohn Baldwin uint32_t x44; uint8_t/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44); 1070*744bfb21SJohn Baldwin uint32_t x47; uint8_t/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47); 1071*744bfb21SJohn Baldwin uint32_t x49 = cmovznz32(x48, 0x0, 0xffffffff); 1072*744bfb21SJohn Baldwin uint32_t x50 = (x49 & 0x3ffffed); 1073*744bfb21SJohn Baldwin uint32_t x52; uint8_t/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52); 1074*744bfb21SJohn Baldwin uint32_t x54 = (x49 & 0x1ffffff); 1075*744bfb21SJohn Baldwin uint32_t x56; uint8_t/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56); 1076*744bfb21SJohn Baldwin uint32_t x58 = (x49 & 0x3ffffff); 1077*744bfb21SJohn Baldwin uint32_t x60; uint8_t/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60); 1078*744bfb21SJohn Baldwin uint32_t x62 = (x49 & 0x1ffffff); 1079*744bfb21SJohn Baldwin uint32_t x64; uint8_t/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64); 1080*744bfb21SJohn Baldwin uint32_t x66 = (x49 & 0x3ffffff); 1081*744bfb21SJohn Baldwin uint32_t x68; uint8_t/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68); 1082*744bfb21SJohn Baldwin uint32_t x70 = (x49 & 0x1ffffff); 1083*744bfb21SJohn Baldwin uint32_t x72; uint8_t/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72); 1084*744bfb21SJohn Baldwin uint32_t x74 = (x49 & 0x3ffffff); 1085*744bfb21SJohn Baldwin uint32_t x76; uint8_t/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76); 1086*744bfb21SJohn Baldwin uint32_t x78 = (x49 & 0x1ffffff); 1087*744bfb21SJohn Baldwin uint32_t x80; uint8_t/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80); 1088*744bfb21SJohn Baldwin uint32_t x82 = (x49 & 0x3ffffff); 1089*744bfb21SJohn Baldwin uint32_t x84; uint8_t/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84); 1090*744bfb21SJohn Baldwin uint32_t x86 = (x49 & 0x1ffffff); 1091*744bfb21SJohn Baldwin uint32_t x88; addcarryx_u25(x85, x47, x86, &x88); 1092*744bfb21SJohn Baldwin out[0] = x52; 1093*744bfb21SJohn Baldwin out[1] = x56; 1094*744bfb21SJohn Baldwin out[2] = x60; 1095*744bfb21SJohn Baldwin out[3] = x64; 1096*744bfb21SJohn Baldwin out[4] = x68; 1097*744bfb21SJohn Baldwin out[5] = x72; 1098*744bfb21SJohn Baldwin out[6] = x76; 1099*744bfb21SJohn Baldwin out[7] = x80; 1100*744bfb21SJohn Baldwin out[8] = x84; 1101*744bfb21SJohn Baldwin out[9] = x88; 1102*744bfb21SJohn Baldwin } 1103*744bfb21SJohn Baldwin 1104*744bfb21SJohn Baldwin static inline void fe_tobytes(uint8_t s[32], const fe *f) 1105*744bfb21SJohn Baldwin { 1106*744bfb21SJohn Baldwin uint32_t h[10]; 1107*744bfb21SJohn Baldwin fe_freeze(h, f->v); 1108*744bfb21SJohn Baldwin s[0] = h[0] >> 0; 1109*744bfb21SJohn Baldwin s[1] = h[0] >> 8; 1110*744bfb21SJohn Baldwin s[2] = h[0] >> 16; 1111*744bfb21SJohn Baldwin s[3] = (h[0] >> 24) | (h[1] << 2); 1112*744bfb21SJohn Baldwin s[4] = h[1] >> 6; 1113*744bfb21SJohn Baldwin s[5] = h[1] >> 14; 1114*744bfb21SJohn Baldwin s[6] = (h[1] >> 22) | (h[2] << 3); 1115*744bfb21SJohn Baldwin s[7] = h[2] >> 5; 1116*744bfb21SJohn Baldwin s[8] = h[2] >> 13; 1117*744bfb21SJohn Baldwin s[9] = (h[2] >> 21) | (h[3] << 5); 1118*744bfb21SJohn Baldwin s[10] = h[3] >> 3; 1119*744bfb21SJohn Baldwin s[11] = h[3] >> 11; 1120*744bfb21SJohn Baldwin s[12] = (h[3] >> 19) | (h[4] << 6); 1121*744bfb21SJohn Baldwin s[13] = h[4] >> 2; 1122*744bfb21SJohn Baldwin s[14] = h[4] >> 10; 1123*744bfb21SJohn Baldwin s[15] = h[4] >> 18; 1124*744bfb21SJohn Baldwin s[16] = h[5] >> 0; 1125*744bfb21SJohn Baldwin s[17] = h[5] >> 8; 1126*744bfb21SJohn Baldwin s[18] = h[5] >> 16; 1127*744bfb21SJohn Baldwin s[19] = (h[5] >> 24) | (h[6] << 1); 1128*744bfb21SJohn Baldwin s[20] = h[6] >> 7; 1129*744bfb21SJohn Baldwin s[21] = h[6] >> 15; 1130*744bfb21SJohn Baldwin s[22] = (h[6] >> 23) | (h[7] << 3); 1131*744bfb21SJohn Baldwin s[23] = h[7] >> 5; 1132*744bfb21SJohn Baldwin s[24] = h[7] >> 13; 1133*744bfb21SJohn Baldwin s[25] = (h[7] >> 21) | (h[8] << 4); 1134*744bfb21SJohn Baldwin s[26] = h[8] >> 4; 1135*744bfb21SJohn Baldwin s[27] = h[8] >> 12; 1136*744bfb21SJohn Baldwin s[28] = (h[8] >> 20) | (h[9] << 6); 1137*744bfb21SJohn Baldwin s[29] = h[9] >> 2; 1138*744bfb21SJohn Baldwin s[30] = h[9] >> 10; 1139*744bfb21SJohn Baldwin s[31] = h[9] >> 18; 1140*744bfb21SJohn Baldwin } 1141*744bfb21SJohn Baldwin 1142*744bfb21SJohn Baldwin /* h = f */ 1143*744bfb21SJohn Baldwin static inline void fe_copy(fe *h, const fe *f) 1144*744bfb21SJohn Baldwin { 1145*744bfb21SJohn Baldwin memmove(h, f, sizeof(uint32_t) * 10); 1146*744bfb21SJohn Baldwin } 1147*744bfb21SJohn Baldwin 1148*744bfb21SJohn Baldwin static inline void fe_copy_lt(fe_loose *h, const fe *f) 1149*744bfb21SJohn Baldwin { 1150*744bfb21SJohn Baldwin memmove(h, f, sizeof(uint32_t) * 10); 1151*744bfb21SJohn Baldwin } 1152*744bfb21SJohn Baldwin 1153*744bfb21SJohn Baldwin /* h = 0 */ 1154*744bfb21SJohn Baldwin static inline void fe_0(fe *h) 1155*744bfb21SJohn Baldwin { 1156*744bfb21SJohn Baldwin memset(h, 0, sizeof(uint32_t) * 10); 1157*744bfb21SJohn Baldwin } 1158*744bfb21SJohn Baldwin 1159*744bfb21SJohn Baldwin /* h = 1 */ 1160*744bfb21SJohn Baldwin static inline void fe_1(fe *h) 1161*744bfb21SJohn Baldwin { 1162*744bfb21SJohn Baldwin memset(h, 0, sizeof(uint32_t) * 10); 1163*744bfb21SJohn Baldwin h->v[0] = 1; 1164*744bfb21SJohn Baldwin } 1165*744bfb21SJohn Baldwin 1166*744bfb21SJohn Baldwin static void fe_add_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) 1167*744bfb21SJohn Baldwin { 1168*744bfb21SJohn Baldwin const uint32_t x20 = in1[9]; 1169*744bfb21SJohn Baldwin const uint32_t x21 = in1[8]; 1170*744bfb21SJohn Baldwin const uint32_t x19 = in1[7]; 1171*744bfb21SJohn Baldwin const uint32_t x17 = in1[6]; 1172*744bfb21SJohn Baldwin const uint32_t x15 = in1[5]; 1173*744bfb21SJohn Baldwin const uint32_t x13 = in1[4]; 1174*744bfb21SJohn Baldwin const uint32_t x11 = in1[3]; 1175*744bfb21SJohn Baldwin const uint32_t x9 = in1[2]; 1176*744bfb21SJohn Baldwin const uint32_t x7 = in1[1]; 1177*744bfb21SJohn Baldwin const uint32_t x5 = in1[0]; 1178*744bfb21SJohn Baldwin const uint32_t x38 = in2[9]; 1179*744bfb21SJohn Baldwin const uint32_t x39 = in2[8]; 1180*744bfb21SJohn Baldwin const uint32_t x37 = in2[7]; 1181*744bfb21SJohn Baldwin const uint32_t x35 = in2[6]; 1182*744bfb21SJohn Baldwin const uint32_t x33 = in2[5]; 1183*744bfb21SJohn Baldwin const uint32_t x31 = in2[4]; 1184*744bfb21SJohn Baldwin const uint32_t x29 = in2[3]; 1185*744bfb21SJohn Baldwin const uint32_t x27 = in2[2]; 1186*744bfb21SJohn Baldwin const uint32_t x25 = in2[1]; 1187*744bfb21SJohn Baldwin const uint32_t x23 = in2[0]; 1188*744bfb21SJohn Baldwin out[0] = (x5 + x23); 1189*744bfb21SJohn Baldwin out[1] = (x7 + x25); 1190*744bfb21SJohn Baldwin out[2] = (x9 + x27); 1191*744bfb21SJohn Baldwin out[3] = (x11 + x29); 1192*744bfb21SJohn Baldwin out[4] = (x13 + x31); 1193*744bfb21SJohn Baldwin out[5] = (x15 + x33); 1194*744bfb21SJohn Baldwin out[6] = (x17 + x35); 1195*744bfb21SJohn Baldwin out[7] = (x19 + x37); 1196*744bfb21SJohn Baldwin out[8] = (x21 + x39); 1197*744bfb21SJohn Baldwin out[9] = (x20 + x38); 1198*744bfb21SJohn Baldwin } 1199*744bfb21SJohn Baldwin 1200*744bfb21SJohn Baldwin /* h = f + g 1201*744bfb21SJohn Baldwin * Can overlap h with f or g. 1202*744bfb21SJohn Baldwin */ 1203*744bfb21SJohn Baldwin static inline void fe_add(fe_loose *h, const fe *f, const fe *g) 1204*744bfb21SJohn Baldwin { 1205*744bfb21SJohn Baldwin fe_add_impl(h->v, f->v, g->v); 1206*744bfb21SJohn Baldwin } 1207*744bfb21SJohn Baldwin 1208*744bfb21SJohn Baldwin static void fe_sub_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) 1209*744bfb21SJohn Baldwin { 1210*744bfb21SJohn Baldwin const uint32_t x20 = in1[9]; 1211*744bfb21SJohn Baldwin const uint32_t x21 = in1[8]; 1212*744bfb21SJohn Baldwin const uint32_t x19 = in1[7]; 1213*744bfb21SJohn Baldwin const uint32_t x17 = in1[6]; 1214*744bfb21SJohn Baldwin const uint32_t x15 = in1[5]; 1215*744bfb21SJohn Baldwin const uint32_t x13 = in1[4]; 1216*744bfb21SJohn Baldwin const uint32_t x11 = in1[3]; 1217*744bfb21SJohn Baldwin const uint32_t x9 = in1[2]; 1218*744bfb21SJohn Baldwin const uint32_t x7 = in1[1]; 1219*744bfb21SJohn Baldwin const uint32_t x5 = in1[0]; 1220*744bfb21SJohn Baldwin const uint32_t x38 = in2[9]; 1221*744bfb21SJohn Baldwin const uint32_t x39 = in2[8]; 1222*744bfb21SJohn Baldwin const uint32_t x37 = in2[7]; 1223*744bfb21SJohn Baldwin const uint32_t x35 = in2[6]; 1224*744bfb21SJohn Baldwin const uint32_t x33 = in2[5]; 1225*744bfb21SJohn Baldwin const uint32_t x31 = in2[4]; 1226*744bfb21SJohn Baldwin const uint32_t x29 = in2[3]; 1227*744bfb21SJohn Baldwin const uint32_t x27 = in2[2]; 1228*744bfb21SJohn Baldwin const uint32_t x25 = in2[1]; 1229*744bfb21SJohn Baldwin const uint32_t x23 = in2[0]; 1230*744bfb21SJohn Baldwin out[0] = ((0x7ffffda + x5) - x23); 1231*744bfb21SJohn Baldwin out[1] = ((0x3fffffe + x7) - x25); 1232*744bfb21SJohn Baldwin out[2] = ((0x7fffffe + x9) - x27); 1233*744bfb21SJohn Baldwin out[3] = ((0x3fffffe + x11) - x29); 1234*744bfb21SJohn Baldwin out[4] = ((0x7fffffe + x13) - x31); 1235*744bfb21SJohn Baldwin out[5] = ((0x3fffffe + x15) - x33); 1236*744bfb21SJohn Baldwin out[6] = ((0x7fffffe + x17) - x35); 1237*744bfb21SJohn Baldwin out[7] = ((0x3fffffe + x19) - x37); 1238*744bfb21SJohn Baldwin out[8] = ((0x7fffffe + x21) - x39); 1239*744bfb21SJohn Baldwin out[9] = ((0x3fffffe + x20) - x38); 1240*744bfb21SJohn Baldwin } 1241*744bfb21SJohn Baldwin 1242*744bfb21SJohn Baldwin /* h = f - g 1243*744bfb21SJohn Baldwin * Can overlap h with f or g. 1244*744bfb21SJohn Baldwin */ 1245*744bfb21SJohn Baldwin static inline void fe_sub(fe_loose *h, const fe *f, const fe *g) 1246*744bfb21SJohn Baldwin { 1247*744bfb21SJohn Baldwin fe_sub_impl(h->v, f->v, g->v); 1248*744bfb21SJohn Baldwin } 1249*744bfb21SJohn Baldwin 1250*744bfb21SJohn Baldwin static void fe_mul_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) 1251*744bfb21SJohn Baldwin { 1252*744bfb21SJohn Baldwin const uint32_t x20 = in1[9]; 1253*744bfb21SJohn Baldwin const uint32_t x21 = in1[8]; 1254*744bfb21SJohn Baldwin const uint32_t x19 = in1[7]; 1255*744bfb21SJohn Baldwin const uint32_t x17 = in1[6]; 1256*744bfb21SJohn Baldwin const uint32_t x15 = in1[5]; 1257*744bfb21SJohn Baldwin const uint32_t x13 = in1[4]; 1258*744bfb21SJohn Baldwin const uint32_t x11 = in1[3]; 1259*744bfb21SJohn Baldwin const uint32_t x9 = in1[2]; 1260*744bfb21SJohn Baldwin const uint32_t x7 = in1[1]; 1261*744bfb21SJohn Baldwin const uint32_t x5 = in1[0]; 1262*744bfb21SJohn Baldwin const uint32_t x38 = in2[9]; 1263*744bfb21SJohn Baldwin const uint32_t x39 = in2[8]; 1264*744bfb21SJohn Baldwin const uint32_t x37 = in2[7]; 1265*744bfb21SJohn Baldwin const uint32_t x35 = in2[6]; 1266*744bfb21SJohn Baldwin const uint32_t x33 = in2[5]; 1267*744bfb21SJohn Baldwin const uint32_t x31 = in2[4]; 1268*744bfb21SJohn Baldwin const uint32_t x29 = in2[3]; 1269*744bfb21SJohn Baldwin const uint32_t x27 = in2[2]; 1270*744bfb21SJohn Baldwin const uint32_t x25 = in2[1]; 1271*744bfb21SJohn Baldwin const uint32_t x23 = in2[0]; 1272*744bfb21SJohn Baldwin uint64_t x40 = ((uint64_t)x23 * x5); 1273*744bfb21SJohn Baldwin uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5)); 1274*744bfb21SJohn Baldwin uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5)); 1275*744bfb21SJohn Baldwin uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5)); 1276*744bfb21SJohn Baldwin uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5)); 1277*744bfb21SJohn Baldwin uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5)); 1278*744bfb21SJohn Baldwin uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5)); 1279*744bfb21SJohn Baldwin uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5)); 1280*744bfb21SJohn Baldwin uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5)); 1281*744bfb21SJohn Baldwin uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5)); 1282*744bfb21SJohn Baldwin uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9)); 1283*744bfb21SJohn Baldwin uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9)); 1284*744bfb21SJohn Baldwin uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13)); 1285*744bfb21SJohn Baldwin uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13)); 1286*744bfb21SJohn Baldwin uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17)); 1287*744bfb21SJohn Baldwin uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17)); 1288*744bfb21SJohn Baldwin uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19)))); 1289*744bfb21SJohn Baldwin uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21)); 1290*744bfb21SJohn Baldwin uint64_t x58 = ((uint64_t)(0x2 * x38) * x20); 1291*744bfb21SJohn Baldwin uint64_t x59 = (x48 + (x58 << 0x4)); 1292*744bfb21SJohn Baldwin uint64_t x60 = (x59 + (x58 << 0x1)); 1293*744bfb21SJohn Baldwin uint64_t x61 = (x60 + x58); 1294*744bfb21SJohn Baldwin uint64_t x62 = (x47 + (x57 << 0x4)); 1295*744bfb21SJohn Baldwin uint64_t x63 = (x62 + (x57 << 0x1)); 1296*744bfb21SJohn Baldwin uint64_t x64 = (x63 + x57); 1297*744bfb21SJohn Baldwin uint64_t x65 = (x46 + (x56 << 0x4)); 1298*744bfb21SJohn Baldwin uint64_t x66 = (x65 + (x56 << 0x1)); 1299*744bfb21SJohn Baldwin uint64_t x67 = (x66 + x56); 1300*744bfb21SJohn Baldwin uint64_t x68 = (x45 + (x55 << 0x4)); 1301*744bfb21SJohn Baldwin uint64_t x69 = (x68 + (x55 << 0x1)); 1302*744bfb21SJohn Baldwin uint64_t x70 = (x69 + x55); 1303*744bfb21SJohn Baldwin uint64_t x71 = (x44 + (x54 << 0x4)); 1304*744bfb21SJohn Baldwin uint64_t x72 = (x71 + (x54 << 0x1)); 1305*744bfb21SJohn Baldwin uint64_t x73 = (x72 + x54); 1306*744bfb21SJohn Baldwin uint64_t x74 = (x43 + (x53 << 0x4)); 1307*744bfb21SJohn Baldwin uint64_t x75 = (x74 + (x53 << 0x1)); 1308*744bfb21SJohn Baldwin uint64_t x76 = (x75 + x53); 1309*744bfb21SJohn Baldwin uint64_t x77 = (x42 + (x52 << 0x4)); 1310*744bfb21SJohn Baldwin uint64_t x78 = (x77 + (x52 << 0x1)); 1311*744bfb21SJohn Baldwin uint64_t x79 = (x78 + x52); 1312*744bfb21SJohn Baldwin uint64_t x80 = (x41 + (x51 << 0x4)); 1313*744bfb21SJohn Baldwin uint64_t x81 = (x80 + (x51 << 0x1)); 1314*744bfb21SJohn Baldwin uint64_t x82 = (x81 + x51); 1315*744bfb21SJohn Baldwin uint64_t x83 = (x40 + (x50 << 0x4)); 1316*744bfb21SJohn Baldwin uint64_t x84 = (x83 + (x50 << 0x1)); 1317*744bfb21SJohn Baldwin uint64_t x85 = (x84 + x50); 1318*744bfb21SJohn Baldwin uint64_t x86 = (x85 >> 0x1a); 1319*744bfb21SJohn Baldwin uint32_t x87 = ((uint32_t)x85 & 0x3ffffff); 1320*744bfb21SJohn Baldwin uint64_t x88 = (x86 + x82); 1321*744bfb21SJohn Baldwin uint64_t x89 = (x88 >> 0x19); 1322*744bfb21SJohn Baldwin uint32_t x90 = ((uint32_t)x88 & 0x1ffffff); 1323*744bfb21SJohn Baldwin uint64_t x91 = (x89 + x79); 1324*744bfb21SJohn Baldwin uint64_t x92 = (x91 >> 0x1a); 1325*744bfb21SJohn Baldwin uint32_t x93 = ((uint32_t)x91 & 0x3ffffff); 1326*744bfb21SJohn Baldwin uint64_t x94 = (x92 + x76); 1327*744bfb21SJohn Baldwin uint64_t x95 = (x94 >> 0x19); 1328*744bfb21SJohn Baldwin uint32_t x96 = ((uint32_t)x94 & 0x1ffffff); 1329*744bfb21SJohn Baldwin uint64_t x97 = (x95 + x73); 1330*744bfb21SJohn Baldwin uint64_t x98 = (x97 >> 0x1a); 1331*744bfb21SJohn Baldwin uint32_t x99 = ((uint32_t)x97 & 0x3ffffff); 1332*744bfb21SJohn Baldwin uint64_t x100 = (x98 + x70); 1333*744bfb21SJohn Baldwin uint64_t x101 = (x100 >> 0x19); 1334*744bfb21SJohn Baldwin uint32_t x102 = ((uint32_t)x100 & 0x1ffffff); 1335*744bfb21SJohn Baldwin uint64_t x103 = (x101 + x67); 1336*744bfb21SJohn Baldwin uint64_t x104 = (x103 >> 0x1a); 1337*744bfb21SJohn Baldwin uint32_t x105 = ((uint32_t)x103 & 0x3ffffff); 1338*744bfb21SJohn Baldwin uint64_t x106 = (x104 + x64); 1339*744bfb21SJohn Baldwin uint64_t x107 = (x106 >> 0x19); 1340*744bfb21SJohn Baldwin uint32_t x108 = ((uint32_t)x106 & 0x1ffffff); 1341*744bfb21SJohn Baldwin uint64_t x109 = (x107 + x61); 1342*744bfb21SJohn Baldwin uint64_t x110 = (x109 >> 0x1a); 1343*744bfb21SJohn Baldwin uint32_t x111 = ((uint32_t)x109 & 0x3ffffff); 1344*744bfb21SJohn Baldwin uint64_t x112 = (x110 + x49); 1345*744bfb21SJohn Baldwin uint64_t x113 = (x112 >> 0x19); 1346*744bfb21SJohn Baldwin uint32_t x114 = ((uint32_t)x112 & 0x1ffffff); 1347*744bfb21SJohn Baldwin uint64_t x115 = (x87 + (0x13 * x113)); 1348*744bfb21SJohn Baldwin uint32_t x116 = (uint32_t) (x115 >> 0x1a); 1349*744bfb21SJohn Baldwin uint32_t x117 = ((uint32_t)x115 & 0x3ffffff); 1350*744bfb21SJohn Baldwin uint32_t x118 = (x116 + x90); 1351*744bfb21SJohn Baldwin uint32_t x119 = (x118 >> 0x19); 1352*744bfb21SJohn Baldwin uint32_t x120 = (x118 & 0x1ffffff); 1353*744bfb21SJohn Baldwin out[0] = x117; 1354*744bfb21SJohn Baldwin out[1] = x120; 1355*744bfb21SJohn Baldwin out[2] = (x119 + x93); 1356*744bfb21SJohn Baldwin out[3] = x96; 1357*744bfb21SJohn Baldwin out[4] = x99; 1358*744bfb21SJohn Baldwin out[5] = x102; 1359*744bfb21SJohn Baldwin out[6] = x105; 1360*744bfb21SJohn Baldwin out[7] = x108; 1361*744bfb21SJohn Baldwin out[8] = x111; 1362*744bfb21SJohn Baldwin out[9] = x114; 1363*744bfb21SJohn Baldwin } 1364*744bfb21SJohn Baldwin 1365*744bfb21SJohn Baldwin static inline void fe_mul_ttt(fe *h, const fe *f, const fe *g) 1366*744bfb21SJohn Baldwin { 1367*744bfb21SJohn Baldwin fe_mul_impl(h->v, f->v, g->v); 1368*744bfb21SJohn Baldwin } 1369*744bfb21SJohn Baldwin 1370*744bfb21SJohn Baldwin static inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) 1371*744bfb21SJohn Baldwin { 1372*744bfb21SJohn Baldwin fe_mul_impl(h->v, f->v, g->v); 1373*744bfb21SJohn Baldwin } 1374*744bfb21SJohn Baldwin 1375*744bfb21SJohn Baldwin static inline void 1376*744bfb21SJohn Baldwin fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) 1377*744bfb21SJohn Baldwin { 1378*744bfb21SJohn Baldwin fe_mul_impl(h->v, f->v, g->v); 1379*744bfb21SJohn Baldwin } 1380*744bfb21SJohn Baldwin 1381*744bfb21SJohn Baldwin static void fe_sqr_impl(uint32_t out[10], const uint32_t in1[10]) 1382*744bfb21SJohn Baldwin { 1383*744bfb21SJohn Baldwin const uint32_t x17 = in1[9]; 1384*744bfb21SJohn Baldwin const uint32_t x18 = in1[8]; 1385*744bfb21SJohn Baldwin const uint32_t x16 = in1[7]; 1386*744bfb21SJohn Baldwin const uint32_t x14 = in1[6]; 1387*744bfb21SJohn Baldwin const uint32_t x12 = in1[5]; 1388*744bfb21SJohn Baldwin const uint32_t x10 = in1[4]; 1389*744bfb21SJohn Baldwin const uint32_t x8 = in1[3]; 1390*744bfb21SJohn Baldwin const uint32_t x6 = in1[2]; 1391*744bfb21SJohn Baldwin const uint32_t x4 = in1[1]; 1392*744bfb21SJohn Baldwin const uint32_t x2 = in1[0]; 1393*744bfb21SJohn Baldwin uint64_t x19 = ((uint64_t)x2 * x2); 1394*744bfb21SJohn Baldwin uint64_t x20 = ((uint64_t)(0x2 * x2) * x4); 1395*744bfb21SJohn Baldwin uint64_t x21 = (0x2 * (((uint64_t)x4 * x4) + ((uint64_t)x2 * x6))); 1396*744bfb21SJohn Baldwin uint64_t x22 = (0x2 * (((uint64_t)x4 * x6) + ((uint64_t)x2 * x8))); 1397*744bfb21SJohn Baldwin uint64_t x23 = ((((uint64_t)x6 * x6) + ((uint64_t)(0x4 * x4) * x8)) + ((uint64_t)(0x2 * x2) * x10)); 1398*744bfb21SJohn Baldwin uint64_t x24 = (0x2 * ((((uint64_t)x6 * x8) + ((uint64_t)x4 * x10)) + ((uint64_t)x2 * x12))); 1399*744bfb21SJohn Baldwin uint64_t x25 = (0x2 * (((((uint64_t)x8 * x8) + ((uint64_t)x6 * x10)) + ((uint64_t)x2 * x14)) + ((uint64_t)(0x2 * x4) * x12))); 1400*744bfb21SJohn Baldwin uint64_t x26 = (0x2 * (((((uint64_t)x8 * x10) + ((uint64_t)x6 * x12)) + ((uint64_t)x4 * x14)) + ((uint64_t)x2 * x16))); 1401*744bfb21SJohn Baldwin uint64_t x27 = (((uint64_t)x10 * x10) + (0x2 * ((((uint64_t)x6 * x14) + ((uint64_t)x2 * x18)) + (0x2 * (((uint64_t)x4 * x16) + ((uint64_t)x8 * x12)))))); 1402*744bfb21SJohn Baldwin uint64_t x28 = (0x2 * ((((((uint64_t)x10 * x12) + ((uint64_t)x8 * x14)) + ((uint64_t)x6 * x16)) + ((uint64_t)x4 * x18)) + ((uint64_t)x2 * x17))); 1403*744bfb21SJohn Baldwin uint64_t x29 = (0x2 * (((((uint64_t)x12 * x12) + ((uint64_t)x10 * x14)) + ((uint64_t)x6 * x18)) + (0x2 * (((uint64_t)x8 * x16) + ((uint64_t)x4 * x17))))); 1404*744bfb21SJohn Baldwin uint64_t x30 = (0x2 * (((((uint64_t)x12 * x14) + ((uint64_t)x10 * x16)) + ((uint64_t)x8 * x18)) + ((uint64_t)x6 * x17))); 1405*744bfb21SJohn Baldwin uint64_t x31 = (((uint64_t)x14 * x14) + (0x2 * (((uint64_t)x10 * x18) + (0x2 * (((uint64_t)x12 * x16) + ((uint64_t)x8 * x17)))))); 1406*744bfb21SJohn Baldwin uint64_t x32 = (0x2 * ((((uint64_t)x14 * x16) + ((uint64_t)x12 * x18)) + ((uint64_t)x10 * x17))); 1407*744bfb21SJohn Baldwin uint64_t x33 = (0x2 * ((((uint64_t)x16 * x16) + ((uint64_t)x14 * x18)) + ((uint64_t)(0x2 * x12) * x17))); 1408*744bfb21SJohn Baldwin uint64_t x34 = (0x2 * (((uint64_t)x16 * x18) + ((uint64_t)x14 * x17))); 1409*744bfb21SJohn Baldwin uint64_t x35 = (((uint64_t)x18 * x18) + ((uint64_t)(0x4 * x16) * x17)); 1410*744bfb21SJohn Baldwin uint64_t x36 = ((uint64_t)(0x2 * x18) * x17); 1411*744bfb21SJohn Baldwin uint64_t x37 = ((uint64_t)(0x2 * x17) * x17); 1412*744bfb21SJohn Baldwin uint64_t x38 = (x27 + (x37 << 0x4)); 1413*744bfb21SJohn Baldwin uint64_t x39 = (x38 + (x37 << 0x1)); 1414*744bfb21SJohn Baldwin uint64_t x40 = (x39 + x37); 1415*744bfb21SJohn Baldwin uint64_t x41 = (x26 + (x36 << 0x4)); 1416*744bfb21SJohn Baldwin uint64_t x42 = (x41 + (x36 << 0x1)); 1417*744bfb21SJohn Baldwin uint64_t x43 = (x42 + x36); 1418*744bfb21SJohn Baldwin uint64_t x44 = (x25 + (x35 << 0x4)); 1419*744bfb21SJohn Baldwin uint64_t x45 = (x44 + (x35 << 0x1)); 1420*744bfb21SJohn Baldwin uint64_t x46 = (x45 + x35); 1421*744bfb21SJohn Baldwin uint64_t x47 = (x24 + (x34 << 0x4)); 1422*744bfb21SJohn Baldwin uint64_t x48 = (x47 + (x34 << 0x1)); 1423*744bfb21SJohn Baldwin uint64_t x49 = (x48 + x34); 1424*744bfb21SJohn Baldwin uint64_t x50 = (x23 + (x33 << 0x4)); 1425*744bfb21SJohn Baldwin uint64_t x51 = (x50 + (x33 << 0x1)); 1426*744bfb21SJohn Baldwin uint64_t x52 = (x51 + x33); 1427*744bfb21SJohn Baldwin uint64_t x53 = (x22 + (x32 << 0x4)); 1428*744bfb21SJohn Baldwin uint64_t x54 = (x53 + (x32 << 0x1)); 1429*744bfb21SJohn Baldwin uint64_t x55 = (x54 + x32); 1430*744bfb21SJohn Baldwin uint64_t x56 = (x21 + (x31 << 0x4)); 1431*744bfb21SJohn Baldwin uint64_t x57 = (x56 + (x31 << 0x1)); 1432*744bfb21SJohn Baldwin uint64_t x58 = (x57 + x31); 1433*744bfb21SJohn Baldwin uint64_t x59 = (x20 + (x30 << 0x4)); 1434*744bfb21SJohn Baldwin uint64_t x60 = (x59 + (x30 << 0x1)); 1435*744bfb21SJohn Baldwin uint64_t x61 = (x60 + x30); 1436*744bfb21SJohn Baldwin uint64_t x62 = (x19 + (x29 << 0x4)); 1437*744bfb21SJohn Baldwin uint64_t x63 = (x62 + (x29 << 0x1)); 1438*744bfb21SJohn Baldwin uint64_t x64 = (x63 + x29); 1439*744bfb21SJohn Baldwin uint64_t x65 = (x64 >> 0x1a); 1440*744bfb21SJohn Baldwin uint32_t x66 = ((uint32_t)x64 & 0x3ffffff); 1441*744bfb21SJohn Baldwin uint64_t x67 = (x65 + x61); 1442*744bfb21SJohn Baldwin uint64_t x68 = (x67 >> 0x19); 1443*744bfb21SJohn Baldwin uint32_t x69 = ((uint32_t)x67 & 0x1ffffff); 1444*744bfb21SJohn Baldwin uint64_t x70 = (x68 + x58); 1445*744bfb21SJohn Baldwin uint64_t x71 = (x70 >> 0x1a); 1446*744bfb21SJohn Baldwin uint32_t x72 = ((uint32_t)x70 & 0x3ffffff); 1447*744bfb21SJohn Baldwin uint64_t x73 = (x71 + x55); 1448*744bfb21SJohn Baldwin uint64_t x74 = (x73 >> 0x19); 1449*744bfb21SJohn Baldwin uint32_t x75 = ((uint32_t)x73 & 0x1ffffff); 1450*744bfb21SJohn Baldwin uint64_t x76 = (x74 + x52); 1451*744bfb21SJohn Baldwin uint64_t x77 = (x76 >> 0x1a); 1452*744bfb21SJohn Baldwin uint32_t x78 = ((uint32_t)x76 & 0x3ffffff); 1453*744bfb21SJohn Baldwin uint64_t x79 = (x77 + x49); 1454*744bfb21SJohn Baldwin uint64_t x80 = (x79 >> 0x19); 1455*744bfb21SJohn Baldwin uint32_t x81 = ((uint32_t)x79 & 0x1ffffff); 1456*744bfb21SJohn Baldwin uint64_t x82 = (x80 + x46); 1457*744bfb21SJohn Baldwin uint64_t x83 = (x82 >> 0x1a); 1458*744bfb21SJohn Baldwin uint32_t x84 = ((uint32_t)x82 & 0x3ffffff); 1459*744bfb21SJohn Baldwin uint64_t x85 = (x83 + x43); 1460*744bfb21SJohn Baldwin uint64_t x86 = (x85 >> 0x19); 1461*744bfb21SJohn Baldwin uint32_t x87 = ((uint32_t)x85 & 0x1ffffff); 1462*744bfb21SJohn Baldwin uint64_t x88 = (x86 + x40); 1463*744bfb21SJohn Baldwin uint64_t x89 = (x88 >> 0x1a); 1464*744bfb21SJohn Baldwin uint32_t x90 = ((uint32_t)x88 & 0x3ffffff); 1465*744bfb21SJohn Baldwin uint64_t x91 = (x89 + x28); 1466*744bfb21SJohn Baldwin uint64_t x92 = (x91 >> 0x19); 1467*744bfb21SJohn Baldwin uint32_t x93 = ((uint32_t)x91 & 0x1ffffff); 1468*744bfb21SJohn Baldwin uint64_t x94 = (x66 + (0x13 * x92)); 1469*744bfb21SJohn Baldwin uint32_t x95 = (uint32_t) (x94 >> 0x1a); 1470*744bfb21SJohn Baldwin uint32_t x96 = ((uint32_t)x94 & 0x3ffffff); 1471*744bfb21SJohn Baldwin uint32_t x97 = (x95 + x69); 1472*744bfb21SJohn Baldwin uint32_t x98 = (x97 >> 0x19); 1473*744bfb21SJohn Baldwin uint32_t x99 = (x97 & 0x1ffffff); 1474*744bfb21SJohn Baldwin out[0] = x96; 1475*744bfb21SJohn Baldwin out[1] = x99; 1476*744bfb21SJohn Baldwin out[2] = (x98 + x72); 1477*744bfb21SJohn Baldwin out[3] = x75; 1478*744bfb21SJohn Baldwin out[4] = x78; 1479*744bfb21SJohn Baldwin out[5] = x81; 1480*744bfb21SJohn Baldwin out[6] = x84; 1481*744bfb21SJohn Baldwin out[7] = x87; 1482*744bfb21SJohn Baldwin out[8] = x90; 1483*744bfb21SJohn Baldwin out[9] = x93; 1484*744bfb21SJohn Baldwin } 1485*744bfb21SJohn Baldwin 1486*744bfb21SJohn Baldwin static inline void fe_sq_tl(fe *h, const fe_loose *f) 1487*744bfb21SJohn Baldwin { 1488*744bfb21SJohn Baldwin fe_sqr_impl(h->v, f->v); 1489*744bfb21SJohn Baldwin } 1490*744bfb21SJohn Baldwin 1491*744bfb21SJohn Baldwin static inline void fe_sq_tt(fe *h, const fe *f) 1492*744bfb21SJohn Baldwin { 1493*744bfb21SJohn Baldwin fe_sqr_impl(h->v, f->v); 1494*744bfb21SJohn Baldwin } 1495*744bfb21SJohn Baldwin 1496*744bfb21SJohn Baldwin static inline void fe_loose_invert(fe *out, const fe_loose *z) 1497*744bfb21SJohn Baldwin { 1498*744bfb21SJohn Baldwin fe t0; 1499*744bfb21SJohn Baldwin fe t1; 1500*744bfb21SJohn Baldwin fe t2; 1501*744bfb21SJohn Baldwin fe t3; 1502*744bfb21SJohn Baldwin int i; 1503*744bfb21SJohn Baldwin 1504*744bfb21SJohn Baldwin fe_sq_tl(&t0, z); 1505*744bfb21SJohn Baldwin fe_sq_tt(&t1, &t0); 1506*744bfb21SJohn Baldwin for (i = 1; i < 2; ++i) 1507*744bfb21SJohn Baldwin fe_sq_tt(&t1, &t1); 1508*744bfb21SJohn Baldwin fe_mul_tlt(&t1, z, &t1); 1509*744bfb21SJohn Baldwin fe_mul_ttt(&t0, &t0, &t1); 1510*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t0); 1511*744bfb21SJohn Baldwin fe_mul_ttt(&t1, &t1, &t2); 1512*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t1); 1513*744bfb21SJohn Baldwin for (i = 1; i < 5; ++i) 1514*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t2); 1515*744bfb21SJohn Baldwin fe_mul_ttt(&t1, &t2, &t1); 1516*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t1); 1517*744bfb21SJohn Baldwin for (i = 1; i < 10; ++i) 1518*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t2); 1519*744bfb21SJohn Baldwin fe_mul_ttt(&t2, &t2, &t1); 1520*744bfb21SJohn Baldwin fe_sq_tt(&t3, &t2); 1521*744bfb21SJohn Baldwin for (i = 1; i < 20; ++i) 1522*744bfb21SJohn Baldwin fe_sq_tt(&t3, &t3); 1523*744bfb21SJohn Baldwin fe_mul_ttt(&t2, &t3, &t2); 1524*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t2); 1525*744bfb21SJohn Baldwin for (i = 1; i < 10; ++i) 1526*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t2); 1527*744bfb21SJohn Baldwin fe_mul_ttt(&t1, &t2, &t1); 1528*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t1); 1529*744bfb21SJohn Baldwin for (i = 1; i < 50; ++i) 1530*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t2); 1531*744bfb21SJohn Baldwin fe_mul_ttt(&t2, &t2, &t1); 1532*744bfb21SJohn Baldwin fe_sq_tt(&t3, &t2); 1533*744bfb21SJohn Baldwin for (i = 1; i < 100; ++i) 1534*744bfb21SJohn Baldwin fe_sq_tt(&t3, &t3); 1535*744bfb21SJohn Baldwin fe_mul_ttt(&t2, &t3, &t2); 1536*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t2); 1537*744bfb21SJohn Baldwin for (i = 1; i < 50; ++i) 1538*744bfb21SJohn Baldwin fe_sq_tt(&t2, &t2); 1539*744bfb21SJohn Baldwin fe_mul_ttt(&t1, &t2, &t1); 1540*744bfb21SJohn Baldwin fe_sq_tt(&t1, &t1); 1541*744bfb21SJohn Baldwin for (i = 1; i < 5; ++i) 1542*744bfb21SJohn Baldwin fe_sq_tt(&t1, &t1); 1543*744bfb21SJohn Baldwin fe_mul_ttt(out, &t1, &t0); 1544*744bfb21SJohn Baldwin } 1545*744bfb21SJohn Baldwin 1546*744bfb21SJohn Baldwin static inline void fe_invert(fe *out, const fe *z) 1547*744bfb21SJohn Baldwin { 1548*744bfb21SJohn Baldwin fe_loose l; 1549*744bfb21SJohn Baldwin fe_copy_lt(&l, z); 1550*744bfb21SJohn Baldwin fe_loose_invert(out, &l); 1551*744bfb21SJohn Baldwin } 1552*744bfb21SJohn Baldwin 1553*744bfb21SJohn Baldwin /* Replace (f,g) with (g,f) if b == 1; 1554*744bfb21SJohn Baldwin * replace (f,g) with (f,g) if b == 0. 1555*744bfb21SJohn Baldwin * 1556*744bfb21SJohn Baldwin * Preconditions: b in {0,1} 1557*744bfb21SJohn Baldwin */ 1558*744bfb21SJohn Baldwin static inline void fe_cswap(fe *f, fe *g, unsigned int b) 1559*744bfb21SJohn Baldwin { 1560*744bfb21SJohn Baldwin unsigned i; 1561*744bfb21SJohn Baldwin b = 0 - b; 1562*744bfb21SJohn Baldwin for (i = 0; i < 10; i++) { 1563*744bfb21SJohn Baldwin uint32_t x = f->v[i] ^ g->v[i]; 1564*744bfb21SJohn Baldwin x &= b; 1565*744bfb21SJohn Baldwin f->v[i] ^= x; 1566*744bfb21SJohn Baldwin g->v[i] ^= x; 1567*744bfb21SJohn Baldwin } 1568*744bfb21SJohn Baldwin } 1569*744bfb21SJohn Baldwin 1570*744bfb21SJohn Baldwin /* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/ 1571*744bfb21SJohn Baldwin static inline void fe_mul_121666_impl(uint32_t out[10], const uint32_t in1[10]) 1572*744bfb21SJohn Baldwin { 1573*744bfb21SJohn Baldwin const uint32_t x20 = in1[9]; 1574*744bfb21SJohn Baldwin const uint32_t x21 = in1[8]; 1575*744bfb21SJohn Baldwin const uint32_t x19 = in1[7]; 1576*744bfb21SJohn Baldwin const uint32_t x17 = in1[6]; 1577*744bfb21SJohn Baldwin const uint32_t x15 = in1[5]; 1578*744bfb21SJohn Baldwin const uint32_t x13 = in1[4]; 1579*744bfb21SJohn Baldwin const uint32_t x11 = in1[3]; 1580*744bfb21SJohn Baldwin const uint32_t x9 = in1[2]; 1581*744bfb21SJohn Baldwin const uint32_t x7 = in1[1]; 1582*744bfb21SJohn Baldwin const uint32_t x5 = in1[0]; 1583*744bfb21SJohn Baldwin const uint32_t x38 = 0; 1584*744bfb21SJohn Baldwin const uint32_t x39 = 0; 1585*744bfb21SJohn Baldwin const uint32_t x37 = 0; 1586*744bfb21SJohn Baldwin const uint32_t x35 = 0; 1587*744bfb21SJohn Baldwin const uint32_t x33 = 0; 1588*744bfb21SJohn Baldwin const uint32_t x31 = 0; 1589*744bfb21SJohn Baldwin const uint32_t x29 = 0; 1590*744bfb21SJohn Baldwin const uint32_t x27 = 0; 1591*744bfb21SJohn Baldwin const uint32_t x25 = 0; 1592*744bfb21SJohn Baldwin const uint32_t x23 = 121666; 1593*744bfb21SJohn Baldwin uint64_t x40 = ((uint64_t)x23 * x5); 1594*744bfb21SJohn Baldwin uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5)); 1595*744bfb21SJohn Baldwin uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5)); 1596*744bfb21SJohn Baldwin uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5)); 1597*744bfb21SJohn Baldwin uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5)); 1598*744bfb21SJohn Baldwin uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5)); 1599*744bfb21SJohn Baldwin uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5)); 1600*744bfb21SJohn Baldwin uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5)); 1601*744bfb21SJohn Baldwin uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5)); 1602*744bfb21SJohn Baldwin uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5)); 1603*744bfb21SJohn Baldwin uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9)); 1604*744bfb21SJohn Baldwin uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9)); 1605*744bfb21SJohn Baldwin uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13)); 1606*744bfb21SJohn Baldwin uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13)); 1607*744bfb21SJohn Baldwin uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17)); 1608*744bfb21SJohn Baldwin uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17)); 1609*744bfb21SJohn Baldwin uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19)))); 1610*744bfb21SJohn Baldwin uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21)); 1611*744bfb21SJohn Baldwin uint64_t x58 = ((uint64_t)(0x2 * x38) * x20); 1612*744bfb21SJohn Baldwin uint64_t x59 = (x48 + (x58 << 0x4)); 1613*744bfb21SJohn Baldwin uint64_t x60 = (x59 + (x58 << 0x1)); 1614*744bfb21SJohn Baldwin uint64_t x61 = (x60 + x58); 1615*744bfb21SJohn Baldwin uint64_t x62 = (x47 + (x57 << 0x4)); 1616*744bfb21SJohn Baldwin uint64_t x63 = (x62 + (x57 << 0x1)); 1617*744bfb21SJohn Baldwin uint64_t x64 = (x63 + x57); 1618*744bfb21SJohn Baldwin uint64_t x65 = (x46 + (x56 << 0x4)); 1619*744bfb21SJohn Baldwin uint64_t x66 = (x65 + (x56 << 0x1)); 1620*744bfb21SJohn Baldwin uint64_t x67 = (x66 + x56); 1621*744bfb21SJohn Baldwin uint64_t x68 = (x45 + (x55 << 0x4)); 1622*744bfb21SJohn Baldwin uint64_t x69 = (x68 + (x55 << 0x1)); 1623*744bfb21SJohn Baldwin uint64_t x70 = (x69 + x55); 1624*744bfb21SJohn Baldwin uint64_t x71 = (x44 + (x54 << 0x4)); 1625*744bfb21SJohn Baldwin uint64_t x72 = (x71 + (x54 << 0x1)); 1626*744bfb21SJohn Baldwin uint64_t x73 = (x72 + x54); 1627*744bfb21SJohn Baldwin uint64_t x74 = (x43 + (x53 << 0x4)); 1628*744bfb21SJohn Baldwin uint64_t x75 = (x74 + (x53 << 0x1)); 1629*744bfb21SJohn Baldwin uint64_t x76 = (x75 + x53); 1630*744bfb21SJohn Baldwin uint64_t x77 = (x42 + (x52 << 0x4)); 1631*744bfb21SJohn Baldwin uint64_t x78 = (x77 + (x52 << 0x1)); 1632*744bfb21SJohn Baldwin uint64_t x79 = (x78 + x52); 1633*744bfb21SJohn Baldwin uint64_t x80 = (x41 + (x51 << 0x4)); 1634*744bfb21SJohn Baldwin uint64_t x81 = (x80 + (x51 << 0x1)); 1635*744bfb21SJohn Baldwin uint64_t x82 = (x81 + x51); 1636*744bfb21SJohn Baldwin uint64_t x83 = (x40 + (x50 << 0x4)); 1637*744bfb21SJohn Baldwin uint64_t x84 = (x83 + (x50 << 0x1)); 1638*744bfb21SJohn Baldwin uint64_t x85 = (x84 + x50); 1639*744bfb21SJohn Baldwin uint64_t x86 = (x85 >> 0x1a); 1640*744bfb21SJohn Baldwin uint32_t x87 = ((uint32_t)x85 & 0x3ffffff); 1641*744bfb21SJohn Baldwin uint64_t x88 = (x86 + x82); 1642*744bfb21SJohn Baldwin uint64_t x89 = (x88 >> 0x19); 1643*744bfb21SJohn Baldwin uint32_t x90 = ((uint32_t)x88 & 0x1ffffff); 1644*744bfb21SJohn Baldwin uint64_t x91 = (x89 + x79); 1645*744bfb21SJohn Baldwin uint64_t x92 = (x91 >> 0x1a); 1646*744bfb21SJohn Baldwin uint32_t x93 = ((uint32_t)x91 & 0x3ffffff); 1647*744bfb21SJohn Baldwin uint64_t x94 = (x92 + x76); 1648*744bfb21SJohn Baldwin uint64_t x95 = (x94 >> 0x19); 1649*744bfb21SJohn Baldwin uint32_t x96 = ((uint32_t)x94 & 0x1ffffff); 1650*744bfb21SJohn Baldwin uint64_t x97 = (x95 + x73); 1651*744bfb21SJohn Baldwin uint64_t x98 = (x97 >> 0x1a); 1652*744bfb21SJohn Baldwin uint32_t x99 = ((uint32_t)x97 & 0x3ffffff); 1653*744bfb21SJohn Baldwin uint64_t x100 = (x98 + x70); 1654*744bfb21SJohn Baldwin uint64_t x101 = (x100 >> 0x19); 1655*744bfb21SJohn Baldwin uint32_t x102 = ((uint32_t)x100 & 0x1ffffff); 1656*744bfb21SJohn Baldwin uint64_t x103 = (x101 + x67); 1657*744bfb21SJohn Baldwin uint64_t x104 = (x103 >> 0x1a); 1658*744bfb21SJohn Baldwin uint32_t x105 = ((uint32_t)x103 & 0x3ffffff); 1659*744bfb21SJohn Baldwin uint64_t x106 = (x104 + x64); 1660*744bfb21SJohn Baldwin uint64_t x107 = (x106 >> 0x19); 1661*744bfb21SJohn Baldwin uint32_t x108 = ((uint32_t)x106 & 0x1ffffff); 1662*744bfb21SJohn Baldwin uint64_t x109 = (x107 + x61); 1663*744bfb21SJohn Baldwin uint64_t x110 = (x109 >> 0x1a); 1664*744bfb21SJohn Baldwin uint32_t x111 = ((uint32_t)x109 & 0x3ffffff); 1665*744bfb21SJohn Baldwin uint64_t x112 = (x110 + x49); 1666*744bfb21SJohn Baldwin uint64_t x113 = (x112 >> 0x19); 1667*744bfb21SJohn Baldwin uint32_t x114 = ((uint32_t)x112 & 0x1ffffff); 1668*744bfb21SJohn Baldwin uint64_t x115 = (x87 + (0x13 * x113)); 1669*744bfb21SJohn Baldwin uint32_t x116 = (uint32_t) (x115 >> 0x1a); 1670*744bfb21SJohn Baldwin uint32_t x117 = ((uint32_t)x115 & 0x3ffffff); 1671*744bfb21SJohn Baldwin uint32_t x118 = (x116 + x90); 1672*744bfb21SJohn Baldwin uint32_t x119 = (x118 >> 0x19); 1673*744bfb21SJohn Baldwin uint32_t x120 = (x118 & 0x1ffffff); 1674*744bfb21SJohn Baldwin out[0] = x117; 1675*744bfb21SJohn Baldwin out[1] = x120; 1676*744bfb21SJohn Baldwin out[2] = (x119 + x93); 1677*744bfb21SJohn Baldwin out[3] = x96; 1678*744bfb21SJohn Baldwin out[4] = x99; 1679*744bfb21SJohn Baldwin out[5] = x102; 1680*744bfb21SJohn Baldwin out[6] = x105; 1681*744bfb21SJohn Baldwin out[7] = x108; 1682*744bfb21SJohn Baldwin out[8] = x111; 1683*744bfb21SJohn Baldwin out[9] = x114; 1684*744bfb21SJohn Baldwin } 1685*744bfb21SJohn Baldwin 1686*744bfb21SJohn Baldwin static inline void fe_mul121666(fe *h, const fe_loose *f) 1687*744bfb21SJohn Baldwin { 1688*744bfb21SJohn Baldwin fe_mul_121666_impl(h->v, f->v); 1689*744bfb21SJohn Baldwin } 1690*744bfb21SJohn Baldwin 1691*744bfb21SJohn Baldwin static const uint8_t curve25519_null_point[CURVE25519_KEY_SIZE]; 1692*744bfb21SJohn Baldwin 1693*744bfb21SJohn Baldwin bool curve25519(uint8_t out[CURVE25519_KEY_SIZE], 1694*744bfb21SJohn Baldwin const uint8_t scalar[CURVE25519_KEY_SIZE], 1695*744bfb21SJohn Baldwin const uint8_t point[CURVE25519_KEY_SIZE]) 1696*744bfb21SJohn Baldwin { 1697*744bfb21SJohn Baldwin fe x1, x2, z2, x3, z3; 1698*744bfb21SJohn Baldwin fe_loose x2l, z2l, x3l; 1699*744bfb21SJohn Baldwin unsigned swap = 0; 1700*744bfb21SJohn Baldwin int pos; 1701*744bfb21SJohn Baldwin uint8_t e[32]; 1702*744bfb21SJohn Baldwin 1703*744bfb21SJohn Baldwin memcpy(e, scalar, 32); 1704*744bfb21SJohn Baldwin curve25519_clamp_secret(e); 1705*744bfb21SJohn Baldwin 1706*744bfb21SJohn Baldwin /* The following implementation was transcribed to Coq and proven to 1707*744bfb21SJohn Baldwin * correspond to unary scalar multiplication in affine coordinates given 1708*744bfb21SJohn Baldwin * that x1 != 0 is the x coordinate of some point on the curve. It was 1709*744bfb21SJohn Baldwin * also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives 1710*744bfb21SJohn Baldwin * z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was 1711*744bfb21SJohn Baldwin * quantified over the underlying field, so it applies to Curve25519 1712*744bfb21SJohn Baldwin * itself and the quadratic twist of Curve25519. It was not proven in 1713*744bfb21SJohn Baldwin * Coq that prime-field arithmetic correctly simulates extension-field 1714*744bfb21SJohn Baldwin * arithmetic on prime-field values. The decoding of the byte array 1715*744bfb21SJohn Baldwin * representation of e was not considered. 1716*744bfb21SJohn Baldwin * 1717*744bfb21SJohn Baldwin * Specification of Montgomery curves in affine coordinates: 1718*744bfb21SJohn Baldwin * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27> 1719*744bfb21SJohn Baldwin * 1720*744bfb21SJohn Baldwin * Proof that these form a group that is isomorphic to a Weierstrass 1721*744bfb21SJohn Baldwin * curve: 1722*744bfb21SJohn Baldwin * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35> 1723*744bfb21SJohn Baldwin * 1724*744bfb21SJohn Baldwin * Coq transcription and correctness proof of the loop 1725*744bfb21SJohn Baldwin * (where scalarbits=255): 1726*744bfb21SJohn Baldwin * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118> 1727*744bfb21SJohn Baldwin * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278> 1728*744bfb21SJohn Baldwin * preconditions: 0 <= e < 2^255 (not necessarily e < order), 1729*744bfb21SJohn Baldwin * fe_invert(0) = 0 1730*744bfb21SJohn Baldwin */ 1731*744bfb21SJohn Baldwin fe_frombytes(&x1, point); 1732*744bfb21SJohn Baldwin fe_1(&x2); 1733*744bfb21SJohn Baldwin fe_0(&z2); 1734*744bfb21SJohn Baldwin fe_copy(&x3, &x1); 1735*744bfb21SJohn Baldwin fe_1(&z3); 1736*744bfb21SJohn Baldwin 1737*744bfb21SJohn Baldwin for (pos = 254; pos >= 0; --pos) { 1738*744bfb21SJohn Baldwin fe tmp0, tmp1; 1739*744bfb21SJohn Baldwin fe_loose tmp0l, tmp1l; 1740*744bfb21SJohn Baldwin /* loop invariant as of right before the test, for the case 1741*744bfb21SJohn Baldwin * where x1 != 0: 1742*744bfb21SJohn Baldwin * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 1743*744bfb21SJohn Baldwin * is nonzero 1744*744bfb21SJohn Baldwin * let r := e >> (pos+1) in the following equalities of 1745*744bfb21SJohn Baldwin * projective points: 1746*744bfb21SJohn Baldwin * to_xz (r*P) === if swap then (x3, z3) else (x2, z2) 1747*744bfb21SJohn Baldwin * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) 1748*744bfb21SJohn Baldwin * x1 is the nonzero x coordinate of the nonzero 1749*744bfb21SJohn Baldwin * point (r*P-(r+1)*P) 1750*744bfb21SJohn Baldwin */ 1751*744bfb21SJohn Baldwin unsigned b = 1 & (e[pos / 8] >> (pos & 7)); 1752*744bfb21SJohn Baldwin swap ^= b; 1753*744bfb21SJohn Baldwin fe_cswap(&x2, &x3, swap); 1754*744bfb21SJohn Baldwin fe_cswap(&z2, &z3, swap); 1755*744bfb21SJohn Baldwin swap = b; 1756*744bfb21SJohn Baldwin /* Coq transcription of ladderstep formula (called from 1757*744bfb21SJohn Baldwin * transcribed loop): 1758*744bfb21SJohn Baldwin * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89> 1759*744bfb21SJohn Baldwin * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131> 1760*744bfb21SJohn Baldwin * x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217> 1761*744bfb21SJohn Baldwin * x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147> 1762*744bfb21SJohn Baldwin */ 1763*744bfb21SJohn Baldwin fe_sub(&tmp0l, &x3, &z3); 1764*744bfb21SJohn Baldwin fe_sub(&tmp1l, &x2, &z2); 1765*744bfb21SJohn Baldwin fe_add(&x2l, &x2, &z2); 1766*744bfb21SJohn Baldwin fe_add(&z2l, &x3, &z3); 1767*744bfb21SJohn Baldwin fe_mul_tll(&z3, &tmp0l, &x2l); 1768*744bfb21SJohn Baldwin fe_mul_tll(&z2, &z2l, &tmp1l); 1769*744bfb21SJohn Baldwin fe_sq_tl(&tmp0, &tmp1l); 1770*744bfb21SJohn Baldwin fe_sq_tl(&tmp1, &x2l); 1771*744bfb21SJohn Baldwin fe_add(&x3l, &z3, &z2); 1772*744bfb21SJohn Baldwin fe_sub(&z2l, &z3, &z2); 1773*744bfb21SJohn Baldwin fe_mul_ttt(&x2, &tmp1, &tmp0); 1774*744bfb21SJohn Baldwin fe_sub(&tmp1l, &tmp1, &tmp0); 1775*744bfb21SJohn Baldwin fe_sq_tl(&z2, &z2l); 1776*744bfb21SJohn Baldwin fe_mul121666(&z3, &tmp1l); 1777*744bfb21SJohn Baldwin fe_sq_tl(&x3, &x3l); 1778*744bfb21SJohn Baldwin fe_add(&tmp0l, &tmp0, &z3); 1779*744bfb21SJohn Baldwin fe_mul_ttt(&z3, &x1, &z2); 1780*744bfb21SJohn Baldwin fe_mul_tll(&z2, &tmp1l, &tmp0l); 1781*744bfb21SJohn Baldwin } 1782*744bfb21SJohn Baldwin /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) 1783*744bfb21SJohn Baldwin * else (x2, z2) 1784*744bfb21SJohn Baldwin */ 1785*744bfb21SJohn Baldwin fe_cswap(&x2, &x3, swap); 1786*744bfb21SJohn Baldwin fe_cswap(&z2, &z3, swap); 1787*744bfb21SJohn Baldwin 1788*744bfb21SJohn Baldwin fe_invert(&z2, &z2); 1789*744bfb21SJohn Baldwin fe_mul_ttt(&x2, &x2, &z2); 1790*744bfb21SJohn Baldwin fe_tobytes(out, &x2); 1791*744bfb21SJohn Baldwin 1792*744bfb21SJohn Baldwin explicit_bzero(&x1, sizeof(x1)); 1793*744bfb21SJohn Baldwin explicit_bzero(&x2, sizeof(x2)); 1794*744bfb21SJohn Baldwin explicit_bzero(&z2, sizeof(z2)); 1795*744bfb21SJohn Baldwin explicit_bzero(&x3, sizeof(x3)); 1796*744bfb21SJohn Baldwin explicit_bzero(&z3, sizeof(z3)); 1797*744bfb21SJohn Baldwin explicit_bzero(&x2l, sizeof(x2l)); 1798*744bfb21SJohn Baldwin explicit_bzero(&z2l, sizeof(z2l)); 1799*744bfb21SJohn Baldwin explicit_bzero(&x3l, sizeof(x3l)); 1800*744bfb21SJohn Baldwin explicit_bzero(&e, sizeof(e)); 1801*744bfb21SJohn Baldwin 1802*744bfb21SJohn Baldwin return timingsafe_bcmp(out, curve25519_null_point, CURVE25519_KEY_SIZE) != 0; 1803*744bfb21SJohn Baldwin } 1804*744bfb21SJohn Baldwin #endif 1805*744bfb21SJohn Baldwin 1806*744bfb21SJohn Baldwin int 1807*744bfb21SJohn Baldwin crypto_init(void) 1808*744bfb21SJohn Baldwin { 1809*744bfb21SJohn Baldwin #ifndef COMPAT_NEED_CHACHA20POLY1305_MBUF 1810*744bfb21SJohn Baldwin struct crypto_session_params csp = { 1811*744bfb21SJohn Baldwin .csp_mode = CSP_MODE_AEAD, 1812*744bfb21SJohn Baldwin .csp_ivlen = sizeof(uint64_t), 1813*744bfb21SJohn Baldwin .csp_cipher_alg = CRYPTO_CHACHA20_POLY1305, 1814*744bfb21SJohn Baldwin .csp_cipher_klen = CHACHA20POLY1305_KEY_SIZE, 1815*744bfb21SJohn Baldwin .csp_flags = CSP_F_SEPARATE_AAD | CSP_F_SEPARATE_OUTPUT 1816*744bfb21SJohn Baldwin }; 1817*744bfb21SJohn Baldwin int ret = crypto_newsession(&chacha20_poly1305_sid, &csp, CRYPTOCAP_F_SOFTWARE); 1818*744bfb21SJohn Baldwin if (ret != 0) 1819*744bfb21SJohn Baldwin return (ret); 1820*744bfb21SJohn Baldwin #endif 1821*744bfb21SJohn Baldwin return (0); 1822*744bfb21SJohn Baldwin } 1823*744bfb21SJohn Baldwin 1824*744bfb21SJohn Baldwin void 1825*744bfb21SJohn Baldwin crypto_deinit(void) 1826*744bfb21SJohn Baldwin { 1827*744bfb21SJohn Baldwin #ifndef COMPAT_NEED_CHACHA20POLY1305_MBUF 1828*744bfb21SJohn Baldwin crypto_freesession(chacha20_poly1305_sid); 1829*744bfb21SJohn Baldwin #endif 1830*744bfb21SJohn Baldwin } 1831