1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * POLYVAL library functions 4 * 5 * Copyright 2025 Google LLC 6 */ 7 8 #include <crypto/polyval.h> 9 #include <linux/export.h> 10 #include <linux/module.h> 11 #include <linux/string.h> 12 #include <linux/unaligned.h> 13 14 /* 15 * POLYVAL is an almost-XOR-universal hash function. Similar to GHASH, POLYVAL 16 * interprets the message as the coefficients of a polynomial in GF(2^128) and 17 * evaluates that polynomial at a secret point. POLYVAL has a simple 18 * mathematical relationship with GHASH, but it uses a better field convention 19 * which makes it easier and faster to implement. 20 * 21 * POLYVAL is not a cryptographic hash function, and it should be used only by 22 * algorithms that are specifically designed to use it. 23 * 24 * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated 25 * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452) 26 * 27 * POLYVAL is also used by HCTR2. See "Length-preserving encryption with HCTR2" 28 * (https://eprint.iacr.org/2021/1441.pdf). 29 * 30 * This file provides a library API for POLYVAL. This API can delegate to 31 * either a generic implementation or an architecture-optimized implementation. 32 * 33 * For the generic implementation, we don't use the traditional table approach 34 * to GF(2^128) multiplication. That approach is not constant-time and requires 35 * a lot of memory. Instead, we use a different approach which emulates 36 * carryless multiplication using standard multiplications by spreading the data 37 * bits apart using "holes". This allows the carries to spill harmlessly. This 38 * approach is borrowed from BoringSSL, which in turn credits BearSSL's 39 * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the 40 * "holes" trick and a presentation by Shay Gueron 41 * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the 42 * 256-bit => 128-bit reduction algorithm. 43 */ 44 45 #ifdef CONFIG_ARCH_SUPPORTS_INT128 46 47 /* Do a 64 x 64 => 128 bit carryless multiplication. */ 48 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi) 49 { 50 /* 51 * With 64-bit multiplicands and one term every 4 bits, there would be 52 * up to 64 / 4 = 16 one bits per column when each multiplication is 53 * written out as a series of additions in the schoolbook manner. 54 * Unfortunately, that doesn't work since the value 16 is 1 too large to 55 * fit in 4 bits. Carries would sometimes overflow into the next term. 56 * 57 * Using one term every 5 bits would work. However, that would cost 58 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16. 59 * 60 * Instead, mask off 4 bits from one multiplicand, giving a max of 15 61 * one bits per column. Then handle those 4 bits separately. 62 */ 63 u64 a0 = a & 0x1111111111111110; 64 u64 a1 = a & 0x2222222222222220; 65 u64 a2 = a & 0x4444444444444440; 66 u64 a3 = a & 0x8888888888888880; 67 68 u64 b0 = b & 0x1111111111111111; 69 u64 b1 = b & 0x2222222222222222; 70 u64 b2 = b & 0x4444444444444444; 71 u64 b3 = b & 0x8888888888888888; 72 73 /* Multiply the high 60 bits of @a by @b. */ 74 u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^ 75 (a2 * (u128)b2) ^ (a3 * (u128)b1); 76 u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^ 77 (a2 * (u128)b3) ^ (a3 * (u128)b2); 78 u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^ 79 (a2 * (u128)b0) ^ (a3 * (u128)b3); 80 u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^ 81 (a2 * (u128)b1) ^ (a3 * (u128)b0); 82 83 /* Multiply the low 4 bits of @a by @b. */ 84 u64 e0 = -(a & 1) & b; 85 u64 e1 = -((a >> 1) & 1) & b; 86 u64 e2 = -((a >> 2) & 1) & b; 87 u64 e3 = -((a >> 3) & 1) & b; 88 u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3); 89 u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61); 90 91 /* Add all the intermediate products together. */ 92 *out_lo = (((u64)c0) & 0x1111111111111111) ^ 93 (((u64)c1) & 0x2222222222222222) ^ 94 (((u64)c2) & 0x4444444444444444) ^ 95 (((u64)c3) & 0x8888888888888888) ^ extra_lo; 96 *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^ 97 (((u64)(c1 >> 64)) & 0x2222222222222222) ^ 98 (((u64)(c2 >> 64)) & 0x4444444444444444) ^ 99 (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi; 100 } 101 102 #else /* CONFIG_ARCH_SUPPORTS_INT128 */ 103 104 /* Do a 32 x 32 => 64 bit carryless multiplication. */ 105 static u64 clmul32(u32 a, u32 b) 106 { 107 /* 108 * With 32-bit multiplicands and one term every 4 bits, there are up to 109 * 32 / 4 = 8 one bits per column when each multiplication is written 110 * out as a series of additions in the schoolbook manner. The value 8 111 * fits in 4 bits, so the carries don't overflow into the next term. 112 */ 113 u32 a0 = a & 0x11111111; 114 u32 a1 = a & 0x22222222; 115 u32 a2 = a & 0x44444444; 116 u32 a3 = a & 0x88888888; 117 118 u32 b0 = b & 0x11111111; 119 u32 b1 = b & 0x22222222; 120 u32 b2 = b & 0x44444444; 121 u32 b3 = b & 0x88888888; 122 123 u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^ 124 (a2 * (u64)b2) ^ (a3 * (u64)b1); 125 u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^ 126 (a2 * (u64)b3) ^ (a3 * (u64)b2); 127 u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^ 128 (a2 * (u64)b0) ^ (a3 * (u64)b3); 129 u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^ 130 (a2 * (u64)b1) ^ (a3 * (u64)b0); 131 132 /* Add all the intermediate products together. */ 133 return (c0 & 0x1111111111111111) ^ 134 (c1 & 0x2222222222222222) ^ 135 (c2 & 0x4444444444444444) ^ 136 (c3 & 0x8888888888888888); 137 } 138 139 /* Do a 64 x 64 => 128 bit carryless multiplication. */ 140 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi) 141 { 142 u32 a_lo = (u32)a; 143 u32 a_hi = a >> 32; 144 u32 b_lo = (u32)b; 145 u32 b_hi = b >> 32; 146 147 /* Karatsuba multiplication */ 148 u64 lo = clmul32(a_lo, b_lo); 149 u64 hi = clmul32(a_hi, b_hi); 150 u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi; 151 152 *out_lo = lo ^ (mi << 32); 153 *out_hi = hi ^ (mi >> 32); 154 } 155 #endif /* !CONFIG_ARCH_SUPPORTS_INT128 */ 156 157 /* Compute @a = @a * @b * x^-128 in the POLYVAL field. */ 158 static void __maybe_unused 159 polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b) 160 { 161 u64 c0, c1, c2, c3, mi0, mi1; 162 163 /* 164 * Carryless-multiply @a by @b using Karatsuba multiplication. Store 165 * the 256-bit product in @c0 (low) through @c3 (high). 166 */ 167 clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1); 168 clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3); 169 clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi), 170 &mi0, &mi1); 171 mi0 ^= c0 ^ c2; 172 mi1 ^= c1 ^ c3; 173 c1 ^= mi0; 174 c2 ^= mi1; 175 176 /* 177 * Cancel out the low 128 bits of the product by adding multiples of 178 * G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each 179 * of which cancels out 64 bits. Note that we break G(x) into three 180 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1. 181 */ 182 183 /* 184 * First, add G(x) times c0 as follows: 185 * 186 * (c0, c1, c2) = (0, 187 * c1 + (c0 * (x^63 + x^62 + x^57) mod x^64), 188 * c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64)) 189 */ 190 c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57); 191 c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7); 192 193 /* 194 * Second, add G(x) times the new c1: 195 * 196 * (c1, c2, c3) = (0, 197 * c2 + (c1 * (x^63 + x^62 + x^57) mod x^64), 198 * c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64)) 199 */ 200 c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57); 201 c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7); 202 203 /* Return (c2, c3). This implicitly multiplies by x^-128. */ 204 a->lo = cpu_to_le64(c2); 205 a->hi = cpu_to_le64(c3); 206 } 207 208 static void __maybe_unused 209 polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key, 210 const u8 *data, size_t nblocks) 211 { 212 do { 213 acc->lo ^= get_unaligned((__le64 *)data); 214 acc->hi ^= get_unaligned((__le64 *)(data + 8)); 215 polyval_mul_generic(acc, key); 216 data += POLYVAL_BLOCK_SIZE; 217 } while (--nblocks); 218 } 219 220 /* Include the arch-optimized implementation of POLYVAL, if one is available. */ 221 #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH 222 #include "polyval.h" /* $(SRCARCH)/polyval.h */ 223 void polyval_preparekey(struct polyval_key *key, 224 const u8 raw_key[POLYVAL_BLOCK_SIZE]) 225 { 226 polyval_preparekey_arch(key, raw_key); 227 } 228 EXPORT_SYMBOL_GPL(polyval_preparekey); 229 #endif /* Else, polyval_preparekey() is an inline function. */ 230 231 /* 232 * polyval_mul_generic() and polyval_blocks_generic() take the key as a 233 * polyval_elem rather than a polyval_key, so that arch-optimized 234 * implementations with a different key format can use it as a fallback (if they 235 * have H^1 stored somewhere in their struct). Thus, the following dispatch 236 * code is needed to pass the appropriate key argument. 237 */ 238 239 static void polyval_mul(struct polyval_ctx *ctx) 240 { 241 #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH 242 polyval_mul_arch(&ctx->acc, ctx->key); 243 #else 244 polyval_mul_generic(&ctx->acc, &ctx->key->h); 245 #endif 246 } 247 248 static void polyval_blocks(struct polyval_ctx *ctx, 249 const u8 *data, size_t nblocks) 250 { 251 #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH 252 polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks); 253 #else 254 polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks); 255 #endif 256 } 257 258 void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len) 259 { 260 if (unlikely(ctx->partial)) { 261 size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial); 262 263 len -= n; 264 while (n--) 265 ctx->acc.bytes[ctx->partial++] ^= *data++; 266 if (ctx->partial < POLYVAL_BLOCK_SIZE) 267 return; 268 polyval_mul(ctx); 269 } 270 if (len >= POLYVAL_BLOCK_SIZE) { 271 size_t nblocks = len / POLYVAL_BLOCK_SIZE; 272 273 polyval_blocks(ctx, data, nblocks); 274 data += len & ~(POLYVAL_BLOCK_SIZE - 1); 275 len &= POLYVAL_BLOCK_SIZE - 1; 276 } 277 for (size_t i = 0; i < len; i++) 278 ctx->acc.bytes[i] ^= data[i]; 279 ctx->partial = len; 280 } 281 EXPORT_SYMBOL_GPL(polyval_update); 282 283 void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE]) 284 { 285 if (unlikely(ctx->partial)) 286 polyval_mul(ctx); 287 memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE); 288 memzero_explicit(ctx, sizeof(*ctx)); 289 } 290 EXPORT_SYMBOL_GPL(polyval_final); 291 292 #ifdef polyval_mod_init_arch 293 static int __init polyval_mod_init(void) 294 { 295 polyval_mod_init_arch(); 296 return 0; 297 } 298 subsys_initcall(polyval_mod_init); 299 300 static void __exit polyval_mod_exit(void) 301 { 302 } 303 module_exit(polyval_mod_exit); 304 #endif 305 306 MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function"); 307 MODULE_LICENSE("GPL"); 308