1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * GF(2^128) polynomial hashing: GHASH and POLYVAL 4 * 5 * Copyright 2025 Google LLC 6 */ 7 8 #include <crypto/gf128hash.h> 9 #include <linux/export.h> 10 #include <linux/module.h> 11 #include <linux/string.h> 12 #include <linux/unaligned.h> 13 14 /* 15 * GHASH and POLYVAL are almost-XOR-universal hash functions. They interpret 16 * the message as the coefficients of a polynomial in the finite field GF(2^128) 17 * and evaluate that polynomial at a secret point. 18 * 19 * Neither GHASH nor POLYVAL is a cryptographic hash function. They should be 20 * used only by algorithms that are specifically designed to use them. 21 * 22 * GHASH is the older variant, defined as part of GCM in NIST SP 800-38D 23 * (https://nvlpubs.nist.gov/nistpubs/legacy/sp/nistspecialpublication800-38d.pdf). 24 * GHASH is hard to implement directly, due to its backwards mapping between 25 * bits and polynomial coefficients. GHASH implementations typically pre and 26 * post-process the inputs and outputs (mainly by byte-swapping) to convert the 27 * GHASH computation into an equivalent computation over a different, 28 * easier-to-use representation of GF(2^128). 29 * 30 * POLYVAL is a newer GF(2^128) polynomial hash, originally defined as part of 31 * AES-GCM-SIV (https://datatracker.ietf.org/doc/html/rfc8452) and also used by 32 * HCTR2 (https://eprint.iacr.org/2021/1441.pdf). It uses that easier-to-use 33 * field representation directly, eliminating the data conversion steps. 34 * 35 * This file provides library APIs for GHASH and POLYVAL. These APIs can 36 * delegate to either a generic implementation or an architecture-optimized 37 * implementation. Due to the mathematical relationship between GHASH and 38 * POLYVAL, in some cases code for one is reused with the other. 39 * 40 * For the generic implementation, we don't use the traditional table approach 41 * to GF(2^128) multiplication. That approach is not constant-time and requires 42 * a lot of memory. Instead, we use a different approach which emulates 43 * carryless multiplication using standard multiplications by spreading the data 44 * bits apart using "holes". This allows the carries to spill harmlessly. This 45 * approach is borrowed from BoringSSL, which in turn credits BearSSL's 46 * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the 47 * "holes" trick and a presentation by Shay Gueron 48 * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the 49 * 256-bit => 128-bit reduction algorithm. 50 */ 51 52 #ifdef CONFIG_ARCH_SUPPORTS_INT128 53 54 /* Do a 64 x 64 => 128 bit carryless multiplication. */ 55 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi) 56 { 57 /* 58 * With 64-bit multiplicands and one term every 4 bits, there would be 59 * up to 64 / 4 = 16 one bits per column when each multiplication is 60 * written out as a series of additions in the schoolbook manner. 61 * Unfortunately, that doesn't work since the value 16 is 1 too large to 62 * fit in 4 bits. Carries would sometimes overflow into the next term. 63 * 64 * Using one term every 5 bits would work. However, that would cost 65 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16. 66 * 67 * Instead, mask off 4 bits from one multiplicand, giving a max of 15 68 * one bits per column. Then handle those 4 bits separately. 69 */ 70 u64 a0 = a & 0x1111111111111110; 71 u64 a1 = a & 0x2222222222222220; 72 u64 a2 = a & 0x4444444444444440; 73 u64 a3 = a & 0x8888888888888880; 74 75 u64 b0 = b & 0x1111111111111111; 76 u64 b1 = b & 0x2222222222222222; 77 u64 b2 = b & 0x4444444444444444; 78 u64 b3 = b & 0x8888888888888888; 79 80 /* Multiply the high 60 bits of @a by @b. */ 81 u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^ 82 (a2 * (u128)b2) ^ (a3 * (u128)b1); 83 u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^ 84 (a2 * (u128)b3) ^ (a3 * (u128)b2); 85 u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^ 86 (a2 * (u128)b0) ^ (a3 * (u128)b3); 87 u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^ 88 (a2 * (u128)b1) ^ (a3 * (u128)b0); 89 90 /* Multiply the low 4 bits of @a by @b. */ 91 u64 e0 = -(a & 1) & b; 92 u64 e1 = -((a >> 1) & 1) & b; 93 u64 e2 = -((a >> 2) & 1) & b; 94 u64 e3 = -((a >> 3) & 1) & b; 95 u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3); 96 u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61); 97 98 /* Add all the intermediate products together. */ 99 *out_lo = (((u64)c0) & 0x1111111111111111) ^ 100 (((u64)c1) & 0x2222222222222222) ^ 101 (((u64)c2) & 0x4444444444444444) ^ 102 (((u64)c3) & 0x8888888888888888) ^ extra_lo; 103 *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^ 104 (((u64)(c1 >> 64)) & 0x2222222222222222) ^ 105 (((u64)(c2 >> 64)) & 0x4444444444444444) ^ 106 (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi; 107 } 108 109 #else /* CONFIG_ARCH_SUPPORTS_INT128 */ 110 111 /* Do a 32 x 32 => 64 bit carryless multiplication. */ 112 static u64 clmul32(u32 a, u32 b) 113 { 114 /* 115 * With 32-bit multiplicands and one term every 4 bits, there are up to 116 * 32 / 4 = 8 one bits per column when each multiplication is written 117 * out as a series of additions in the schoolbook manner. The value 8 118 * fits in 4 bits, so the carries don't overflow into the next term. 119 */ 120 u32 a0 = a & 0x11111111; 121 u32 a1 = a & 0x22222222; 122 u32 a2 = a & 0x44444444; 123 u32 a3 = a & 0x88888888; 124 125 u32 b0 = b & 0x11111111; 126 u32 b1 = b & 0x22222222; 127 u32 b2 = b & 0x44444444; 128 u32 b3 = b & 0x88888888; 129 130 u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^ 131 (a2 * (u64)b2) ^ (a3 * (u64)b1); 132 u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^ 133 (a2 * (u64)b3) ^ (a3 * (u64)b2); 134 u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^ 135 (a2 * (u64)b0) ^ (a3 * (u64)b3); 136 u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^ 137 (a2 * (u64)b1) ^ (a3 * (u64)b0); 138 139 /* Add all the intermediate products together. */ 140 return (c0 & 0x1111111111111111) ^ 141 (c1 & 0x2222222222222222) ^ 142 (c2 & 0x4444444444444444) ^ 143 (c3 & 0x8888888888888888); 144 } 145 146 /* Do a 64 x 64 => 128 bit carryless multiplication. */ 147 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi) 148 { 149 u32 a_lo = (u32)a; 150 u32 a_hi = a >> 32; 151 u32 b_lo = (u32)b; 152 u32 b_hi = b >> 32; 153 154 /* Karatsuba multiplication */ 155 u64 lo = clmul32(a_lo, b_lo); 156 u64 hi = clmul32(a_hi, b_hi); 157 u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi; 158 159 *out_lo = lo ^ (mi << 32); 160 *out_hi = hi ^ (mi >> 32); 161 } 162 #endif /* !CONFIG_ARCH_SUPPORTS_INT128 */ 163 164 /* Compute @a = @a * @b * x^-128 in the POLYVAL field. */ 165 static void __maybe_unused 166 polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b) 167 { 168 u64 c0, c1, c2, c3, mi0, mi1; 169 170 /* 171 * Carryless-multiply @a by @b using Karatsuba multiplication. Store 172 * the 256-bit product in @c0 (low) through @c3 (high). 173 */ 174 clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1); 175 clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3); 176 clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi), 177 &mi0, &mi1); 178 mi0 ^= c0 ^ c2; 179 mi1 ^= c1 ^ c3; 180 c1 ^= mi0; 181 c2 ^= mi1; 182 183 /* 184 * Cancel out the low 128 bits of the product by adding multiples of 185 * G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each 186 * of which cancels out 64 bits. Note that we break G(x) into three 187 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1. 188 */ 189 190 /* 191 * First, add G(x) times c0 as follows: 192 * 193 * (c0, c1, c2) = (0, 194 * c1 + (c0 * (x^63 + x^62 + x^57) mod x^64), 195 * c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64)) 196 */ 197 c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57); 198 c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7); 199 200 /* 201 * Second, add G(x) times the new c1: 202 * 203 * (c1, c2, c3) = (0, 204 * c2 + (c1 * (x^63 + x^62 + x^57) mod x^64), 205 * c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64)) 206 */ 207 c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57); 208 c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7); 209 210 /* Return (c2, c3). This implicitly multiplies by x^-128. */ 211 a->lo = cpu_to_le64(c2); 212 a->hi = cpu_to_le64(c3); 213 } 214 215 static void __maybe_unused ghash_blocks_generic(struct polyval_elem *acc, 216 const struct polyval_elem *key, 217 const u8 *data, size_t nblocks) 218 { 219 do { 220 acc->lo ^= 221 cpu_to_le64(get_unaligned_be64((__be64 *)(data + 8))); 222 acc->hi ^= cpu_to_le64(get_unaligned_be64((__be64 *)data)); 223 polyval_mul_generic(acc, key); 224 data += GHASH_BLOCK_SIZE; 225 } while (--nblocks); 226 } 227 228 static void __maybe_unused 229 polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key, 230 const u8 *data, size_t nblocks) 231 { 232 do { 233 acc->lo ^= get_unaligned((__le64 *)data); 234 acc->hi ^= get_unaligned((__le64 *)(data + 8)); 235 polyval_mul_generic(acc, key); 236 data += POLYVAL_BLOCK_SIZE; 237 } while (--nblocks); 238 } 239 240 /* Convert the key from GHASH format to POLYVAL format. */ 241 static void __maybe_unused ghash_key_to_polyval(const u8 in[GHASH_BLOCK_SIZE], 242 struct polyval_elem *out) 243 { 244 u64 hi = get_unaligned_be64(&in[0]); 245 u64 lo = get_unaligned_be64(&in[8]); 246 u64 mask = (s64)hi >> 63; 247 248 hi = (hi << 1) ^ (lo >> 63) ^ (mask & ((u64)0xc2 << 56)); 249 lo = (lo << 1) ^ (mask & 1); 250 out->lo = cpu_to_le64(lo); 251 out->hi = cpu_to_le64(hi); 252 } 253 254 /* Convert the accumulator from POLYVAL format to GHASH format. */ 255 static void polyval_acc_to_ghash(const struct polyval_elem *in, 256 u8 out[GHASH_BLOCK_SIZE]) 257 { 258 put_unaligned_be64(le64_to_cpu(in->hi), &out[0]); 259 put_unaligned_be64(le64_to_cpu(in->lo), &out[8]); 260 } 261 262 /* Convert the accumulator from GHASH format to POLYVAL format. */ 263 static void __maybe_unused ghash_acc_to_polyval(const u8 in[GHASH_BLOCK_SIZE], 264 struct polyval_elem *out) 265 { 266 out->lo = cpu_to_le64(get_unaligned_be64(&in[8])); 267 out->hi = cpu_to_le64(get_unaligned_be64(&in[0])); 268 } 269 270 #ifdef CONFIG_CRYPTO_LIB_GF128HASH_ARCH 271 #include "gf128hash.h" /* $(SRCARCH)/gf128hash.h */ 272 #endif 273 274 void ghash_preparekey(struct ghash_key *key, const u8 raw_key[GHASH_BLOCK_SIZE]) 275 { 276 #ifdef ghash_preparekey_arch 277 ghash_preparekey_arch(key, raw_key); 278 #else 279 ghash_key_to_polyval(raw_key, &key->h); 280 #endif 281 } 282 EXPORT_SYMBOL_GPL(ghash_preparekey); 283 284 static void ghash_mul(struct ghash_ctx *ctx) 285 { 286 #ifdef ghash_mul_arch 287 ghash_mul_arch(&ctx->acc, ctx->key); 288 #elif defined(ghash_blocks_arch) 289 static const u8 zeroes[GHASH_BLOCK_SIZE]; 290 291 ghash_blocks_arch(&ctx->acc, ctx->key, zeroes, 1); 292 #else 293 polyval_mul_generic(&ctx->acc, &ctx->key->h); 294 #endif 295 } 296 297 /* nblocks is always >= 1. */ 298 static void ghash_blocks(struct ghash_ctx *ctx, const u8 *data, size_t nblocks) 299 { 300 #ifdef ghash_blocks_arch 301 ghash_blocks_arch(&ctx->acc, ctx->key, data, nblocks); 302 #else 303 ghash_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks); 304 #endif 305 } 306 307 void ghash_update(struct ghash_ctx *ctx, const u8 *data, size_t len) 308 { 309 if (unlikely(ctx->partial)) { 310 size_t n = min(len, GHASH_BLOCK_SIZE - ctx->partial); 311 312 len -= n; 313 while (n--) 314 ctx->acc.bytes[GHASH_BLOCK_SIZE - 1 - ctx->partial++] ^= 315 *data++; 316 if (ctx->partial < GHASH_BLOCK_SIZE) 317 return; 318 ghash_mul(ctx); 319 } 320 if (len >= GHASH_BLOCK_SIZE) { 321 size_t nblocks = len / GHASH_BLOCK_SIZE; 322 323 ghash_blocks(ctx, data, nblocks); 324 data += len & ~(GHASH_BLOCK_SIZE - 1); 325 len &= GHASH_BLOCK_SIZE - 1; 326 } 327 for (size_t i = 0; i < len; i++) 328 ctx->acc.bytes[GHASH_BLOCK_SIZE - 1 - i] ^= data[i]; 329 ctx->partial = len; 330 } 331 EXPORT_SYMBOL_GPL(ghash_update); 332 333 void ghash_final(struct ghash_ctx *ctx, u8 out[GHASH_BLOCK_SIZE]) 334 { 335 if (unlikely(ctx->partial)) 336 ghash_mul(ctx); 337 polyval_acc_to_ghash(&ctx->acc, out); 338 memzero_explicit(ctx, sizeof(*ctx)); 339 } 340 EXPORT_SYMBOL_GPL(ghash_final); 341 342 void polyval_preparekey(struct polyval_key *key, 343 const u8 raw_key[POLYVAL_BLOCK_SIZE]) 344 { 345 #ifdef polyval_preparekey_arch 346 polyval_preparekey_arch(key, raw_key); 347 #else 348 memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE); 349 #endif 350 } 351 EXPORT_SYMBOL_GPL(polyval_preparekey); 352 353 /* 354 * polyval_mul_generic() and polyval_blocks_generic() take the key as a 355 * polyval_elem rather than a polyval_key, so that arch-optimized 356 * implementations with a different key format can use it as a fallback (if they 357 * have H^1 stored somewhere in their struct). Thus, the following dispatch 358 * code is needed to pass the appropriate key argument. 359 */ 360 361 static void polyval_mul(struct polyval_ctx *ctx) 362 { 363 #ifdef polyval_mul_arch 364 polyval_mul_arch(&ctx->acc, ctx->key); 365 #elif defined(polyval_blocks_arch) 366 static const u8 zeroes[POLYVAL_BLOCK_SIZE]; 367 368 polyval_blocks_arch(&ctx->acc, ctx->key, zeroes, 1); 369 #else 370 polyval_mul_generic(&ctx->acc, &ctx->key->h); 371 #endif 372 } 373 374 /* nblocks is always >= 1. */ 375 static void polyval_blocks(struct polyval_ctx *ctx, 376 const u8 *data, size_t nblocks) 377 { 378 #ifdef polyval_blocks_arch 379 polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks); 380 #else 381 polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks); 382 #endif 383 } 384 385 void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len) 386 { 387 if (unlikely(ctx->partial)) { 388 size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial); 389 390 len -= n; 391 while (n--) 392 ctx->acc.bytes[ctx->partial++] ^= *data++; 393 if (ctx->partial < POLYVAL_BLOCK_SIZE) 394 return; 395 polyval_mul(ctx); 396 } 397 if (len >= POLYVAL_BLOCK_SIZE) { 398 size_t nblocks = len / POLYVAL_BLOCK_SIZE; 399 400 polyval_blocks(ctx, data, nblocks); 401 data += len & ~(POLYVAL_BLOCK_SIZE - 1); 402 len &= POLYVAL_BLOCK_SIZE - 1; 403 } 404 for (size_t i = 0; i < len; i++) 405 ctx->acc.bytes[i] ^= data[i]; 406 ctx->partial = len; 407 } 408 EXPORT_SYMBOL_GPL(polyval_update); 409 410 void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE]) 411 { 412 if (unlikely(ctx->partial)) 413 polyval_mul(ctx); 414 memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE); 415 memzero_explicit(ctx, sizeof(*ctx)); 416 } 417 EXPORT_SYMBOL_GPL(polyval_final); 418 419 #ifdef gf128hash_mod_init_arch 420 static int __init gf128hash_mod_init(void) 421 { 422 gf128hash_mod_init_arch(); 423 return 0; 424 } 425 subsys_initcall(gf128hash_mod_init); 426 427 static void __exit gf128hash_mod_exit(void) 428 { 429 } 430 module_exit(gf128hash_mod_exit); 431 #endif 432 433 MODULE_DESCRIPTION("GF(2^128) polynomial hashing: GHASH and POLYVAL"); 434 MODULE_LICENSE("GPL"); 435