xref: /freebsd/crypto/openssl/crypto/ml_dsa/ml_dsa_sample.c (revision e7be843b4a162e68651d3911f0357ed464915629)
1 /*
2  * Copyright 2024-2025 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include <openssl/byteorder.h>
11 #include "ml_dsa_local.h"
12 #include "ml_dsa_vector.h"
13 #include "ml_dsa_matrix.h"
14 #include "ml_dsa_hash.h"
15 #include "internal/sha3.h"
16 #include "internal/packet.h"
17 
18 #define SHAKE128_BLOCKSIZE SHA3_BLOCKSIZE(128)
19 #define SHAKE256_BLOCKSIZE SHA3_BLOCKSIZE(256)
20 
21 /*
22  * This is a constant time version of n % 5
23  * Note that 0xFFFF / 5 = 0x3333, 2 is added to make an over-estimate of 1/5
24  * and then we divide by (0xFFFF + 1)
25  */
26 #define MOD5(n) ((n) - 5 * (0x3335 * (n) >> 16))
27 
28 #if SHAKE128_BLOCKSIZE % 3 != 0
29 # error "rej_ntt_poly() requires SHAKE128_BLOCKSIZE to be a multiple of 3"
30 #endif
31 
32 typedef int (COEFF_FROM_NIBBLE_FUNC)(uint32_t nibble, uint32_t *out);
33 
34 static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_4;
35 static COEFF_FROM_NIBBLE_FUNC coeff_from_nibble_2;
36 
37 /**
38  * @brief Combine 3 bytes to form an coefficient.
39  * See FIPS 204, Algorithm 14, CoeffFromThreeBytes()
40  *
41  * This is not constant time as it is used to generate the matrix A which is public.
42  *
43  * @param s A byte array of 3 uniformly distributed bytes.
44  * @param out The returned coefficient in the range 0..q-1.
45  * @returns 1 if the value is less than q or 0 otherwise.
46  *          This is used for rejection sampling.
47  */
coeff_from_three_bytes(const uint8_t * s,uint32_t * out)48 static ossl_inline int coeff_from_three_bytes(const uint8_t *s, uint32_t *out)
49 {
50     /* Zero out the top bit of the 3rd byte to get a value in the range 0..2^23-1) */
51     *out = (uint32_t)s[0] | ((uint32_t)s[1] << 8) | (((uint32_t)s[2] & 0x7f) << 16);
52     return *out < ML_DSA_Q;
53 }
54 
55 /**
56  * @brief Generate a value in the range (q-4..0..4)
57  * See FIPS 204, Algorithm 15, CoeffFromHalfByte() where eta = 4
58  * Note the FIPS 204 code uses the range -4..4 (whereas this code adds q to the
59  * negative numbers).
60  *
61  * @param nibble A value in the range 0..15
62  * @param out The returned value if the range (q-4)..0..4 if nibble is < 9
63  * @returns 1 nibble was in range, or 0 if the nibble was rejected.
64  */
coeff_from_nibble_4(uint32_t nibble,uint32_t * out)65 static ossl_inline int coeff_from_nibble_4(uint32_t nibble, uint32_t *out)
66 {
67     /*
68      * This is not constant time but will not leak any important info since
69      * the value is either chosen or thrown away.
70      */
71     if (value_barrier_32(nibble < 9)) {
72         *out = mod_sub(4, nibble);
73         return 1;
74     }
75     return 0;
76 }
77 
78 /**
79  * @brief Generate a value in the range (q-2..0..2)
80  * See FIPS 204, Algorithm 15, CoeffFromHalfByte() where eta = 2
81  * Note the FIPS 204 code uses the range -2..2 (whereas this code adds q to the
82  * negative numbers).
83  *
84  * @param nibble A value in the range 0..15
85  * @param out The returned value if the range (q-2)..0..2 if nibble is < 15
86  * @returns 1 nibble was in range, or 0 if the nibble was rejected.
87  */
coeff_from_nibble_2(uint32_t nibble,uint32_t * out)88 static ossl_inline int coeff_from_nibble_2(uint32_t nibble, uint32_t *out)
89 {
90     if (value_barrier_32(nibble < 15)) {
91         *out = mod_sub(2, MOD5(nibble));
92         return 1;
93     }
94     return 0;
95 }
96 
97 /**
98  * @brief Use a seed value to generate a polynomial with coefficients in the
99  * range of 0..q-1 using rejection sampling.
100  * SHAKE128 is used to absorb the seed, and then sequences of 3 sample bytes are
101  * squeezed to try to produce coefficients.
102  * The SHAKE128 stream is used to get uniformly distributed elements.
103  * This algorithm is used for matrix expansion and only operates on public inputs.
104  *
105  * See FIPS 204, Algorithm 30, RejNTTPoly()
106  *
107  * @param g_ctx A EVP_MD_CTX object used for sampling the seed.
108  * @param md A pre-fetched SHAKE128 object.
109  * @param seed The seed to use for sampling.
110  * @param seed_len The size of |seed|
111  * @param out The returned polynomial with coefficients in the range of
112  *            0..q-1. This range is required for NTT.
113  * @returns 1 if the polynomial was successfully generated, or 0 if any of the
114  *            digest operations failed.
115  */
rej_ntt_poly(EVP_MD_CTX * g_ctx,const EVP_MD * md,const uint8_t * seed,size_t seed_len,POLY * out)116 static int rej_ntt_poly(EVP_MD_CTX *g_ctx, const EVP_MD *md,
117                         const uint8_t *seed, size_t seed_len, POLY *out)
118 {
119     int j = 0;
120     uint8_t blocks[SHAKE128_BLOCKSIZE], *b, *end = blocks + sizeof(blocks);
121 
122     /*
123      * Instead of just squeezing 3 bytes at a time, we grab a whole block
124      * Note that the shake128 blocksize of 168 is divisible by 3.
125      */
126     if (!shake_xof(g_ctx, md, seed, seed_len, blocks, sizeof(blocks)))
127         return 0;
128 
129     while (1) {
130         for (b = blocks; b < end; b += 3) {
131             if (coeff_from_three_bytes(b, &(out->coeff[j]))) {
132                 if (++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
133                     return 1;   /* finished */
134             }
135         }
136         if (!EVP_DigestSqueeze(g_ctx, blocks, sizeof(blocks)))
137             return 0;
138     }
139 }
140 
141 /**
142  * @brief Use a seed value to generate a polynomial with coefficients in the
143  * range of ((q-eta)..0..eta) using rejection sampling. eta is either 2 or 4.
144  * SHAKE256 is used to absorb the seed, and then samples are squeezed.
145  * See FIPS 204, Algorithm 31, RejBoundedPoly()
146  *
147  * @param h_ctx A EVP_MD_CTX object context used to sample the seed.
148  * @param md A pre-fetched SHAKE256 object.
149  * @param coef_from_nibble A function that is dependent on eta, which takes a
150  *                         nibble and tries to see if it is in the correct range.
151  * @param seed The seed to use for sampling.
152  * @param seed_len The size of |seed|
153  * @param out The returned polynomial with coefficients in the range of
154  *            ((q-eta)..0..eta)
155  * @returns 1 if the polynomial was successfully generated, or 0 if any of the
156  *            digest operations failed.
157  */
rej_bounded_poly(EVP_MD_CTX * h_ctx,const EVP_MD * md,COEFF_FROM_NIBBLE_FUNC * coef_from_nibble,const uint8_t * seed,size_t seed_len,POLY * out)158 static int rej_bounded_poly(EVP_MD_CTX *h_ctx, const EVP_MD *md,
159                             COEFF_FROM_NIBBLE_FUNC *coef_from_nibble,
160                             const uint8_t *seed, size_t seed_len, POLY *out)
161 {
162     int j = 0;
163     uint32_t z0, z1;
164     uint8_t blocks[SHAKE256_BLOCKSIZE], *b, *end = blocks + sizeof(blocks);
165 
166     /* Instead of just squeezing 1 byte at a time, we grab a whole block */
167     if (!shake_xof(h_ctx, md, seed, seed_len, blocks, sizeof(blocks)))
168         return 0;
169 
170     while (1) {
171         for (b = blocks; b < end; b++) {
172             z0 = *b & 0x0F; /* lower nibble of byte */
173             z1 = *b >> 4;   /* high nibble of byte */
174 
175             if (coef_from_nibble(z0, &out->coeff[j])
176                     && ++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
177                 return 1;
178             if (coef_from_nibble(z1, &out->coeff[j])
179                     && ++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
180                 return 1;
181         }
182         if (!EVP_DigestSqueeze(h_ctx, blocks, sizeof(blocks)))
183             return 0;
184     }
185 }
186 
187 /**
188  * @brief Generate a k * l matrix that has uniformly distributed polynomial
189  *        elements using rejection sampling.
190  * See FIPS 204, Algorithm 32, ExpandA()
191  *
192  * @param g_ctx A EVP_MD_CTX context used for rejection sampling
193  *              seed values generated from the seed rho.
194  * @param md A pre-fetched SHAKE128 object
195  * @param rho A 32 byte seed to generated the matrix from.
196  * @param out The generated k * l matrix of polynomials with coefficients
197  *            in the range of 0..q-1.
198  * @returns 1 if the matrix was generated, or 0 on error.
199  */
ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX * g_ctx,const EVP_MD * md,const uint8_t * rho,MATRIX * out)200 int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const EVP_MD *md,
201                                 const uint8_t *rho, MATRIX *out)
202 {
203     int ret = 0;
204     size_t i, j;
205     uint8_t derived_seed[ML_DSA_RHO_BYTES + 2];
206     POLY *poly = out->m_poly;
207 
208     /* The seed used for each matrix element is rho + column_index + row_index */
209     memcpy(derived_seed, rho, ML_DSA_RHO_BYTES);
210 
211     for (i = 0; i < out->k; i++) {
212         for (j = 0; j < out->l; j++) {
213             derived_seed[ML_DSA_RHO_BYTES + 1] = (uint8_t)i;
214             derived_seed[ML_DSA_RHO_BYTES] = (uint8_t)j;
215             /* Generate the polynomial for each matrix element using a unique seed */
216             if (!rej_ntt_poly(g_ctx, md, derived_seed, sizeof(derived_seed), poly++))
217                 goto err;
218         }
219     }
220     ret = 1;
221 err:
222     return ret;
223 }
224 
225 /**
226  * @brief Generates 2 vectors using rejection sampling whose polynomial
227  * coefficients are in the interval [q-eta..0..eta]
228  *
229  * See FIPS 204, Algorithm 33, ExpandS().
230  * Note that in FIPS 204 the range -eta..eta is used.
231  *
232  * @param h_ctx A EVP_MD_CTX context to use to sample the seed.
233  * @param md A pre-fetched SHAKE256 object.
234  * @param eta Is either 2 or 4, and determines the range of the coefficients for
235  *            s1 and s2.
236  * @param seed A 64 byte seed to use for sampling.
237  * @param s1 A 1 * l column vector containing polynomials with coefficients in
238  *           the range (q-eta)..0..eta
239  * @param s2 A 1 * k column vector containing polynomials with coefficients in
240  *           the range (q-eta)..0..eta
241  * @returns 1 if s1 and s2 were successfully generated, or 0 otherwise.
242  */
ossl_ml_dsa_vector_expand_S(EVP_MD_CTX * h_ctx,const EVP_MD * md,int eta,const uint8_t * seed,VECTOR * s1,VECTOR * s2)243 int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, const EVP_MD *md, int eta,
244                                 const uint8_t *seed, VECTOR *s1, VECTOR *s2)
245 {
246     int ret = 0;
247     size_t i;
248     size_t l = s1->num_poly;
249     size_t k = s2->num_poly;
250     uint8_t derived_seed[ML_DSA_PRIV_SEED_BYTES + 2];
251     COEFF_FROM_NIBBLE_FUNC *coef_from_nibble_fn;
252 
253     coef_from_nibble_fn = (eta == ML_DSA_ETA_4) ? coeff_from_nibble_4 : coeff_from_nibble_2;
254 
255     /*
256      * Each polynomial generated uses a unique seed that consists of
257      * seed + counter (where the counter is 2 bytes starting at 0)
258      */
259     memcpy(derived_seed, seed, ML_DSA_PRIV_SEED_BYTES);
260     derived_seed[ML_DSA_PRIV_SEED_BYTES] = 0;
261     derived_seed[ML_DSA_PRIV_SEED_BYTES + 1] = 0;
262 
263     for (i = 0; i < l; i++) {
264         if (!rej_bounded_poly(h_ctx, md, coef_from_nibble_fn,
265                               derived_seed, sizeof(derived_seed), &s1->poly[i]))
266             goto err;
267         ++derived_seed[ML_DSA_PRIV_SEED_BYTES];
268     }
269     for (i = 0; i < k; i++) {
270         if (!rej_bounded_poly(h_ctx, md, coef_from_nibble_fn,
271                               derived_seed, sizeof(derived_seed), &s2->poly[i]))
272             goto err;
273         ++derived_seed[ML_DSA_PRIV_SEED_BYTES];
274     }
275     ret = 1;
276 err:
277     return ret;
278 }
279 
280 /* See FIPS 204, Algorithm 34, ExpandMask(), Step 4 & 5 */
ossl_ml_dsa_poly_expand_mask(POLY * out,const uint8_t * seed,size_t seed_len,uint32_t gamma1,EVP_MD_CTX * h_ctx,const EVP_MD * md)281 int ossl_ml_dsa_poly_expand_mask(POLY *out, const uint8_t *seed, size_t seed_len,
282                                  uint32_t gamma1,
283                                  EVP_MD_CTX *h_ctx, const EVP_MD *md)
284 {
285     uint8_t buf[32 * 20];
286     size_t buf_len = 32 * (gamma1 == ML_DSA_GAMMA1_TWO_POWER_19 ? 20 : 18);
287 
288     return shake_xof(h_ctx, md, seed, seed_len, buf, buf_len)
289         && ossl_ml_dsa_poly_decode_expand_mask(out, buf, buf_len, gamma1);
290 }
291 
292 /*
293  * @brief Sample a polynomial with coefficients in the range {-1..1}.
294  * The number of non zero values (hamming weight) is given by tau
295  *
296  * See FIPS 204, Algorithm 29, SampleInBall()
297  * This function is assumed to not be constant time.
298  * The algorithm is based on Durstenfeld's version of the Fisher-Yates shuffle.
299  *
300  * Note that the coefficients returned by this implementation are positive
301  * i.e one of q-1, 0, or 1.
302  *
303  * @param tau is the number of +1 or -1's in the polynomial 'out_c' (39, 49 or 60)
304  *            that is less than or equal to 64
305  */
ossl_ml_dsa_poly_sample_in_ball(POLY * out_c,const uint8_t * seed,int seed_len,EVP_MD_CTX * h_ctx,const EVP_MD * md,uint32_t tau)306 int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_len,
307                                     EVP_MD_CTX *h_ctx, const EVP_MD *md,
308                                     uint32_t tau)
309 {
310     uint8_t block[SHAKE256_BLOCKSIZE];
311     uint64_t signs;
312     int offset = 8;
313     size_t end;
314 
315     /*
316      * Rather than squeeze 8 bytes followed by lots of 1 byte squeezes
317      * the SHAKE blocksize is squeezed each time and buffered into 'block'.
318      */
319     if (!shake_xof(h_ctx, md, seed, seed_len, block, sizeof(block)))
320         return 0;
321 
322     /*
323      * grab the first 64 bits - since tau < 64
324      * Each bit gives a +1 or -1 value.
325      */
326     OPENSSL_load_u64_le(&signs, block);
327 
328     poly_zero(out_c);
329 
330     /* Loop tau times */
331     for (end = 256 - tau; end < 256; end++) {
332         size_t index; /* index is a random offset to write +1 or -1 */
333 
334         /* rejection sample in {0..end} to choose an index to place -1 or 1 into */
335         for (;;) {
336             if (offset == sizeof(block)) {
337                 /* squeeze another block if the bytes from block have been used */
338                 if (!EVP_DigestSqueeze(h_ctx, block, sizeof(block)))
339                     return 0;
340                 offset = 0;
341             }
342 
343             index = block[offset++];
344             if (index <= end)
345                 break;
346         }
347 
348         /*
349          * In-place swap the coefficient we are about to replace to the end so
350          * we don't lose any values that have been already written.
351          */
352         out_c->coeff[end] = out_c->coeff[index];
353         /* set the random coefficient value to either 1 or q-1 */
354         out_c->coeff[index] = mod_sub(1, 2 * (signs & 1));
355         signs >>= 1; /* grab the next random bit */
356     }
357     return 1;
358 }
359