xref: /freebsd/sys/dev/random/fenestrasX/fx_main.c (revision 51015e6d0f570239b0c2088dc6cf2b018928375d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Conrad Meyer <cem@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * This random algorithm is derived in part from the "Windows 10 random number
30  * generation infrastructure" whitepaper published by Niels Ferguson and
31  * Microsoft: https://aka.ms/win10rng
32  *
33  * It is also inspired by DJB's writing on buffered key-erasure PRNGs:
34  * https://blog.cr.yp.to/20170723-random.html
35  *
36  * The Windows 10 RNG bears some similarity to Fortuna, which Ferguson was also
37  * involved with.  Notable differences include:
38  *  - Extended to multi-CPU design
39  *  - Extended to pre-buffer some PRNG output
40  *  - Pool-based reseeding is solely time-based (rather than on-access w/
41  *    pacing)
42  *  - Extended to specify efficient userspace design
43  *  - Always-available design (requires the equivalent of loader(8) for all
44  *    boots; probably relatively easy given the limited platforms Windows 10
45  *    supports)
46  *
47  * Some aspects of the design document I found confusing and may have
48  * misinterpreted:
49  *  - Relationship between root PRNG seed version and periodic reseed pool use.
50  *    I interpreted these as separate sequences.  The root PRNG seed version is
51  *    bumped both by the periodic pool based reseed, and also special
52  *    conditions such as the first time an entropy source provides entropy.  I
53  *    don't think first-time entropy sources should cause us to skip an entropy
54  *    pool reseed.
55  *  - Initial seeding.  The paper is pretty terse on the subject.  My
56  *    interpretation of the document is that the Windows RNG infrastructure
57  *    relies on the loader(8)-provided material for initial seeding and either
58  *    ignores or doesn't start entropy sources until after that time.  So when
59  *    the paper says that first-time entropy source material "bypasses the
60  *    pools," the root PRNG state already has been keyed for the first time and
61  *    can generate 256 bits, mix it with the first-time entropy, and reseed
62  *    immediately.
63  *
64  * Some notable design choices in this implementation divergent from that
65  * specified in the document above:
66  *  - Blake2b instead of SHA-2 512 for entropy pooling
67  *  - Chacha20 instead of AES-CTR DRBG for PRF
68  *  - Initial seeding.  We treat the 0->1 seed version (brng_generation) edge
69  *    as the transition from blocked to unblocked.  That edge is also the first
70  *    time the key of the root BRNG's PRF is set.  We perform initial seeding
71  *    when the first request for entropy arrives.
72  *    • As a result: Entropy callbacks prior to this edge do not have a keyed
73  *      root PRNG, so bypassing the pools is kind of meaningless.  Instead,
74  *      they feed into pool0.  (They also do not set the root PRNG key or bump
75  *      the root PRNG seed version.)
76  *    • Entropy callbacks after the edge behave like the specification.
77  *    • All one-off sources are fed into pool0 and the result used to seed the
78  *      root BRNG during the initial seed step.
79  *    • All memory needed for initial seeding must be preallocated or static or
80  *      fit on the stack; random reads can occur in nonsleepable contexts and
81  *      we cannot allocate M_WAITOK.  (We also cannot fail to incorporate any
82  *      present one-off source, to the extent it is in the control of
83  *      software.)
84  * - Timer interval reseeding.  We also start the timer-based reseeding at
85  *   initial seed, but unlike the design, our initial seed is some time after
86  *   load (usually within the order of micro- or milliseconds due to
87  *   stack_guard on x86, but conceivably later if nothing reads from random for
88  *   a while).
89  *
90  * Not yet implemented, not in scope, or todo:
91  *  - Various initial seeding sources we don't have yet
92  *  - In particular, VM migration/copy detection
93  */
94 
95 #include <sys/cdefs.h>
96 __FBSDID("$FreeBSD$");
97 
98 #include <sys/param.h>
99 #include <sys/domainset.h>
100 #include <sys/fail.h>
101 #include <sys/limits.h>
102 #include <sys/lock.h>
103 #include <sys/kernel.h>
104 #include <sys/malloc.h>
105 #include <sys/mutex.h>
106 #include <sys/random.h>
107 #include <sys/sdt.h>
108 #include <sys/smp.h>
109 #include <sys/sysctl.h>
110 #include <sys/systm.h>
111 
112 #include <machine/cpu.h>
113 
114 #include <vm/vm.h>
115 #include <vm/vm_param.h>
116 #include <vm/vm_page.h>
117 #include <vm/vm_phys.h>
118 #include <vm/vm_pagequeue.h>
119 
120 #include <dev/random/randomdev.h>
121 #include <dev/random/random_harvestq.h>
122 #include <dev/random/uint128.h>
123 
124 #include <dev/random/fenestrasX/fx_brng.h>
125 #include <dev/random/fenestrasX/fx_hash.h>
126 #include <dev/random/fenestrasX/fx_pool.h>
127 #include <dev/random/fenestrasX/fx_priv.h>
128 #include <dev/random/fenestrasX/fx_pub.h>
129 #include <dev/random/fenestrasX/fx_rng.h>
130 
131 struct fxrng_buffered_rng fxrng_root;
132 uint64_t __read_mostly fxrng_root_generation;
133 DPCPU_DEFINE_STATIC(struct fxrng_buffered_rng *, fxrng_brng);
134 
135 /*
136  * Top-level read API from randomdev.  Responsible for NOWAIT-allocating
137  * per-cpu NUMA-local BRNGs, if needed and satisfiable; subroutines handle
138  * reseeding if the local BRNG is stale and rekeying when necessary.  In
139  * low-memory conditions when a local BRNG cannot be allocated, the request is
140  * simply forwarded to the root BRNG.
141  *
142  * It is a precondition is that the root BRNG initial seeding has completed and
143  * the root generation number >0.
144  */
145 static void
146 _fxrng_alg_read(uint8_t *output, size_t nbytes, uint64_t *seed_version_out)
147 {
148 	struct fxrng_buffered_rng **pcpu_brng_p, *rng, *tmp;
149 	struct pcpu *pcpu;
150 
151 	pcpu = get_pcpu();
152 
153 	/*
154 	 * The following statement directly accesses an implementation detail
155 	 * of DPCPU, but the macros cater only to pinned threads; we want to
156 	 * operate on our initial CPU, without pinning, *even if* we migrate.
157 	 */
158 	pcpu_brng_p = _DPCPU_PTR(pcpu->pc_dynamic, fxrng_brng);
159 
160 	rng = (void *)atomic_load_acq_ptr((uintptr_t *)pcpu_brng_p);
161 
162 	/*
163 	 * Usually the pcpu BRNG has already been allocated, but we do it
164 	 * on-demand and need to check first.  BRNGs are never deallocated and
165 	 * are valid as soon as the pointer is initialized.
166 	 */
167 	if (__predict_false(rng == NULL)) {
168 		uint8_t newkey[FX_CHACHA20_KEYSIZE];
169 		struct domainset *ds;
170 		int domain;
171 
172 		domain = pcpu->pc_domain;
173 
174 		/*
175 		 * Allocate pcpu BRNGs off-domain on weird NUMA machines like
176 		 * AMD Threadripper 2990WX, which has 2 NUMA nodes without
177 		 * local memory controllers.  The PREF policy is automatically
178 		 * converted to something appropriate when domains are empty.
179 		 * (FIXED is not.)
180 		 *
181 		 * Otherwise, allocate strictly CPU-local memory.  The
182 		 * rationale is this: if there is a memory shortage such that
183 		 * PREF policy would fallback to RR, we have no business
184 		 * wasting memory on a faster BRNG.  So, use a FIXED domainset
185 		 * policy.  If we cannot allocate, that's fine!  We fall back
186 		 * to invoking the root BRNG.
187 		 */
188 		if (VM_DOMAIN_EMPTY(domain))
189 			ds = DOMAINSET_PREF(domain);
190 		else
191 			ds = DOMAINSET_FIXED(domain);
192 
193 		rng = malloc_domainset(sizeof(*rng), M_ENTROPY, ds,
194 		    M_NOWAIT | M_ZERO);
195 		if (rng == NULL) {
196 			/* Relatively easy case: fall back to root BRNG. */
197 			rng = &fxrng_root;
198 			goto have_valid_rng;
199 		}
200 
201 		fxrng_brng_init(rng);
202 
203 		/*
204 		 * The root BRNG is always up and available.  Requests are
205 		 * always satisfiable.  This is a design invariant.
206 		 */
207 		ASSERT_DEBUG(atomic_load_acq_64(&fxrng_root_generation) != 0,
208 		    "%s: attempting to seed child BRNG when root hasn't "
209 		    "been initialized yet.", __func__);
210 
211 		FXRNG_BRNG_LOCK(&fxrng_root);
212 #ifdef WITNESS
213 		/* Establish lock order root->pcpu for WITNESS. */
214 		FXRNG_BRNG_LOCK(rng);
215 		FXRNG_BRNG_UNLOCK(rng);
216 #endif
217 		fxrng_brng_produce_seed_data_internal(&fxrng_root, newkey,
218 		    sizeof(newkey), &rng->brng_generation);
219 		FXRNG_BRNG_ASSERT_NOT(&fxrng_root);
220 
221 		fxrng_rng_setkey(&rng->brng_rng, newkey, sizeof(newkey));
222 		explicit_bzero(newkey, sizeof(newkey));
223 
224 		/*
225 		 * We have a valid RNG.  Try to install it, or grab the other
226 		 * one if we lost the race.
227 		 */
228 		tmp = NULL;
229 		while (tmp == NULL)
230 			if (atomic_fcmpset_ptr((uintptr_t *)pcpu_brng_p,
231 			    (uintptr_t *)&tmp, (uintptr_t)rng))
232 				goto have_valid_rng;
233 
234 		/*
235 		 * We lost the race.  There's nothing sensitive about
236 		 * our BRNG's PRF state, because it will never be used
237 		 * for anything and the key doesn't expose any
238 		 * information about the parent (root) generator's
239 		 * state -- it has already rekeyed.  The generation
240 		 * number is public, and a zero counter isn't sensitive.
241 		 */
242 		free(rng, M_ENTROPY);
243 		/*
244 		 * Use the winner's PCPU BRNG.
245 		 */
246 		rng = tmp;
247 	}
248 
249 have_valid_rng:
250 	/* At this point we have a valid, initialized and seeded rng pointer. */
251 	FXRNG_BRNG_LOCK(rng);
252 	if (seed_version_out != NULL)
253 		*seed_version_out = rng->brng_generation;
254 	fxrng_brng_read(rng, output, nbytes);
255 	FXRNG_BRNG_ASSERT_NOT(rng);
256 }
257 
258 static void
259 fxrng_alg_read(uint8_t *output, size_t nbytes)
260 {
261 	_fxrng_alg_read(output, nbytes, NULL);
262 }
263 
264 /*
265  * External API for arc4random(9) to fetch new key material and associated seed
266  * version in chacha20_randomstir().
267  */
268 void
269 read_random_key(void *output, size_t nbytes, uint64_t *seed_version_out)
270 {
271 	/* Ensure _fxrng_alg_read invariant. */
272 	if (__predict_false(atomic_load_acq_64(&fxrng_root_generation) == 0))
273 		(void)fxrng_alg_seeded();
274 
275 	_fxrng_alg_read(output, nbytes, seed_version_out);
276 }
277 
278 static void
279 fxrng_init_alg(void *dummy __unused)
280 {
281 	DPCPU_ZERO(fxrng_brng);
282 	fxrng_brng_init(&fxrng_root);
283 	fxrng_pools_init();
284 }
285 SYSINIT(random_alg, SI_SUB_RANDOM, SI_ORDER_SECOND, fxrng_init_alg, NULL);
286 
287 /*
288  * Public visibility struct referenced directly by other parts of randomdev.
289  */
290 const struct random_algorithm random_alg_context = {
291 	.ra_ident = "fenestrasX",
292 	.ra_pre_read = (void (*)(void))nullop,
293 	.ra_read = fxrng_alg_read,
294 	.ra_seeded = fxrng_alg_seeded,
295 	.ra_event_processor = fxrng_event_processor,
296 	.ra_poolcount = FXRNG_NPOOLS,
297 };
298