xref: /linux/lib/vdso/getrandom.c (revision 32bd966050486d3fed6980aa3de3e60b9e383589)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4  */
5 
6 #include <linux/array_size.h>
7 #include <linux/minmax.h>
8 #include <vdso/datapage.h>
9 #include <vdso/getrandom.h>
10 #include <vdso/limits.h>
11 #include <vdso/unaligned.h>
12 #include <asm/barrier.h>
13 #include <asm/vdso/getrandom.h>
14 #include <uapi/linux/errno.h>
15 #include <uapi/linux/mman.h>
16 #include <uapi/linux/random.h>
17 
18 /* Bring in default accessors */
19 #include <vdso/vsyscall.h>
20 
21 #undef PAGE_SIZE
22 #undef PAGE_MASK
23 #define PAGE_SIZE (1UL << CONFIG_PAGE_SHIFT)
24 #define PAGE_MASK (~(PAGE_SIZE - 1))
25 
26 #define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do {				\
27 	while (len >= sizeof(type)) {						\
28 		__put_unaligned_t(type, __get_unaligned_t(type, src), dst);	\
29 		__put_unaligned_t(type, 0, src);				\
30 		dst += sizeof(type);						\
31 		src += sizeof(type);						\
32 		len -= sizeof(type);						\
33 	}									\
34 } while (0)
35 
36 static void memcpy_and_zero_src(void *dst, void *src, size_t len)
37 {
38 	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
39 		if (IS_ENABLED(CONFIG_64BIT))
40 			MEMCPY_AND_ZERO_SRC(u64, dst, src, len);
41 		MEMCPY_AND_ZERO_SRC(u32, dst, src, len);
42 		MEMCPY_AND_ZERO_SRC(u16, dst, src, len);
43 	}
44 	MEMCPY_AND_ZERO_SRC(u8, dst, src, len);
45 }
46 
47 /**
48  * __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall.
49  * @rng_info:		Describes state of kernel RNG, memory shared with kernel.
50  * @buffer:		Destination buffer to fill with random bytes.
51  * @len:		Size of @buffer in bytes.
52  * @flags:		Zero or more GRND_* flags.
53  * @opaque_state:	Pointer to an opaque state area.
54  * @opaque_len:		Length of opaque state area.
55  *
56  * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's
57  * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same
58  * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always
59  * calls into the syscall.
60  *
61  * If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated
62  * with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0,
63  * this function should not be used.
64  *
65  * @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields
66  * from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external
67  * locking is used, one state must be allocated per thread, as it is not safe to call this function
68  * concurrently with the same @opaque_state. However, it is safe to call this using the same
69  * @opaque_state that is shared between main code and signal handling code, within the same thread.
70  *
71  * Returns:	The number of random bytes written to @buffer, or a negative value indicating an error.
72  */
73 static __always_inline ssize_t
74 __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len,
75 		       unsigned int flags, void *opaque_state, size_t opaque_len)
76 {
77 	ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);
78 	struct vgetrandom_state *state = opaque_state;
79 	size_t batch_len, nblocks, orig_len = len;
80 	bool in_use, have_retried = false;
81 	void *orig_buffer = buffer;
82 	u64 current_generation;
83 	u32 counter[2] = { 0 };
84 
85 	if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) {
86 		struct vgetrandom_opaque_params *params = opaque_state;
87 		params->size_of_opaque_state = sizeof(*state);
88 		params->mmap_prot = PROT_READ | PROT_WRITE;
89 		params->mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS;
90 		for (size_t i = 0; i < ARRAY_SIZE(params->reserved); ++i)
91 			params->reserved[i] = 0;
92 		return 0;
93 	}
94 
95 	/* The state must not straddle a page, since pages can be zeroed at any time. */
96 	if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE))
97 		return -EFAULT;
98 
99 	/* Handle unexpected flags by falling back to the kernel. */
100 	if (unlikely(flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE)))
101 		goto fallback_syscall;
102 
103 	/* If the caller passes the wrong size, which might happen due to CRIU, fallback. */
104 	if (unlikely(opaque_len != sizeof(*state)))
105 		goto fallback_syscall;
106 
107 	/*
108 	 * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from
109 	 * userspace, because A) the various @flags require this to block, or not, depending on
110 	 * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is
111 	 * ready is to reseed from the entropy pool at every invocation.
112 	 */
113 	if (unlikely(!READ_ONCE(rng_info->is_ready)))
114 		goto fallback_syscall;
115 
116 	/*
117 	 * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is
118 	 * initialized, the @flags parameter may require this to block or return an error, even when
119 	 * len is zero.
120 	 */
121 	if (unlikely(!len))
122 		return 0;
123 
124 	/*
125 	 * @state->in_use is basic reentrancy protection against this running in a signal handler
126 	 * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one
127 	 * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before
128 	 * writing @state->in_use, there is still no race, because the signal handler will run to
129 	 * its completion before returning execution.
130 	 */
131 	in_use = READ_ONCE(state->in_use);
132 	if (unlikely(in_use))
133 		/* The syscall simply fills the buffer and does not touch @state, so fallback. */
134 		goto fallback_syscall;
135 	WRITE_ONCE(state->in_use, true);
136 
137 retry_generation:
138 	/*
139 	 * @rng_info->generation must always be read here, as it serializes @state->key with the
140 	 * kernel's RNG reseeding schedule.
141 	 */
142 	current_generation = READ_ONCE(rng_info->generation);
143 
144 	/*
145 	 * If @state->generation doesn't match the kernel RNG's generation, then it means the
146 	 * kernel's RNG has reseeded, and so @state->key is reseeded as well.
147 	 */
148 	if (unlikely(state->generation != current_generation)) {
149 		/*
150 		 * Write the generation before filling the key, in case of fork. If there is a fork
151 		 * just after this line, the parent and child will get different random bytes from
152 		 * the syscall, which is good. However, were this line to occur after the getrandom
153 		 * syscall, then both child and parent could have the same bytes and the same
154 		 * generation counter, so the fork would not be detected. Therefore, write
155 		 * @state->generation before the call to the getrandom syscall.
156 		 */
157 		WRITE_ONCE(state->generation, current_generation);
158 
159 		/*
160 		 * Prevent the syscall from being reordered wrt current_generation. Pairs with the
161 		 * smp_store_release(&vdso_k_rng_data->generation) in random.c.
162 		 */
163 		smp_rmb();
164 
165 		/* Reseed @state->key using fresh bytes from the kernel. */
166 		if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) {
167 			/*
168 			 * If the syscall failed to refresh the key, then @state->key is now
169 			 * invalid, so invalidate the generation so that it is not used again, and
170 			 * fallback to using the syscall entirely.
171 			 */
172 			WRITE_ONCE(state->generation, 0);
173 
174 			/*
175 			 * Set @state->in_use to false only after the last write to @state in the
176 			 * line above.
177 			 */
178 			WRITE_ONCE(state->in_use, false);
179 
180 			goto fallback_syscall;
181 		}
182 
183 		/*
184 		 * Set @state->pos to beyond the end of the batch, so that the batch is refilled
185 		 * using the new key.
186 		 */
187 		state->pos = sizeof(state->batch);
188 	}
189 
190 	/* Set len to the total amount of bytes that this function is allowed to read, ret. */
191 	len = ret;
192 more_batch:
193 	/*
194 	 * First use bytes out of @state->batch, which may have been filled by the last call to this
195 	 * function.
196 	 */
197 	batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len);
198 	if (batch_len) {
199 		/* Zeroing at the same time as memcpying helps preserve forward secrecy. */
200 		memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len);
201 		state->pos += batch_len;
202 		buffer += batch_len;
203 		len -= batch_len;
204 	}
205 
206 	if (!len) {
207 		/* Prevent the loop from being reordered wrt ->generation. */
208 		barrier();
209 
210 		/*
211 		 * Since @rng_info->generation will never be 0, re-read @state->generation, rather
212 		 * than using the local current_generation variable, to learn whether a fork
213 		 * occurred or if @state was zeroed due to memory pressure. Primarily, though, this
214 		 * indicates whether the kernel's RNG has reseeded, in which case generate a new key
215 		 * and start over.
216 		 */
217 		if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) {
218 			/*
219 			 * Prevent this from looping forever in case of low memory or racing with a
220 			 * user force-reseeding the kernel's RNG using the ioctl.
221 			 */
222 			if (have_retried) {
223 				WRITE_ONCE(state->in_use, false);
224 				goto fallback_syscall;
225 			}
226 
227 			have_retried = true;
228 			buffer = orig_buffer;
229 			goto retry_generation;
230 		}
231 
232 		/*
233 		 * Set @state->in_use to false only when there will be no more reads or writes of
234 		 * @state.
235 		 */
236 		WRITE_ONCE(state->in_use, false);
237 		return ret;
238 	}
239 
240 	/* Generate blocks of RNG output directly into @buffer while there's enough room left. */
241 	nblocks = len / CHACHA_BLOCK_SIZE;
242 	if (nblocks) {
243 		__arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks);
244 		buffer += nblocks * CHACHA_BLOCK_SIZE;
245 		len -= nblocks * CHACHA_BLOCK_SIZE;
246 	}
247 
248 	BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0);
249 
250 	/* Refill the batch and overwrite the key, in order to preserve forward secrecy. */
251 	__arch_chacha20_blocks_nostack(state->batch_key, state->key, counter,
252 				       sizeof(state->batch_key) / CHACHA_BLOCK_SIZE);
253 
254 	/* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */
255 	state->pos = 0;
256 	goto more_batch;
257 
258 fallback_syscall:
259 	return getrandom_syscall(orig_buffer, orig_len, flags);
260 }
261 
262 static __always_inline ssize_t
263 __cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
264 {
265 	return __cvdso_getrandom_data(__arch_get_vdso_u_rng_data(), buffer, len, flags,
266 				      opaque_state, opaque_len);
267 }
268