xref: /linux/security/landlock/tsync.c (revision 37a93dd5c49b5fda807fd204edf2547c3493319c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Landlock - Cross-thread ruleset enforcement
4  *
5  * Copyright © 2025 Google LLC
6  */
7 
8 #include <linux/atomic.h>
9 #include <linux/cleanup.h>
10 #include <linux/completion.h>
11 #include <linux/cred.h>
12 #include <linux/errno.h>
13 #include <linux/overflow.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched.h>
16 #include <linux/sched/signal.h>
17 #include <linux/sched/task.h>
18 #include <linux/slab.h>
19 #include <linux/task_work.h>
20 
21 #include "cred.h"
22 #include "tsync.h"
23 
24 /*
25  * Shared state between multiple threads which are enforcing Landlock rulesets
26  * in lockstep with each other.
27  */
28 struct tsync_shared_context {
29 	/* The old and tentative new creds of the calling thread. */
30 	const struct cred *old_cred;
31 	const struct cred *new_cred;
32 
33 	/* True if sibling tasks need to set the no_new_privs flag. */
34 	bool set_no_new_privs;
35 
36 	/* An error encountered in preparation step, or 0. */
37 	atomic_t preparation_error;
38 
39 	/*
40 	 * Barrier after preparation step in restrict_one_thread.
41 	 * The calling thread waits for completion.
42 	 *
43 	 * Re-initialized on every round of looking for newly spawned threads.
44 	 */
45 	atomic_t num_preparing;
46 	struct completion all_prepared;
47 
48 	/* Sibling threads wait for completion. */
49 	struct completion ready_to_commit;
50 
51 	/*
52 	 * Barrier after commit step (used by syscall impl to wait for
53 	 * completion).
54 	 */
55 	atomic_t num_unfinished;
56 	struct completion all_finished;
57 };
58 
59 struct tsync_work {
60 	struct callback_head work;
61 	struct task_struct *task;
62 	struct tsync_shared_context *shared_ctx;
63 };
64 
65 /*
66  * restrict_one_thread - update a thread's Landlock domain in lockstep with the
67  * other threads in the same process
68  *
69  * When this is run, the same function gets run in all other threads in the same
70  * process (except for the calling thread which called landlock_restrict_self).
71  * The concurrently running invocations of restrict_one_thread coordinate
72  * through the shared ctx object to do their work in lockstep to implement
73  * all-or-nothing semantics for enforcing the new Landlock domain.
74  *
75  * Afterwards, depending on the presence of an error, all threads either commit
76  * or abort the prepared credentials.  The commit operation can not fail any
77  * more.
78  */
79 static void restrict_one_thread(struct tsync_shared_context *ctx)
80 {
81 	int err;
82 	struct cred *cred = NULL;
83 
84 	if (current_cred() == ctx->old_cred) {
85 		/*
86 		 * Switch out old_cred with new_cred, if possible.
87 		 *
88 		 * In the common case, where all threads initially point to the same
89 		 * struct cred, this optimization avoids creating separate redundant
90 		 * credentials objects for each, which would all have the same contents.
91 		 *
92 		 * Note: We are intentionally dropping the const qualifier here, because
93 		 * it is required by commit_creds() and abort_creds().
94 		 */
95 		cred = (struct cred *)get_cred(ctx->new_cred);
96 	} else {
97 		/* Else, prepare new creds and populate them. */
98 		cred = prepare_creds();
99 
100 		if (!cred) {
101 			atomic_set(&ctx->preparation_error, -ENOMEM);
102 
103 			/*
104 			 * Even on error, we need to adhere to the protocol and coordinate
105 			 * with concurrently running invocations.
106 			 */
107 			if (atomic_dec_return(&ctx->num_preparing) == 0)
108 				complete_all(&ctx->all_prepared);
109 
110 			goto out;
111 		}
112 
113 		landlock_cred_copy(landlock_cred(cred),
114 				   landlock_cred(ctx->new_cred));
115 	}
116 
117 	/*
118 	 * Barrier: Wait until all threads are done preparing.
119 	 * After this point, we can have no more failures.
120 	 */
121 	if (atomic_dec_return(&ctx->num_preparing) == 0)
122 		complete_all(&ctx->all_prepared);
123 
124 	/*
125 	 * Wait for signal from calling thread that it's safe to read the
126 	 * preparation error now and we are ready to commit (or abort).
127 	 */
128 	wait_for_completion(&ctx->ready_to_commit);
129 
130 	/* Abort the commit if any of the other threads had an error. */
131 	err = atomic_read(&ctx->preparation_error);
132 	if (err) {
133 		abort_creds(cred);
134 		goto out;
135 	}
136 
137 	/*
138 	 * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
139 	 * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
140 	 * kernel/seccomp.c)
141 	 */
142 	if (ctx->set_no_new_privs)
143 		task_set_no_new_privs(current);
144 
145 	commit_creds(cred);
146 
147 out:
148 	/* Notify the calling thread once all threads are done */
149 	if (atomic_dec_return(&ctx->num_unfinished) == 0)
150 		complete_all(&ctx->all_finished);
151 }
152 
153 /*
154  * restrict_one_thread_callback - task_work callback for restricting a thread
155  *
156  * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
157  */
158 static void restrict_one_thread_callback(struct callback_head *work)
159 {
160 	struct tsync_work *ctx = container_of(work, struct tsync_work, work);
161 
162 	restrict_one_thread(ctx->shared_ctx);
163 }
164 
165 /*
166  * struct tsync_works - a growable array of per-task contexts
167  *
168  * The zero-initialized struct represents the empty array.
169  */
170 struct tsync_works {
171 	struct tsync_work **works;
172 	size_t size;
173 	size_t capacity;
174 };
175 
176 /*
177  * tsync_works_provide - provides a preallocated tsync_work for the given task
178  *
179  * This also stores a task pointer in the context and increments the reference
180  * count of the task.
181  *
182  * This function may fail in the case where we did not preallocate sufficient
183  * capacity.  This can legitimately happen if new threads get started after we
184  * grew the capacity.
185  *
186  * Returns:
187  *   A pointer to the preallocated context struct, with task filled in.
188  *
189  *   NULL, if we ran out of preallocated context structs.
190  */
191 static struct tsync_work *tsync_works_provide(struct tsync_works *s,
192 					      struct task_struct *task)
193 {
194 	struct tsync_work *ctx;
195 
196 	if (s->size >= s->capacity)
197 		return NULL;
198 
199 	ctx = s->works[s->size];
200 	s->size++;
201 
202 	ctx->task = get_task_struct(task);
203 	return ctx;
204 }
205 
206 /*
207  * tsync_works_grow_by - preallocates space for n more contexts in s
208  *
209  * On a successful return, the subsequent n calls to tsync_works_provide() are
210  * guaranteed to succeed.  (size + n <= capacity)
211  *
212  * Returns:
213  *   -ENOMEM if the (re)allocation fails
214 
215  *   0       if the allocation succeeds, partially succeeds, or no reallocation
216  *           was needed
217  */
218 static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
219 {
220 	size_t i;
221 	size_t new_capacity;
222 	struct tsync_work **works;
223 	struct tsync_work *work;
224 
225 	if (check_add_overflow(s->size, n, &new_capacity))
226 		return -EOVERFLOW;
227 
228 	/* No need to reallocate if s already has sufficient capacity. */
229 	if (new_capacity <= s->capacity)
230 		return 0;
231 
232 	works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
233 			       flags);
234 	if (!works)
235 		return -ENOMEM;
236 
237 	s->works = works;
238 
239 	for (i = s->capacity; i < new_capacity; i++) {
240 		work = kzalloc(sizeof(*work), flags);
241 		if (!work) {
242 			/*
243 			 * Leave the object in a consistent state,
244 			 * but return an error.
245 			 */
246 			s->capacity = i;
247 			return -ENOMEM;
248 		}
249 		s->works[i] = work;
250 	}
251 	s->capacity = new_capacity;
252 	return 0;
253 }
254 
255 /*
256  * tsync_works_contains - checks for presence of task in s
257  */
258 static bool tsync_works_contains_task(const struct tsync_works *s,
259 				      struct task_struct *task)
260 {
261 	size_t i;
262 
263 	for (i = 0; i < s->size; i++)
264 		if (s->works[i]->task == task)
265 			return true;
266 	return false;
267 }
268 
269 /*
270  * tsync_works_release - frees memory held by s and drops all task references
271  *
272  * This does not free s itself, only the data structures held by it.
273  */
274 static void tsync_works_release(struct tsync_works *s)
275 {
276 	size_t i;
277 
278 	for (i = 0; i < s->size; i++) {
279 		if (!s->works[i]->task)
280 			continue;
281 
282 		put_task_struct(s->works[i]->task);
283 	}
284 
285 	for (i = 0; i < s->capacity; i++)
286 		kfree(s->works[i]);
287 	kfree(s->works);
288 	s->works = NULL;
289 	s->size = 0;
290 	s->capacity = 0;
291 }
292 
293 /*
294  * count_additional_threads - counts the sibling threads that are not in works
295  */
296 static size_t count_additional_threads(const struct tsync_works *works)
297 {
298 	struct task_struct *thread, *caller;
299 	size_t n = 0;
300 
301 	caller = current;
302 
303 	guard(rcu)();
304 
305 	for_each_thread(caller, thread) {
306 		/* Skip current, since it is initiating the sync. */
307 		if (thread == caller)
308 			continue;
309 
310 		/* Skip exited threads. */
311 		if (thread->flags & PF_EXITING)
312 			continue;
313 
314 		/* Skip threads that we have already seen. */
315 		if (tsync_works_contains_task(works, thread))
316 			continue;
317 
318 		n++;
319 	}
320 	return n;
321 }
322 
323 /*
324  * schedule_task_work - adds task_work for all eligible sibling threads
325  *                      which have not been scheduled yet
326  *
327  * For each added task_work, atomically increments shared_ctx->num_preparing and
328  * shared_ctx->num_unfinished.
329  *
330  * Returns:
331  *     true, if at least one eligible sibling thread was found
332  */
333 static bool schedule_task_work(struct tsync_works *works,
334 			       struct tsync_shared_context *shared_ctx)
335 {
336 	int err;
337 	struct task_struct *thread, *caller;
338 	struct tsync_work *ctx;
339 	bool found_more_threads = false;
340 
341 	caller = current;
342 
343 	guard(rcu)();
344 
345 	for_each_thread(caller, thread) {
346 		/* Skip current, since it is initiating the sync. */
347 		if (thread == caller)
348 			continue;
349 
350 		/* Skip exited threads. */
351 		if (thread->flags & PF_EXITING)
352 			continue;
353 
354 		/* Skip threads that we already looked at. */
355 		if (tsync_works_contains_task(works, thread))
356 			continue;
357 
358 		/*
359 		 * We found a sibling thread that is not doing its task_work yet, and
360 		 * which might spawn new threads before our task work runs, so we need
361 		 * at least one more round in the outer loop.
362 		 */
363 		found_more_threads = true;
364 
365 		ctx = tsync_works_provide(works, thread);
366 		if (!ctx) {
367 			/*
368 			 * We ran out of preallocated contexts -- we need to try again with
369 			 * this thread at a later time!
370 			 * found_more_threads is already true at this point.
371 			 */
372 			break;
373 		}
374 
375 		ctx->shared_ctx = shared_ctx;
376 
377 		atomic_inc(&shared_ctx->num_preparing);
378 		atomic_inc(&shared_ctx->num_unfinished);
379 
380 		init_task_work(&ctx->work, restrict_one_thread_callback);
381 		err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
382 		if (err) {
383 			/*
384 			 * task_work_add() only fails if the task is about to exit.  We
385 			 * checked that earlier, but it can happen as a race.  Resume
386 			 * without setting an error, as the task is probably gone in the
387 			 * next loop iteration.  For consistency, remove the task from ctx
388 			 * so that it does not look like we handed it a task_work.
389 			 */
390 			put_task_struct(ctx->task);
391 			ctx->task = NULL;
392 
393 			atomic_dec(&shared_ctx->num_preparing);
394 			atomic_dec(&shared_ctx->num_unfinished);
395 		}
396 	}
397 
398 	return found_more_threads;
399 }
400 
401 /*
402  * cancel_tsync_works - cancel all task works where it is possible
403  *
404  * Task works can be canceled as long as they are still queued and have not
405  * started running.  If they get canceled, we decrement
406  * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
407  * completions if needed, as if the task was never scheduled.
408  */
409 static void cancel_tsync_works(struct tsync_works *works,
410 			       struct tsync_shared_context *shared_ctx)
411 {
412 	int i;
413 
414 	for (i = 0; i < works->size; i++) {
415 		if (!task_work_cancel(works->works[i]->task,
416 				      &works->works[i]->work))
417 			continue;
418 
419 		/* After dequeueing, act as if the task work had executed. */
420 
421 		if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
422 			complete_all(&shared_ctx->all_prepared);
423 
424 		if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
425 			complete_all(&shared_ctx->all_finished);
426 	}
427 }
428 
429 /*
430  * restrict_sibling_threads - enables a Landlock policy for all sibling threads
431  */
432 int landlock_restrict_sibling_threads(const struct cred *old_cred,
433 				      const struct cred *new_cred)
434 {
435 	int err;
436 	struct tsync_shared_context shared_ctx;
437 	struct tsync_works works = {};
438 	size_t newly_discovered_threads;
439 	bool found_more_threads;
440 
441 	atomic_set(&shared_ctx.preparation_error, 0);
442 	init_completion(&shared_ctx.all_prepared);
443 	init_completion(&shared_ctx.ready_to_commit);
444 	atomic_set(&shared_ctx.num_unfinished, 1);
445 	init_completion(&shared_ctx.all_finished);
446 	shared_ctx.old_cred = old_cred;
447 	shared_ctx.new_cred = new_cred;
448 	shared_ctx.set_no_new_privs = task_no_new_privs(current);
449 
450 	/*
451 	 * We schedule a pseudo-signal task_work for each of the calling task's
452 	 * sibling threads.  In the task work, each thread:
453 	 *
454 	 * 1) runs prepare_creds() and writes back the error to
455 	 *    shared_ctx.preparation_error, if needed.
456 	 *
457 	 * 2) signals that it's done with prepare_creds() to the calling task.
458 	 *    (completion "all_prepared").
459 	 *
460 	 * 3) waits for the completion "ready_to_commit".  This is sent by the
461 	 *    calling task after ensuring that all sibling threads have done
462 	 *    with the "preparation" stage.
463 	 *
464 	 *    After this barrier is reached, it's safe to read
465 	 *    shared_ctx.preparation_error.
466 	 *
467 	 * 4) reads shared_ctx.preparation_error and then either does commit_creds()
468 	 *    or abort_creds().
469 	 *
470 	 * 5) signals that it's done altogether (barrier synchronization
471 	 *    "all_finished")
472 	 *
473 	 * Unlike seccomp, which modifies sibling tasks directly, we do not need to
474 	 * acquire the cred_guard_mutex and sighand->siglock:
475 	 *
476 	 * - As in our case, all threads are themselves exchanging their own struct
477 	 *   cred through the credentials API, no locks are needed for that.
478 	 * - Our for_each_thread() loops are protected by RCU.
479 	 * - We do not acquire a lock to keep the list of sibling threads stable
480 	 *   between our for_each_thread loops.  If the list of available sibling
481 	 *   threads changes between these for_each_thread loops, we make up for
482 	 *   that by continuing to look for threads until they are all discovered
483 	 *   and have entered their task_work, where they are unable to spawn new
484 	 *   threads.
485 	 */
486 	do {
487 		/* In RCU read-lock, count the threads we need. */
488 		newly_discovered_threads = count_additional_threads(&works);
489 
490 		if (newly_discovered_threads == 0)
491 			break; /* done */
492 
493 		err = tsync_works_grow_by(&works, newly_discovered_threads,
494 					  GFP_KERNEL_ACCOUNT);
495 		if (err) {
496 			atomic_set(&shared_ctx.preparation_error, err);
497 			break;
498 		}
499 
500 		/*
501 		 * The "all_prepared" barrier is used locally to the loop body, this use
502 		 * of for_each_thread().  We can reset it on each loop iteration because
503 		 * all previous loop iterations are done with it already.
504 		 *
505 		 * num_preparing is initialized to 1 so that the counter can not go to 0
506 		 * and mark the completion as done before all task works are registered.
507 		 * We decrement it at the end of the loop body.
508 		 */
509 		atomic_set(&shared_ctx.num_preparing, 1);
510 		reinit_completion(&shared_ctx.all_prepared);
511 
512 		/*
513 		 * In RCU read-lock, schedule task work on newly discovered sibling
514 		 * tasks.
515 		 */
516 		found_more_threads = schedule_task_work(&works, &shared_ctx);
517 
518 		/*
519 		 * Decrement num_preparing for current, to undo that we initialized it
520 		 * to 1 a few lines above.
521 		 */
522 		if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
523 			if (wait_for_completion_interruptible(
524 				    &shared_ctx.all_prepared)) {
525 				/* In case of interruption, we need to retry the system call. */
526 				atomic_set(&shared_ctx.preparation_error,
527 					   -ERESTARTNOINTR);
528 
529 				/*
530 				 * Cancel task works for tasks that did not start running yet,
531 				 * and decrement all_prepared and num_unfinished accordingly.
532 				 */
533 				cancel_tsync_works(&works, &shared_ctx);
534 
535 				/*
536 				 * The remaining task works have started running, so waiting for
537 				 * their completion will finish.
538 				 */
539 				wait_for_completion(&shared_ctx.all_prepared);
540 			}
541 		}
542 	} while (found_more_threads &&
543 		 !atomic_read(&shared_ctx.preparation_error));
544 
545 	/*
546 	 * We now have all sibling threads blocking and in "prepared" state in the
547 	 * task work. Ask all threads to commit.
548 	 */
549 	complete_all(&shared_ctx.ready_to_commit);
550 
551 	/*
552 	 * Decrement num_unfinished for current, to undo that we initialized it to 1
553 	 * at the beginning.
554 	 */
555 	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
556 		wait_for_completion(&shared_ctx.all_finished);
557 
558 	tsync_works_release(&works);
559 
560 	return atomic_read(&shared_ctx.preparation_error);
561 }
562