xref: /linux/security/landlock/tsync.c (revision aba1de96e80a26648a8e3b593a106041e3e1e2a1)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Landlock - Cross-thread ruleset enforcement
4  *
5  * Copyright © 2025 Google LLC
6  */
7 
8 #include <linux/atomic.h>
9 #include <linux/cleanup.h>
10 #include <linux/completion.h>
11 #include <linux/cred.h>
12 #include <linux/errno.h>
13 #include <linux/overflow.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched.h>
16 #include <linux/sched/signal.h>
17 #include <linux/sched/task.h>
18 #include <linux/slab.h>
19 #include <linux/task_work.h>
20 
21 #include "cred.h"
22 #include "tsync.h"
23 
24 /*
25  * Shared state between multiple threads which are enforcing Landlock rulesets
26  * in lockstep with each other.
27  */
28 struct tsync_shared_context {
29 	/* The old and tentative new creds of the calling thread. */
30 	const struct cred *old_cred;
31 	const struct cred *new_cred;
32 
33 	/* True if sibling tasks need to set the no_new_privs flag. */
34 	bool set_no_new_privs;
35 
36 	/* An error encountered in preparation step, or 0. */
37 	atomic_t preparation_error;
38 
39 	/*
40 	 * Barrier after preparation step in restrict_one_thread.
41 	 * The calling thread waits for completion.
42 	 *
43 	 * Re-initialized on every round of looking for newly spawned threads.
44 	 */
45 	atomic_t num_preparing;
46 	struct completion all_prepared;
47 
48 	/* Sibling threads wait for completion. */
49 	struct completion ready_to_commit;
50 
51 	/*
52 	 * Barrier after commit step (used by syscall impl to wait for
53 	 * completion).
54 	 */
55 	atomic_t num_unfinished;
56 	struct completion all_finished;
57 };
58 
59 struct tsync_work {
60 	struct callback_head work;
61 	struct task_struct *task;
62 	struct tsync_shared_context *shared_ctx;
63 };
64 
65 /*
66  * restrict_one_thread - update a thread's Landlock domain in lockstep with the
67  * other threads in the same process
68  *
69  * When this is run, the same function gets run in all other threads in the same
70  * process (except for the calling thread which called landlock_restrict_self).
71  * The concurrently running invocations of restrict_one_thread coordinate
72  * through the shared ctx object to do their work in lockstep to implement
73  * all-or-nothing semantics for enforcing the new Landlock domain.
74  *
75  * Afterwards, depending on the presence of an error, all threads either commit
76  * or abort the prepared credentials.  The commit operation can not fail any
77  * more.
78  */
79 static void restrict_one_thread(struct tsync_shared_context *ctx)
80 {
81 	int err;
82 	struct cred *cred = NULL;
83 
84 	if (current_cred() == ctx->old_cred) {
85 		/*
86 		 * Switch out old_cred with new_cred, if possible.
87 		 *
88 		 * In the common case, where all threads initially point to the
89 		 * same struct cred, this optimization avoids creating separate
90 		 * redundant credentials objects for each, which would all have
91 		 * the same contents.
92 		 *
93 		 * Note: We are intentionally dropping the const qualifier
94 		 * here, because it is required by commit_creds() and
95 		 * abort_creds().
96 		 */
97 		cred = (struct cred *)get_cred(ctx->new_cred);
98 	} else {
99 		/* Else, prepare new creds and populate them. */
100 		cred = prepare_creds();
101 
102 		if (!cred) {
103 			atomic_set(&ctx->preparation_error, -ENOMEM);
104 
105 			/*
106 			 * Even on error, we need to adhere to the protocol and
107 			 * coordinate with concurrently running invocations.
108 			 */
109 			if (atomic_dec_return(&ctx->num_preparing) == 0)
110 				complete_all(&ctx->all_prepared);
111 
112 			goto out;
113 		}
114 
115 		landlock_cred_copy(landlock_cred(cred),
116 				   landlock_cred(ctx->new_cred));
117 	}
118 
119 	/*
120 	 * Barrier: Wait until all threads are done preparing.
121 	 * After this point, we can have no more failures.
122 	 */
123 	if (atomic_dec_return(&ctx->num_preparing) == 0)
124 		complete_all(&ctx->all_prepared);
125 
126 	/*
127 	 * Wait for signal from calling thread that it's safe to read the
128 	 * preparation error now and we are ready to commit (or abort).
129 	 */
130 	wait_for_completion(&ctx->ready_to_commit);
131 
132 	/* Abort the commit if any of the other threads had an error. */
133 	err = atomic_read(&ctx->preparation_error);
134 	if (err) {
135 		abort_creds(cred);
136 		goto out;
137 	}
138 
139 	/*
140 	 * Make sure that all sibling tasks fulfill the no_new_privs
141 	 * prerequisite.  (This is in line with Seccomp's
142 	 * SECCOMP_FILTER_FLAG_TSYNC logic in kernel/seccomp.c)
143 	 */
144 	if (ctx->set_no_new_privs)
145 		task_set_no_new_privs(current);
146 
147 	commit_creds(cred);
148 
149 out:
150 	/* Notify the calling thread once all threads are done */
151 	if (atomic_dec_return(&ctx->num_unfinished) == 0)
152 		complete_all(&ctx->all_finished);
153 }
154 
155 /*
156  * restrict_one_thread_callback - task_work callback for restricting a thread
157  *
158  * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
159  */
160 static void restrict_one_thread_callback(struct callback_head *work)
161 {
162 	struct tsync_work *ctx = container_of(work, struct tsync_work, work);
163 
164 	restrict_one_thread(ctx->shared_ctx);
165 }
166 
167 /*
168  * struct tsync_works - a growable array of per-task contexts
169  *
170  * The zero-initialized struct represents the empty array.
171  */
172 struct tsync_works {
173 	struct tsync_work **works;
174 	size_t size;
175 	size_t capacity;
176 };
177 
178 /*
179  * tsync_works_provide - provides a preallocated tsync_work for the given task
180  *
181  * This also stores a task pointer in the context and increments the reference
182  * count of the task.
183  *
184  * This function may fail in the case where we did not preallocate sufficient
185  * capacity.  This can legitimately happen if new threads get started after we
186  * grew the capacity.
187  *
188  * Return: A pointer to the preallocated context struct with task filled in, or
189  * NULL if preallocated context structs ran out.
190  */
191 static struct tsync_work *tsync_works_provide(struct tsync_works *s,
192 					      struct task_struct *task)
193 {
194 	struct tsync_work *ctx;
195 
196 	if (s->size >= s->capacity)
197 		return NULL;
198 
199 	ctx = s->works[s->size];
200 	s->size++;
201 
202 	ctx->task = get_task_struct(task);
203 	return ctx;
204 }
205 
206 /**
207  * tsync_works_trim - Put the last tsync_work element
208  *
209  * @s: TSYNC works to trim.
210  *
211  * Put the last task and decrement the size of @s.
212  *
213  * This helper does not cancel a running task, but just reset the last element
214  * to zero.
215  */
216 static void tsync_works_trim(struct tsync_works *s)
217 {
218 	struct tsync_work *ctx;
219 
220 	if (WARN_ON_ONCE(s->size <= 0))
221 		return;
222 
223 	ctx = s->works[s->size - 1];
224 
225 	/*
226 	 * For consistency, remove the task from ctx so that it does not look
227 	 * like we handed it a task_work.
228 	 */
229 	put_task_struct(ctx->task);
230 	*ctx = (typeof(*ctx)){};
231 
232 	/*
233 	 * Cancel the tsync_works_provide() change to recycle the reserved
234 	 * memory for the next thread, if any.  This also ensures that
235 	 * cancel_tsync_works() and tsync_works_release() do not see any NULL
236 	 * task pointers.
237 	 */
238 	s->size--;
239 }
240 
241 /*
242  * tsync_works_grow_by - preallocates space for n more contexts in s
243  *
244  * On a successful return, the subsequent n calls to tsync_works_provide() are
245  * guaranteed to succeed.  (size + n <= capacity)
246  *
247  * Return: 0 if sufficient space for n more elements could be provided, -ENOMEM
248  * on allocation errors, -EOVERFLOW in case of integer overflow.
249  */
250 static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
251 {
252 	size_t i;
253 	size_t new_capacity;
254 	struct tsync_work **works;
255 	struct tsync_work *work;
256 
257 	if (check_add_overflow(s->size, n, &new_capacity))
258 		return -EOVERFLOW;
259 
260 	/* No need to reallocate if s already has sufficient capacity. */
261 	if (new_capacity <= s->capacity)
262 		return 0;
263 
264 	works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
265 			       flags);
266 	if (!works)
267 		return -ENOMEM;
268 
269 	s->works = works;
270 
271 	for (i = s->capacity; i < new_capacity; i++) {
272 		work = kzalloc_obj(*work, flags);
273 		if (!work) {
274 			/*
275 			 * Leave the object in a consistent state,
276 			 * but return an error.
277 			 */
278 			s->capacity = i;
279 			return -ENOMEM;
280 		}
281 		s->works[i] = work;
282 	}
283 	s->capacity = new_capacity;
284 	return 0;
285 }
286 
287 /*
288  * tsync_works_contains - checks for presence of task in s
289  */
290 static bool tsync_works_contains_task(const struct tsync_works *s,
291 				      const struct task_struct *task)
292 {
293 	size_t i;
294 
295 	for (i = 0; i < s->size; i++)
296 		if (s->works[i]->task == task)
297 			return true;
298 
299 	return false;
300 }
301 
302 /*
303  * tsync_works_release - frees memory held by s and drops all task references
304  *
305  * This does not free s itself, only the data structures held by it.
306  */
307 static void tsync_works_release(struct tsync_works *s)
308 {
309 	size_t i;
310 
311 	for (i = 0; i < s->size; i++) {
312 		if (WARN_ON_ONCE(!s->works[i]->task))
313 			continue;
314 
315 		put_task_struct(s->works[i]->task);
316 	}
317 
318 	for (i = 0; i < s->capacity; i++)
319 		kfree(s->works[i]);
320 
321 	kfree(s->works);
322 	s->works = NULL;
323 	s->size = 0;
324 	s->capacity = 0;
325 }
326 
327 /*
328  * count_additional_threads - counts the sibling threads that are not in works
329  */
330 static size_t count_additional_threads(const struct tsync_works *works)
331 {
332 	const struct task_struct *caller, *thread;
333 	size_t n = 0;
334 
335 	caller = current;
336 
337 	guard(rcu)();
338 
339 	for_each_thread(caller, thread) {
340 		/* Skip current, since it is initiating the sync. */
341 		if (thread == caller)
342 			continue;
343 
344 		/* Skip exited threads. */
345 		if (thread->flags & PF_EXITING)
346 			continue;
347 
348 		/* Skip threads that we have already seen. */
349 		if (tsync_works_contains_task(works, thread))
350 			continue;
351 
352 		n++;
353 	}
354 	return n;
355 }
356 
357 /*
358  * schedule_task_work - adds task_work for all eligible sibling threads
359  *                      which have not been scheduled yet
360  *
361  * For each added task_work, atomically increments shared_ctx->num_preparing and
362  * shared_ctx->num_unfinished.
363  *
364  * Return: True if at least one eligible sibling thread was found, false
365  * otherwise.
366  */
367 static bool schedule_task_work(struct tsync_works *works,
368 			       struct tsync_shared_context *shared_ctx)
369 {
370 	int err;
371 	const struct task_struct *caller;
372 	struct task_struct *thread;
373 	struct tsync_work *ctx;
374 	bool found_more_threads = false;
375 
376 	caller = current;
377 
378 	guard(rcu)();
379 
380 	for_each_thread(caller, thread) {
381 		/* Skip current, since it is initiating the sync. */
382 		if (thread == caller)
383 			continue;
384 
385 		/* Skip exited threads. */
386 		if (thread->flags & PF_EXITING)
387 			continue;
388 
389 		/* Skip threads that we already looked at. */
390 		if (tsync_works_contains_task(works, thread))
391 			continue;
392 
393 		/*
394 		 * We found a sibling thread that is not doing its task_work
395 		 * yet, and which might spawn new threads before our task work
396 		 * runs, so we need at least one more round in the outer loop.
397 		 */
398 		found_more_threads = true;
399 
400 		ctx = tsync_works_provide(works, thread);
401 		if (!ctx) {
402 			/*
403 			 * We ran out of preallocated contexts -- we need to
404 			 * try again with this thread at a later time!
405 			 * found_more_threads is already true at this point.
406 			 */
407 			break;
408 		}
409 
410 		ctx->shared_ctx = shared_ctx;
411 
412 		atomic_inc(&shared_ctx->num_preparing);
413 		atomic_inc(&shared_ctx->num_unfinished);
414 
415 		init_task_work(&ctx->work, restrict_one_thread_callback);
416 		err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
417 		if (unlikely(err)) {
418 			/*
419 			 * task_work_add() only fails if the task is about to
420 			 * exit.  We checked that earlier, but it can happen as
421 			 * a race.  Resume without setting an error, as the
422 			 * task is probably gone in the next loop iteration.
423 			 */
424 			tsync_works_trim(works);
425 
426 			atomic_dec(&shared_ctx->num_preparing);
427 			atomic_dec(&shared_ctx->num_unfinished);
428 		}
429 	}
430 
431 	return found_more_threads;
432 }
433 
434 /*
435  * cancel_tsync_works - cancel all task works where it is possible
436  *
437  * Task works can be canceled as long as they are still queued and have not
438  * started running.  If they get canceled, we decrement
439  * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
440  * completions if needed, as if the task was never scheduled.
441  */
442 static void cancel_tsync_works(const struct tsync_works *works,
443 			       struct tsync_shared_context *shared_ctx)
444 {
445 	size_t i;
446 
447 	for (i = 0; i < works->size; i++) {
448 		if (WARN_ON_ONCE(!works->works[i]->task))
449 			continue;
450 
451 		if (!task_work_cancel(works->works[i]->task,
452 				      &works->works[i]->work))
453 			continue;
454 
455 		/* After dequeueing, act as if the task work had executed. */
456 
457 		if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
458 			complete_all(&shared_ctx->all_prepared);
459 
460 		if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
461 			complete_all(&shared_ctx->all_finished);
462 	}
463 }
464 
465 /*
466  * restrict_sibling_threads - enables a Landlock policy for all sibling threads
467  */
468 int landlock_restrict_sibling_threads(const struct cred *old_cred,
469 				      const struct cred *new_cred)
470 {
471 	int err;
472 	struct tsync_shared_context shared_ctx;
473 	struct tsync_works works = {};
474 	size_t newly_discovered_threads;
475 	bool found_more_threads;
476 
477 	atomic_set(&shared_ctx.preparation_error, 0);
478 	init_completion(&shared_ctx.all_prepared);
479 	init_completion(&shared_ctx.ready_to_commit);
480 	atomic_set(&shared_ctx.num_unfinished, 1);
481 	init_completion(&shared_ctx.all_finished);
482 	shared_ctx.old_cred = old_cred;
483 	shared_ctx.new_cred = new_cred;
484 	shared_ctx.set_no_new_privs = task_no_new_privs(current);
485 
486 	/*
487 	 * Serialize concurrent TSYNC operations to prevent deadlocks when
488 	 * multiple threads call landlock_restrict_self() simultaneously.
489 	 * If the lock is already held, we gracefully yield by restarting the
490 	 * syscall. This allows the current thread to process pending
491 	 * task_works before retrying.
492 	 */
493 	if (!down_write_trylock(&current->signal->exec_update_lock))
494 		return restart_syscall();
495 
496 	/*
497 	 * We schedule a pseudo-signal task_work for each of the calling task's
498 	 * sibling threads.  In the task work, each thread:
499 	 *
500 	 * 1) runs prepare_creds() and writes back the error to
501 	 *    shared_ctx.preparation_error, if needed.
502 	 *
503 	 * 2) signals that it's done with prepare_creds() to the calling task.
504 	 *    (completion "all_prepared").
505 	 *
506 	 * 3) waits for the completion "ready_to_commit".  This is sent by the
507 	 *    calling task after ensuring that all sibling threads have done
508 	 *    with the "preparation" stage.
509 	 *
510 	 *    After this barrier is reached, it's safe to read
511 	 *    shared_ctx.preparation_error.
512 	 *
513 	 * 4) reads shared_ctx.preparation_error and then either does
514 	 *    commit_creds() or abort_creds().
515 	 *
516 	 * 5) signals that it's done altogether (barrier synchronization
517 	 *    "all_finished")
518 	 *
519 	 * Unlike seccomp, which modifies sibling tasks directly, we do not
520 	 * need to acquire the cred_guard_mutex and sighand->siglock:
521 	 *
522 	 * - As in our case, all threads are themselves exchanging their own
523 	 *   struct cred through the credentials API, no locks are needed for
524 	 *   that.
525 	 * - Our for_each_thread() loops are protected by RCU.
526 	 * - We do not acquire a lock to keep the list of sibling threads
527 	 *   stable between our for_each_thread loops.  If the list of
528 	 *   available sibling threads changes between these for_each_thread
529 	 *   loops, we make up for that by continuing to look for threads until
530 	 *   they are all discovered and have entered their task_work, where
531 	 *   they are unable to spawn new threads.
532 	 */
533 	do {
534 		/* In RCU read-lock, count the threads we need. */
535 		newly_discovered_threads = count_additional_threads(&works);
536 
537 		if (newly_discovered_threads == 0)
538 			break; /* done */
539 
540 		err = tsync_works_grow_by(&works, newly_discovered_threads,
541 					  GFP_KERNEL_ACCOUNT);
542 		if (err) {
543 			atomic_set(&shared_ctx.preparation_error, err);
544 			break;
545 		}
546 
547 		/*
548 		 * The "all_prepared" barrier is used locally to the loop body,
549 		 * this use of for_each_thread().  We can reset it on each loop
550 		 * iteration because all previous loop iterations are done with
551 		 * it already.
552 		 *
553 		 * num_preparing is initialized to 1 so that the counter can
554 		 * not go to 0 and mark the completion as done before all task
555 		 * works are registered.  We decrement it at the end of the
556 		 * loop body.
557 		 */
558 		atomic_set(&shared_ctx.num_preparing, 1);
559 		reinit_completion(&shared_ctx.all_prepared);
560 
561 		/*
562 		 * In RCU read-lock, schedule task work on newly discovered
563 		 * sibling tasks.
564 		 */
565 		found_more_threads = schedule_task_work(&works, &shared_ctx);
566 
567 		/*
568 		 * Decrement num_preparing for current, to undo that we
569 		 * initialized it to 1 a few lines above.
570 		 */
571 		if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
572 			if (wait_for_completion_interruptible(
573 				    &shared_ctx.all_prepared)) {
574 				/*
575 				 * In case of interruption, we need to retry
576 				 * the system call.
577 				 */
578 				atomic_set(&shared_ctx.preparation_error,
579 					   -ERESTARTNOINTR);
580 
581 				/*
582 				 * Opportunistic improvement: try to cancel task
583 				 * works for tasks that did not start running
584 				 * yet. We do not have a guarantee that it
585 				 * cancels any of the enqueued task works
586 				 * because task_work_run() might already have
587 				 * dequeued them.
588 				 */
589 				cancel_tsync_works(&works, &shared_ctx);
590 
591 				/*
592 				 * Break the loop with error. The cleanup code
593 				 * after the loop unblocks the remaining
594 				 * task_works.
595 				 */
596 				break;
597 			}
598 		}
599 	} while (found_more_threads &&
600 		 !atomic_read(&shared_ctx.preparation_error));
601 
602 	/*
603 	 * We now have either (a) all sibling threads blocking and in "prepared"
604 	 * state in the task work, or (b) the preparation error is set. Ask all
605 	 * threads to commit (or abort).
606 	 */
607 	complete_all(&shared_ctx.ready_to_commit);
608 
609 	/*
610 	 * Decrement num_unfinished for current, to undo that we initialized it
611 	 * to 1 at the beginning.
612 	 */
613 	if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
614 		wait_for_completion(&shared_ctx.all_finished);
615 
616 	tsync_works_release(&works);
617 	up_write(&current->signal->exec_update_lock);
618 	return atomic_read(&shared_ctx.preparation_error);
619 }
620