1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Landlock - Cross-thread ruleset enforcement
4 *
5 * Copyright © 2025 Google LLC
6 */
7
8 #include <linux/atomic.h>
9 #include <linux/cleanup.h>
10 #include <linux/completion.h>
11 #include <linux/cred.h>
12 #include <linux/errno.h>
13 #include <linux/overflow.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched.h>
16 #include <linux/sched/signal.h>
17 #include <linux/sched/task.h>
18 #include <linux/slab.h>
19 #include <linux/task_work.h>
20
21 #include "cred.h"
22 #include "tsync.h"
23
24 /*
25 * Shared state between multiple threads which are enforcing Landlock rulesets
26 * in lockstep with each other.
27 */
28 struct tsync_shared_context {
29 /* The old and tentative new creds of the calling thread. */
30 const struct cred *old_cred;
31 const struct cred *new_cred;
32
33 /* True if sibling tasks need to set the no_new_privs flag. */
34 bool set_no_new_privs;
35
36 /* An error encountered in preparation step, or 0. */
37 atomic_t preparation_error;
38
39 /*
40 * Barrier after preparation step in restrict_one_thread.
41 * The calling thread waits for completion.
42 *
43 * Re-initialized on every round of looking for newly spawned threads.
44 */
45 atomic_t num_preparing;
46 struct completion all_prepared;
47
48 /* Sibling threads wait for completion. */
49 struct completion ready_to_commit;
50
51 /*
52 * Barrier after commit step (used by syscall impl to wait for
53 * completion).
54 */
55 atomic_t num_unfinished;
56 struct completion all_finished;
57 };
58
59 struct tsync_work {
60 struct callback_head work;
61 struct task_struct *task;
62 struct tsync_shared_context *shared_ctx;
63 };
64
65 /*
66 * restrict_one_thread - update a thread's Landlock domain in lockstep with the
67 * other threads in the same process
68 *
69 * When this is run, the same function gets run in all other threads in the same
70 * process (except for the calling thread which called landlock_restrict_self).
71 * The concurrently running invocations of restrict_one_thread coordinate
72 * through the shared ctx object to do their work in lockstep to implement
73 * all-or-nothing semantics for enforcing the new Landlock domain.
74 *
75 * Afterwards, depending on the presence of an error, all threads either commit
76 * or abort the prepared credentials. The commit operation can not fail any
77 * more.
78 */
restrict_one_thread(struct tsync_shared_context * ctx)79 static void restrict_one_thread(struct tsync_shared_context *ctx)
80 {
81 int err;
82 struct cred *cred = NULL;
83
84 if (current_cred() == ctx->old_cred) {
85 /*
86 * Switch out old_cred with new_cred, if possible.
87 *
88 * In the common case, where all threads initially point to the
89 * same struct cred, this optimization avoids creating separate
90 * redundant credentials objects for each, which would all have
91 * the same contents.
92 *
93 * Note: We are intentionally dropping the const qualifier
94 * here, because it is required by commit_creds() and
95 * abort_creds().
96 */
97 cred = (struct cred *)get_cred(ctx->new_cred);
98 } else {
99 /* Else, prepare new creds and populate them. */
100 cred = prepare_creds();
101
102 if (!cred) {
103 atomic_set(&ctx->preparation_error, -ENOMEM);
104
105 /*
106 * Even on error, we need to adhere to the protocol and
107 * coordinate with concurrently running invocations.
108 */
109 if (atomic_dec_return(&ctx->num_preparing) == 0)
110 complete_all(&ctx->all_prepared);
111
112 goto out;
113 }
114
115 landlock_cred_copy(landlock_cred(cred),
116 landlock_cred(ctx->new_cred));
117 }
118
119 /*
120 * Barrier: Wait until all threads are done preparing.
121 * After this point, we can have no more failures.
122 */
123 if (atomic_dec_return(&ctx->num_preparing) == 0)
124 complete_all(&ctx->all_prepared);
125
126 /*
127 * Wait for signal from calling thread that it's safe to read the
128 * preparation error now and we are ready to commit (or abort).
129 */
130 wait_for_completion(&ctx->ready_to_commit);
131
132 /* Abort the commit if any of the other threads had an error. */
133 err = atomic_read(&ctx->preparation_error);
134 if (err) {
135 abort_creds(cred);
136 goto out;
137 }
138
139 /*
140 * Make sure that all sibling tasks fulfill the no_new_privs
141 * prerequisite. (This is in line with Seccomp's
142 * SECCOMP_FILTER_FLAG_TSYNC logic in kernel/seccomp.c)
143 */
144 if (ctx->set_no_new_privs)
145 task_set_no_new_privs(current);
146
147 commit_creds(cred);
148
149 out:
150 /* Notify the calling thread once all threads are done */
151 if (atomic_dec_return(&ctx->num_unfinished) == 0)
152 complete_all(&ctx->all_finished);
153 }
154
155 /*
156 * restrict_one_thread_callback - task_work callback for restricting a thread
157 *
158 * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
159 */
restrict_one_thread_callback(struct callback_head * work)160 static void restrict_one_thread_callback(struct callback_head *work)
161 {
162 struct tsync_work *ctx = container_of(work, struct tsync_work, work);
163
164 restrict_one_thread(ctx->shared_ctx);
165 }
166
167 /*
168 * struct tsync_works - a growable array of per-task contexts
169 *
170 * The zero-initialized struct represents the empty array.
171 */
172 struct tsync_works {
173 struct tsync_work **works;
174 size_t size;
175 size_t capacity;
176 };
177
178 /*
179 * tsync_works_provide - provides a preallocated tsync_work for the given task
180 *
181 * This also stores a task pointer in the context and increments the reference
182 * count of the task.
183 *
184 * This function may fail in the case where we did not preallocate sufficient
185 * capacity. This can legitimately happen if new threads get started after we
186 * grew the capacity.
187 *
188 * Return: A pointer to the preallocated context struct with task filled in, or
189 * NULL if preallocated context structs ran out.
190 */
tsync_works_provide(struct tsync_works * s,struct task_struct * task)191 static struct tsync_work *tsync_works_provide(struct tsync_works *s,
192 struct task_struct *task)
193 {
194 struct tsync_work *ctx;
195
196 if (s->size >= s->capacity)
197 return NULL;
198
199 ctx = s->works[s->size];
200 s->size++;
201
202 ctx->task = get_task_struct(task);
203 return ctx;
204 }
205
206 /**
207 * tsync_works_trim - Put the last tsync_work element
208 *
209 * @s: TSYNC works to trim.
210 *
211 * Put the last task and decrement the size of @s.
212 *
213 * This helper does not cancel a running task, but just reset the last element
214 * to zero.
215 */
tsync_works_trim(struct tsync_works * s)216 static void tsync_works_trim(struct tsync_works *s)
217 {
218 struct tsync_work *ctx;
219
220 if (WARN_ON_ONCE(s->size <= 0))
221 return;
222
223 ctx = s->works[s->size - 1];
224
225 /*
226 * For consistency, remove the task from ctx so that it does not look
227 * like we handed it a task_work.
228 */
229 put_task_struct(ctx->task);
230 *ctx = (typeof(*ctx)){};
231
232 /*
233 * Cancel the tsync_works_provide() change to recycle the reserved
234 * memory for the next thread, if any. This also ensures that
235 * cancel_tsync_works() and tsync_works_release() do not see any NULL
236 * task pointers.
237 */
238 s->size--;
239 }
240
241 /*
242 * tsync_works_grow_by - preallocates space for n more contexts in s
243 *
244 * On a successful return, the subsequent n calls to tsync_works_provide() are
245 * guaranteed to succeed. (size + n <= capacity)
246 *
247 * Return: 0 if sufficient space for n more elements could be provided, -ENOMEM
248 * on allocation errors, -EOVERFLOW in case of integer overflow.
249 */
tsync_works_grow_by(struct tsync_works * s,size_t n,gfp_t flags)250 static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
251 {
252 size_t i;
253 size_t new_capacity;
254 struct tsync_work **works;
255 struct tsync_work *work;
256
257 if (check_add_overflow(s->size, n, &new_capacity))
258 return -EOVERFLOW;
259
260 /* No need to reallocate if s already has sufficient capacity. */
261 if (new_capacity <= s->capacity)
262 return 0;
263
264 works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
265 flags);
266 if (!works)
267 return -ENOMEM;
268
269 s->works = works;
270
271 for (i = s->capacity; i < new_capacity; i++) {
272 work = kzalloc_obj(*work, flags);
273 if (!work) {
274 /*
275 * Leave the object in a consistent state,
276 * but return an error.
277 */
278 s->capacity = i;
279 return -ENOMEM;
280 }
281 s->works[i] = work;
282 }
283 s->capacity = new_capacity;
284 return 0;
285 }
286
287 /*
288 * tsync_works_contains - checks for presence of task in s
289 */
tsync_works_contains_task(const struct tsync_works * s,const struct task_struct * task)290 static bool tsync_works_contains_task(const struct tsync_works *s,
291 const struct task_struct *task)
292 {
293 size_t i;
294
295 for (i = 0; i < s->size; i++)
296 if (s->works[i]->task == task)
297 return true;
298
299 return false;
300 }
301
302 /*
303 * tsync_works_release - frees memory held by s and drops all task references
304 *
305 * This does not free s itself, only the data structures held by it.
306 */
tsync_works_release(struct tsync_works * s)307 static void tsync_works_release(struct tsync_works *s)
308 {
309 size_t i;
310
311 for (i = 0; i < s->size; i++) {
312 if (WARN_ON_ONCE(!s->works[i]->task))
313 continue;
314
315 put_task_struct(s->works[i]->task);
316 }
317
318 for (i = 0; i < s->capacity; i++)
319 kfree(s->works[i]);
320
321 kfree(s->works);
322 s->works = NULL;
323 s->size = 0;
324 s->capacity = 0;
325 }
326
327 /*
328 * count_additional_threads - counts the sibling threads that are not in works
329 */
count_additional_threads(const struct tsync_works * works)330 static size_t count_additional_threads(const struct tsync_works *works)
331 {
332 const struct task_struct *caller, *thread;
333 size_t n = 0;
334
335 caller = current;
336
337 guard(rcu)();
338
339 for_each_thread(caller, thread) {
340 /* Skip current, since it is initiating the sync. */
341 if (thread == caller)
342 continue;
343
344 /* Skip exited threads. */
345 if (thread->flags & PF_EXITING)
346 continue;
347
348 /* Skip threads that we have already seen. */
349 if (tsync_works_contains_task(works, thread))
350 continue;
351
352 n++;
353 }
354 return n;
355 }
356
357 /*
358 * schedule_task_work - adds task_work for all eligible sibling threads
359 * which have not been scheduled yet
360 *
361 * For each added task_work, atomically increments shared_ctx->num_preparing and
362 * shared_ctx->num_unfinished.
363 *
364 * Return: True if at least one eligible sibling thread was found, false
365 * otherwise.
366 */
schedule_task_work(struct tsync_works * works,struct tsync_shared_context * shared_ctx)367 static bool schedule_task_work(struct tsync_works *works,
368 struct tsync_shared_context *shared_ctx)
369 {
370 int err;
371 const struct task_struct *caller;
372 struct task_struct *thread;
373 struct tsync_work *ctx;
374 bool found_more_threads = false;
375
376 caller = current;
377
378 guard(rcu)();
379
380 for_each_thread(caller, thread) {
381 /* Skip current, since it is initiating the sync. */
382 if (thread == caller)
383 continue;
384
385 /* Skip exited threads. */
386 if (thread->flags & PF_EXITING)
387 continue;
388
389 /* Skip threads that we already looked at. */
390 if (tsync_works_contains_task(works, thread))
391 continue;
392
393 /*
394 * We found a sibling thread that is not doing its task_work
395 * yet, and which might spawn new threads before our task work
396 * runs, so we need at least one more round in the outer loop.
397 */
398 found_more_threads = true;
399
400 ctx = tsync_works_provide(works, thread);
401 if (!ctx) {
402 /*
403 * We ran out of preallocated contexts -- we need to
404 * try again with this thread at a later time!
405 * found_more_threads is already true at this point.
406 */
407 break;
408 }
409
410 ctx->shared_ctx = shared_ctx;
411
412 atomic_inc(&shared_ctx->num_preparing);
413 atomic_inc(&shared_ctx->num_unfinished);
414
415 init_task_work(&ctx->work, restrict_one_thread_callback);
416 err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
417 if (unlikely(err)) {
418 /*
419 * task_work_add() only fails if the task is about to
420 * exit. We checked that earlier, but it can happen as
421 * a race. Resume without setting an error, as the
422 * task is probably gone in the next loop iteration.
423 */
424 tsync_works_trim(works);
425
426 atomic_dec(&shared_ctx->num_preparing);
427 atomic_dec(&shared_ctx->num_unfinished);
428 }
429 }
430
431 return found_more_threads;
432 }
433
434 /*
435 * cancel_tsync_works - cancel all task works where it is possible
436 *
437 * Task works can be canceled as long as they are still queued and have not
438 * started running. If they get canceled, we decrement
439 * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
440 * completions if needed, as if the task was never scheduled.
441 */
cancel_tsync_works(const struct tsync_works * works,struct tsync_shared_context * shared_ctx)442 static void cancel_tsync_works(const struct tsync_works *works,
443 struct tsync_shared_context *shared_ctx)
444 {
445 size_t i;
446
447 for (i = 0; i < works->size; i++) {
448 if (WARN_ON_ONCE(!works->works[i]->task))
449 continue;
450
451 if (!task_work_cancel(works->works[i]->task,
452 &works->works[i]->work))
453 continue;
454
455 /* After dequeueing, act as if the task work had executed. */
456
457 if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
458 complete_all(&shared_ctx->all_prepared);
459
460 if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
461 complete_all(&shared_ctx->all_finished);
462 }
463 }
464
465 /*
466 * restrict_sibling_threads - enables a Landlock policy for all sibling threads
467 */
landlock_restrict_sibling_threads(const struct cred * old_cred,const struct cred * new_cred)468 int landlock_restrict_sibling_threads(const struct cred *old_cred,
469 const struct cred *new_cred)
470 {
471 int err;
472 struct tsync_shared_context shared_ctx;
473 struct tsync_works works = {};
474 size_t newly_discovered_threads;
475 bool found_more_threads;
476
477 atomic_set(&shared_ctx.preparation_error, 0);
478 init_completion(&shared_ctx.all_prepared);
479 init_completion(&shared_ctx.ready_to_commit);
480 atomic_set(&shared_ctx.num_unfinished, 1);
481 init_completion(&shared_ctx.all_finished);
482 shared_ctx.old_cred = old_cred;
483 shared_ctx.new_cred = new_cred;
484 shared_ctx.set_no_new_privs = task_no_new_privs(current);
485
486 /*
487 * Serialize concurrent TSYNC operations to prevent deadlocks when
488 * multiple threads call landlock_restrict_self() simultaneously.
489 * If the lock is already held, we gracefully yield by restarting the
490 * syscall. This allows the current thread to process pending
491 * task_works before retrying.
492 */
493 if (!down_write_trylock(¤t->signal->exec_update_lock))
494 return restart_syscall();
495
496 /*
497 * We schedule a pseudo-signal task_work for each of the calling task's
498 * sibling threads. In the task work, each thread:
499 *
500 * 1) runs prepare_creds() and writes back the error to
501 * shared_ctx.preparation_error, if needed.
502 *
503 * 2) signals that it's done with prepare_creds() to the calling task.
504 * (completion "all_prepared").
505 *
506 * 3) waits for the completion "ready_to_commit". This is sent by the
507 * calling task after ensuring that all sibling threads have done
508 * with the "preparation" stage.
509 *
510 * After this barrier is reached, it's safe to read
511 * shared_ctx.preparation_error.
512 *
513 * 4) reads shared_ctx.preparation_error and then either does
514 * commit_creds() or abort_creds().
515 *
516 * 5) signals that it's done altogether (barrier synchronization
517 * "all_finished")
518 *
519 * Unlike seccomp, which modifies sibling tasks directly, we do not
520 * need to acquire the cred_guard_mutex and sighand->siglock:
521 *
522 * - As in our case, all threads are themselves exchanging their own
523 * struct cred through the credentials API, no locks are needed for
524 * that.
525 * - Our for_each_thread() loops are protected by RCU.
526 * - We do not acquire a lock to keep the list of sibling threads
527 * stable between our for_each_thread loops. If the list of
528 * available sibling threads changes between these for_each_thread
529 * loops, we make up for that by continuing to look for threads until
530 * they are all discovered and have entered their task_work, where
531 * they are unable to spawn new threads.
532 */
533 do {
534 /* In RCU read-lock, count the threads we need. */
535 newly_discovered_threads = count_additional_threads(&works);
536
537 if (newly_discovered_threads == 0)
538 break; /* done */
539
540 err = tsync_works_grow_by(&works, newly_discovered_threads,
541 GFP_KERNEL_ACCOUNT);
542 if (err) {
543 atomic_set(&shared_ctx.preparation_error, err);
544 break;
545 }
546
547 /*
548 * The "all_prepared" barrier is used locally to the loop body,
549 * this use of for_each_thread(). We can reset it on each loop
550 * iteration because all previous loop iterations are done with
551 * it already.
552 *
553 * num_preparing is initialized to 1 so that the counter can
554 * not go to 0 and mark the completion as done before all task
555 * works are registered. We decrement it at the end of the
556 * loop body.
557 */
558 atomic_set(&shared_ctx.num_preparing, 1);
559 reinit_completion(&shared_ctx.all_prepared);
560
561 /*
562 * In RCU read-lock, schedule task work on newly discovered
563 * sibling tasks.
564 */
565 found_more_threads = schedule_task_work(&works, &shared_ctx);
566
567 /*
568 * Decrement num_preparing for current, to undo that we
569 * initialized it to 1 a few lines above.
570 */
571 if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
572 if (wait_for_completion_interruptible(
573 &shared_ctx.all_prepared)) {
574 /*
575 * In case of interruption, we need to retry
576 * the system call.
577 */
578 atomic_set(&shared_ctx.preparation_error,
579 -ERESTARTNOINTR);
580
581 /*
582 * Opportunistic improvement: try to cancel task
583 * works for tasks that did not start running
584 * yet. We do not have a guarantee that it
585 * cancels any of the enqueued task works
586 * because task_work_run() might already have
587 * dequeued them.
588 */
589 cancel_tsync_works(&works, &shared_ctx);
590
591 /*
592 * Break the loop with error. The cleanup code
593 * after the loop unblocks the remaining
594 * task_works.
595 */
596 break;
597 }
598 }
599 } while (found_more_threads &&
600 !atomic_read(&shared_ctx.preparation_error));
601
602 /*
603 * We now have either (a) all sibling threads blocking and in "prepared"
604 * state in the task work, or (b) the preparation error is set. Ask all
605 * threads to commit (or abort).
606 */
607 complete_all(&shared_ctx.ready_to_commit);
608
609 /*
610 * Decrement num_unfinished for current, to undo that we initialized it
611 * to 1 at the beginning.
612 */
613 if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
614 wait_for_completion(&shared_ctx.all_finished);
615
616 tsync_works_release(&works);
617 up_write(¤t->signal->exec_update_lock);
618 return atomic_read(&shared_ctx.preparation_error);
619 }
620