1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Landlock - Cross-thread ruleset enforcement
4 *
5 * Copyright © 2025 Google LLC
6 */
7
8 #include <linux/atomic.h>
9 #include <linux/cleanup.h>
10 #include <linux/completion.h>
11 #include <linux/cred.h>
12 #include <linux/errno.h>
13 #include <linux/overflow.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched.h>
16 #include <linux/sched/signal.h>
17 #include <linux/sched/task.h>
18 #include <linux/slab.h>
19 #include <linux/task_work.h>
20
21 #include "cred.h"
22 #include "tsync.h"
23
24 /*
25 * Shared state between multiple threads which are enforcing Landlock rulesets
26 * in lockstep with each other.
27 */
28 struct tsync_shared_context {
29 /* The old and tentative new creds of the calling thread. */
30 const struct cred *old_cred;
31 const struct cred *new_cred;
32
33 /* True if sibling tasks need to set the no_new_privs flag. */
34 bool set_no_new_privs;
35
36 /* An error encountered in preparation step, or 0. */
37 atomic_t preparation_error;
38
39 /*
40 * Barrier after preparation step in restrict_one_thread.
41 * The calling thread waits for completion.
42 *
43 * Re-initialized on every round of looking for newly spawned threads.
44 */
45 atomic_t num_preparing;
46 struct completion all_prepared;
47
48 /* Sibling threads wait for completion. */
49 struct completion ready_to_commit;
50
51 /*
52 * Barrier after commit step (used by syscall impl to wait for
53 * completion).
54 */
55 atomic_t num_unfinished;
56 struct completion all_finished;
57 };
58
59 struct tsync_work {
60 struct callback_head work;
61 struct task_struct *task;
62 struct tsync_shared_context *shared_ctx;
63 };
64
65 /*
66 * restrict_one_thread - update a thread's Landlock domain in lockstep with the
67 * other threads in the same process
68 *
69 * When this is run, the same function gets run in all other threads in the same
70 * process (except for the calling thread which called landlock_restrict_self).
71 * The concurrently running invocations of restrict_one_thread coordinate
72 * through the shared ctx object to do their work in lockstep to implement
73 * all-or-nothing semantics for enforcing the new Landlock domain.
74 *
75 * Afterwards, depending on the presence of an error, all threads either commit
76 * or abort the prepared credentials. The commit operation can not fail any
77 * more.
78 */
restrict_one_thread(struct tsync_shared_context * ctx)79 static void restrict_one_thread(struct tsync_shared_context *ctx)
80 {
81 int err;
82 struct cred *cred = NULL;
83
84 if (current_cred() == ctx->old_cred) {
85 /*
86 * Switch out old_cred with new_cred, if possible.
87 *
88 * In the common case, where all threads initially point to the same
89 * struct cred, this optimization avoids creating separate redundant
90 * credentials objects for each, which would all have the same contents.
91 *
92 * Note: We are intentionally dropping the const qualifier here, because
93 * it is required by commit_creds() and abort_creds().
94 */
95 cred = (struct cred *)get_cred(ctx->new_cred);
96 } else {
97 /* Else, prepare new creds and populate them. */
98 cred = prepare_creds();
99
100 if (!cred) {
101 atomic_set(&ctx->preparation_error, -ENOMEM);
102
103 /*
104 * Even on error, we need to adhere to the protocol and coordinate
105 * with concurrently running invocations.
106 */
107 if (atomic_dec_return(&ctx->num_preparing) == 0)
108 complete_all(&ctx->all_prepared);
109
110 goto out;
111 }
112
113 landlock_cred_copy(landlock_cred(cred),
114 landlock_cred(ctx->new_cred));
115 }
116
117 /*
118 * Barrier: Wait until all threads are done preparing.
119 * After this point, we can have no more failures.
120 */
121 if (atomic_dec_return(&ctx->num_preparing) == 0)
122 complete_all(&ctx->all_prepared);
123
124 /*
125 * Wait for signal from calling thread that it's safe to read the
126 * preparation error now and we are ready to commit (or abort).
127 */
128 wait_for_completion(&ctx->ready_to_commit);
129
130 /* Abort the commit if any of the other threads had an error. */
131 err = atomic_read(&ctx->preparation_error);
132 if (err) {
133 abort_creds(cred);
134 goto out;
135 }
136
137 /*
138 * Make sure that all sibling tasks fulfill the no_new_privs prerequisite.
139 * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in
140 * kernel/seccomp.c)
141 */
142 if (ctx->set_no_new_privs)
143 task_set_no_new_privs(current);
144
145 commit_creds(cred);
146
147 out:
148 /* Notify the calling thread once all threads are done */
149 if (atomic_dec_return(&ctx->num_unfinished) == 0)
150 complete_all(&ctx->all_finished);
151 }
152
153 /*
154 * restrict_one_thread_callback - task_work callback for restricting a thread
155 *
156 * Calls restrict_one_thread with the struct landlock_shared_tsync_context.
157 */
restrict_one_thread_callback(struct callback_head * work)158 static void restrict_one_thread_callback(struct callback_head *work)
159 {
160 struct tsync_work *ctx = container_of(work, struct tsync_work, work);
161
162 restrict_one_thread(ctx->shared_ctx);
163 }
164
165 /*
166 * struct tsync_works - a growable array of per-task contexts
167 *
168 * The zero-initialized struct represents the empty array.
169 */
170 struct tsync_works {
171 struct tsync_work **works;
172 size_t size;
173 size_t capacity;
174 };
175
176 /*
177 * tsync_works_provide - provides a preallocated tsync_work for the given task
178 *
179 * This also stores a task pointer in the context and increments the reference
180 * count of the task.
181 *
182 * This function may fail in the case where we did not preallocate sufficient
183 * capacity. This can legitimately happen if new threads get started after we
184 * grew the capacity.
185 *
186 * Returns:
187 * A pointer to the preallocated context struct, with task filled in.
188 *
189 * NULL, if we ran out of preallocated context structs.
190 */
tsync_works_provide(struct tsync_works * s,struct task_struct * task)191 static struct tsync_work *tsync_works_provide(struct tsync_works *s,
192 struct task_struct *task)
193 {
194 struct tsync_work *ctx;
195
196 if (s->size >= s->capacity)
197 return NULL;
198
199 ctx = s->works[s->size];
200 s->size++;
201
202 ctx->task = get_task_struct(task);
203 return ctx;
204 }
205
206 /*
207 * tsync_works_grow_by - preallocates space for n more contexts in s
208 *
209 * On a successful return, the subsequent n calls to tsync_works_provide() are
210 * guaranteed to succeed. (size + n <= capacity)
211 *
212 * Returns:
213 * -ENOMEM if the (re)allocation fails
214
215 * 0 if the allocation succeeds, partially succeeds, or no reallocation
216 * was needed
217 */
tsync_works_grow_by(struct tsync_works * s,size_t n,gfp_t flags)218 static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
219 {
220 size_t i;
221 size_t new_capacity;
222 struct tsync_work **works;
223 struct tsync_work *work;
224
225 if (check_add_overflow(s->size, n, &new_capacity))
226 return -EOVERFLOW;
227
228 /* No need to reallocate if s already has sufficient capacity. */
229 if (new_capacity <= s->capacity)
230 return 0;
231
232 works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]),
233 flags);
234 if (!works)
235 return -ENOMEM;
236
237 s->works = works;
238
239 for (i = s->capacity; i < new_capacity; i++) {
240 work = kzalloc_obj(*work, flags);
241 if (!work) {
242 /*
243 * Leave the object in a consistent state,
244 * but return an error.
245 */
246 s->capacity = i;
247 return -ENOMEM;
248 }
249 s->works[i] = work;
250 }
251 s->capacity = new_capacity;
252 return 0;
253 }
254
255 /*
256 * tsync_works_contains - checks for presence of task in s
257 */
tsync_works_contains_task(const struct tsync_works * s,struct task_struct * task)258 static bool tsync_works_contains_task(const struct tsync_works *s,
259 struct task_struct *task)
260 {
261 size_t i;
262
263 for (i = 0; i < s->size; i++)
264 if (s->works[i]->task == task)
265 return true;
266 return false;
267 }
268
269 /*
270 * tsync_works_release - frees memory held by s and drops all task references
271 *
272 * This does not free s itself, only the data structures held by it.
273 */
tsync_works_release(struct tsync_works * s)274 static void tsync_works_release(struct tsync_works *s)
275 {
276 size_t i;
277
278 for (i = 0; i < s->size; i++) {
279 if (!s->works[i]->task)
280 continue;
281
282 put_task_struct(s->works[i]->task);
283 }
284
285 for (i = 0; i < s->capacity; i++)
286 kfree(s->works[i]);
287 kfree(s->works);
288 s->works = NULL;
289 s->size = 0;
290 s->capacity = 0;
291 }
292
293 /*
294 * count_additional_threads - counts the sibling threads that are not in works
295 */
count_additional_threads(const struct tsync_works * works)296 static size_t count_additional_threads(const struct tsync_works *works)
297 {
298 struct task_struct *thread, *caller;
299 size_t n = 0;
300
301 caller = current;
302
303 guard(rcu)();
304
305 for_each_thread(caller, thread) {
306 /* Skip current, since it is initiating the sync. */
307 if (thread == caller)
308 continue;
309
310 /* Skip exited threads. */
311 if (thread->flags & PF_EXITING)
312 continue;
313
314 /* Skip threads that we have already seen. */
315 if (tsync_works_contains_task(works, thread))
316 continue;
317
318 n++;
319 }
320 return n;
321 }
322
323 /*
324 * schedule_task_work - adds task_work for all eligible sibling threads
325 * which have not been scheduled yet
326 *
327 * For each added task_work, atomically increments shared_ctx->num_preparing and
328 * shared_ctx->num_unfinished.
329 *
330 * Returns:
331 * true, if at least one eligible sibling thread was found
332 */
schedule_task_work(struct tsync_works * works,struct tsync_shared_context * shared_ctx)333 static bool schedule_task_work(struct tsync_works *works,
334 struct tsync_shared_context *shared_ctx)
335 {
336 int err;
337 struct task_struct *thread, *caller;
338 struct tsync_work *ctx;
339 bool found_more_threads = false;
340
341 caller = current;
342
343 guard(rcu)();
344
345 for_each_thread(caller, thread) {
346 /* Skip current, since it is initiating the sync. */
347 if (thread == caller)
348 continue;
349
350 /* Skip exited threads. */
351 if (thread->flags & PF_EXITING)
352 continue;
353
354 /* Skip threads that we already looked at. */
355 if (tsync_works_contains_task(works, thread))
356 continue;
357
358 /*
359 * We found a sibling thread that is not doing its task_work yet, and
360 * which might spawn new threads before our task work runs, so we need
361 * at least one more round in the outer loop.
362 */
363 found_more_threads = true;
364
365 ctx = tsync_works_provide(works, thread);
366 if (!ctx) {
367 /*
368 * We ran out of preallocated contexts -- we need to try again with
369 * this thread at a later time!
370 * found_more_threads is already true at this point.
371 */
372 break;
373 }
374
375 ctx->shared_ctx = shared_ctx;
376
377 atomic_inc(&shared_ctx->num_preparing);
378 atomic_inc(&shared_ctx->num_unfinished);
379
380 init_task_work(&ctx->work, restrict_one_thread_callback);
381 err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
382 if (err) {
383 /*
384 * task_work_add() only fails if the task is about to exit. We
385 * checked that earlier, but it can happen as a race. Resume
386 * without setting an error, as the task is probably gone in the
387 * next loop iteration. For consistency, remove the task from ctx
388 * so that it does not look like we handed it a task_work.
389 */
390 put_task_struct(ctx->task);
391 ctx->task = NULL;
392
393 atomic_dec(&shared_ctx->num_preparing);
394 atomic_dec(&shared_ctx->num_unfinished);
395 }
396 }
397
398 return found_more_threads;
399 }
400
401 /*
402 * cancel_tsync_works - cancel all task works where it is possible
403 *
404 * Task works can be canceled as long as they are still queued and have not
405 * started running. If they get canceled, we decrement
406 * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
407 * completions if needed, as if the task was never scheduled.
408 */
cancel_tsync_works(struct tsync_works * works,struct tsync_shared_context * shared_ctx)409 static void cancel_tsync_works(struct tsync_works *works,
410 struct tsync_shared_context *shared_ctx)
411 {
412 int i;
413
414 for (i = 0; i < works->size; i++) {
415 if (!task_work_cancel(works->works[i]->task,
416 &works->works[i]->work))
417 continue;
418
419 /* After dequeueing, act as if the task work had executed. */
420
421 if (atomic_dec_return(&shared_ctx->num_preparing) == 0)
422 complete_all(&shared_ctx->all_prepared);
423
424 if (atomic_dec_return(&shared_ctx->num_unfinished) == 0)
425 complete_all(&shared_ctx->all_finished);
426 }
427 }
428
429 /*
430 * restrict_sibling_threads - enables a Landlock policy for all sibling threads
431 */
landlock_restrict_sibling_threads(const struct cred * old_cred,const struct cred * new_cred)432 int landlock_restrict_sibling_threads(const struct cred *old_cred,
433 const struct cred *new_cred)
434 {
435 int err;
436 struct tsync_shared_context shared_ctx;
437 struct tsync_works works = {};
438 size_t newly_discovered_threads;
439 bool found_more_threads;
440
441 atomic_set(&shared_ctx.preparation_error, 0);
442 init_completion(&shared_ctx.all_prepared);
443 init_completion(&shared_ctx.ready_to_commit);
444 atomic_set(&shared_ctx.num_unfinished, 1);
445 init_completion(&shared_ctx.all_finished);
446 shared_ctx.old_cred = old_cred;
447 shared_ctx.new_cred = new_cred;
448 shared_ctx.set_no_new_privs = task_no_new_privs(current);
449
450 /*
451 * We schedule a pseudo-signal task_work for each of the calling task's
452 * sibling threads. In the task work, each thread:
453 *
454 * 1) runs prepare_creds() and writes back the error to
455 * shared_ctx.preparation_error, if needed.
456 *
457 * 2) signals that it's done with prepare_creds() to the calling task.
458 * (completion "all_prepared").
459 *
460 * 3) waits for the completion "ready_to_commit". This is sent by the
461 * calling task after ensuring that all sibling threads have done
462 * with the "preparation" stage.
463 *
464 * After this barrier is reached, it's safe to read
465 * shared_ctx.preparation_error.
466 *
467 * 4) reads shared_ctx.preparation_error and then either does commit_creds()
468 * or abort_creds().
469 *
470 * 5) signals that it's done altogether (barrier synchronization
471 * "all_finished")
472 *
473 * Unlike seccomp, which modifies sibling tasks directly, we do not need to
474 * acquire the cred_guard_mutex and sighand->siglock:
475 *
476 * - As in our case, all threads are themselves exchanging their own struct
477 * cred through the credentials API, no locks are needed for that.
478 * - Our for_each_thread() loops are protected by RCU.
479 * - We do not acquire a lock to keep the list of sibling threads stable
480 * between our for_each_thread loops. If the list of available sibling
481 * threads changes between these for_each_thread loops, we make up for
482 * that by continuing to look for threads until they are all discovered
483 * and have entered their task_work, where they are unable to spawn new
484 * threads.
485 */
486 do {
487 /* In RCU read-lock, count the threads we need. */
488 newly_discovered_threads = count_additional_threads(&works);
489
490 if (newly_discovered_threads == 0)
491 break; /* done */
492
493 err = tsync_works_grow_by(&works, newly_discovered_threads,
494 GFP_KERNEL_ACCOUNT);
495 if (err) {
496 atomic_set(&shared_ctx.preparation_error, err);
497 break;
498 }
499
500 /*
501 * The "all_prepared" barrier is used locally to the loop body, this use
502 * of for_each_thread(). We can reset it on each loop iteration because
503 * all previous loop iterations are done with it already.
504 *
505 * num_preparing is initialized to 1 so that the counter can not go to 0
506 * and mark the completion as done before all task works are registered.
507 * We decrement it at the end of the loop body.
508 */
509 atomic_set(&shared_ctx.num_preparing, 1);
510 reinit_completion(&shared_ctx.all_prepared);
511
512 /*
513 * In RCU read-lock, schedule task work on newly discovered sibling
514 * tasks.
515 */
516 found_more_threads = schedule_task_work(&works, &shared_ctx);
517
518 /*
519 * Decrement num_preparing for current, to undo that we initialized it
520 * to 1 a few lines above.
521 */
522 if (atomic_dec_return(&shared_ctx.num_preparing) > 0) {
523 if (wait_for_completion_interruptible(
524 &shared_ctx.all_prepared)) {
525 /* In case of interruption, we need to retry the system call. */
526 atomic_set(&shared_ctx.preparation_error,
527 -ERESTARTNOINTR);
528
529 /*
530 * Cancel task works for tasks that did not start running yet,
531 * and decrement all_prepared and num_unfinished accordingly.
532 */
533 cancel_tsync_works(&works, &shared_ctx);
534
535 /*
536 * The remaining task works have started running, so waiting for
537 * their completion will finish.
538 */
539 wait_for_completion(&shared_ctx.all_prepared);
540 }
541 }
542 } while (found_more_threads &&
543 !atomic_read(&shared_ctx.preparation_error));
544
545 /*
546 * We now have all sibling threads blocking and in "prepared" state in the
547 * task work. Ask all threads to commit.
548 */
549 complete_all(&shared_ctx.ready_to_commit);
550
551 /*
552 * Decrement num_unfinished for current, to undo that we initialized it to 1
553 * at the beginning.
554 */
555 if (atomic_dec_return(&shared_ctx.num_unfinished) > 0)
556 wait_for_completion(&shared_ctx.all_finished);
557
558 tsync_works_release(&works);
559
560 return atomic_read(&shared_ctx.preparation_error);
561 }
562