1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Landlock - Cross-thread ruleset enforcement 4 * 5 * Copyright © 2025 Google LLC 6 */ 7 8 #include <linux/atomic.h> 9 #include <linux/cleanup.h> 10 #include <linux/completion.h> 11 #include <linux/cred.h> 12 #include <linux/errno.h> 13 #include <linux/overflow.h> 14 #include <linux/rcupdate.h> 15 #include <linux/sched.h> 16 #include <linux/sched/signal.h> 17 #include <linux/sched/task.h> 18 #include <linux/slab.h> 19 #include <linux/task_work.h> 20 21 #include "cred.h" 22 #include "tsync.h" 23 24 /* 25 * Shared state between multiple threads which are enforcing Landlock rulesets 26 * in lockstep with each other. 27 */ 28 struct tsync_shared_context { 29 /* The old and tentative new creds of the calling thread. */ 30 const struct cred *old_cred; 31 const struct cred *new_cred; 32 33 /* True if sibling tasks need to set the no_new_privs flag. */ 34 bool set_no_new_privs; 35 36 /* An error encountered in preparation step, or 0. */ 37 atomic_t preparation_error; 38 39 /* 40 * Barrier after preparation step in restrict_one_thread. 41 * The calling thread waits for completion. 42 * 43 * Re-initialized on every round of looking for newly spawned threads. 44 */ 45 atomic_t num_preparing; 46 struct completion all_prepared; 47 48 /* Sibling threads wait for completion. */ 49 struct completion ready_to_commit; 50 51 /* 52 * Barrier after commit step (used by syscall impl to wait for 53 * completion). 54 */ 55 atomic_t num_unfinished; 56 struct completion all_finished; 57 }; 58 59 struct tsync_work { 60 struct callback_head work; 61 struct task_struct *task; 62 struct tsync_shared_context *shared_ctx; 63 }; 64 65 /* 66 * restrict_one_thread - update a thread's Landlock domain in lockstep with the 67 * other threads in the same process 68 * 69 * When this is run, the same function gets run in all other threads in the same 70 * process (except for the calling thread which called landlock_restrict_self). 71 * The concurrently running invocations of restrict_one_thread coordinate 72 * through the shared ctx object to do their work in lockstep to implement 73 * all-or-nothing semantics for enforcing the new Landlock domain. 74 * 75 * Afterwards, depending on the presence of an error, all threads either commit 76 * or abort the prepared credentials. The commit operation can not fail any 77 * more. 78 */ 79 static void restrict_one_thread(struct tsync_shared_context *ctx) 80 { 81 int err; 82 struct cred *cred = NULL; 83 84 if (current_cred() == ctx->old_cred) { 85 /* 86 * Switch out old_cred with new_cred, if possible. 87 * 88 * In the common case, where all threads initially point to the 89 * same struct cred, this optimization avoids creating separate 90 * redundant credentials objects for each, which would all have 91 * the same contents. 92 * 93 * Note: We are intentionally dropping the const qualifier 94 * here, because it is required by commit_creds() and 95 * abort_creds(). 96 */ 97 cred = (struct cred *)get_cred(ctx->new_cred); 98 } else { 99 /* Else, prepare new creds and populate them. */ 100 cred = prepare_creds(); 101 102 if (!cred) { 103 atomic_set(&ctx->preparation_error, -ENOMEM); 104 105 /* 106 * Even on error, we need to adhere to the protocol and 107 * coordinate with concurrently running invocations. 108 */ 109 if (atomic_dec_return(&ctx->num_preparing) == 0) 110 complete_all(&ctx->all_prepared); 111 112 goto out; 113 } 114 115 landlock_cred_copy(landlock_cred(cred), 116 landlock_cred(ctx->new_cred)); 117 } 118 119 /* 120 * Barrier: Wait until all threads are done preparing. 121 * After this point, we can have no more failures. 122 */ 123 if (atomic_dec_return(&ctx->num_preparing) == 0) 124 complete_all(&ctx->all_prepared); 125 126 /* 127 * Wait for signal from calling thread that it's safe to read the 128 * preparation error now and we are ready to commit (or abort). 129 */ 130 wait_for_completion(&ctx->ready_to_commit); 131 132 /* Abort the commit if any of the other threads had an error. */ 133 err = atomic_read(&ctx->preparation_error); 134 if (err) { 135 abort_creds(cred); 136 goto out; 137 } 138 139 /* 140 * Make sure that all sibling tasks fulfill the no_new_privs 141 * prerequisite. (This is in line with Seccomp's 142 * SECCOMP_FILTER_FLAG_TSYNC logic in kernel/seccomp.c) 143 */ 144 if (ctx->set_no_new_privs) 145 task_set_no_new_privs(current); 146 147 commit_creds(cred); 148 149 out: 150 /* Notify the calling thread once all threads are done */ 151 if (atomic_dec_return(&ctx->num_unfinished) == 0) 152 complete_all(&ctx->all_finished); 153 } 154 155 /* 156 * restrict_one_thread_callback - task_work callback for restricting a thread 157 * 158 * Calls restrict_one_thread with the struct landlock_shared_tsync_context. 159 */ 160 static void restrict_one_thread_callback(struct callback_head *work) 161 { 162 struct tsync_work *ctx = container_of(work, struct tsync_work, work); 163 164 restrict_one_thread(ctx->shared_ctx); 165 } 166 167 /* 168 * struct tsync_works - a growable array of per-task contexts 169 * 170 * The zero-initialized struct represents the empty array. 171 */ 172 struct tsync_works { 173 struct tsync_work **works; 174 size_t size; 175 size_t capacity; 176 }; 177 178 /* 179 * tsync_works_provide - provides a preallocated tsync_work for the given task 180 * 181 * This also stores a task pointer in the context and increments the reference 182 * count of the task. 183 * 184 * This function may fail in the case where we did not preallocate sufficient 185 * capacity. This can legitimately happen if new threads get started after we 186 * grew the capacity. 187 * 188 * Return: A pointer to the preallocated context struct with task filled in, or 189 * NULL if preallocated context structs ran out. 190 */ 191 static struct tsync_work *tsync_works_provide(struct tsync_works *s, 192 struct task_struct *task) 193 { 194 struct tsync_work *ctx; 195 196 if (s->size >= s->capacity) 197 return NULL; 198 199 ctx = s->works[s->size]; 200 s->size++; 201 202 ctx->task = get_task_struct(task); 203 return ctx; 204 } 205 206 /** 207 * tsync_works_trim - Put the last tsync_work element 208 * 209 * @s: TSYNC works to trim. 210 * 211 * Put the last task and decrement the size of @s. 212 * 213 * This helper does not cancel a running task, but just reset the last element 214 * to zero. 215 */ 216 static void tsync_works_trim(struct tsync_works *s) 217 { 218 struct tsync_work *ctx; 219 220 if (WARN_ON_ONCE(s->size <= 0)) 221 return; 222 223 ctx = s->works[s->size - 1]; 224 225 /* 226 * For consistency, remove the task from ctx so that it does not look 227 * like we handed it a task_work. 228 */ 229 put_task_struct(ctx->task); 230 *ctx = (typeof(*ctx)){}; 231 232 /* 233 * Cancel the tsync_works_provide() change to recycle the reserved 234 * memory for the next thread, if any. This also ensures that 235 * cancel_tsync_works() and tsync_works_release() do not see any NULL 236 * task pointers. 237 */ 238 s->size--; 239 } 240 241 /* 242 * tsync_works_grow_by - preallocates space for n more contexts in s 243 * 244 * On a successful return, the subsequent n calls to tsync_works_provide() are 245 * guaranteed to succeed. (size + n <= capacity) 246 * 247 * Return: 0 if sufficient space for n more elements could be provided, -ENOMEM 248 * on allocation errors, -EOVERFLOW in case of integer overflow. 249 */ 250 static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags) 251 { 252 size_t i; 253 size_t new_capacity; 254 struct tsync_work **works; 255 struct tsync_work *work; 256 257 if (check_add_overflow(s->size, n, &new_capacity)) 258 return -EOVERFLOW; 259 260 /* No need to reallocate if s already has sufficient capacity. */ 261 if (new_capacity <= s->capacity) 262 return 0; 263 264 works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]), 265 flags); 266 if (!works) 267 return -ENOMEM; 268 269 s->works = works; 270 271 for (i = s->capacity; i < new_capacity; i++) { 272 work = kzalloc_obj(*work, flags); 273 if (!work) { 274 /* 275 * Leave the object in a consistent state, 276 * but return an error. 277 */ 278 s->capacity = i; 279 return -ENOMEM; 280 } 281 s->works[i] = work; 282 } 283 s->capacity = new_capacity; 284 return 0; 285 } 286 287 /* 288 * tsync_works_contains - checks for presence of task in s 289 */ 290 static bool tsync_works_contains_task(const struct tsync_works *s, 291 const struct task_struct *task) 292 { 293 size_t i; 294 295 for (i = 0; i < s->size; i++) 296 if (s->works[i]->task == task) 297 return true; 298 299 return false; 300 } 301 302 /* 303 * tsync_works_release - frees memory held by s and drops all task references 304 * 305 * This does not free s itself, only the data structures held by it. 306 */ 307 static void tsync_works_release(struct tsync_works *s) 308 { 309 size_t i; 310 311 for (i = 0; i < s->size; i++) { 312 if (WARN_ON_ONCE(!s->works[i]->task)) 313 continue; 314 315 put_task_struct(s->works[i]->task); 316 } 317 318 for (i = 0; i < s->capacity; i++) 319 kfree(s->works[i]); 320 321 kfree(s->works); 322 s->works = NULL; 323 s->size = 0; 324 s->capacity = 0; 325 } 326 327 /* 328 * count_additional_threads - counts the sibling threads that are not in works 329 */ 330 static size_t count_additional_threads(const struct tsync_works *works) 331 { 332 const struct task_struct *caller, *thread; 333 size_t n = 0; 334 335 caller = current; 336 337 guard(rcu)(); 338 339 for_each_thread(caller, thread) { 340 /* Skip current, since it is initiating the sync. */ 341 if (thread == caller) 342 continue; 343 344 /* Skip exited threads. */ 345 if (thread->flags & PF_EXITING) 346 continue; 347 348 /* Skip threads that we have already seen. */ 349 if (tsync_works_contains_task(works, thread)) 350 continue; 351 352 n++; 353 } 354 return n; 355 } 356 357 /* 358 * schedule_task_work - adds task_work for all eligible sibling threads 359 * which have not been scheduled yet 360 * 361 * For each added task_work, atomically increments shared_ctx->num_preparing and 362 * shared_ctx->num_unfinished. 363 * 364 * Return: True if at least one eligible sibling thread was found, false 365 * otherwise. 366 */ 367 static bool schedule_task_work(struct tsync_works *works, 368 struct tsync_shared_context *shared_ctx) 369 { 370 int err; 371 const struct task_struct *caller; 372 struct task_struct *thread; 373 struct tsync_work *ctx; 374 bool found_more_threads = false; 375 376 caller = current; 377 378 guard(rcu)(); 379 380 for_each_thread(caller, thread) { 381 /* Skip current, since it is initiating the sync. */ 382 if (thread == caller) 383 continue; 384 385 /* Skip exited threads. */ 386 if (thread->flags & PF_EXITING) 387 continue; 388 389 /* Skip threads that we already looked at. */ 390 if (tsync_works_contains_task(works, thread)) 391 continue; 392 393 /* 394 * We found a sibling thread that is not doing its task_work 395 * yet, and which might spawn new threads before our task work 396 * runs, so we need at least one more round in the outer loop. 397 */ 398 found_more_threads = true; 399 400 ctx = tsync_works_provide(works, thread); 401 if (!ctx) { 402 /* 403 * We ran out of preallocated contexts -- we need to 404 * try again with this thread at a later time! 405 * found_more_threads is already true at this point. 406 */ 407 break; 408 } 409 410 ctx->shared_ctx = shared_ctx; 411 412 atomic_inc(&shared_ctx->num_preparing); 413 atomic_inc(&shared_ctx->num_unfinished); 414 415 init_task_work(&ctx->work, restrict_one_thread_callback); 416 err = task_work_add(thread, &ctx->work, TWA_SIGNAL); 417 if (unlikely(err)) { 418 /* 419 * task_work_add() only fails if the task is about to 420 * exit. We checked that earlier, but it can happen as 421 * a race. Resume without setting an error, as the 422 * task is probably gone in the next loop iteration. 423 */ 424 tsync_works_trim(works); 425 426 atomic_dec(&shared_ctx->num_preparing); 427 atomic_dec(&shared_ctx->num_unfinished); 428 } 429 } 430 431 return found_more_threads; 432 } 433 434 /* 435 * cancel_tsync_works - cancel all task works where it is possible 436 * 437 * Task works can be canceled as long as they are still queued and have not 438 * started running. If they get canceled, we decrement 439 * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two 440 * completions if needed, as if the task was never scheduled. 441 */ 442 static void cancel_tsync_works(const struct tsync_works *works, 443 struct tsync_shared_context *shared_ctx) 444 { 445 size_t i; 446 447 for (i = 0; i < works->size; i++) { 448 if (WARN_ON_ONCE(!works->works[i]->task)) 449 continue; 450 451 if (!task_work_cancel(works->works[i]->task, 452 &works->works[i]->work)) 453 continue; 454 455 /* After dequeueing, act as if the task work had executed. */ 456 457 if (atomic_dec_return(&shared_ctx->num_preparing) == 0) 458 complete_all(&shared_ctx->all_prepared); 459 460 if (atomic_dec_return(&shared_ctx->num_unfinished) == 0) 461 complete_all(&shared_ctx->all_finished); 462 } 463 } 464 465 /* 466 * restrict_sibling_threads - enables a Landlock policy for all sibling threads 467 */ 468 int landlock_restrict_sibling_threads(const struct cred *old_cred, 469 const struct cred *new_cred) 470 { 471 int err; 472 struct tsync_shared_context shared_ctx; 473 struct tsync_works works = {}; 474 size_t newly_discovered_threads; 475 bool found_more_threads; 476 477 atomic_set(&shared_ctx.preparation_error, 0); 478 init_completion(&shared_ctx.all_prepared); 479 init_completion(&shared_ctx.ready_to_commit); 480 atomic_set(&shared_ctx.num_unfinished, 1); 481 init_completion(&shared_ctx.all_finished); 482 shared_ctx.old_cred = old_cred; 483 shared_ctx.new_cred = new_cred; 484 shared_ctx.set_no_new_privs = task_no_new_privs(current); 485 486 /* 487 * Serialize concurrent TSYNC operations to prevent deadlocks when 488 * multiple threads call landlock_restrict_self() simultaneously. 489 * If the lock is already held, we gracefully yield by restarting the 490 * syscall. This allows the current thread to process pending 491 * task_works before retrying. 492 */ 493 if (!down_write_trylock(¤t->signal->exec_update_lock)) 494 return restart_syscall(); 495 496 /* 497 * We schedule a pseudo-signal task_work for each of the calling task's 498 * sibling threads. In the task work, each thread: 499 * 500 * 1) runs prepare_creds() and writes back the error to 501 * shared_ctx.preparation_error, if needed. 502 * 503 * 2) signals that it's done with prepare_creds() to the calling task. 504 * (completion "all_prepared"). 505 * 506 * 3) waits for the completion "ready_to_commit". This is sent by the 507 * calling task after ensuring that all sibling threads have done 508 * with the "preparation" stage. 509 * 510 * After this barrier is reached, it's safe to read 511 * shared_ctx.preparation_error. 512 * 513 * 4) reads shared_ctx.preparation_error and then either does 514 * commit_creds() or abort_creds(). 515 * 516 * 5) signals that it's done altogether (barrier synchronization 517 * "all_finished") 518 * 519 * Unlike seccomp, which modifies sibling tasks directly, we do not 520 * need to acquire the cred_guard_mutex and sighand->siglock: 521 * 522 * - As in our case, all threads are themselves exchanging their own 523 * struct cred through the credentials API, no locks are needed for 524 * that. 525 * - Our for_each_thread() loops are protected by RCU. 526 * - We do not acquire a lock to keep the list of sibling threads 527 * stable between our for_each_thread loops. If the list of 528 * available sibling threads changes between these for_each_thread 529 * loops, we make up for that by continuing to look for threads until 530 * they are all discovered and have entered their task_work, where 531 * they are unable to spawn new threads. 532 */ 533 do { 534 /* In RCU read-lock, count the threads we need. */ 535 newly_discovered_threads = count_additional_threads(&works); 536 537 if (newly_discovered_threads == 0) 538 break; /* done */ 539 540 err = tsync_works_grow_by(&works, newly_discovered_threads, 541 GFP_KERNEL_ACCOUNT); 542 if (err) { 543 atomic_set(&shared_ctx.preparation_error, err); 544 break; 545 } 546 547 /* 548 * The "all_prepared" barrier is used locally to the loop body, 549 * this use of for_each_thread(). We can reset it on each loop 550 * iteration because all previous loop iterations are done with 551 * it already. 552 * 553 * num_preparing is initialized to 1 so that the counter can 554 * not go to 0 and mark the completion as done before all task 555 * works are registered. We decrement it at the end of the 556 * loop body. 557 */ 558 atomic_set(&shared_ctx.num_preparing, 1); 559 reinit_completion(&shared_ctx.all_prepared); 560 561 /* 562 * In RCU read-lock, schedule task work on newly discovered 563 * sibling tasks. 564 */ 565 found_more_threads = schedule_task_work(&works, &shared_ctx); 566 567 /* 568 * Decrement num_preparing for current, to undo that we 569 * initialized it to 1 a few lines above. 570 */ 571 if (atomic_dec_return(&shared_ctx.num_preparing) > 0) { 572 if (wait_for_completion_interruptible( 573 &shared_ctx.all_prepared)) { 574 /* 575 * In case of interruption, we need to retry 576 * the system call. 577 */ 578 atomic_set(&shared_ctx.preparation_error, 579 -ERESTARTNOINTR); 580 581 /* 582 * Opportunistic improvement: try to cancel task 583 * works for tasks that did not start running 584 * yet. We do not have a guarantee that it 585 * cancels any of the enqueued task works 586 * because task_work_run() might already have 587 * dequeued them. 588 */ 589 cancel_tsync_works(&works, &shared_ctx); 590 591 /* 592 * Break the loop with error. The cleanup code 593 * after the loop unblocks the remaining 594 * task_works. 595 */ 596 break; 597 } 598 } 599 } while (found_more_threads && 600 !atomic_read(&shared_ctx.preparation_error)); 601 602 /* 603 * We now have either (a) all sibling threads blocking and in "prepared" 604 * state in the task work, or (b) the preparation error is set. Ask all 605 * threads to commit (or abort). 606 */ 607 complete_all(&shared_ctx.ready_to_commit); 608 609 /* 610 * Decrement num_unfinished for current, to undo that we initialized it 611 * to 1 at the beginning. 612 */ 613 if (atomic_dec_return(&shared_ctx.num_unfinished) > 0) 614 wait_for_completion(&shared_ctx.all_finished); 615 616 tsync_works_release(&works); 617 up_write(¤t->signal->exec_update_lock); 618 return atomic_read(&shared_ctx.preparation_error); 619 } 620