1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Landlock - Cross-thread ruleset enforcement 4 * 5 * Copyright © 2025 Google LLC 6 */ 7 8 #include <linux/atomic.h> 9 #include <linux/cleanup.h> 10 #include <linux/completion.h> 11 #include <linux/cred.h> 12 #include <linux/errno.h> 13 #include <linux/overflow.h> 14 #include <linux/rcupdate.h> 15 #include <linux/sched.h> 16 #include <linux/sched/signal.h> 17 #include <linux/sched/task.h> 18 #include <linux/slab.h> 19 #include <linux/task_work.h> 20 21 #include "cred.h" 22 #include "tsync.h" 23 24 /* 25 * Shared state between multiple threads which are enforcing Landlock rulesets 26 * in lockstep with each other. 27 */ 28 struct tsync_shared_context { 29 /* The old and tentative new creds of the calling thread. */ 30 const struct cred *old_cred; 31 const struct cred *new_cred; 32 33 /* True if sibling tasks need to set the no_new_privs flag. */ 34 bool set_no_new_privs; 35 36 /* An error encountered in preparation step, or 0. */ 37 atomic_t preparation_error; 38 39 /* 40 * Barrier after preparation step in restrict_one_thread. 41 * The calling thread waits for completion. 42 * 43 * Re-initialized on every round of looking for newly spawned threads. 44 */ 45 atomic_t num_preparing; 46 struct completion all_prepared; 47 48 /* Sibling threads wait for completion. */ 49 struct completion ready_to_commit; 50 51 /* 52 * Barrier after commit step (used by syscall impl to wait for 53 * completion). 54 */ 55 atomic_t num_unfinished; 56 struct completion all_finished; 57 }; 58 59 struct tsync_work { 60 struct callback_head work; 61 struct task_struct *task; 62 struct tsync_shared_context *shared_ctx; 63 }; 64 65 /* 66 * restrict_one_thread - update a thread's Landlock domain in lockstep with the 67 * other threads in the same process 68 * 69 * When this is run, the same function gets run in all other threads in the same 70 * process (except for the calling thread which called landlock_restrict_self). 71 * The concurrently running invocations of restrict_one_thread coordinate 72 * through the shared ctx object to do their work in lockstep to implement 73 * all-or-nothing semantics for enforcing the new Landlock domain. 74 * 75 * Afterwards, depending on the presence of an error, all threads either commit 76 * or abort the prepared credentials. The commit operation can not fail any 77 * more. 78 */ 79 static void restrict_one_thread(struct tsync_shared_context *ctx) 80 { 81 int err; 82 struct cred *cred = NULL; 83 84 if (current_cred() == ctx->old_cred) { 85 /* 86 * Switch out old_cred with new_cred, if possible. 87 * 88 * In the common case, where all threads initially point to the same 89 * struct cred, this optimization avoids creating separate redundant 90 * credentials objects for each, which would all have the same contents. 91 * 92 * Note: We are intentionally dropping the const qualifier here, because 93 * it is required by commit_creds() and abort_creds(). 94 */ 95 cred = (struct cred *)get_cred(ctx->new_cred); 96 } else { 97 /* Else, prepare new creds and populate them. */ 98 cred = prepare_creds(); 99 100 if (!cred) { 101 atomic_set(&ctx->preparation_error, -ENOMEM); 102 103 /* 104 * Even on error, we need to adhere to the protocol and coordinate 105 * with concurrently running invocations. 106 */ 107 if (atomic_dec_return(&ctx->num_preparing) == 0) 108 complete_all(&ctx->all_prepared); 109 110 goto out; 111 } 112 113 landlock_cred_copy(landlock_cred(cred), 114 landlock_cred(ctx->new_cred)); 115 } 116 117 /* 118 * Barrier: Wait until all threads are done preparing. 119 * After this point, we can have no more failures. 120 */ 121 if (atomic_dec_return(&ctx->num_preparing) == 0) 122 complete_all(&ctx->all_prepared); 123 124 /* 125 * Wait for signal from calling thread that it's safe to read the 126 * preparation error now and we are ready to commit (or abort). 127 */ 128 wait_for_completion(&ctx->ready_to_commit); 129 130 /* Abort the commit if any of the other threads had an error. */ 131 err = atomic_read(&ctx->preparation_error); 132 if (err) { 133 abort_creds(cred); 134 goto out; 135 } 136 137 /* 138 * Make sure that all sibling tasks fulfill the no_new_privs prerequisite. 139 * (This is in line with Seccomp's SECCOMP_FILTER_FLAG_TSYNC logic in 140 * kernel/seccomp.c) 141 */ 142 if (ctx->set_no_new_privs) 143 task_set_no_new_privs(current); 144 145 commit_creds(cred); 146 147 out: 148 /* Notify the calling thread once all threads are done */ 149 if (atomic_dec_return(&ctx->num_unfinished) == 0) 150 complete_all(&ctx->all_finished); 151 } 152 153 /* 154 * restrict_one_thread_callback - task_work callback for restricting a thread 155 * 156 * Calls restrict_one_thread with the struct landlock_shared_tsync_context. 157 */ 158 static void restrict_one_thread_callback(struct callback_head *work) 159 { 160 struct tsync_work *ctx = container_of(work, struct tsync_work, work); 161 162 restrict_one_thread(ctx->shared_ctx); 163 } 164 165 /* 166 * struct tsync_works - a growable array of per-task contexts 167 * 168 * The zero-initialized struct represents the empty array. 169 */ 170 struct tsync_works { 171 struct tsync_work **works; 172 size_t size; 173 size_t capacity; 174 }; 175 176 /* 177 * tsync_works_provide - provides a preallocated tsync_work for the given task 178 * 179 * This also stores a task pointer in the context and increments the reference 180 * count of the task. 181 * 182 * This function may fail in the case where we did not preallocate sufficient 183 * capacity. This can legitimately happen if new threads get started after we 184 * grew the capacity. 185 * 186 * Returns: 187 * A pointer to the preallocated context struct, with task filled in. 188 * 189 * NULL, if we ran out of preallocated context structs. 190 */ 191 static struct tsync_work *tsync_works_provide(struct tsync_works *s, 192 struct task_struct *task) 193 { 194 struct tsync_work *ctx; 195 196 if (s->size >= s->capacity) 197 return NULL; 198 199 ctx = s->works[s->size]; 200 s->size++; 201 202 ctx->task = get_task_struct(task); 203 return ctx; 204 } 205 206 /** 207 * tsync_works_trim - Put the last tsync_work element 208 * 209 * @s: TSYNC works to trim. 210 * 211 * Put the last task and decrement the size of @s. 212 * 213 * This helper does not cancel a running task, but just reset the last element 214 * to zero. 215 */ 216 static void tsync_works_trim(struct tsync_works *s) 217 { 218 struct tsync_work *ctx; 219 220 if (WARN_ON_ONCE(s->size <= 0)) 221 return; 222 223 ctx = s->works[s->size - 1]; 224 225 /* 226 * For consistency, remove the task from ctx so that it does not look like 227 * we handed it a task_work. 228 */ 229 put_task_struct(ctx->task); 230 *ctx = (typeof(*ctx)){}; 231 232 /* 233 * Cancel the tsync_works_provide() change to recycle the reserved memory 234 * for the next thread, if any. This also ensures that cancel_tsync_works() 235 * and tsync_works_release() do not see any NULL task pointers. 236 */ 237 s->size--; 238 } 239 240 /* 241 * tsync_works_grow_by - preallocates space for n more contexts in s 242 * 243 * On a successful return, the subsequent n calls to tsync_works_provide() are 244 * guaranteed to succeed. (size + n <= capacity) 245 * 246 * Returns: 247 * -ENOMEM if the (re)allocation fails 248 249 * 0 if the allocation succeeds, partially succeeds, or no reallocation 250 * was needed 251 */ 252 static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags) 253 { 254 size_t i; 255 size_t new_capacity; 256 struct tsync_work **works; 257 struct tsync_work *work; 258 259 if (check_add_overflow(s->size, n, &new_capacity)) 260 return -EOVERFLOW; 261 262 /* No need to reallocate if s already has sufficient capacity. */ 263 if (new_capacity <= s->capacity) 264 return 0; 265 266 works = krealloc_array(s->works, new_capacity, sizeof(s->works[0]), 267 flags); 268 if (!works) 269 return -ENOMEM; 270 271 s->works = works; 272 273 for (i = s->capacity; i < new_capacity; i++) { 274 work = kzalloc_obj(*work, flags); 275 if (!work) { 276 /* 277 * Leave the object in a consistent state, 278 * but return an error. 279 */ 280 s->capacity = i; 281 return -ENOMEM; 282 } 283 s->works[i] = work; 284 } 285 s->capacity = new_capacity; 286 return 0; 287 } 288 289 /* 290 * tsync_works_contains - checks for presence of task in s 291 */ 292 static bool tsync_works_contains_task(const struct tsync_works *s, 293 const struct task_struct *task) 294 { 295 size_t i; 296 297 for (i = 0; i < s->size; i++) 298 if (s->works[i]->task == task) 299 return true; 300 301 return false; 302 } 303 304 /* 305 * tsync_works_release - frees memory held by s and drops all task references 306 * 307 * This does not free s itself, only the data structures held by it. 308 */ 309 static void tsync_works_release(struct tsync_works *s) 310 { 311 size_t i; 312 313 for (i = 0; i < s->size; i++) { 314 if (WARN_ON_ONCE(!s->works[i]->task)) 315 continue; 316 317 put_task_struct(s->works[i]->task); 318 } 319 320 for (i = 0; i < s->capacity; i++) 321 kfree(s->works[i]); 322 323 kfree(s->works); 324 s->works = NULL; 325 s->size = 0; 326 s->capacity = 0; 327 } 328 329 /* 330 * count_additional_threads - counts the sibling threads that are not in works 331 */ 332 static size_t count_additional_threads(const struct tsync_works *works) 333 { 334 const struct task_struct *caller, *thread; 335 size_t n = 0; 336 337 caller = current; 338 339 guard(rcu)(); 340 341 for_each_thread(caller, thread) { 342 /* Skip current, since it is initiating the sync. */ 343 if (thread == caller) 344 continue; 345 346 /* Skip exited threads. */ 347 if (thread->flags & PF_EXITING) 348 continue; 349 350 /* Skip threads that we have already seen. */ 351 if (tsync_works_contains_task(works, thread)) 352 continue; 353 354 n++; 355 } 356 return n; 357 } 358 359 /* 360 * schedule_task_work - adds task_work for all eligible sibling threads 361 * which have not been scheduled yet 362 * 363 * For each added task_work, atomically increments shared_ctx->num_preparing and 364 * shared_ctx->num_unfinished. 365 * 366 * Returns: 367 * true, if at least one eligible sibling thread was found 368 */ 369 static bool schedule_task_work(struct tsync_works *works, 370 struct tsync_shared_context *shared_ctx) 371 { 372 int err; 373 const struct task_struct *caller; 374 struct task_struct *thread; 375 struct tsync_work *ctx; 376 bool found_more_threads = false; 377 378 caller = current; 379 380 guard(rcu)(); 381 382 for_each_thread(caller, thread) { 383 /* Skip current, since it is initiating the sync. */ 384 if (thread == caller) 385 continue; 386 387 /* Skip exited threads. */ 388 if (thread->flags & PF_EXITING) 389 continue; 390 391 /* Skip threads that we already looked at. */ 392 if (tsync_works_contains_task(works, thread)) 393 continue; 394 395 /* 396 * We found a sibling thread that is not doing its task_work yet, and 397 * which might spawn new threads before our task work runs, so we need 398 * at least one more round in the outer loop. 399 */ 400 found_more_threads = true; 401 402 ctx = tsync_works_provide(works, thread); 403 if (!ctx) { 404 /* 405 * We ran out of preallocated contexts -- we need to try again with 406 * this thread at a later time! 407 * found_more_threads is already true at this point. 408 */ 409 break; 410 } 411 412 ctx->shared_ctx = shared_ctx; 413 414 atomic_inc(&shared_ctx->num_preparing); 415 atomic_inc(&shared_ctx->num_unfinished); 416 417 init_task_work(&ctx->work, restrict_one_thread_callback); 418 err = task_work_add(thread, &ctx->work, TWA_SIGNAL); 419 if (unlikely(err)) { 420 /* 421 * task_work_add() only fails if the task is about to exit. We 422 * checked that earlier, but it can happen as a race. Resume 423 * without setting an error, as the task is probably gone in the 424 * next loop iteration. 425 */ 426 tsync_works_trim(works); 427 428 atomic_dec(&shared_ctx->num_preparing); 429 atomic_dec(&shared_ctx->num_unfinished); 430 } 431 } 432 433 return found_more_threads; 434 } 435 436 /* 437 * cancel_tsync_works - cancel all task works where it is possible 438 * 439 * Task works can be canceled as long as they are still queued and have not 440 * started running. If they get canceled, we decrement 441 * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two 442 * completions if needed, as if the task was never scheduled. 443 */ 444 static void cancel_tsync_works(const struct tsync_works *works, 445 struct tsync_shared_context *shared_ctx) 446 { 447 size_t i; 448 449 for (i = 0; i < works->size; i++) { 450 if (WARN_ON_ONCE(!works->works[i]->task)) 451 continue; 452 453 if (!task_work_cancel(works->works[i]->task, 454 &works->works[i]->work)) 455 continue; 456 457 /* After dequeueing, act as if the task work had executed. */ 458 459 if (atomic_dec_return(&shared_ctx->num_preparing) == 0) 460 complete_all(&shared_ctx->all_prepared); 461 462 if (atomic_dec_return(&shared_ctx->num_unfinished) == 0) 463 complete_all(&shared_ctx->all_finished); 464 } 465 } 466 467 /* 468 * restrict_sibling_threads - enables a Landlock policy for all sibling threads 469 */ 470 int landlock_restrict_sibling_threads(const struct cred *old_cred, 471 const struct cred *new_cred) 472 { 473 int err; 474 struct tsync_shared_context shared_ctx; 475 struct tsync_works works = {}; 476 size_t newly_discovered_threads; 477 bool found_more_threads; 478 479 atomic_set(&shared_ctx.preparation_error, 0); 480 init_completion(&shared_ctx.all_prepared); 481 init_completion(&shared_ctx.ready_to_commit); 482 atomic_set(&shared_ctx.num_unfinished, 1); 483 init_completion(&shared_ctx.all_finished); 484 shared_ctx.old_cred = old_cred; 485 shared_ctx.new_cred = new_cred; 486 shared_ctx.set_no_new_privs = task_no_new_privs(current); 487 488 /* 489 * Serialize concurrent TSYNC operations to prevent deadlocks when 490 * multiple threads call landlock_restrict_self() simultaneously. 491 * If the lock is already held, we gracefully yield by restarting the 492 * syscall. This allows the current thread to process pending 493 * task_works before retrying. 494 */ 495 if (!down_write_trylock(¤t->signal->exec_update_lock)) 496 return restart_syscall(); 497 498 /* 499 * We schedule a pseudo-signal task_work for each of the calling task's 500 * sibling threads. In the task work, each thread: 501 * 502 * 1) runs prepare_creds() and writes back the error to 503 * shared_ctx.preparation_error, if needed. 504 * 505 * 2) signals that it's done with prepare_creds() to the calling task. 506 * (completion "all_prepared"). 507 * 508 * 3) waits for the completion "ready_to_commit". This is sent by the 509 * calling task after ensuring that all sibling threads have done 510 * with the "preparation" stage. 511 * 512 * After this barrier is reached, it's safe to read 513 * shared_ctx.preparation_error. 514 * 515 * 4) reads shared_ctx.preparation_error and then either does commit_creds() 516 * or abort_creds(). 517 * 518 * 5) signals that it's done altogether (barrier synchronization 519 * "all_finished") 520 * 521 * Unlike seccomp, which modifies sibling tasks directly, we do not need to 522 * acquire the cred_guard_mutex and sighand->siglock: 523 * 524 * - As in our case, all threads are themselves exchanging their own struct 525 * cred through the credentials API, no locks are needed for that. 526 * - Our for_each_thread() loops are protected by RCU. 527 * - We do not acquire a lock to keep the list of sibling threads stable 528 * between our for_each_thread loops. If the list of available sibling 529 * threads changes between these for_each_thread loops, we make up for 530 * that by continuing to look for threads until they are all discovered 531 * and have entered their task_work, where they are unable to spawn new 532 * threads. 533 */ 534 do { 535 /* In RCU read-lock, count the threads we need. */ 536 newly_discovered_threads = count_additional_threads(&works); 537 538 if (newly_discovered_threads == 0) 539 break; /* done */ 540 541 err = tsync_works_grow_by(&works, newly_discovered_threads, 542 GFP_KERNEL_ACCOUNT); 543 if (err) { 544 atomic_set(&shared_ctx.preparation_error, err); 545 break; 546 } 547 548 /* 549 * The "all_prepared" barrier is used locally to the loop body, this use 550 * of for_each_thread(). We can reset it on each loop iteration because 551 * all previous loop iterations are done with it already. 552 * 553 * num_preparing is initialized to 1 so that the counter can not go to 0 554 * and mark the completion as done before all task works are registered. 555 * We decrement it at the end of the loop body. 556 */ 557 atomic_set(&shared_ctx.num_preparing, 1); 558 reinit_completion(&shared_ctx.all_prepared); 559 560 /* 561 * In RCU read-lock, schedule task work on newly discovered sibling 562 * tasks. 563 */ 564 found_more_threads = schedule_task_work(&works, &shared_ctx); 565 566 /* 567 * Decrement num_preparing for current, to undo that we initialized it 568 * to 1 a few lines above. 569 */ 570 if (atomic_dec_return(&shared_ctx.num_preparing) > 0) { 571 if (wait_for_completion_interruptible( 572 &shared_ctx.all_prepared)) { 573 /* In case of interruption, we need to retry the system call. */ 574 atomic_set(&shared_ctx.preparation_error, 575 -ERESTARTNOINTR); 576 577 /* 578 * Opportunistic improvement: try to cancel task 579 * works for tasks that did not start running 580 * yet. We do not have a guarantee that it 581 * cancels any of the enqueued task works 582 * because task_work_run() might already have 583 * dequeued them. 584 */ 585 cancel_tsync_works(&works, &shared_ctx); 586 587 /* 588 * Break the loop with error. The cleanup code 589 * after the loop unblocks the remaining 590 * task_works. 591 */ 592 break; 593 } 594 } 595 } while (found_more_threads && 596 !atomic_read(&shared_ctx.preparation_error)); 597 598 /* 599 * We now have either (a) all sibling threads blocking and in "prepared" 600 * state in the task work, or (b) the preparation error is set. Ask all 601 * threads to commit (or abort). 602 */ 603 complete_all(&shared_ctx.ready_to_commit); 604 605 /* 606 * Decrement num_unfinished for current, to undo that we initialized it to 1 607 * at the beginning. 608 */ 609 if (atomic_dec_return(&shared_ctx.num_unfinished) > 0) 610 wait_for_completion(&shared_ctx.all_finished); 611 612 tsync_works_release(&works); 613 up_write(¤t->signal->exec_update_lock); 614 return atomic_read(&shared_ctx.preparation_error); 615 } 616