1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/slab.h> 4 #include <linux/sched/rt.h> 5 #include <linux/sched/task.h> 6 7 #include "futex.h" 8 #include "../locking/rtmutex_common.h" 9 10 /* 11 * PI code: 12 */ 13 int refill_pi_state_cache(void) 14 { 15 struct futex_pi_state *pi_state; 16 17 if (likely(current->pi_state_cache)) 18 return 0; 19 20 pi_state = kzalloc_obj(*pi_state); 21 22 if (!pi_state) 23 return -ENOMEM; 24 25 INIT_LIST_HEAD(&pi_state->list); 26 /* pi_mutex gets initialized later */ 27 pi_state->owner = NULL; 28 refcount_set(&pi_state->refcount, 1); 29 pi_state->key = FUTEX_KEY_INIT; 30 31 current->pi_state_cache = pi_state; 32 33 return 0; 34 } 35 36 static struct futex_pi_state *alloc_pi_state(void) 37 { 38 struct futex_pi_state *pi_state = current->pi_state_cache; 39 40 WARN_ON(!pi_state); 41 current->pi_state_cache = NULL; 42 43 return pi_state; 44 } 45 46 static void pi_state_update_owner(struct futex_pi_state *pi_state, 47 struct task_struct *new_owner) 48 { 49 struct task_struct *old_owner = pi_state->owner; 50 51 lockdep_assert_held(&pi_state->pi_mutex.wait_lock); 52 53 if (old_owner) { 54 raw_spin_lock(&old_owner->pi_lock); 55 WARN_ON(list_empty(&pi_state->list)); 56 list_del_init(&pi_state->list); 57 raw_spin_unlock(&old_owner->pi_lock); 58 } 59 60 if (new_owner) { 61 raw_spin_lock(&new_owner->pi_lock); 62 WARN_ON(!list_empty(&pi_state->list)); 63 list_add(&pi_state->list, &new_owner->pi_state_list); 64 pi_state->owner = new_owner; 65 raw_spin_unlock(&new_owner->pi_lock); 66 } 67 } 68 69 void get_pi_state(struct futex_pi_state *pi_state) 70 { 71 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); 72 } 73 74 /* 75 * Drops a reference to the pi_state object and frees or caches it 76 * when the last reference is gone. 77 */ 78 void put_pi_state(struct futex_pi_state *pi_state) 79 { 80 if (!pi_state) 81 return; 82 83 if (!refcount_dec_and_test(&pi_state->refcount)) 84 return; 85 86 /* 87 * If pi_state->owner is NULL, the owner is most probably dying 88 * and has cleaned up the pi_state already 89 */ 90 if (pi_state->owner) { 91 unsigned long flags; 92 93 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); 94 pi_state_update_owner(pi_state, NULL); 95 rt_mutex_proxy_unlock(&pi_state->pi_mutex); 96 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); 97 } 98 99 if (current->pi_state_cache) { 100 kfree(pi_state); 101 } else { 102 /* 103 * pi_state->list is already empty. 104 * clear pi_state->owner. 105 * refcount is at 0 - put it back to 1. 106 */ 107 pi_state->owner = NULL; 108 refcount_set(&pi_state->refcount, 1); 109 current->pi_state_cache = pi_state; 110 } 111 } 112 113 /* 114 * We need to check the following states: 115 * 116 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 117 * 118 * [1] NULL | --- | --- | 0 | 0/1 | Valid 119 * [2] NULL | --- | --- | >0 | 0/1 | Valid 120 * 121 * [3] Found | NULL | -- | Any | 0/1 | Invalid 122 * 123 * [4] Found | Found | NULL | 0 | 1 | Valid 124 * [5] Found | Found | NULL | >0 | 1 | Invalid 125 * 126 * [6] Found | Found | task | 0 | 1 | Valid 127 * 128 * [7] Found | Found | NULL | Any | 0 | Invalid 129 * 130 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 131 * [9] Found | Found | task | 0 | 0 | Invalid 132 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 133 * 134 * [1] Indicates that the kernel can acquire the futex atomically. We 135 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 136 * 137 * [2] Valid, if TID does not belong to a kernel thread. If no matching 138 * thread is found then it indicates that the owner TID has died. 139 * 140 * [3] Invalid. The waiter is queued on a non PI futex 141 * 142 * [4] Valid state after exit_robust_list(), which sets the user space 143 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 144 * 145 * [5] The user space value got manipulated between exit_robust_list() 146 * and exit_pi_state_list() 147 * 148 * [6] Valid state after exit_pi_state_list() which sets the new owner in 149 * the pi_state but cannot access the user space value. 150 * 151 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 152 * 153 * [8] Owner and user space value match 154 * 155 * [9] There is no transient state which sets the user space TID to 0 156 * except exit_robust_list(), but this is indicated by the 157 * FUTEX_OWNER_DIED bit. See [4] 158 * 159 * [10] There is no transient state which leaves owner and user space 160 * TID out of sync. Except one error case where the kernel is denied 161 * write access to the user address, see fixup_pi_state_owner(). 162 * 163 * 164 * Serialization and lifetime rules: 165 * 166 * hb->lock: 167 * 168 * hb -> futex_q, relation 169 * futex_q -> pi_state, relation 170 * 171 * (cannot be raw because hb can contain arbitrary amount 172 * of futex_q's) 173 * 174 * pi_mutex->wait_lock: 175 * 176 * {uval, pi_state} 177 * 178 * (and pi_mutex 'obviously') 179 * 180 * p->pi_lock: 181 * 182 * p->pi_state_list -> pi_state->list, relation 183 * pi_mutex->owner -> pi_state->owner, relation 184 * 185 * pi_state->refcount: 186 * 187 * pi_state lifetime 188 * 189 * 190 * Lock order: 191 * 192 * hb->lock 193 * pi_mutex->wait_lock 194 * p->pi_lock 195 * 196 */ 197 198 /* 199 * Validate that the existing waiter has a pi_state and sanity check 200 * the pi_state against the user space value. If correct, attach to 201 * it. 202 */ 203 static int attach_to_pi_state(u32 __user *uaddr, u32 uval, 204 struct futex_pi_state *pi_state, 205 struct futex_pi_state **ps) 206 { 207 pid_t pid = uval & FUTEX_TID_MASK; 208 u32 uval2; 209 int ret; 210 211 /* 212 * Userspace might have messed up non-PI and PI futexes [3] 213 */ 214 if (unlikely(!pi_state)) 215 return -EINVAL; 216 217 /* 218 * We get here with hb->lock held, and having found a 219 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q 220 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), 221 * which in turn means that futex_lock_pi() still has a reference on 222 * our pi_state. 223 * 224 * The waiter holding a reference on @pi_state also protects against 225 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() 226 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently 227 * free pi_state before we can take a reference ourselves. 228 */ 229 WARN_ON(!refcount_read(&pi_state->refcount)); 230 231 /* 232 * Now that we have a pi_state, we can acquire wait_lock 233 * and do the state validation. 234 */ 235 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 236 237 /* 238 * Since {uval, pi_state} is serialized by wait_lock, and our current 239 * uval was read without holding it, it can have changed. Verify it 240 * still is what we expect it to be, otherwise retry the entire 241 * operation. 242 */ 243 if (futex_get_value_locked(&uval2, uaddr)) 244 goto out_efault; 245 246 if (uval != uval2) 247 goto out_eagain; 248 249 /* 250 * Handle the owner died case: 251 */ 252 if (uval & FUTEX_OWNER_DIED) { 253 /* 254 * exit_pi_state_list sets owner to NULL and wakes the 255 * topmost waiter. The task which acquires the 256 * pi_state->rt_mutex will fixup owner. 257 */ 258 if (!pi_state->owner) { 259 /* 260 * No pi state owner, but the user space TID 261 * is not 0. Inconsistent state. [5] 262 */ 263 if (pid) 264 goto out_einval; 265 /* 266 * Take a ref on the state and return success. [4] 267 */ 268 goto out_attach; 269 } 270 271 /* 272 * If TID is 0, then either the dying owner has not 273 * yet executed exit_pi_state_list() or some waiter 274 * acquired the rtmutex in the pi state, but did not 275 * yet fixup the TID in user space. 276 * 277 * Take a ref on the state and return success. [6] 278 */ 279 if (!pid) 280 goto out_attach; 281 } else { 282 /* 283 * If the owner died bit is not set, then the pi_state 284 * must have an owner. [7] 285 */ 286 if (!pi_state->owner) 287 goto out_einval; 288 } 289 290 /* 291 * Bail out if user space manipulated the futex value. If pi 292 * state exists then the owner TID must be the same as the 293 * user space TID. [9/10] 294 */ 295 if (pid != task_pid_vnr(pi_state->owner)) 296 goto out_einval; 297 298 out_attach: 299 get_pi_state(pi_state); 300 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 301 *ps = pi_state; 302 return 0; 303 304 out_einval: 305 ret = -EINVAL; 306 goto out_error; 307 308 out_eagain: 309 ret = -EAGAIN; 310 goto out_error; 311 312 out_efault: 313 ret = -EFAULT; 314 goto out_error; 315 316 out_error: 317 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 318 return ret; 319 } 320 321 static int handle_exit_race(u32 __user *uaddr, u32 uval, 322 struct task_struct *tsk) 323 { 324 u32 uval2; 325 326 /* 327 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the 328 * caller that the alleged owner is busy. 329 */ 330 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) 331 return -EBUSY; 332 333 /* 334 * Reread the user space value to handle the following situation: 335 * 336 * CPU0 CPU1 337 * 338 * sys_exit() sys_futex() 339 * do_exit() futex_lock_pi() 340 * futex_lock_pi_atomic() 341 * exit_signals(tsk) No waiters: 342 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID 343 * mm_release(tsk) Set waiter bit 344 * exit_robust_list(tsk) { *uaddr = 0x80000PID; 345 * Set owner died attach_to_pi_owner() { 346 * *uaddr = 0xC0000000; tsk = get_task(PID); 347 * } if (!tsk->flags & PF_EXITING) { 348 * ... attach(); 349 * tsk->futex_state = } else { 350 * FUTEX_STATE_DEAD; if (tsk->futex_state != 351 * FUTEX_STATE_DEAD) 352 * return -EAGAIN; 353 * return -ESRCH; <--- FAIL 354 * } 355 * 356 * Returning ESRCH unconditionally is wrong here because the 357 * user space value has been changed by the exiting task. 358 * 359 * The same logic applies to the case where the exiting task is 360 * already gone. 361 */ 362 if (futex_get_value_locked(&uval2, uaddr)) 363 return -EFAULT; 364 365 /* If the user space value has changed, try again. */ 366 if (uval2 != uval) 367 return -EAGAIN; 368 369 /* 370 * The exiting task did not have a robust list, the robust list was 371 * corrupted or the user space value in *uaddr is simply bogus. 372 * Give up and tell user space. 373 */ 374 return -ESRCH; 375 } 376 377 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, 378 struct futex_pi_state **ps) 379 { 380 /* 381 * No existing pi state. First waiter. [2] 382 * 383 * This creates pi_state, we have hb->lock held, this means nothing can 384 * observe this state, wait_lock is irrelevant. 385 */ 386 struct futex_pi_state *pi_state = alloc_pi_state(); 387 388 /* 389 * Initialize the pi_mutex in locked state and make @p 390 * the owner of it: 391 */ 392 __assume_ctx_lock(&pi_state->pi_mutex.wait_lock); 393 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 394 395 /* Store the key for possible exit cleanups: */ 396 pi_state->key = *key; 397 398 WARN_ON(!list_empty(&pi_state->list)); 399 list_add(&pi_state->list, &p->pi_state_list); 400 /* 401 * Assignment without holding pi_state->pi_mutex.wait_lock is safe 402 * because there is no concurrency as the object is not published yet. 403 */ 404 pi_state->owner = p; 405 406 *ps = pi_state; 407 } 408 /* 409 * Lookup the task for the TID provided from user space and attach to 410 * it after doing proper sanity checks. 411 */ 412 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, 413 struct futex_pi_state **ps, 414 struct task_struct **exiting) 415 { 416 pid_t pid = uval & FUTEX_TID_MASK; 417 struct task_struct *p; 418 419 /* 420 * We are the first waiter - try to look up the real owner and attach 421 * the new pi_state to it, but bail out when TID = 0 [1] 422 * 423 * The !pid check is paranoid. None of the call sites should end up 424 * with pid == 0, but better safe than sorry. Let the caller retry 425 */ 426 if (!pid) 427 return -EAGAIN; 428 p = find_get_task_by_vpid(pid); 429 if (!p) 430 return handle_exit_race(uaddr, uval, NULL); 431 432 if (unlikely(p->flags & PF_KTHREAD)) { 433 put_task_struct(p); 434 return -EPERM; 435 } 436 437 /* 438 * We need to look at the task state to figure out, whether the 439 * task is exiting. To protect against the change of the task state 440 * in futex_exit_release(), we do this protected by p->pi_lock: 441 */ 442 raw_spin_lock_irq(&p->pi_lock); 443 if (unlikely(p->futex_state != FUTEX_STATE_OK)) { 444 /* 445 * The task is on the way out. When the futex state is 446 * FUTEX_STATE_DEAD, we know that the task has finished 447 * the cleanup: 448 */ 449 int ret = handle_exit_race(uaddr, uval, p); 450 451 raw_spin_unlock_irq(&p->pi_lock); 452 /* 453 * If the owner task is between FUTEX_STATE_EXITING and 454 * FUTEX_STATE_DEAD then store the task pointer and keep 455 * the reference on the task struct. The calling code will 456 * drop all locks, wait for the task to reach 457 * FUTEX_STATE_DEAD and then drop the refcount. This is 458 * required to prevent a live lock when the current task 459 * preempted the exiting task between the two states. 460 */ 461 if (ret == -EBUSY) 462 *exiting = p; 463 else 464 put_task_struct(p); 465 return ret; 466 } 467 468 __attach_to_pi_owner(p, key, ps); 469 raw_spin_unlock_irq(&p->pi_lock); 470 471 put_task_struct(p); 472 473 return 0; 474 } 475 476 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 477 { 478 int err; 479 u32 curval; 480 481 if (unlikely(should_fail_futex(true))) 482 return -EFAULT; 483 484 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 485 if (unlikely(err)) 486 return err; 487 488 /* If user space value changed, let the caller retry */ 489 return curval != uval ? -EAGAIN : 0; 490 } 491 492 /** 493 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 494 * @uaddr: the pi futex user address 495 * @hb: the pi futex hash bucket 496 * @key: the futex key associated with uaddr and hb 497 * @ps: the pi_state pointer where we store the result of the 498 * lookup 499 * @task: the task to perform the atomic lock work for. This will 500 * be "current" except in the case of requeue pi. 501 * @exiting: Pointer to store the task pointer of the owner task 502 * which is in the middle of exiting 503 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 504 * 505 * Return: 506 * - 0 - ready to wait; 507 * - 1 - acquired the lock; 508 * - <0 - error 509 * 510 * The hb->lock must be held by the caller. 511 * 512 * @exiting is only set when the return value is -EBUSY. If so, this holds 513 * a refcount on the exiting task on return and the caller needs to drop it 514 * after waiting for the exit to complete. 515 */ 516 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 517 union futex_key *key, 518 struct futex_pi_state **ps, 519 struct task_struct *task, 520 struct task_struct **exiting, 521 int set_waiters) 522 { 523 u32 uval, newval, vpid = task_pid_vnr(task); 524 struct futex_q *top_waiter; 525 int ret; 526 527 /* 528 * Read the user space value first so we can validate a few 529 * things before proceeding further. 530 */ 531 if (futex_get_value_locked(&uval, uaddr)) 532 return -EFAULT; 533 534 if (unlikely(should_fail_futex(true))) 535 return -EFAULT; 536 537 /* 538 * Detect deadlocks. 539 */ 540 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 541 return -EDEADLK; 542 543 if ((unlikely(should_fail_futex(true)))) 544 return -EDEADLK; 545 546 /* 547 * Lookup existing state first. If it exists, try to attach to 548 * its pi_state. 549 */ 550 top_waiter = futex_top_waiter(hb, key); 551 if (top_waiter) 552 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 553 554 /* 555 * No waiter and user TID is 0. We are here because the 556 * waiters or the owner died bit is set or called from 557 * requeue_cmp_pi or for whatever reason something took the 558 * syscall. 559 */ 560 if (!(uval & FUTEX_TID_MASK)) { 561 /* 562 * We take over the futex. No other waiters and the user space 563 * TID is 0. We preserve the owner died bit. 564 */ 565 newval = uval & FUTEX_OWNER_DIED; 566 newval |= vpid; 567 568 /* The futex requeue_pi code can enforce the waiters bit */ 569 if (set_waiters) 570 newval |= FUTEX_WAITERS; 571 572 ret = lock_pi_update_atomic(uaddr, uval, newval); 573 if (ret) 574 return ret; 575 576 /* 577 * If the waiter bit was requested the caller also needs PI 578 * state attached to the new owner of the user space futex. 579 * 580 * @task is guaranteed to be alive and it cannot be exiting 581 * because it is either sleeping or waiting in 582 * futex_requeue_pi_wakeup_sync(). 583 * 584 * No need to do the full attach_to_pi_owner() exercise 585 * because @task is known and valid. 586 */ 587 if (set_waiters) { 588 raw_spin_lock_irq(&task->pi_lock); 589 __attach_to_pi_owner(task, key, ps); 590 raw_spin_unlock_irq(&task->pi_lock); 591 } 592 return 1; 593 } 594 595 /* 596 * First waiter. Set the waiters bit before attaching ourself to 597 * the owner. If owner tries to unlock, it will be forced into 598 * the kernel and blocked on hb->lock. 599 */ 600 newval = uval | FUTEX_WAITERS; 601 ret = lock_pi_update_atomic(uaddr, uval, newval); 602 if (ret) 603 return ret; 604 /* 605 * If the update of the user space value succeeded, we try to 606 * attach to the owner. If that fails, no harm done, we only 607 * set the FUTEX_WAITERS bit in the user space variable. 608 */ 609 return attach_to_pi_owner(uaddr, newval, key, ps, exiting); 610 } 611 612 /* 613 * Caller must hold a reference on @pi_state. 614 */ 615 static int wake_futex_pi(u32 __user *uaddr, u32 uval, 616 struct futex_pi_state *pi_state, 617 struct rt_mutex_waiter *top_waiter) 618 __must_hold(&pi_state->pi_mutex.wait_lock) 619 __releases(&pi_state->pi_mutex.wait_lock) 620 { 621 struct task_struct *new_owner; 622 bool postunlock = false; 623 DEFINE_RT_WAKE_Q(wqh); 624 u32 curval, newval; 625 int ret = 0; 626 627 new_owner = top_waiter->task; 628 629 /* 630 * We pass it to the next owner. The WAITERS bit is always kept 631 * enabled while there is PI state around. We cleanup the owner 632 * died bit, because we are the owner. 633 */ 634 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 635 636 if (unlikely(should_fail_futex(true))) { 637 ret = -EFAULT; 638 goto out_unlock; 639 } 640 641 ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 642 if (!ret && (curval != uval)) { 643 /* 644 * If a unconditional UNLOCK_PI operation (user space did not 645 * try the TID->0 transition) raced with a waiter setting the 646 * FUTEX_WAITERS flag between get_user() and locking the hash 647 * bucket lock, retry the operation. 648 */ 649 if ((FUTEX_TID_MASK & curval) == uval) 650 ret = -EAGAIN; 651 else 652 ret = -EINVAL; 653 } 654 655 if (!ret) { 656 /* 657 * This is a point of no return; once we modified the uval 658 * there is no going back and subsequent operations must 659 * not fail. 660 */ 661 pi_state_update_owner(pi_state, new_owner); 662 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); 663 } 664 665 out_unlock: 666 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 667 668 if (postunlock) 669 rt_mutex_postunlock(&wqh); 670 671 return ret; 672 } 673 674 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 675 struct task_struct *argowner) 676 __must_hold(&q->pi_state->pi_mutex.wait_lock) 677 __must_hold(q->lock_ptr) 678 { 679 struct futex_pi_state *pi_state = q->pi_state; 680 struct task_struct *oldowner, *newowner; 681 u32 uval, curval, newval, newtid; 682 int err = 0; 683 684 oldowner = pi_state->owner; 685 686 /* 687 * We are here because either: 688 * 689 * - we stole the lock and pi_state->owner needs updating to reflect 690 * that (@argowner == current), 691 * 692 * or: 693 * 694 * - someone stole our lock and we need to fix things to point to the 695 * new owner (@argowner == NULL). 696 * 697 * Either way, we have to replace the TID in the user space variable. 698 * This must be atomic as we have to preserve the owner died bit here. 699 * 700 * Note: We write the user space value _before_ changing the pi_state 701 * because we can fault here. Imagine swapped out pages or a fork 702 * that marked all the anonymous memory readonly for cow. 703 * 704 * Modifying pi_state _before_ the user space value would leave the 705 * pi_state in an inconsistent state when we fault here, because we 706 * need to drop the locks to handle the fault. This might be observed 707 * in the PID checks when attaching to PI state . 708 */ 709 retry: 710 if (!argowner) { 711 if (oldowner != current) { 712 /* 713 * We raced against a concurrent self; things are 714 * already fixed up. Nothing to do. 715 */ 716 return 0; 717 } 718 719 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { 720 /* We got the lock. pi_state is correct. Tell caller. */ 721 return 1; 722 } 723 724 /* 725 * The trylock just failed, so either there is an owner or 726 * there is a higher priority waiter than this one. 727 */ 728 newowner = rt_mutex_owner(&pi_state->pi_mutex); 729 /* 730 * If the higher priority waiter has not yet taken over the 731 * rtmutex then newowner is NULL. We can't return here with 732 * that state because it's inconsistent vs. the user space 733 * state. So drop the locks and try again. It's a valid 734 * situation and not any different from the other retry 735 * conditions. 736 */ 737 if (unlikely(!newowner)) { 738 err = -EAGAIN; 739 goto handle_err; 740 } 741 } else { 742 WARN_ON_ONCE(argowner != current); 743 if (oldowner == current) { 744 /* 745 * We raced against a concurrent self; things are 746 * already fixed up. Nothing to do. 747 */ 748 return 1; 749 } 750 newowner = argowner; 751 } 752 753 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 754 /* Owner died? */ 755 if (!pi_state->owner) 756 newtid |= FUTEX_OWNER_DIED; 757 758 err = futex_get_value_locked(&uval, uaddr); 759 if (err) 760 goto handle_err; 761 762 for (;;) { 763 newval = (uval & FUTEX_OWNER_DIED) | newtid; 764 765 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 766 if (err) 767 goto handle_err; 768 769 if (curval == uval) 770 break; 771 uval = curval; 772 } 773 774 /* 775 * We fixed up user space. Now we need to fix the pi_state 776 * itself. 777 */ 778 pi_state_update_owner(pi_state, newowner); 779 780 return argowner == current; 781 782 /* 783 * In order to reschedule or handle a page fault, we need to drop the 784 * locks here. In the case of a fault, this gives the other task 785 * (either the highest priority waiter itself or the task which stole 786 * the rtmutex) the chance to try the fixup of the pi_state. So once we 787 * are back from handling the fault we need to check the pi_state after 788 * reacquiring the locks and before trying to do another fixup. When 789 * the fixup has been done already we simply return. 790 * 791 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 792 * drop hb->lock since the caller owns the hb -> futex_q relation. 793 * Dropping the pi_mutex->wait_lock requires the state revalidate. 794 */ 795 handle_err: 796 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 797 spin_unlock(q->lock_ptr); 798 799 switch (err) { 800 case -EFAULT: 801 err = fault_in_user_writeable(uaddr); 802 break; 803 804 case -EAGAIN: 805 cond_resched(); 806 err = 0; 807 break; 808 809 default: 810 WARN_ON_ONCE(1); 811 break; 812 } 813 814 futex_q_lockptr_lock(q); 815 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 816 817 /* 818 * Check if someone else fixed it for us: 819 */ 820 if (pi_state->owner != oldowner) 821 return argowner == current; 822 823 /* Retry if err was -EAGAIN or the fault in succeeded */ 824 if (!err) 825 goto retry; 826 827 /* 828 * fault_in_user_writeable() failed so user state is immutable. At 829 * best we can make the kernel state consistent but user state will 830 * be most likely hosed and any subsequent unlock operation will be 831 * rejected due to PI futex rule [10]. 832 * 833 * Ensure that the rtmutex owner is also the pi_state owner despite 834 * the user space value claiming something different. There is no 835 * point in unlocking the rtmutex if current is the owner as it 836 * would need to wait until the next waiter has taken the rtmutex 837 * to guarantee consistent state. Keep it simple. Userspace asked 838 * for this wreckaged state. 839 * 840 * The rtmutex has an owner - either current or some other 841 * task. See the EAGAIN loop above. 842 */ 843 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); 844 845 return err; 846 } 847 848 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 849 struct task_struct *argowner) 850 { 851 struct futex_pi_state *pi_state = q->pi_state; 852 int ret; 853 854 lockdep_assert_held(q->lock_ptr); 855 856 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 857 ret = __fixup_pi_state_owner(uaddr, q, argowner); 858 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 859 return ret; 860 } 861 862 /** 863 * fixup_pi_owner() - Post lock pi_state and corner case management 864 * @uaddr: user address of the futex 865 * @q: futex_q (contains pi_state and access to the rt_mutex) 866 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 867 * 868 * After attempting to lock an rt_mutex, this function is called to cleanup 869 * the pi_state owner as well as handle race conditions that may allow us to 870 * acquire the lock. Must be called with the hb lock held. 871 * 872 * Return: 873 * - 1 - success, lock taken; 874 * - 0 - success, lock not taken; 875 * - <0 - on error (-EFAULT) 876 */ 877 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) 878 { 879 if (locked) { 880 /* 881 * Got the lock. We might not be the anticipated owner if we 882 * did a lock-steal - fix up the PI-state in that case: 883 * 884 * Speculative pi_state->owner read (we don't hold wait_lock); 885 * since we own the lock pi_state->owner == current is the 886 * stable state, anything else needs more attention. 887 */ 888 if (q->pi_state->owner != current) 889 return fixup_pi_state_owner(uaddr, q, current); 890 return 1; 891 } 892 893 /* 894 * If we didn't get the lock; check if anybody stole it from us. In 895 * that case, we need to fix up the uval to point to them instead of 896 * us, otherwise bad things happen. [10] 897 * 898 * Another speculative read; pi_state->owner == current is unstable 899 * but needs our attention. 900 */ 901 if (q->pi_state->owner == current) 902 return fixup_pi_state_owner(uaddr, q, NULL); 903 904 /* 905 * Paranoia check. If we did not take the lock, then we should not be 906 * the owner of the rt_mutex. Warn and establish consistent state. 907 */ 908 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) 909 return fixup_pi_state_owner(uaddr, q, current); 910 911 return 0; 912 } 913 914 /* 915 * Userspace tried a 0 -> TID atomic transition of the futex value 916 * and failed. The kernel side here does the whole locking operation: 917 * if there are waiters then it will block as a consequence of relying 918 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see 919 * a 0 value of the futex too.). 920 * 921 * Also serves as futex trylock_pi()'ing, and due semantics. 922 */ 923 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) 924 { 925 struct hrtimer_sleeper timeout, *to; 926 struct task_struct *exiting; 927 struct rt_mutex_waiter rt_waiter; 928 struct futex_q q = futex_q_init; 929 DEFINE_WAKE_Q(wake_q); 930 int res, ret; 931 932 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 933 return -ENOSYS; 934 935 if (refill_pi_state_cache()) 936 return -ENOMEM; 937 938 to = futex_setup_timer(time, &timeout, flags, 0); 939 940 retry: 941 exiting = NULL; 942 ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); 943 if (unlikely(ret != 0)) 944 goto out; 945 946 retry_private: 947 if (1) { 948 CLASS(hb, hb)(&q.key); 949 950 futex_q_lock(&q, hb); 951 952 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 953 &exiting, 0); 954 if (unlikely(ret)) { 955 /* 956 * Atomic work succeeded and we got the lock, 957 * or failed. Either way, we do _not_ block. 958 */ 959 switch (ret) { 960 case 1: 961 /* We got the lock. */ 962 ret = 0; 963 goto out_unlock_put_key; 964 case -EFAULT: 965 goto uaddr_faulted; 966 case -EBUSY: 967 case -EAGAIN: 968 /* 969 * Two reasons for this: 970 * - EBUSY: Task is exiting and we just wait for the 971 * exit to complete. 972 * - EAGAIN: The user space value changed. 973 */ 974 futex_q_unlock(hb); 975 __release(q.lock_ptr); 976 /* 977 * Handle the case where the owner is in the middle of 978 * exiting. Wait for the exit to complete otherwise 979 * this task might loop forever, aka. live lock. 980 */ 981 wait_for_owner_exiting(ret, exiting); 982 cond_resched(); 983 goto retry; 984 default: 985 goto out_unlock_put_key; 986 } 987 } 988 989 WARN_ON(!q.pi_state); 990 991 /* 992 * Only actually queue now that the atomic ops are done: 993 */ 994 __futex_queue(&q, hb, current); 995 996 if (trylock) { 997 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 998 /* Fixup the trylock return value: */ 999 ret = ret ? 0 : -EWOULDBLOCK; 1000 goto no_block; 1001 } 1002 1003 /* 1004 * Caution; releasing @hb in-scope. The hb->lock is still locked 1005 * while the reference is dropped. The reference can not be dropped 1006 * after the unlock because if a user initiated resize is in progress 1007 * then we might need to wake him. This can not be done after the 1008 * rt_mutex_pre_schedule() invocation. The hb will remain valid because 1009 * the thread, performing resize, will block on hb->lock during 1010 * the requeue. 1011 */ 1012 futex_hash_put(no_free_ptr(hb)); 1013 /* 1014 * Must be done before we enqueue the waiter, here is unfortunately 1015 * under the hb lock, but that *should* work because it does nothing. 1016 */ 1017 rt_mutex_pre_schedule(); 1018 1019 rt_mutex_init_waiter(&rt_waiter); 1020 1021 /* 1022 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1023 * hold it while doing rt_mutex_start_proxy(), because then it will 1024 * include hb->lock in the blocking chain, even through we'll not in 1025 * fact hold it while blocking. This will lead it to report -EDEADLK 1026 * and BUG when futex_unlock_pi() interleaves with this. 1027 * 1028 * Therefore acquire wait_lock while holding hb->lock, but drop the 1029 * latter before calling __rt_mutex_start_proxy_lock(). This 1030 * interleaves with futex_unlock_pi() -- which does a similar lock 1031 * handoff -- such that the latter can observe the futex_q::pi_state 1032 * before __rt_mutex_start_proxy_lock() is done. 1033 */ 1034 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1035 spin_unlock(q.lock_ptr); 1036 /* 1037 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1038 * such that futex_unlock_pi() is guaranteed to observe the waiter when 1039 * it sees the futex_q::pi_state. 1040 */ 1041 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); 1042 raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); 1043 1044 if (ret) { 1045 if (ret == 1) 1046 ret = 0; 1047 goto cleanup; 1048 } 1049 1050 if (unlikely(to)) 1051 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1052 1053 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1054 1055 cleanup: 1056 /* 1057 * If we failed to acquire the lock (deadlock/signal/timeout), we must 1058 * unwind the above, however we canont lock hb->lock because 1059 * rt_mutex already has a waiter enqueued and hb->lock can itself try 1060 * and enqueue an rt_waiter through rtlock. 1061 * 1062 * Doing the cleanup without holding hb->lock can cause inconsistent 1063 * state between hb and pi_state, but only in the direction of not 1064 * seeing a waiter that is leaving. 1065 * 1066 * See futex_unlock_pi(), it deals with this inconsistency. 1067 * 1068 * There be dragons here, since we must deal with the inconsistency on 1069 * the way out (here), it is impossible to detect/warn about the race 1070 * the other way around (missing an incoming waiter). 1071 * 1072 * What could possibly go wrong... 1073 */ 1074 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1075 ret = 0; 1076 1077 /* 1078 * Now that the rt_waiter has been dequeued, it is safe to use 1079 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up 1080 * the 1081 */ 1082 futex_q_lockptr_lock(&q); 1083 /* 1084 * Waiter is unqueued. 1085 */ 1086 rt_mutex_post_schedule(); 1087 no_block: 1088 /* 1089 * Fixup the pi_state owner and possibly acquire the lock if we 1090 * haven't already. 1091 */ 1092 res = fixup_pi_owner(uaddr, &q, !ret); 1093 /* 1094 * If fixup_pi_owner() returned an error, propagate that. If it acquired 1095 * the lock, clear our -ETIMEDOUT or -EINTR. 1096 */ 1097 if (res) 1098 ret = (res < 0) ? res : 0; 1099 1100 __release(&hb->lock); 1101 futex_unqueue_pi(&q); 1102 spin_unlock(q.lock_ptr); 1103 if (q.drop_hb_ref) { 1104 CLASS(hb, hb)(&q.key); 1105 /* Additional reference from futex_unlock_pi() */ 1106 futex_hash_put(hb); 1107 } 1108 goto out; 1109 1110 out_unlock_put_key: 1111 futex_q_unlock(hb); 1112 __release(q.lock_ptr); 1113 goto out; 1114 1115 uaddr_faulted: 1116 futex_q_unlock(hb); 1117 __release(q.lock_ptr); 1118 1119 ret = fault_in_user_writeable(uaddr); 1120 if (ret) 1121 goto out; 1122 1123 if (!(flags & FLAGS_SHARED)) 1124 goto retry_private; 1125 1126 goto retry; 1127 } 1128 1129 out: 1130 if (to) { 1131 hrtimer_cancel(&to->timer); 1132 destroy_hrtimer_on_stack(&to->timer); 1133 } 1134 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1135 } 1136 1137 /* 1138 * Userspace attempted a TID -> 0 atomic transition, and failed. 1139 * This is the in-kernel slowpath: we look up the PI state (if any), 1140 * and do the rt-mutex unlock. 1141 */ 1142 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 1143 { 1144 u32 curval, uval, vpid = task_pid_vnr(current); 1145 union futex_key key = FUTEX_KEY_INIT; 1146 struct futex_q *top_waiter; 1147 int ret; 1148 1149 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 1150 return -ENOSYS; 1151 1152 retry: 1153 if (get_user(uval, uaddr)) 1154 return -EFAULT; 1155 /* 1156 * We release only a lock we actually own: 1157 */ 1158 if ((uval & FUTEX_TID_MASK) != vpid) 1159 return -EPERM; 1160 1161 ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE); 1162 if (ret) 1163 return ret; 1164 1165 CLASS(hb, hb)(&key); 1166 spin_lock(&hb->lock); 1167 retry_hb: 1168 1169 /* 1170 * Check waiters first. We do not trust user space values at 1171 * all and we at least want to know if user space fiddled 1172 * with the futex value instead of blindly unlocking. 1173 */ 1174 top_waiter = futex_top_waiter(hb, &key); 1175 if (top_waiter) { 1176 struct futex_pi_state *pi_state = top_waiter->pi_state; 1177 struct rt_mutex_waiter *rt_waiter; 1178 1179 ret = -EINVAL; 1180 if (!pi_state) 1181 goto out_unlock; 1182 1183 /* 1184 * If current does not own the pi_state then the futex is 1185 * inconsistent and user space fiddled with the futex value. 1186 */ 1187 if (pi_state->owner != current) 1188 goto out_unlock; 1189 1190 /* 1191 * By taking wait_lock while still holding hb->lock, we ensure 1192 * there is no point where we hold neither; and thereby 1193 * wake_futex_pi() must observe any new waiters. 1194 * 1195 * Since the cleanup: case in futex_lock_pi() removes the 1196 * rt_waiter without holding hb->lock, it is possible for 1197 * wake_futex_pi() to not find a waiter while the above does, 1198 * in this case the waiter is on the way out and it can be 1199 * ignored. 1200 * 1201 * In particular; this forces __rt_mutex_start_proxy() to 1202 * complete such that we're guaranteed to observe the 1203 * rt_waiter. 1204 */ 1205 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1206 1207 /* 1208 * Futex vs rt_mutex waiter state -- if there are no rt_mutex 1209 * waiters even though futex thinks there are, then the waiter 1210 * is leaving. The entry needs to be removed from the list so a 1211 * new futex_lock_pi() is not using this stale PI-state while 1212 * the futex is available in user space again. 1213 * There can be more than one task on its way out so it needs 1214 * to retry. 1215 */ 1216 rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); 1217 if (!rt_waiter) { 1218 /* 1219 * Acquire a reference for the leaving waiter to ensure 1220 * valid futex_q::lock_ptr. 1221 */ 1222 futex_hash_get(hb); 1223 top_waiter->drop_hb_ref = true; 1224 __futex_unqueue(top_waiter); 1225 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1226 goto retry_hb; 1227 } 1228 1229 get_pi_state(pi_state); 1230 spin_unlock(&hb->lock); 1231 1232 /* drops pi_state->pi_mutex.wait_lock */ 1233 ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter); 1234 1235 put_pi_state(pi_state); 1236 1237 /* 1238 * Success, we're done! No tricky corner cases. 1239 */ 1240 if (!ret) 1241 return ret; 1242 /* 1243 * The atomic access to the futex value generated a 1244 * pagefault, so retry the user-access and the wakeup: 1245 */ 1246 if (ret == -EFAULT) 1247 goto pi_faulted; 1248 /* 1249 * A unconditional UNLOCK_PI op raced against a waiter 1250 * setting the FUTEX_WAITERS bit. Try again. 1251 */ 1252 if (ret == -EAGAIN) 1253 goto pi_retry; 1254 /* 1255 * wake_futex_pi has detected invalid state. Tell user 1256 * space. 1257 */ 1258 return ret; 1259 } 1260 1261 /* 1262 * We have no kernel internal state, i.e. no waiters in the 1263 * kernel. Waiters which are about to queue themselves are stuck 1264 * on hb->lock. So we can safely ignore them. We do neither 1265 * preserve the WAITERS bit not the OWNER_DIED one. We are the 1266 * owner. 1267 */ 1268 if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) { 1269 spin_unlock(&hb->lock); 1270 switch (ret) { 1271 case -EFAULT: 1272 goto pi_faulted; 1273 1274 case -EAGAIN: 1275 goto pi_retry; 1276 1277 default: 1278 WARN_ON_ONCE(1); 1279 return ret; 1280 } 1281 } 1282 1283 /* 1284 * If uval has changed, let user space handle it. 1285 */ 1286 ret = (curval == uval) ? 0 : -EAGAIN; 1287 1288 out_unlock: 1289 spin_unlock(&hb->lock); 1290 return ret; 1291 1292 pi_retry: 1293 cond_resched(); 1294 goto retry; 1295 1296 pi_faulted: 1297 1298 ret = fault_in_user_writeable(uaddr); 1299 if (!ret) 1300 goto retry; 1301 1302 return ret; 1303 } 1304 1305