1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/slab.h> 4 #include <linux/sched/rt.h> 5 #include <linux/sched/task.h> 6 7 #include "futex.h" 8 #include "../locking/rtmutex_common.h" 9 10 /* 11 * PI code: 12 */ 13 int refill_pi_state_cache(void) 14 { 15 struct futex_pi_state *pi_state; 16 17 if (likely(current->pi_state_cache)) 18 return 0; 19 20 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 21 22 if (!pi_state) 23 return -ENOMEM; 24 25 INIT_LIST_HEAD(&pi_state->list); 26 /* pi_mutex gets initialized later */ 27 pi_state->owner = NULL; 28 refcount_set(&pi_state->refcount, 1); 29 pi_state->key = FUTEX_KEY_INIT; 30 31 current->pi_state_cache = pi_state; 32 33 return 0; 34 } 35 36 static struct futex_pi_state *alloc_pi_state(void) 37 { 38 struct futex_pi_state *pi_state = current->pi_state_cache; 39 40 WARN_ON(!pi_state); 41 current->pi_state_cache = NULL; 42 43 return pi_state; 44 } 45 46 static void pi_state_update_owner(struct futex_pi_state *pi_state, 47 struct task_struct *new_owner) 48 { 49 struct task_struct *old_owner = pi_state->owner; 50 51 lockdep_assert_held(&pi_state->pi_mutex.wait_lock); 52 53 if (old_owner) { 54 raw_spin_lock(&old_owner->pi_lock); 55 WARN_ON(list_empty(&pi_state->list)); 56 list_del_init(&pi_state->list); 57 raw_spin_unlock(&old_owner->pi_lock); 58 } 59 60 if (new_owner) { 61 raw_spin_lock(&new_owner->pi_lock); 62 WARN_ON(!list_empty(&pi_state->list)); 63 list_add(&pi_state->list, &new_owner->pi_state_list); 64 pi_state->owner = new_owner; 65 raw_spin_unlock(&new_owner->pi_lock); 66 } 67 } 68 69 void get_pi_state(struct futex_pi_state *pi_state) 70 { 71 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); 72 } 73 74 /* 75 * Drops a reference to the pi_state object and frees or caches it 76 * when the last reference is gone. 77 */ 78 void put_pi_state(struct futex_pi_state *pi_state) 79 { 80 if (!pi_state) 81 return; 82 83 if (!refcount_dec_and_test(&pi_state->refcount)) 84 return; 85 86 /* 87 * If pi_state->owner is NULL, the owner is most probably dying 88 * and has cleaned up the pi_state already 89 */ 90 if (pi_state->owner) { 91 unsigned long flags; 92 93 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); 94 pi_state_update_owner(pi_state, NULL); 95 rt_mutex_proxy_unlock(&pi_state->pi_mutex); 96 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); 97 } 98 99 if (current->pi_state_cache) { 100 kfree(pi_state); 101 } else { 102 /* 103 * pi_state->list is already empty. 104 * clear pi_state->owner. 105 * refcount is at 0 - put it back to 1. 106 */ 107 pi_state->owner = NULL; 108 refcount_set(&pi_state->refcount, 1); 109 current->pi_state_cache = pi_state; 110 } 111 } 112 113 /* 114 * We need to check the following states: 115 * 116 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 117 * 118 * [1] NULL | --- | --- | 0 | 0/1 | Valid 119 * [2] NULL | --- | --- | >0 | 0/1 | Valid 120 * 121 * [3] Found | NULL | -- | Any | 0/1 | Invalid 122 * 123 * [4] Found | Found | NULL | 0 | 1 | Valid 124 * [5] Found | Found | NULL | >0 | 1 | Invalid 125 * 126 * [6] Found | Found | task | 0 | 1 | Valid 127 * 128 * [7] Found | Found | NULL | Any | 0 | Invalid 129 * 130 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 131 * [9] Found | Found | task | 0 | 0 | Invalid 132 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 133 * 134 * [1] Indicates that the kernel can acquire the futex atomically. We 135 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 136 * 137 * [2] Valid, if TID does not belong to a kernel thread. If no matching 138 * thread is found then it indicates that the owner TID has died. 139 * 140 * [3] Invalid. The waiter is queued on a non PI futex 141 * 142 * [4] Valid state after exit_robust_list(), which sets the user space 143 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 144 * 145 * [5] The user space value got manipulated between exit_robust_list() 146 * and exit_pi_state_list() 147 * 148 * [6] Valid state after exit_pi_state_list() which sets the new owner in 149 * the pi_state but cannot access the user space value. 150 * 151 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 152 * 153 * [8] Owner and user space value match 154 * 155 * [9] There is no transient state which sets the user space TID to 0 156 * except exit_robust_list(), but this is indicated by the 157 * FUTEX_OWNER_DIED bit. See [4] 158 * 159 * [10] There is no transient state which leaves owner and user space 160 * TID out of sync. Except one error case where the kernel is denied 161 * write access to the user address, see fixup_pi_state_owner(). 162 * 163 * 164 * Serialization and lifetime rules: 165 * 166 * hb->lock: 167 * 168 * hb -> futex_q, relation 169 * futex_q -> pi_state, relation 170 * 171 * (cannot be raw because hb can contain arbitrary amount 172 * of futex_q's) 173 * 174 * pi_mutex->wait_lock: 175 * 176 * {uval, pi_state} 177 * 178 * (and pi_mutex 'obviously') 179 * 180 * p->pi_lock: 181 * 182 * p->pi_state_list -> pi_state->list, relation 183 * pi_mutex->owner -> pi_state->owner, relation 184 * 185 * pi_state->refcount: 186 * 187 * pi_state lifetime 188 * 189 * 190 * Lock order: 191 * 192 * hb->lock 193 * pi_mutex->wait_lock 194 * p->pi_lock 195 * 196 */ 197 198 /* 199 * Validate that the existing waiter has a pi_state and sanity check 200 * the pi_state against the user space value. If correct, attach to 201 * it. 202 */ 203 static int attach_to_pi_state(u32 __user *uaddr, u32 uval, 204 struct futex_pi_state *pi_state, 205 struct futex_pi_state **ps) 206 { 207 pid_t pid = uval & FUTEX_TID_MASK; 208 u32 uval2; 209 int ret; 210 211 /* 212 * Userspace might have messed up non-PI and PI futexes [3] 213 */ 214 if (unlikely(!pi_state)) 215 return -EINVAL; 216 217 /* 218 * We get here with hb->lock held, and having found a 219 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q 220 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), 221 * which in turn means that futex_lock_pi() still has a reference on 222 * our pi_state. 223 * 224 * The waiter holding a reference on @pi_state also protects against 225 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() 226 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently 227 * free pi_state before we can take a reference ourselves. 228 */ 229 WARN_ON(!refcount_read(&pi_state->refcount)); 230 231 /* 232 * Now that we have a pi_state, we can acquire wait_lock 233 * and do the state validation. 234 */ 235 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 236 237 /* 238 * Since {uval, pi_state} is serialized by wait_lock, and our current 239 * uval was read without holding it, it can have changed. Verify it 240 * still is what we expect it to be, otherwise retry the entire 241 * operation. 242 */ 243 if (futex_get_value_locked(&uval2, uaddr)) 244 goto out_efault; 245 246 if (uval != uval2) 247 goto out_eagain; 248 249 /* 250 * Handle the owner died case: 251 */ 252 if (uval & FUTEX_OWNER_DIED) { 253 /* 254 * exit_pi_state_list sets owner to NULL and wakes the 255 * topmost waiter. The task which acquires the 256 * pi_state->rt_mutex will fixup owner. 257 */ 258 if (!pi_state->owner) { 259 /* 260 * No pi state owner, but the user space TID 261 * is not 0. Inconsistent state. [5] 262 */ 263 if (pid) 264 goto out_einval; 265 /* 266 * Take a ref on the state and return success. [4] 267 */ 268 goto out_attach; 269 } 270 271 /* 272 * If TID is 0, then either the dying owner has not 273 * yet executed exit_pi_state_list() or some waiter 274 * acquired the rtmutex in the pi state, but did not 275 * yet fixup the TID in user space. 276 * 277 * Take a ref on the state and return success. [6] 278 */ 279 if (!pid) 280 goto out_attach; 281 } else { 282 /* 283 * If the owner died bit is not set, then the pi_state 284 * must have an owner. [7] 285 */ 286 if (!pi_state->owner) 287 goto out_einval; 288 } 289 290 /* 291 * Bail out if user space manipulated the futex value. If pi 292 * state exists then the owner TID must be the same as the 293 * user space TID. [9/10] 294 */ 295 if (pid != task_pid_vnr(pi_state->owner)) 296 goto out_einval; 297 298 out_attach: 299 get_pi_state(pi_state); 300 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 301 *ps = pi_state; 302 return 0; 303 304 out_einval: 305 ret = -EINVAL; 306 goto out_error; 307 308 out_eagain: 309 ret = -EAGAIN; 310 goto out_error; 311 312 out_efault: 313 ret = -EFAULT; 314 goto out_error; 315 316 out_error: 317 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 318 return ret; 319 } 320 321 static int handle_exit_race(u32 __user *uaddr, u32 uval, 322 struct task_struct *tsk) 323 { 324 u32 uval2; 325 326 /* 327 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the 328 * caller that the alleged owner is busy. 329 */ 330 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) 331 return -EBUSY; 332 333 /* 334 * Reread the user space value to handle the following situation: 335 * 336 * CPU0 CPU1 337 * 338 * sys_exit() sys_futex() 339 * do_exit() futex_lock_pi() 340 * futex_lock_pi_atomic() 341 * exit_signals(tsk) No waiters: 342 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID 343 * mm_release(tsk) Set waiter bit 344 * exit_robust_list(tsk) { *uaddr = 0x80000PID; 345 * Set owner died attach_to_pi_owner() { 346 * *uaddr = 0xC0000000; tsk = get_task(PID); 347 * } if (!tsk->flags & PF_EXITING) { 348 * ... attach(); 349 * tsk->futex_state = } else { 350 * FUTEX_STATE_DEAD; if (tsk->futex_state != 351 * FUTEX_STATE_DEAD) 352 * return -EAGAIN; 353 * return -ESRCH; <--- FAIL 354 * } 355 * 356 * Returning ESRCH unconditionally is wrong here because the 357 * user space value has been changed by the exiting task. 358 * 359 * The same logic applies to the case where the exiting task is 360 * already gone. 361 */ 362 if (futex_get_value_locked(&uval2, uaddr)) 363 return -EFAULT; 364 365 /* If the user space value has changed, try again. */ 366 if (uval2 != uval) 367 return -EAGAIN; 368 369 /* 370 * The exiting task did not have a robust list, the robust list was 371 * corrupted or the user space value in *uaddr is simply bogus. 372 * Give up and tell user space. 373 */ 374 return -ESRCH; 375 } 376 377 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, 378 struct futex_pi_state **ps) 379 { 380 /* 381 * No existing pi state. First waiter. [2] 382 * 383 * This creates pi_state, we have hb->lock held, this means nothing can 384 * observe this state, wait_lock is irrelevant. 385 */ 386 struct futex_pi_state *pi_state = alloc_pi_state(); 387 388 /* 389 * Initialize the pi_mutex in locked state and make @p 390 * the owner of it: 391 */ 392 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 393 394 /* Store the key for possible exit cleanups: */ 395 pi_state->key = *key; 396 397 WARN_ON(!list_empty(&pi_state->list)); 398 list_add(&pi_state->list, &p->pi_state_list); 399 /* 400 * Assignment without holding pi_state->pi_mutex.wait_lock is safe 401 * because there is no concurrency as the object is not published yet. 402 */ 403 pi_state->owner = p; 404 405 *ps = pi_state; 406 } 407 /* 408 * Lookup the task for the TID provided from user space and attach to 409 * it after doing proper sanity checks. 410 */ 411 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, 412 struct futex_pi_state **ps, 413 struct task_struct **exiting) 414 { 415 pid_t pid = uval & FUTEX_TID_MASK; 416 struct task_struct *p; 417 418 /* 419 * We are the first waiter - try to look up the real owner and attach 420 * the new pi_state to it, but bail out when TID = 0 [1] 421 * 422 * The !pid check is paranoid. None of the call sites should end up 423 * with pid == 0, but better safe than sorry. Let the caller retry 424 */ 425 if (!pid) 426 return -EAGAIN; 427 p = find_get_task_by_vpid(pid); 428 if (!p) 429 return handle_exit_race(uaddr, uval, NULL); 430 431 if (unlikely(p->flags & PF_KTHREAD)) { 432 put_task_struct(p); 433 return -EPERM; 434 } 435 436 /* 437 * We need to look at the task state to figure out, whether the 438 * task is exiting. To protect against the change of the task state 439 * in futex_exit_release(), we do this protected by p->pi_lock: 440 */ 441 raw_spin_lock_irq(&p->pi_lock); 442 if (unlikely(p->futex_state != FUTEX_STATE_OK)) { 443 /* 444 * The task is on the way out. When the futex state is 445 * FUTEX_STATE_DEAD, we know that the task has finished 446 * the cleanup: 447 */ 448 int ret = handle_exit_race(uaddr, uval, p); 449 450 raw_spin_unlock_irq(&p->pi_lock); 451 /* 452 * If the owner task is between FUTEX_STATE_EXITING and 453 * FUTEX_STATE_DEAD then store the task pointer and keep 454 * the reference on the task struct. The calling code will 455 * drop all locks, wait for the task to reach 456 * FUTEX_STATE_DEAD and then drop the refcount. This is 457 * required to prevent a live lock when the current task 458 * preempted the exiting task between the two states. 459 */ 460 if (ret == -EBUSY) 461 *exiting = p; 462 else 463 put_task_struct(p); 464 return ret; 465 } 466 467 __attach_to_pi_owner(p, key, ps); 468 raw_spin_unlock_irq(&p->pi_lock); 469 470 put_task_struct(p); 471 472 return 0; 473 } 474 475 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 476 { 477 int err; 478 u32 curval; 479 480 if (unlikely(should_fail_futex(true))) 481 return -EFAULT; 482 483 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 484 if (unlikely(err)) 485 return err; 486 487 /* If user space value changed, let the caller retry */ 488 return curval != uval ? -EAGAIN : 0; 489 } 490 491 /** 492 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 493 * @uaddr: the pi futex user address 494 * @hb: the pi futex hash bucket 495 * @key: the futex key associated with uaddr and hb 496 * @ps: the pi_state pointer where we store the result of the 497 * lookup 498 * @task: the task to perform the atomic lock work for. This will 499 * be "current" except in the case of requeue pi. 500 * @exiting: Pointer to store the task pointer of the owner task 501 * which is in the middle of exiting 502 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 503 * 504 * Return: 505 * - 0 - ready to wait; 506 * - 1 - acquired the lock; 507 * - <0 - error 508 * 509 * The hb->lock must be held by the caller. 510 * 511 * @exiting is only set when the return value is -EBUSY. If so, this holds 512 * a refcount on the exiting task on return and the caller needs to drop it 513 * after waiting for the exit to complete. 514 */ 515 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 516 union futex_key *key, 517 struct futex_pi_state **ps, 518 struct task_struct *task, 519 struct task_struct **exiting, 520 int set_waiters) 521 { 522 u32 uval, newval, vpid = task_pid_vnr(task); 523 struct futex_q *top_waiter; 524 int ret; 525 526 /* 527 * Read the user space value first so we can validate a few 528 * things before proceeding further. 529 */ 530 if (futex_get_value_locked(&uval, uaddr)) 531 return -EFAULT; 532 533 if (unlikely(should_fail_futex(true))) 534 return -EFAULT; 535 536 /* 537 * Detect deadlocks. 538 */ 539 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 540 return -EDEADLK; 541 542 if ((unlikely(should_fail_futex(true)))) 543 return -EDEADLK; 544 545 /* 546 * Lookup existing state first. If it exists, try to attach to 547 * its pi_state. 548 */ 549 top_waiter = futex_top_waiter(hb, key); 550 if (top_waiter) 551 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 552 553 /* 554 * No waiter and user TID is 0. We are here because the 555 * waiters or the owner died bit is set or called from 556 * requeue_cmp_pi or for whatever reason something took the 557 * syscall. 558 */ 559 if (!(uval & FUTEX_TID_MASK)) { 560 /* 561 * We take over the futex. No other waiters and the user space 562 * TID is 0. We preserve the owner died bit. 563 */ 564 newval = uval & FUTEX_OWNER_DIED; 565 newval |= vpid; 566 567 /* The futex requeue_pi code can enforce the waiters bit */ 568 if (set_waiters) 569 newval |= FUTEX_WAITERS; 570 571 ret = lock_pi_update_atomic(uaddr, uval, newval); 572 if (ret) 573 return ret; 574 575 /* 576 * If the waiter bit was requested the caller also needs PI 577 * state attached to the new owner of the user space futex. 578 * 579 * @task is guaranteed to be alive and it cannot be exiting 580 * because it is either sleeping or waiting in 581 * futex_requeue_pi_wakeup_sync(). 582 * 583 * No need to do the full attach_to_pi_owner() exercise 584 * because @task is known and valid. 585 */ 586 if (set_waiters) { 587 raw_spin_lock_irq(&task->pi_lock); 588 __attach_to_pi_owner(task, key, ps); 589 raw_spin_unlock_irq(&task->pi_lock); 590 } 591 return 1; 592 } 593 594 /* 595 * First waiter. Set the waiters bit before attaching ourself to 596 * the owner. If owner tries to unlock, it will be forced into 597 * the kernel and blocked on hb->lock. 598 */ 599 newval = uval | FUTEX_WAITERS; 600 ret = lock_pi_update_atomic(uaddr, uval, newval); 601 if (ret) 602 return ret; 603 /* 604 * If the update of the user space value succeeded, we try to 605 * attach to the owner. If that fails, no harm done, we only 606 * set the FUTEX_WAITERS bit in the user space variable. 607 */ 608 return attach_to_pi_owner(uaddr, newval, key, ps, exiting); 609 } 610 611 /* 612 * Caller must hold a reference on @pi_state. 613 */ 614 static int wake_futex_pi(u32 __user *uaddr, u32 uval, 615 struct futex_pi_state *pi_state, 616 struct rt_mutex_waiter *top_waiter) 617 { 618 struct task_struct *new_owner; 619 bool postunlock = false; 620 DEFINE_RT_WAKE_Q(wqh); 621 u32 curval, newval; 622 int ret = 0; 623 624 new_owner = top_waiter->task; 625 626 /* 627 * We pass it to the next owner. The WAITERS bit is always kept 628 * enabled while there is PI state around. We cleanup the owner 629 * died bit, because we are the owner. 630 */ 631 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 632 633 if (unlikely(should_fail_futex(true))) { 634 ret = -EFAULT; 635 goto out_unlock; 636 } 637 638 ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 639 if (!ret && (curval != uval)) { 640 /* 641 * If a unconditional UNLOCK_PI operation (user space did not 642 * try the TID->0 transition) raced with a waiter setting the 643 * FUTEX_WAITERS flag between get_user() and locking the hash 644 * bucket lock, retry the operation. 645 */ 646 if ((FUTEX_TID_MASK & curval) == uval) 647 ret = -EAGAIN; 648 else 649 ret = -EINVAL; 650 } 651 652 if (!ret) { 653 /* 654 * This is a point of no return; once we modified the uval 655 * there is no going back and subsequent operations must 656 * not fail. 657 */ 658 pi_state_update_owner(pi_state, new_owner); 659 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); 660 } 661 662 out_unlock: 663 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 664 665 if (postunlock) 666 rt_mutex_postunlock(&wqh); 667 668 return ret; 669 } 670 671 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 672 struct task_struct *argowner) 673 { 674 struct futex_pi_state *pi_state = q->pi_state; 675 struct task_struct *oldowner, *newowner; 676 u32 uval, curval, newval, newtid; 677 int err = 0; 678 679 oldowner = pi_state->owner; 680 681 /* 682 * We are here because either: 683 * 684 * - we stole the lock and pi_state->owner needs updating to reflect 685 * that (@argowner == current), 686 * 687 * or: 688 * 689 * - someone stole our lock and we need to fix things to point to the 690 * new owner (@argowner == NULL). 691 * 692 * Either way, we have to replace the TID in the user space variable. 693 * This must be atomic as we have to preserve the owner died bit here. 694 * 695 * Note: We write the user space value _before_ changing the pi_state 696 * because we can fault here. Imagine swapped out pages or a fork 697 * that marked all the anonymous memory readonly for cow. 698 * 699 * Modifying pi_state _before_ the user space value would leave the 700 * pi_state in an inconsistent state when we fault here, because we 701 * need to drop the locks to handle the fault. This might be observed 702 * in the PID checks when attaching to PI state . 703 */ 704 retry: 705 if (!argowner) { 706 if (oldowner != current) { 707 /* 708 * We raced against a concurrent self; things are 709 * already fixed up. Nothing to do. 710 */ 711 return 0; 712 } 713 714 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { 715 /* We got the lock. pi_state is correct. Tell caller. */ 716 return 1; 717 } 718 719 /* 720 * The trylock just failed, so either there is an owner or 721 * there is a higher priority waiter than this one. 722 */ 723 newowner = rt_mutex_owner(&pi_state->pi_mutex); 724 /* 725 * If the higher priority waiter has not yet taken over the 726 * rtmutex then newowner is NULL. We can't return here with 727 * that state because it's inconsistent vs. the user space 728 * state. So drop the locks and try again. It's a valid 729 * situation and not any different from the other retry 730 * conditions. 731 */ 732 if (unlikely(!newowner)) { 733 err = -EAGAIN; 734 goto handle_err; 735 } 736 } else { 737 WARN_ON_ONCE(argowner != current); 738 if (oldowner == current) { 739 /* 740 * We raced against a concurrent self; things are 741 * already fixed up. Nothing to do. 742 */ 743 return 1; 744 } 745 newowner = argowner; 746 } 747 748 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 749 /* Owner died? */ 750 if (!pi_state->owner) 751 newtid |= FUTEX_OWNER_DIED; 752 753 err = futex_get_value_locked(&uval, uaddr); 754 if (err) 755 goto handle_err; 756 757 for (;;) { 758 newval = (uval & FUTEX_OWNER_DIED) | newtid; 759 760 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 761 if (err) 762 goto handle_err; 763 764 if (curval == uval) 765 break; 766 uval = curval; 767 } 768 769 /* 770 * We fixed up user space. Now we need to fix the pi_state 771 * itself. 772 */ 773 pi_state_update_owner(pi_state, newowner); 774 775 return argowner == current; 776 777 /* 778 * In order to reschedule or handle a page fault, we need to drop the 779 * locks here. In the case of a fault, this gives the other task 780 * (either the highest priority waiter itself or the task which stole 781 * the rtmutex) the chance to try the fixup of the pi_state. So once we 782 * are back from handling the fault we need to check the pi_state after 783 * reacquiring the locks and before trying to do another fixup. When 784 * the fixup has been done already we simply return. 785 * 786 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 787 * drop hb->lock since the caller owns the hb -> futex_q relation. 788 * Dropping the pi_mutex->wait_lock requires the state revalidate. 789 */ 790 handle_err: 791 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 792 spin_unlock(q->lock_ptr); 793 794 switch (err) { 795 case -EFAULT: 796 err = fault_in_user_writeable(uaddr); 797 break; 798 799 case -EAGAIN: 800 cond_resched(); 801 err = 0; 802 break; 803 804 default: 805 WARN_ON_ONCE(1); 806 break; 807 } 808 809 spin_lock(q->lock_ptr); 810 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 811 812 /* 813 * Check if someone else fixed it for us: 814 */ 815 if (pi_state->owner != oldowner) 816 return argowner == current; 817 818 /* Retry if err was -EAGAIN or the fault in succeeded */ 819 if (!err) 820 goto retry; 821 822 /* 823 * fault_in_user_writeable() failed so user state is immutable. At 824 * best we can make the kernel state consistent but user state will 825 * be most likely hosed and any subsequent unlock operation will be 826 * rejected due to PI futex rule [10]. 827 * 828 * Ensure that the rtmutex owner is also the pi_state owner despite 829 * the user space value claiming something different. There is no 830 * point in unlocking the rtmutex if current is the owner as it 831 * would need to wait until the next waiter has taken the rtmutex 832 * to guarantee consistent state. Keep it simple. Userspace asked 833 * for this wreckaged state. 834 * 835 * The rtmutex has an owner - either current or some other 836 * task. See the EAGAIN loop above. 837 */ 838 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); 839 840 return err; 841 } 842 843 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 844 struct task_struct *argowner) 845 { 846 struct futex_pi_state *pi_state = q->pi_state; 847 int ret; 848 849 lockdep_assert_held(q->lock_ptr); 850 851 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 852 ret = __fixup_pi_state_owner(uaddr, q, argowner); 853 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 854 return ret; 855 } 856 857 /** 858 * fixup_pi_owner() - Post lock pi_state and corner case management 859 * @uaddr: user address of the futex 860 * @q: futex_q (contains pi_state and access to the rt_mutex) 861 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 862 * 863 * After attempting to lock an rt_mutex, this function is called to cleanup 864 * the pi_state owner as well as handle race conditions that may allow us to 865 * acquire the lock. Must be called with the hb lock held. 866 * 867 * Return: 868 * - 1 - success, lock taken; 869 * - 0 - success, lock not taken; 870 * - <0 - on error (-EFAULT) 871 */ 872 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) 873 { 874 if (locked) { 875 /* 876 * Got the lock. We might not be the anticipated owner if we 877 * did a lock-steal - fix up the PI-state in that case: 878 * 879 * Speculative pi_state->owner read (we don't hold wait_lock); 880 * since we own the lock pi_state->owner == current is the 881 * stable state, anything else needs more attention. 882 */ 883 if (q->pi_state->owner != current) 884 return fixup_pi_state_owner(uaddr, q, current); 885 return 1; 886 } 887 888 /* 889 * If we didn't get the lock; check if anybody stole it from us. In 890 * that case, we need to fix up the uval to point to them instead of 891 * us, otherwise bad things happen. [10] 892 * 893 * Another speculative read; pi_state->owner == current is unstable 894 * but needs our attention. 895 */ 896 if (q->pi_state->owner == current) 897 return fixup_pi_state_owner(uaddr, q, NULL); 898 899 /* 900 * Paranoia check. If we did not take the lock, then we should not be 901 * the owner of the rt_mutex. Warn and establish consistent state. 902 */ 903 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) 904 return fixup_pi_state_owner(uaddr, q, current); 905 906 return 0; 907 } 908 909 /* 910 * Userspace tried a 0 -> TID atomic transition of the futex value 911 * and failed. The kernel side here does the whole locking operation: 912 * if there are waiters then it will block as a consequence of relying 913 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see 914 * a 0 value of the futex too.). 915 * 916 * Also serves as futex trylock_pi()'ing, and due semantics. 917 */ 918 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) 919 { 920 struct hrtimer_sleeper timeout, *to; 921 struct task_struct *exiting = NULL; 922 struct rt_mutex_waiter rt_waiter; 923 struct futex_hash_bucket *hb; 924 struct futex_q q = futex_q_init; 925 int res, ret; 926 927 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 928 return -ENOSYS; 929 930 if (refill_pi_state_cache()) 931 return -ENOMEM; 932 933 to = futex_setup_timer(time, &timeout, flags, 0); 934 935 retry: 936 ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); 937 if (unlikely(ret != 0)) 938 goto out; 939 940 retry_private: 941 hb = futex_q_lock(&q); 942 943 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 944 &exiting, 0); 945 if (unlikely(ret)) { 946 /* 947 * Atomic work succeeded and we got the lock, 948 * or failed. Either way, we do _not_ block. 949 */ 950 switch (ret) { 951 case 1: 952 /* We got the lock. */ 953 ret = 0; 954 goto out_unlock_put_key; 955 case -EFAULT: 956 goto uaddr_faulted; 957 case -EBUSY: 958 case -EAGAIN: 959 /* 960 * Two reasons for this: 961 * - EBUSY: Task is exiting and we just wait for the 962 * exit to complete. 963 * - EAGAIN: The user space value changed. 964 */ 965 futex_q_unlock(hb); 966 /* 967 * Handle the case where the owner is in the middle of 968 * exiting. Wait for the exit to complete otherwise 969 * this task might loop forever, aka. live lock. 970 */ 971 wait_for_owner_exiting(ret, exiting); 972 cond_resched(); 973 goto retry; 974 default: 975 goto out_unlock_put_key; 976 } 977 } 978 979 WARN_ON(!q.pi_state); 980 981 /* 982 * Only actually queue now that the atomic ops are done: 983 */ 984 __futex_queue(&q, hb); 985 986 if (trylock) { 987 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 988 /* Fixup the trylock return value: */ 989 ret = ret ? 0 : -EWOULDBLOCK; 990 goto no_block; 991 } 992 993 /* 994 * Must be done before we enqueue the waiter, here is unfortunately 995 * under the hb lock, but that *should* work because it does nothing. 996 */ 997 rt_mutex_pre_schedule(); 998 999 rt_mutex_init_waiter(&rt_waiter); 1000 1001 /* 1002 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1003 * hold it while doing rt_mutex_start_proxy(), because then it will 1004 * include hb->lock in the blocking chain, even through we'll not in 1005 * fact hold it while blocking. This will lead it to report -EDEADLK 1006 * and BUG when futex_unlock_pi() interleaves with this. 1007 * 1008 * Therefore acquire wait_lock while holding hb->lock, but drop the 1009 * latter before calling __rt_mutex_start_proxy_lock(). This 1010 * interleaves with futex_unlock_pi() -- which does a similar lock 1011 * handoff -- such that the latter can observe the futex_q::pi_state 1012 * before __rt_mutex_start_proxy_lock() is done. 1013 */ 1014 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1015 spin_unlock(q.lock_ptr); 1016 /* 1017 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1018 * such that futex_unlock_pi() is guaranteed to observe the waiter when 1019 * it sees the futex_q::pi_state. 1020 */ 1021 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); 1022 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); 1023 1024 if (ret) { 1025 if (ret == 1) 1026 ret = 0; 1027 goto cleanup; 1028 } 1029 1030 if (unlikely(to)) 1031 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1032 1033 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1034 1035 cleanup: 1036 /* 1037 * If we failed to acquire the lock (deadlock/signal/timeout), we must 1038 * must unwind the above, however we canont lock hb->lock because 1039 * rt_mutex already has a waiter enqueued and hb->lock can itself try 1040 * and enqueue an rt_waiter through rtlock. 1041 * 1042 * Doing the cleanup without holding hb->lock can cause inconsistent 1043 * state between hb and pi_state, but only in the direction of not 1044 * seeing a waiter that is leaving. 1045 * 1046 * See futex_unlock_pi(), it deals with this inconsistency. 1047 * 1048 * There be dragons here, since we must deal with the inconsistency on 1049 * the way out (here), it is impossible to detect/warn about the race 1050 * the other way around (missing an incoming waiter). 1051 * 1052 * What could possibly go wrong... 1053 */ 1054 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1055 ret = 0; 1056 1057 /* 1058 * Now that the rt_waiter has been dequeued, it is safe to use 1059 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up 1060 * the 1061 */ 1062 spin_lock(q.lock_ptr); 1063 /* 1064 * Waiter is unqueued. 1065 */ 1066 rt_mutex_post_schedule(); 1067 no_block: 1068 /* 1069 * Fixup the pi_state owner and possibly acquire the lock if we 1070 * haven't already. 1071 */ 1072 res = fixup_pi_owner(uaddr, &q, !ret); 1073 /* 1074 * If fixup_pi_owner() returned an error, propagate that. If it acquired 1075 * the lock, clear our -ETIMEDOUT or -EINTR. 1076 */ 1077 if (res) 1078 ret = (res < 0) ? res : 0; 1079 1080 futex_unqueue_pi(&q); 1081 spin_unlock(q.lock_ptr); 1082 goto out; 1083 1084 out_unlock_put_key: 1085 futex_q_unlock(hb); 1086 1087 out: 1088 if (to) { 1089 hrtimer_cancel(&to->timer); 1090 destroy_hrtimer_on_stack(&to->timer); 1091 } 1092 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1093 1094 uaddr_faulted: 1095 futex_q_unlock(hb); 1096 1097 ret = fault_in_user_writeable(uaddr); 1098 if (ret) 1099 goto out; 1100 1101 if (!(flags & FLAGS_SHARED)) 1102 goto retry_private; 1103 1104 goto retry; 1105 } 1106 1107 /* 1108 * Userspace attempted a TID -> 0 atomic transition, and failed. 1109 * This is the in-kernel slowpath: we look up the PI state (if any), 1110 * and do the rt-mutex unlock. 1111 */ 1112 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 1113 { 1114 u32 curval, uval, vpid = task_pid_vnr(current); 1115 union futex_key key = FUTEX_KEY_INIT; 1116 struct futex_hash_bucket *hb; 1117 struct futex_q *top_waiter; 1118 int ret; 1119 1120 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 1121 return -ENOSYS; 1122 1123 retry: 1124 if (get_user(uval, uaddr)) 1125 return -EFAULT; 1126 /* 1127 * We release only a lock we actually own: 1128 */ 1129 if ((uval & FUTEX_TID_MASK) != vpid) 1130 return -EPERM; 1131 1132 ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE); 1133 if (ret) 1134 return ret; 1135 1136 hb = futex_hash(&key); 1137 spin_lock(&hb->lock); 1138 1139 /* 1140 * Check waiters first. We do not trust user space values at 1141 * all and we at least want to know if user space fiddled 1142 * with the futex value instead of blindly unlocking. 1143 */ 1144 top_waiter = futex_top_waiter(hb, &key); 1145 if (top_waiter) { 1146 struct futex_pi_state *pi_state = top_waiter->pi_state; 1147 struct rt_mutex_waiter *rt_waiter; 1148 1149 ret = -EINVAL; 1150 if (!pi_state) 1151 goto out_unlock; 1152 1153 /* 1154 * If current does not own the pi_state then the futex is 1155 * inconsistent and user space fiddled with the futex value. 1156 */ 1157 if (pi_state->owner != current) 1158 goto out_unlock; 1159 1160 /* 1161 * By taking wait_lock while still holding hb->lock, we ensure 1162 * there is no point where we hold neither; and thereby 1163 * wake_futex_pi() must observe any new waiters. 1164 * 1165 * Since the cleanup: case in futex_lock_pi() removes the 1166 * rt_waiter without holding hb->lock, it is possible for 1167 * wake_futex_pi() to not find a waiter while the above does, 1168 * in this case the waiter is on the way out and it can be 1169 * ignored. 1170 * 1171 * In particular; this forces __rt_mutex_start_proxy() to 1172 * complete such that we're guaranteed to observe the 1173 * rt_waiter. 1174 */ 1175 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1176 1177 /* 1178 * Futex vs rt_mutex waiter state -- if there are no rt_mutex 1179 * waiters even though futex thinks there are, then the waiter 1180 * is leaving and the uncontended path is safe to take. 1181 */ 1182 rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); 1183 if (!rt_waiter) { 1184 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1185 goto do_uncontended; 1186 } 1187 1188 get_pi_state(pi_state); 1189 spin_unlock(&hb->lock); 1190 1191 /* drops pi_state->pi_mutex.wait_lock */ 1192 ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter); 1193 1194 put_pi_state(pi_state); 1195 1196 /* 1197 * Success, we're done! No tricky corner cases. 1198 */ 1199 if (!ret) 1200 return ret; 1201 /* 1202 * The atomic access to the futex value generated a 1203 * pagefault, so retry the user-access and the wakeup: 1204 */ 1205 if (ret == -EFAULT) 1206 goto pi_faulted; 1207 /* 1208 * A unconditional UNLOCK_PI op raced against a waiter 1209 * setting the FUTEX_WAITERS bit. Try again. 1210 */ 1211 if (ret == -EAGAIN) 1212 goto pi_retry; 1213 /* 1214 * wake_futex_pi has detected invalid state. Tell user 1215 * space. 1216 */ 1217 return ret; 1218 } 1219 1220 do_uncontended: 1221 /* 1222 * We have no kernel internal state, i.e. no waiters in the 1223 * kernel. Waiters which are about to queue themselves are stuck 1224 * on hb->lock. So we can safely ignore them. We do neither 1225 * preserve the WAITERS bit not the OWNER_DIED one. We are the 1226 * owner. 1227 */ 1228 if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) { 1229 spin_unlock(&hb->lock); 1230 switch (ret) { 1231 case -EFAULT: 1232 goto pi_faulted; 1233 1234 case -EAGAIN: 1235 goto pi_retry; 1236 1237 default: 1238 WARN_ON_ONCE(1); 1239 return ret; 1240 } 1241 } 1242 1243 /* 1244 * If uval has changed, let user space handle it. 1245 */ 1246 ret = (curval == uval) ? 0 : -EAGAIN; 1247 1248 out_unlock: 1249 spin_unlock(&hb->lock); 1250 return ret; 1251 1252 pi_retry: 1253 cond_resched(); 1254 goto retry; 1255 1256 pi_faulted: 1257 1258 ret = fault_in_user_writeable(uaddr); 1259 if (!ret) 1260 goto retry; 1261 1262 return ret; 1263 } 1264 1265