1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/slab.h> 4 #include <linux/sched/rt.h> 5 #include <linux/sched/task.h> 6 7 #include "futex.h" 8 #include "../locking/rtmutex_common.h" 9 10 /* 11 * PI code: 12 */ 13 int refill_pi_state_cache(void) 14 { 15 struct futex_pi_state *pi_state; 16 17 if (likely(current->futex.pi_state_cache)) 18 return 0; 19 20 pi_state = kzalloc_obj(*pi_state); 21 22 if (!pi_state) 23 return -ENOMEM; 24 25 INIT_LIST_HEAD(&pi_state->list); 26 /* pi_mutex gets initialized later */ 27 pi_state->owner = NULL; 28 refcount_set(&pi_state->refcount, 1); 29 pi_state->key = FUTEX_KEY_INIT; 30 31 current->futex.pi_state_cache = pi_state; 32 33 return 0; 34 } 35 36 static struct futex_pi_state *alloc_pi_state(void) 37 { 38 struct futex_pi_state *pi_state = current->futex.pi_state_cache; 39 40 WARN_ON(!pi_state); 41 current->futex.pi_state_cache = NULL; 42 43 return pi_state; 44 } 45 46 static void pi_state_update_owner(struct futex_pi_state *pi_state, 47 struct task_struct *new_owner) 48 { 49 struct task_struct *old_owner = pi_state->owner; 50 51 lockdep_assert_held(&pi_state->pi_mutex.wait_lock); 52 53 if (old_owner) { 54 raw_spin_lock(&old_owner->pi_lock); 55 WARN_ON(list_empty(&pi_state->list)); 56 list_del_init(&pi_state->list); 57 raw_spin_unlock(&old_owner->pi_lock); 58 } 59 60 if (new_owner) { 61 raw_spin_lock(&new_owner->pi_lock); 62 WARN_ON(!list_empty(&pi_state->list)); 63 list_add(&pi_state->list, &new_owner->futex.pi_state_list); 64 pi_state->owner = new_owner; 65 raw_spin_unlock(&new_owner->pi_lock); 66 } 67 } 68 69 void get_pi_state(struct futex_pi_state *pi_state) 70 { 71 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); 72 } 73 74 /* 75 * Drops a reference to the pi_state object and frees or caches it 76 * when the last reference is gone. 77 */ 78 void put_pi_state(struct futex_pi_state *pi_state) 79 { 80 if (!pi_state) 81 return; 82 83 if (!refcount_dec_and_test(&pi_state->refcount)) 84 return; 85 86 /* 87 * If pi_state->owner is NULL, the owner is most probably dying 88 * and has cleaned up the pi_state already 89 */ 90 if (pi_state->owner) { 91 unsigned long flags; 92 93 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); 94 pi_state_update_owner(pi_state, NULL); 95 rt_mutex_proxy_unlock(&pi_state->pi_mutex); 96 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); 97 } 98 99 if (current->futex.pi_state_cache) { 100 kfree(pi_state); 101 } else { 102 /* 103 * pi_state->list is already empty. 104 * clear pi_state->owner. 105 * refcount is at 0 - put it back to 1. 106 */ 107 pi_state->owner = NULL; 108 refcount_set(&pi_state->refcount, 1); 109 current->futex.pi_state_cache = pi_state; 110 } 111 } 112 113 /* 114 * We need to check the following states: 115 * 116 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 117 * 118 * [1] NULL | --- | --- | 0 | 0/1 | Valid 119 * [2] NULL | --- | --- | >0 | 0/1 | Valid 120 * 121 * [3] Found | NULL | -- | Any | 0/1 | Invalid 122 * 123 * [4] Found | Found | NULL | 0 | 1 | Valid 124 * [5] Found | Found | NULL | >0 | 1 | Invalid 125 * 126 * [6] Found | Found | task | 0 | 1 | Valid 127 * 128 * [7] Found | Found | NULL | Any | 0 | Invalid 129 * 130 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 131 * [9] Found | Found | task | 0 | 0 | Invalid 132 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 133 * 134 * [1] Indicates that the kernel can acquire the futex atomically. We 135 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 136 * 137 * [2] Valid, if TID does not belong to a kernel thread. If no matching 138 * thread is found then it indicates that the owner TID has died. 139 * 140 * [3] Invalid. The waiter is queued on a non PI futex 141 * 142 * [4] Valid state after exit_robust_list(), which sets the user space 143 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 144 * 145 * [5] The user space value got manipulated between exit_robust_list() 146 * and exit_pi_state_list() 147 * 148 * [6] Valid state after exit_pi_state_list() which sets the new owner in 149 * the pi_state but cannot access the user space value. 150 * 151 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 152 * 153 * [8] Owner and user space value match 154 * 155 * [9] There is no transient state which sets the user space TID to 0 156 * except exit_robust_list(), but this is indicated by the 157 * FUTEX_OWNER_DIED bit. See [4] 158 * 159 * [10] There is no transient state which leaves owner and user space 160 * TID out of sync. Except one error case where the kernel is denied 161 * write access to the user address, see fixup_pi_state_owner(). 162 * 163 * 164 * Serialization and lifetime rules: 165 * 166 * hb->lock: 167 * 168 * hb -> futex_q, relation 169 * futex_q -> pi_state, relation 170 * 171 * (cannot be raw because hb can contain arbitrary amount 172 * of futex_q's) 173 * 174 * pi_mutex->wait_lock: 175 * 176 * {uval, pi_state} 177 * 178 * (and pi_mutex 'obviously') 179 * 180 * p->pi_lock: 181 * 182 * p->futex.pi_state_list -> pi_state->list, relation 183 * pi_mutex->owner -> pi_state->owner, relation 184 * 185 * pi_state->refcount: 186 * 187 * pi_state lifetime 188 * 189 * 190 * Lock order: 191 * 192 * hb->lock 193 * pi_mutex->wait_lock 194 * p->pi_lock 195 * 196 */ 197 198 /* 199 * Validate that the existing waiter has a pi_state and sanity check 200 * the pi_state against the user space value. If correct, attach to 201 * it. 202 */ 203 static int attach_to_pi_state(u32 __user *uaddr, u32 uval, 204 struct futex_pi_state *pi_state, 205 struct futex_pi_state **ps) 206 { 207 pid_t pid = uval & FUTEX_TID_MASK; 208 u32 uval2; 209 int ret; 210 211 /* 212 * Userspace might have messed up non-PI and PI futexes [3] 213 */ 214 if (unlikely(!pi_state)) 215 return -EINVAL; 216 217 /* 218 * We get here with hb->lock held, and having found a 219 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q 220 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), 221 * which in turn means that futex_lock_pi() still has a reference on 222 * our pi_state. 223 * 224 * The waiter holding a reference on @pi_state also protects against 225 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() 226 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently 227 * free pi_state before we can take a reference ourselves. 228 */ 229 WARN_ON(!refcount_read(&pi_state->refcount)); 230 231 /* 232 * Now that we have a pi_state, we can acquire wait_lock 233 * and do the state validation. 234 */ 235 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 236 237 /* 238 * Since {uval, pi_state} is serialized by wait_lock, and our current 239 * uval was read without holding it, it can have changed. Verify it 240 * still is what we expect it to be, otherwise retry the entire 241 * operation. 242 */ 243 if (futex_get_value_locked(&uval2, uaddr)) 244 goto out_efault; 245 246 if (uval != uval2) 247 goto out_eagain; 248 249 /* 250 * Handle the owner died case: 251 */ 252 if (uval & FUTEX_OWNER_DIED) { 253 /* 254 * exit_pi_state_list sets owner to NULL and wakes the 255 * topmost waiter. The task which acquires the 256 * pi_state->rt_mutex will fixup owner. 257 */ 258 if (!pi_state->owner) { 259 /* 260 * No pi state owner, but the user space TID 261 * is not 0. Inconsistent state. [5] 262 */ 263 if (pid) 264 goto out_einval; 265 /* 266 * Take a ref on the state and return success. [4] 267 */ 268 goto out_attach; 269 } 270 271 /* 272 * If TID is 0, then either the dying owner has not 273 * yet executed exit_pi_state_list() or some waiter 274 * acquired the rtmutex in the pi state, but did not 275 * yet fixup the TID in user space. 276 * 277 * Take a ref on the state and return success. [6] 278 */ 279 if (!pid) 280 goto out_attach; 281 } else { 282 /* 283 * If the owner died bit is not set, then the pi_state 284 * must have an owner. [7] 285 */ 286 if (!pi_state->owner) 287 goto out_einval; 288 } 289 290 /* 291 * Bail out if user space manipulated the futex value. If pi 292 * state exists then the owner TID must be the same as the 293 * user space TID. [9/10] 294 */ 295 if (pid != task_pid_vnr(pi_state->owner)) 296 goto out_einval; 297 298 out_attach: 299 get_pi_state(pi_state); 300 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 301 *ps = pi_state; 302 return 0; 303 304 out_einval: 305 ret = -EINVAL; 306 goto out_error; 307 308 out_eagain: 309 ret = -EAGAIN; 310 goto out_error; 311 312 out_efault: 313 ret = -EFAULT; 314 goto out_error; 315 316 out_error: 317 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 318 return ret; 319 } 320 321 static int handle_exit_race(u32 __user *uaddr, u32 uval, 322 struct task_struct *tsk) 323 { 324 u32 uval2; 325 326 /* 327 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the 328 * caller that the alleged owner is busy. 329 */ 330 if (tsk && tsk->futex.state != FUTEX_STATE_DEAD) 331 return -EBUSY; 332 333 /* 334 * Reread the user space value to handle the following situation: 335 * 336 * CPU0 CPU1 337 * 338 * sys_exit() sys_futex() 339 * do_exit() futex_lock_pi() 340 * futex_lock_pi_atomic() 341 * exit_signals(tsk) No waiters: 342 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID 343 * mm_release(tsk) Set waiter bit 344 * exit_robust_list(tsk) { *uaddr = 0x80000PID; 345 * Set owner died attach_to_pi_owner() { 346 * *uaddr = 0xC0000000; tsk = get_task(PID); 347 * } if (!tsk->flags & PF_EXITING) { 348 * ... attach(); 349 * tsk->futex.state = } else { 350 * FUTEX_STATE_DEAD; if (tsk->futex.state != 351 * FUTEX_STATE_DEAD) 352 * return -EAGAIN; 353 * return -ESRCH; <--- FAIL 354 * } 355 * 356 * Returning ESRCH unconditionally is wrong here because the 357 * user space value has been changed by the exiting task. 358 * 359 * The same logic applies to the case where the exiting task is 360 * already gone. 361 */ 362 if (futex_get_value_locked(&uval2, uaddr)) 363 return -EFAULT; 364 365 /* If the user space value has changed, try again. */ 366 if (uval2 != uval) 367 return -EAGAIN; 368 369 /* 370 * The exiting task did not have a robust list, the robust list was 371 * corrupted or the user space value in *uaddr is simply bogus. 372 * Give up and tell user space. 373 */ 374 return -ESRCH; 375 } 376 377 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, 378 struct futex_pi_state **ps) 379 { 380 /* 381 * No existing pi state. First waiter. [2] 382 * 383 * This creates pi_state, we have hb->lock held, this means nothing can 384 * observe this state, wait_lock is irrelevant. 385 */ 386 struct futex_pi_state *pi_state = alloc_pi_state(); 387 388 /* 389 * Initialize the pi_mutex in locked state and make @p 390 * the owner of it: 391 */ 392 __assume_ctx_lock(&pi_state->pi_mutex.wait_lock); 393 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 394 395 /* Store the key for possible exit cleanups: */ 396 pi_state->key = *key; 397 398 WARN_ON(!list_empty(&pi_state->list)); 399 list_add(&pi_state->list, &p->futex.pi_state_list); 400 /* 401 * Assignment without holding pi_state->pi_mutex.wait_lock is safe 402 * because there is no concurrency as the object is not published yet. 403 */ 404 pi_state->owner = p; 405 406 *ps = pi_state; 407 } 408 /* 409 * Lookup the task for the TID provided from user space and attach to 410 * it after doing proper sanity checks. 411 */ 412 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, 413 struct futex_pi_state **ps, 414 struct task_struct **exiting) 415 { 416 pid_t pid = uval & FUTEX_TID_MASK; 417 struct task_struct *p; 418 419 /* 420 * We are the first waiter - try to look up the real owner and attach 421 * the new pi_state to it, but bail out when TID = 0 [1] 422 * 423 * The !pid check is paranoid. None of the call sites should end up 424 * with pid == 0, but better safe than sorry. Let the caller retry 425 */ 426 if (!pid) 427 return -EAGAIN; 428 p = find_get_task_by_vpid(pid); 429 if (!p) 430 return handle_exit_race(uaddr, uval, NULL); 431 432 if (unlikely(p->flags & PF_KTHREAD)) { 433 put_task_struct(p); 434 return -EPERM; 435 } 436 437 /* 438 * We need to look at the task state to figure out, whether the 439 * task is exiting. To protect against the change of the task state 440 * in futex_exit_release(), we do this protected by p->pi_lock: 441 */ 442 raw_spin_lock_irq(&p->pi_lock); 443 if (unlikely(p->futex.state != FUTEX_STATE_OK)) { 444 /* 445 * The task is on the way out. When the futex state is 446 * FUTEX_STATE_DEAD, we know that the task has finished 447 * the cleanup: 448 */ 449 int ret = handle_exit_race(uaddr, uval, p); 450 451 raw_spin_unlock_irq(&p->pi_lock); 452 /* 453 * If the owner task is between FUTEX_STATE_EXITING and 454 * FUTEX_STATE_DEAD then store the task pointer and keep 455 * the reference on the task struct. The calling code will 456 * drop all locks, wait for the task to reach 457 * FUTEX_STATE_DEAD and then drop the refcount. This is 458 * required to prevent a live lock when the current task 459 * preempted the exiting task between the two states. 460 */ 461 if (ret == -EBUSY) 462 *exiting = p; 463 else 464 put_task_struct(p); 465 return ret; 466 } 467 468 __attach_to_pi_owner(p, key, ps); 469 raw_spin_unlock_irq(&p->pi_lock); 470 471 put_task_struct(p); 472 473 return 0; 474 } 475 476 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 477 { 478 int err; 479 u32 curval; 480 481 if (unlikely(should_fail_futex(true))) 482 return -EFAULT; 483 484 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 485 if (unlikely(err)) 486 return err; 487 488 /* If user space value changed, let the caller retry */ 489 return curval != uval ? -EAGAIN : 0; 490 } 491 492 /** 493 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 494 * @uaddr: the pi futex user address 495 * @hb: the pi futex hash bucket 496 * @key: the futex key associated with uaddr and hb 497 * @ps: the pi_state pointer where we store the result of the 498 * lookup 499 * @task: the task to perform the atomic lock work for. This will 500 * be "current" except in the case of requeue pi. 501 * @exiting: Pointer to store the task pointer of the owner task 502 * which is in the middle of exiting 503 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 504 * 505 * Return: 506 * - 0 - ready to wait; 507 * - 1 - acquired the lock; 508 * - <0 - error 509 * 510 * The hb->lock must be held by the caller. 511 * 512 * @exiting is only set when the return value is -EBUSY. If so, this holds 513 * a refcount on the exiting task on return and the caller needs to drop it 514 * after waiting for the exit to complete. 515 */ 516 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 517 union futex_key *key, 518 struct futex_pi_state **ps, 519 struct task_struct *task, 520 struct task_struct **exiting, 521 int set_waiters) 522 { 523 u32 uval, newval, vpid = task_pid_vnr(task); 524 struct futex_q *top_waiter; 525 int ret; 526 527 /* 528 * Read the user space value first so we can validate a few 529 * things before proceeding further. 530 */ 531 if (futex_get_value_locked(&uval, uaddr)) 532 return -EFAULT; 533 534 if (unlikely(should_fail_futex(true))) 535 return -EFAULT; 536 537 /* 538 * Detect deadlocks. 539 */ 540 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 541 return -EDEADLK; 542 543 if ((unlikely(should_fail_futex(true)))) 544 return -EDEADLK; 545 546 /* 547 * Lookup existing state first. If it exists, try to attach to 548 * its pi_state. 549 */ 550 top_waiter = futex_top_waiter(hb, key); 551 if (top_waiter) 552 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 553 554 /* 555 * No waiter and user TID is 0. We are here because the 556 * waiters or the owner died bit is set or called from 557 * requeue_cmp_pi or for whatever reason something took the 558 * syscall. 559 */ 560 if (!(uval & FUTEX_TID_MASK)) { 561 /* 562 * We take over the futex. No other waiters and the user space 563 * TID is 0. We preserve the owner died bit. 564 */ 565 newval = uval & FUTEX_OWNER_DIED; 566 newval |= vpid; 567 568 /* The futex requeue_pi code can enforce the waiters bit */ 569 if (set_waiters) 570 newval |= FUTEX_WAITERS; 571 572 ret = lock_pi_update_atomic(uaddr, uval, newval); 573 if (ret) 574 return ret; 575 576 /* 577 * If the waiter bit was requested the caller also needs PI 578 * state attached to the new owner of the user space futex. 579 * 580 * @task is guaranteed to be alive and it cannot be exiting 581 * because it is either sleeping or waiting in 582 * futex_requeue_pi_wakeup_sync(). 583 * 584 * No need to do the full attach_to_pi_owner() exercise 585 * because @task is known and valid. 586 */ 587 if (set_waiters) { 588 raw_spin_lock_irq(&task->pi_lock); 589 __attach_to_pi_owner(task, key, ps); 590 raw_spin_unlock_irq(&task->pi_lock); 591 } 592 return 1; 593 } 594 595 /* 596 * First waiter. Set the waiters bit before attaching ourself to 597 * the owner. If owner tries to unlock, it will be forced into 598 * the kernel and blocked on hb->lock. 599 */ 600 newval = uval | FUTEX_WAITERS; 601 ret = lock_pi_update_atomic(uaddr, uval, newval); 602 if (ret) 603 return ret; 604 /* 605 * If the update of the user space value succeeded, we try to 606 * attach to the owner. If that fails, no harm done, we only 607 * set the FUTEX_WAITERS bit in the user space variable. 608 */ 609 return attach_to_pi_owner(uaddr, newval, key, ps, exiting); 610 } 611 612 /* 613 * Caller must hold a reference on @pi_state. 614 */ 615 static int wake_futex_pi(u32 __user *uaddr, u32 uval, 616 struct futex_pi_state *pi_state, 617 struct rt_mutex_waiter *top_waiter) 618 __must_hold(&pi_state->pi_mutex.wait_lock) 619 __releases(&pi_state->pi_mutex.wait_lock) 620 { 621 struct task_struct *new_owner; 622 bool postunlock = false; 623 DEFINE_RT_WAKE_Q(wqh); 624 u32 curval, newval; 625 int ret = 0; 626 627 new_owner = top_waiter->task; 628 629 /* 630 * We pass it to the next owner. The WAITERS bit is always kept 631 * enabled while there is PI state around. We cleanup the owner 632 * died bit, because we are the owner. 633 */ 634 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 635 636 if (unlikely(should_fail_futex(true))) { 637 ret = -EFAULT; 638 goto out_unlock; 639 } 640 641 ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 642 if (!ret && (curval != uval)) { 643 /* 644 * If a unconditional UNLOCK_PI operation (user space did not 645 * try the TID->0 transition) raced with a waiter setting the 646 * FUTEX_WAITERS flag between get_user() and locking the hash 647 * bucket lock, retry the operation. 648 */ 649 if ((FUTEX_TID_MASK & curval) == uval) 650 ret = -EAGAIN; 651 else 652 ret = -EINVAL; 653 } 654 655 if (!ret) { 656 /* 657 * This is a point of no return; once we modified the uval 658 * there is no going back and subsequent operations must 659 * not fail. 660 */ 661 pi_state_update_owner(pi_state, new_owner); 662 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); 663 } 664 665 out_unlock: 666 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 667 668 if (postunlock) 669 rt_mutex_postunlock(&wqh); 670 671 return ret; 672 } 673 674 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 675 struct task_struct *argowner) 676 __must_hold(&q->pi_state->pi_mutex.wait_lock) 677 __must_hold(q->lock_ptr) 678 { 679 struct futex_pi_state *pi_state = q->pi_state; 680 struct task_struct *oldowner, *newowner; 681 u32 uval, curval, newval, newtid; 682 int err = 0; 683 684 oldowner = pi_state->owner; 685 686 /* 687 * We are here because either: 688 * 689 * - we stole the lock and pi_state->owner needs updating to reflect 690 * that (@argowner == current), 691 * 692 * or: 693 * 694 * - someone stole our lock and we need to fix things to point to the 695 * new owner (@argowner == NULL). 696 * 697 * Either way, we have to replace the TID in the user space variable. 698 * This must be atomic as we have to preserve the owner died bit here. 699 * 700 * Note: We write the user space value _before_ changing the pi_state 701 * because we can fault here. Imagine swapped out pages or a fork 702 * that marked all the anonymous memory readonly for cow. 703 * 704 * Modifying pi_state _before_ the user space value would leave the 705 * pi_state in an inconsistent state when we fault here, because we 706 * need to drop the locks to handle the fault. This might be observed 707 * in the PID checks when attaching to PI state . 708 */ 709 retry: 710 if (!argowner) { 711 if (oldowner != current) { 712 /* 713 * We raced against a concurrent self; things are 714 * already fixed up. Nothing to do. 715 */ 716 return 0; 717 } 718 719 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { 720 /* We got the lock. pi_state is correct. Tell caller. */ 721 return 1; 722 } 723 724 /* 725 * The trylock just failed, so either there is an owner or 726 * there is a higher priority waiter than this one. 727 */ 728 newowner = rt_mutex_owner(&pi_state->pi_mutex); 729 /* 730 * If the higher priority waiter has not yet taken over the 731 * rtmutex then newowner is NULL. We can't return here with 732 * that state because it's inconsistent vs. the user space 733 * state. So drop the locks and try again. It's a valid 734 * situation and not any different from the other retry 735 * conditions. 736 */ 737 if (unlikely(!newowner)) { 738 err = -EAGAIN; 739 goto handle_err; 740 } 741 } else { 742 WARN_ON_ONCE(argowner != current); 743 if (oldowner == current) { 744 /* 745 * We raced against a concurrent self; things are 746 * already fixed up. Nothing to do. 747 */ 748 return 1; 749 } 750 newowner = argowner; 751 } 752 753 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 754 /* Owner died? */ 755 if (!pi_state->owner) 756 newtid |= FUTEX_OWNER_DIED; 757 758 err = futex_get_value_locked(&uval, uaddr); 759 if (err) 760 goto handle_err; 761 762 for (;;) { 763 newval = (uval & FUTEX_OWNER_DIED) | newtid; 764 765 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 766 if (err) 767 goto handle_err; 768 769 if (curval == uval) 770 break; 771 uval = curval; 772 } 773 774 /* 775 * We fixed up user space. Now we need to fix the pi_state 776 * itself. 777 */ 778 pi_state_update_owner(pi_state, newowner); 779 780 return argowner == current; 781 782 /* 783 * In order to reschedule or handle a page fault, we need to drop the 784 * locks here. In the case of a fault, this gives the other task 785 * (either the highest priority waiter itself or the task which stole 786 * the rtmutex) the chance to try the fixup of the pi_state. So once we 787 * are back from handling the fault we need to check the pi_state after 788 * reacquiring the locks and before trying to do another fixup. When 789 * the fixup has been done already we simply return. 790 * 791 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 792 * drop hb->lock since the caller owns the hb -> futex_q relation. 793 * Dropping the pi_mutex->wait_lock requires the state revalidate. 794 */ 795 handle_err: 796 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 797 spin_unlock(q->lock_ptr); 798 799 switch (err) { 800 case -EFAULT: 801 err = fault_in_user_writeable(uaddr); 802 break; 803 804 case -EAGAIN: 805 cond_resched(); 806 err = 0; 807 break; 808 809 default: 810 WARN_ON_ONCE(1); 811 break; 812 } 813 814 futex_q_lockptr_lock(q); 815 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 816 817 /* 818 * Check if someone else fixed it for us: 819 */ 820 if (pi_state->owner != oldowner) 821 return argowner == current; 822 823 /* Retry if err was -EAGAIN or the fault in succeeded */ 824 if (!err) 825 goto retry; 826 827 /* 828 * fault_in_user_writeable() failed so user state is immutable. At 829 * best we can make the kernel state consistent but user state will 830 * be most likely hosed and any subsequent unlock operation will be 831 * rejected due to PI futex rule [10]. 832 * 833 * Ensure that the rtmutex owner is also the pi_state owner despite 834 * the user space value claiming something different. There is no 835 * point in unlocking the rtmutex if current is the owner as it 836 * would need to wait until the next waiter has taken the rtmutex 837 * to guarantee consistent state. Keep it simple. Userspace asked 838 * for this wreckaged state. 839 * 840 * The rtmutex has an owner - either current or some other 841 * task. See the EAGAIN loop above. 842 */ 843 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); 844 845 return err; 846 } 847 848 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 849 struct task_struct *argowner) 850 { 851 struct futex_pi_state *pi_state = q->pi_state; 852 int ret; 853 854 lockdep_assert_held(q->lock_ptr); 855 856 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 857 ret = __fixup_pi_state_owner(uaddr, q, argowner); 858 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 859 return ret; 860 } 861 862 /** 863 * fixup_pi_owner() - Post lock pi_state and corner case management 864 * @uaddr: user address of the futex 865 * @q: futex_q (contains pi_state and access to the rt_mutex) 866 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 867 * 868 * After attempting to lock an rt_mutex, this function is called to cleanup 869 * the pi_state owner as well as handle race conditions that may allow us to 870 * acquire the lock. Must be called with the hb lock held. 871 * 872 * Return: 873 * - 1 - success, lock taken; 874 * - 0 - success, lock not taken; 875 * - <0 - on error (-EFAULT) 876 */ 877 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) 878 { 879 if (locked) { 880 /* 881 * Got the lock. We might not be the anticipated owner if we 882 * did a lock-steal - fix up the PI-state in that case: 883 * 884 * Speculative pi_state->owner read (we don't hold wait_lock); 885 * since we own the lock pi_state->owner == current is the 886 * stable state, anything else needs more attention. 887 */ 888 if (q->pi_state->owner != current) 889 return fixup_pi_state_owner(uaddr, q, current); 890 return 1; 891 } 892 893 /* 894 * If we didn't get the lock; check if anybody stole it from us. In 895 * that case, we need to fix up the uval to point to them instead of 896 * us, otherwise bad things happen. [10] 897 * 898 * Another speculative read; pi_state->owner == current is unstable 899 * but needs our attention. 900 */ 901 if (q->pi_state->owner == current) 902 return fixup_pi_state_owner(uaddr, q, NULL); 903 904 /* 905 * Paranoia check. If we did not take the lock, then we should not be 906 * the owner of the rt_mutex. Warn and establish consistent state. 907 */ 908 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) 909 return fixup_pi_state_owner(uaddr, q, current); 910 911 return 0; 912 } 913 914 /* 915 * Userspace tried a 0 -> TID atomic transition of the futex value 916 * and failed. The kernel side here does the whole locking operation: 917 * if there are waiters then it will block as a consequence of relying 918 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see 919 * a 0 value of the futex too.). 920 * 921 * Also serves as futex trylock_pi()'ing, and due semantics. 922 */ 923 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) 924 { 925 struct hrtimer_sleeper timeout, *to; 926 struct task_struct *exiting; 927 struct rt_mutex_waiter rt_waiter; 928 struct futex_q q = futex_q_init; 929 DEFINE_WAKE_Q(wake_q); 930 int res, ret; 931 932 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 933 return -ENOSYS; 934 935 if (refill_pi_state_cache()) 936 return -ENOMEM; 937 938 to = futex_setup_timer(time, &timeout, flags, 0); 939 940 retry: 941 exiting = NULL; 942 ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); 943 if (unlikely(ret != 0)) 944 goto out; 945 946 retry_private: 947 if (1) { 948 CLASS(hbr, hbr)(&q.key); 949 auto hb = hbr.hb; 950 951 futex_q_lock(&q, hb); 952 953 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 954 &exiting, 0); 955 if (unlikely(ret)) { 956 /* 957 * Atomic work succeeded and we got the lock, 958 * or failed. Either way, we do _not_ block. 959 */ 960 switch (ret) { 961 case 1: 962 /* We got the lock. */ 963 ret = 0; 964 goto out_unlock_put_key; 965 case -EFAULT: 966 goto uaddr_faulted; 967 case -EBUSY: 968 case -EAGAIN: 969 /* 970 * Two reasons for this: 971 * - EBUSY: Task is exiting and we just wait for the 972 * exit to complete. 973 * - EAGAIN: The user space value changed. 974 */ 975 futex_q_unlock(hb); 976 __release(q.lock_ptr); 977 /* 978 * Handle the case where the owner is in the middle of 979 * exiting. Wait for the exit to complete otherwise 980 * this task might loop forever, aka. live lock. 981 */ 982 wait_for_owner_exiting(ret, exiting); 983 cond_resched(); 984 goto retry; 985 default: 986 goto out_unlock_put_key; 987 } 988 } 989 990 WARN_ON(!q.pi_state); 991 992 /* 993 * Only actually queue now that the atomic ops are done: 994 */ 995 __futex_queue(&q, hb, current); 996 997 if (trylock) { 998 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 999 /* Fixup the trylock return value: */ 1000 ret = ret ? 0 : -EWOULDBLOCK; 1001 goto no_block; 1002 } 1003 1004 /* 1005 * Caution; releasing @hb in-scope. The hb->lock is still locked 1006 * while the reference is dropped. The reference can not be dropped 1007 * after the unlock because if a user initiated resize is in progress 1008 * then we might need to wake him. This can not be done after the 1009 * rt_mutex_pre_schedule() invocation. The hb will remain valid because 1010 * the thread, performing resize, will block on hb->lock during 1011 * the requeue. 1012 */ 1013 futex_private_hash_put(no_free_ptr(hbr.fph)); 1014 /* 1015 * Must be done before we enqueue the waiter, here is unfortunately 1016 * under the hb lock, but that *should* work because it does nothing. 1017 */ 1018 rt_mutex_pre_schedule(); 1019 1020 rt_mutex_init_waiter(&rt_waiter); 1021 1022 /* 1023 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1024 * hold it while doing rt_mutex_start_proxy(), because then it will 1025 * include hb->lock in the blocking chain, even through we'll not in 1026 * fact hold it while blocking. This will lead it to report -EDEADLK 1027 * and BUG when futex_unlock_pi() interleaves with this. 1028 * 1029 * Therefore acquire wait_lock while holding hb->lock, but drop the 1030 * latter before calling __rt_mutex_start_proxy_lock(). This 1031 * interleaves with futex_unlock_pi() -- which does a similar lock 1032 * handoff -- such that the latter can observe the futex_q::pi_state 1033 * before __rt_mutex_start_proxy_lock() is done. 1034 */ 1035 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1036 spin_unlock(q.lock_ptr); 1037 /* 1038 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1039 * such that futex_unlock_pi() is guaranteed to observe the waiter when 1040 * it sees the futex_q::pi_state. 1041 */ 1042 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); 1043 raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); 1044 1045 if (ret) { 1046 if (ret == 1) 1047 ret = 0; 1048 goto cleanup; 1049 } 1050 1051 if (unlikely(to)) 1052 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1053 1054 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1055 1056 cleanup: 1057 /* 1058 * If we failed to acquire the lock (deadlock/signal/timeout), we must 1059 * unwind the above, however we canont lock hb->lock because 1060 * rt_mutex already has a waiter enqueued and hb->lock can itself try 1061 * and enqueue an rt_waiter through rtlock. 1062 * 1063 * Doing the cleanup without holding hb->lock can cause inconsistent 1064 * state between hb and pi_state, but only in the direction of not 1065 * seeing a waiter that is leaving. 1066 * 1067 * See futex_unlock_pi(), it deals with this inconsistency. 1068 * 1069 * There be dragons here, since we must deal with the inconsistency on 1070 * the way out (here), it is impossible to detect/warn about the race 1071 * the other way around (missing an incoming waiter). 1072 * 1073 * What could possibly go wrong... 1074 */ 1075 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1076 ret = 0; 1077 1078 /* 1079 * Now that the rt_waiter has been dequeued, it is safe to use 1080 * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up 1081 * the 1082 */ 1083 futex_q_lockptr_lock(&q); 1084 /* 1085 * Waiter is unqueued. 1086 */ 1087 rt_mutex_post_schedule(); 1088 no_block: 1089 /* 1090 * Fixup the pi_state owner and possibly acquire the lock if we 1091 * haven't already. 1092 */ 1093 res = fixup_pi_owner(uaddr, &q, !ret); 1094 /* 1095 * If fixup_pi_owner() returned an error, propagate that. If it acquired 1096 * the lock, clear our -ETIMEDOUT or -EINTR. 1097 */ 1098 if (res) 1099 ret = (res < 0) ? res : 0; 1100 1101 __release(&hb->lock); 1102 futex_unqueue_pi(&q); 1103 spin_unlock(q.lock_ptr); 1104 1105 /* Additional reference from futex_unlock_pi() */ 1106 futex_private_hash_put(q.drop_fph); 1107 goto out; 1108 1109 out_unlock_put_key: 1110 futex_q_unlock(hb); 1111 __release(q.lock_ptr); 1112 goto out; 1113 1114 uaddr_faulted: 1115 futex_q_unlock(hb); 1116 __release(q.lock_ptr); 1117 1118 ret = fault_in_user_writeable(uaddr); 1119 if (ret) 1120 goto out; 1121 1122 if (!(flags & FLAGS_SHARED)) 1123 goto retry_private; 1124 1125 goto retry; 1126 } 1127 1128 out: 1129 if (to) { 1130 hrtimer_cancel(&to->timer); 1131 destroy_hrtimer_on_stack(&to->timer); 1132 } 1133 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1134 } 1135 1136 /* 1137 * Userspace attempted a TID -> 0 atomic transition, and failed. 1138 * This is the in-kernel slowpath: we look up the PI state (if any), 1139 * and do the rt-mutex unlock. 1140 */ 1141 static int __futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 1142 { 1143 u32 curval, uval, vpid = task_pid_vnr(current); 1144 union futex_key key = FUTEX_KEY_INIT; 1145 struct futex_q *top_waiter; 1146 int ret; 1147 1148 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 1149 return -ENOSYS; 1150 retry: 1151 if (get_user(uval, uaddr)) 1152 return -EFAULT; 1153 /* 1154 * We release only a lock we actually own: 1155 */ 1156 if ((uval & FUTEX_TID_MASK) != vpid) 1157 return -EPERM; 1158 1159 ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE); 1160 if (ret) 1161 return ret; 1162 1163 CLASS(hbr, hbr)(&key); 1164 auto hb = hbr.hb; 1165 spin_lock(&hb->lock); 1166 retry_hb: 1167 1168 /* 1169 * Check waiters first. We do not trust user space values at 1170 * all and we at least want to know if user space fiddled 1171 * with the futex value instead of blindly unlocking. 1172 */ 1173 top_waiter = futex_top_waiter(hb, &key); 1174 if (top_waiter) { 1175 struct futex_pi_state *pi_state = top_waiter->pi_state; 1176 struct rt_mutex_waiter *rt_waiter; 1177 1178 ret = -EINVAL; 1179 if (!pi_state) 1180 goto out_unlock; 1181 1182 /* 1183 * If current does not own the pi_state then the futex is 1184 * inconsistent and user space fiddled with the futex value. 1185 */ 1186 if (pi_state->owner != current) 1187 goto out_unlock; 1188 1189 /* 1190 * By taking wait_lock while still holding hb->lock, we ensure 1191 * there is no point where we hold neither; and thereby 1192 * wake_futex_pi() must observe any new waiters. 1193 * 1194 * Since the cleanup: case in futex_lock_pi() removes the 1195 * rt_waiter without holding hb->lock, it is possible for 1196 * wake_futex_pi() to not find a waiter while the above does, 1197 * in this case the waiter is on the way out and it can be 1198 * ignored. 1199 * 1200 * In particular; this forces __rt_mutex_start_proxy() to 1201 * complete such that we're guaranteed to observe the 1202 * rt_waiter. 1203 */ 1204 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1205 1206 /* 1207 * Futex vs rt_mutex waiter state -- if there are no rt_mutex 1208 * waiters even though futex thinks there are, then the waiter 1209 * is leaving. The entry needs to be removed from the list so a 1210 * new futex_lock_pi() is not using this stale PI-state while 1211 * the futex is available in user space again. 1212 * There can be more than one task on its way out so it needs 1213 * to retry. 1214 */ 1215 rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); 1216 if (!rt_waiter) { 1217 /* 1218 * Acquire a reference for the leaving waiter to ensure 1219 * valid futex_q::lock_ptr. 1220 */ 1221 if (futex_key_is_private(&key)) 1222 top_waiter->drop_fph = futex_private_hash(key.private.mm); 1223 1224 __futex_unqueue(top_waiter); 1225 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1226 goto retry_hb; 1227 } 1228 1229 get_pi_state(pi_state); 1230 spin_unlock(&hb->lock); 1231 1232 /* drops pi_state->pi_mutex.wait_lock */ 1233 ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter); 1234 1235 put_pi_state(pi_state); 1236 1237 /* 1238 * Success, we're done! No tricky corner cases. 1239 */ 1240 if (!ret) 1241 return ret; 1242 /* 1243 * The atomic access to the futex value generated a 1244 * pagefault, so retry the user-access and the wakeup: 1245 */ 1246 if (ret == -EFAULT) 1247 goto pi_faulted; 1248 /* 1249 * A unconditional UNLOCK_PI op raced against a waiter 1250 * setting the FUTEX_WAITERS bit. Try again. 1251 */ 1252 if (ret == -EAGAIN) 1253 goto pi_retry; 1254 /* 1255 * wake_futex_pi has detected invalid state. Tell user 1256 * space. 1257 */ 1258 return ret; 1259 } 1260 1261 /* 1262 * We have no kernel internal state, i.e. no waiters in the 1263 * kernel. Waiters which are about to queue themselves are stuck 1264 * on hb->lock. So we can safely ignore them. We do neither 1265 * preserve the WAITERS bit not the OWNER_DIED one. We are the 1266 * owner. 1267 */ 1268 if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) { 1269 spin_unlock(&hb->lock); 1270 switch (ret) { 1271 case -EFAULT: 1272 goto pi_faulted; 1273 1274 case -EAGAIN: 1275 goto pi_retry; 1276 1277 default: 1278 WARN_ON_ONCE(1); 1279 return ret; 1280 } 1281 } 1282 1283 /* 1284 * If uval has changed, let user space handle it. 1285 */ 1286 ret = (curval == uval) ? 0 : -EAGAIN; 1287 1288 out_unlock: 1289 spin_unlock(&hb->lock); 1290 return ret; 1291 1292 pi_retry: 1293 cond_resched(); 1294 goto retry; 1295 1296 pi_faulted: 1297 1298 ret = fault_in_user_writeable(uaddr); 1299 if (!ret) 1300 goto retry; 1301 1302 return ret; 1303 } 1304 1305 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *pop) 1306 { 1307 int ret = __futex_unlock_pi(uaddr, flags); 1308 1309 if (ret || !(flags & FLAGS_ROBUST_UNLOCK)) 1310 return ret; 1311 1312 if (!futex_robust_list_clear_pending(pop, flags)) 1313 return -EFAULT; 1314 1315 return 0; 1316 } 1317