1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Fast Userspace Mutexes (which I call "Futexes!"). 4 * (C) Rusty Russell, IBM 2002 5 * 6 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 7 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 8 * 9 * Removed page pinning, fix privately mapped COW pages and other cleanups 10 * (C) Copyright 2003, 2004 Jamie Lokier 11 * 12 * Robust futex support started by Ingo Molnar 13 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 14 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 15 * 16 * PI-futex support started by Ingo Molnar and Thomas Gleixner 17 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 18 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 19 * 20 * PRIVATE futexes by Eric Dumazet 21 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 22 * 23 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 24 * Copyright (C) IBM Corporation, 2009 25 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 26 * 27 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 28 * enough at me, Linus for the original (flawed) idea, Matthew 29 * Kirkwood for proof-of-concept implementation. 30 * 31 * "The futexes are also cursed." 32 * "But they come in a choice of three flavours!" 33 */ 34 #include <linux/compat.h> 35 #include <linux/jhash.h> 36 #include <linux/pagemap.h> 37 #include <linux/plist.h> 38 #include <linux/memblock.h> 39 #include <linux/fault-inject.h> 40 #include <linux/slab.h> 41 42 #include "futex.h" 43 #include "../locking/rtmutex_common.h" 44 45 /* 46 * The base of the bucket array and its size are always used together 47 * (after initialization only in futex_hash()), so ensure that they 48 * reside in the same cacheline. 49 */ 50 static struct { 51 struct futex_hash_bucket *queues; 52 unsigned long hashsize; 53 } __futex_data __read_mostly __aligned(2*sizeof(long)); 54 #define futex_queues (__futex_data.queues) 55 #define futex_hashsize (__futex_data.hashsize) 56 57 58 /* 59 * Fault injections for futexes. 60 */ 61 #ifdef CONFIG_FAIL_FUTEX 62 63 static struct { 64 struct fault_attr attr; 65 66 bool ignore_private; 67 } fail_futex = { 68 .attr = FAULT_ATTR_INITIALIZER, 69 .ignore_private = false, 70 }; 71 72 static int __init setup_fail_futex(char *str) 73 { 74 return setup_fault_attr(&fail_futex.attr, str); 75 } 76 __setup("fail_futex=", setup_fail_futex); 77 78 bool should_fail_futex(bool fshared) 79 { 80 if (fail_futex.ignore_private && !fshared) 81 return false; 82 83 return should_fail(&fail_futex.attr, 1); 84 } 85 86 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 87 88 static int __init fail_futex_debugfs(void) 89 { 90 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 91 struct dentry *dir; 92 93 dir = fault_create_debugfs_attr("fail_futex", NULL, 94 &fail_futex.attr); 95 if (IS_ERR(dir)) 96 return PTR_ERR(dir); 97 98 debugfs_create_bool("ignore-private", mode, dir, 99 &fail_futex.ignore_private); 100 return 0; 101 } 102 103 late_initcall(fail_futex_debugfs); 104 105 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 106 107 #endif /* CONFIG_FAIL_FUTEX */ 108 109 /** 110 * futex_hash - Return the hash bucket in the global hash 111 * @key: Pointer to the futex key for which the hash is calculated 112 * 113 * We hash on the keys returned from get_futex_key (see below) and return the 114 * corresponding hash bucket in the global hash. 115 */ 116 struct futex_hash_bucket *futex_hash(union futex_key *key) 117 { 118 u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, 119 key->both.offset); 120 121 return &futex_queues[hash & (futex_hashsize - 1)]; 122 } 123 124 125 /** 126 * futex_setup_timer - set up the sleeping hrtimer. 127 * @time: ptr to the given timeout value 128 * @timeout: the hrtimer_sleeper structure to be set up 129 * @flags: futex flags 130 * @range_ns: optional range in ns 131 * 132 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout 133 * value given 134 */ 135 struct hrtimer_sleeper * 136 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, 137 int flags, u64 range_ns) 138 { 139 if (!time) 140 return NULL; 141 142 hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? 143 CLOCK_REALTIME : CLOCK_MONOTONIC, 144 HRTIMER_MODE_ABS); 145 /* 146 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is 147 * effectively the same as calling hrtimer_set_expires(). 148 */ 149 hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); 150 151 return timeout; 152 } 153 154 /* 155 * Generate a machine wide unique identifier for this inode. 156 * 157 * This relies on u64 not wrapping in the life-time of the machine; which with 158 * 1ns resolution means almost 585 years. 159 * 160 * This further relies on the fact that a well formed program will not unmap 161 * the file while it has a (shared) futex waiting on it. This mapping will have 162 * a file reference which pins the mount and inode. 163 * 164 * If for some reason an inode gets evicted and read back in again, it will get 165 * a new sequence number and will _NOT_ match, even though it is the exact same 166 * file. 167 * 168 * It is important that futex_match() will never have a false-positive, esp. 169 * for PI futexes that can mess up the state. The above argues that false-negatives 170 * are only possible for malformed programs. 171 */ 172 static u64 get_inode_sequence_number(struct inode *inode) 173 { 174 static atomic64_t i_seq; 175 u64 old; 176 177 /* Does the inode already have a sequence number? */ 178 old = atomic64_read(&inode->i_sequence); 179 if (likely(old)) 180 return old; 181 182 for (;;) { 183 u64 new = atomic64_add_return(1, &i_seq); 184 if (WARN_ON_ONCE(!new)) 185 continue; 186 187 old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); 188 if (old) 189 return old; 190 return new; 191 } 192 } 193 194 /** 195 * get_futex_key() - Get parameters which are the keys for a futex 196 * @uaddr: virtual address of the futex 197 * @flags: FLAGS_* 198 * @key: address where result is stored. 199 * @rw: mapping needs to be read/write (values: FUTEX_READ, 200 * FUTEX_WRITE) 201 * 202 * Return: a negative error code or 0 203 * 204 * The key words are stored in @key on success. 205 * 206 * For shared mappings (when @fshared), the key is: 207 * 208 * ( inode->i_sequence, page->index, offset_within_page ) 209 * 210 * [ also see get_inode_sequence_number() ] 211 * 212 * For private mappings (or when !@fshared), the key is: 213 * 214 * ( current->mm, address, 0 ) 215 * 216 * This allows (cross process, where applicable) identification of the futex 217 * without keeping the page pinned for the duration of the FUTEX_WAIT. 218 * 219 * lock_page() might sleep, the caller should not hold a spinlock. 220 */ 221 int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, 222 enum futex_access rw) 223 { 224 unsigned long address = (unsigned long)uaddr; 225 struct mm_struct *mm = current->mm; 226 struct page *page; 227 struct folio *folio; 228 struct address_space *mapping; 229 int err, ro = 0; 230 bool fshared; 231 232 fshared = flags & FLAGS_SHARED; 233 234 /* 235 * The futex address must be "naturally" aligned. 236 */ 237 key->both.offset = address % PAGE_SIZE; 238 if (unlikely((address % sizeof(u32)) != 0)) 239 return -EINVAL; 240 address -= key->both.offset; 241 242 if (unlikely(!access_ok(uaddr, sizeof(u32)))) 243 return -EFAULT; 244 245 if (unlikely(should_fail_futex(fshared))) 246 return -EFAULT; 247 248 /* 249 * PROCESS_PRIVATE futexes are fast. 250 * As the mm cannot disappear under us and the 'key' only needs 251 * virtual address, we dont even have to find the underlying vma. 252 * Note : We do have to check 'uaddr' is a valid user address, 253 * but access_ok() should be faster than find_vma() 254 */ 255 if (!fshared) { 256 /* 257 * On no-MMU, shared futexes are treated as private, therefore 258 * we must not include the current process in the key. Since 259 * there is only one address space, the address is a unique key 260 * on its own. 261 */ 262 if (IS_ENABLED(CONFIG_MMU)) 263 key->private.mm = mm; 264 else 265 key->private.mm = NULL; 266 267 key->private.address = address; 268 return 0; 269 } 270 271 again: 272 /* Ignore any VERIFY_READ mapping (futex common case) */ 273 if (unlikely(should_fail_futex(true))) 274 return -EFAULT; 275 276 err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); 277 /* 278 * If write access is not required (eg. FUTEX_WAIT), try 279 * and get read-only access. 280 */ 281 if (err == -EFAULT && rw == FUTEX_READ) { 282 err = get_user_pages_fast(address, 1, 0, &page); 283 ro = 1; 284 } 285 if (err < 0) 286 return err; 287 else 288 err = 0; 289 290 /* 291 * The treatment of mapping from this point on is critical. The folio 292 * lock protects many things but in this context the folio lock 293 * stabilizes mapping, prevents inode freeing in the shared 294 * file-backed region case and guards against movement to swap cache. 295 * 296 * Strictly speaking the folio lock is not needed in all cases being 297 * considered here and folio lock forces unnecessarily serialization. 298 * From this point on, mapping will be re-verified if necessary and 299 * folio lock will be acquired only if it is unavoidable 300 * 301 * Mapping checks require the folio so it is looked up now. For 302 * anonymous pages, it does not matter if the folio is split 303 * in the future as the key is based on the address. For 304 * filesystem-backed pages, the precise page is required as the 305 * index of the page determines the key. 306 */ 307 folio = page_folio(page); 308 mapping = READ_ONCE(folio->mapping); 309 310 /* 311 * If folio->mapping is NULL, then it cannot be an anonymous 312 * page; but it might be the ZERO_PAGE or in the gate area or 313 * in a special mapping (all cases which we are happy to fail); 314 * or it may have been a good file page when get_user_pages_fast 315 * found it, but truncated or holepunched or subjected to 316 * invalidate_complete_page2 before we got the folio lock (also 317 * cases which we are happy to fail). And we hold a reference, 318 * so refcount care in invalidate_inode_page's remove_mapping 319 * prevents drop_caches from setting mapping to NULL beneath us. 320 * 321 * The case we do have to guard against is when memory pressure made 322 * shmem_writepage move it from filecache to swapcache beneath us: 323 * an unlikely race, but we do need to retry for folio->mapping. 324 */ 325 if (unlikely(!mapping)) { 326 int shmem_swizzled; 327 328 /* 329 * Folio lock is required to identify which special case above 330 * applies. If this is really a shmem page then the folio lock 331 * will prevent unexpected transitions. 332 */ 333 folio_lock(folio); 334 shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; 335 folio_unlock(folio); 336 folio_put(folio); 337 338 if (shmem_swizzled) 339 goto again; 340 341 return -EFAULT; 342 } 343 344 /* 345 * Private mappings are handled in a simple way. 346 * 347 * If the futex key is stored in anonymous memory, then the associated 348 * object is the mm which is implicitly pinned by the calling process. 349 * 350 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 351 * it's a read-only handle, it's expected that futexes attach to 352 * the object not the particular process. 353 */ 354 if (folio_test_anon(folio)) { 355 /* 356 * A RO anonymous page will never change and thus doesn't make 357 * sense for futex operations. 358 */ 359 if (unlikely(should_fail_futex(true)) || ro) { 360 err = -EFAULT; 361 goto out; 362 } 363 364 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 365 key->private.mm = mm; 366 key->private.address = address; 367 368 } else { 369 struct inode *inode; 370 371 /* 372 * The associated futex object in this case is the inode and 373 * the folio->mapping must be traversed. Ordinarily this should 374 * be stabilised under folio lock but it's not strictly 375 * necessary in this case as we just want to pin the inode, not 376 * update i_pages or anything like that. 377 * 378 * The RCU read lock is taken as the inode is finally freed 379 * under RCU. If the mapping still matches expectations then the 380 * mapping->host can be safely accessed as being a valid inode. 381 */ 382 rcu_read_lock(); 383 384 if (READ_ONCE(folio->mapping) != mapping) { 385 rcu_read_unlock(); 386 folio_put(folio); 387 388 goto again; 389 } 390 391 inode = READ_ONCE(mapping->host); 392 if (!inode) { 393 rcu_read_unlock(); 394 folio_put(folio); 395 396 goto again; 397 } 398 399 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 400 key->shared.i_seq = get_inode_sequence_number(inode); 401 key->shared.pgoff = folio->index + folio_page_idx(folio, page); 402 rcu_read_unlock(); 403 } 404 405 out: 406 folio_put(folio); 407 return err; 408 } 409 410 /** 411 * fault_in_user_writeable() - Fault in user address and verify RW access 412 * @uaddr: pointer to faulting user space address 413 * 414 * Slow path to fixup the fault we just took in the atomic write 415 * access to @uaddr. 416 * 417 * We have no generic implementation of a non-destructive write to the 418 * user address. We know that we faulted in the atomic pagefault 419 * disabled section so we can as well avoid the #PF overhead by 420 * calling get_user_pages() right away. 421 */ 422 int fault_in_user_writeable(u32 __user *uaddr) 423 { 424 struct mm_struct *mm = current->mm; 425 int ret; 426 427 mmap_read_lock(mm); 428 ret = fixup_user_fault(mm, (unsigned long)uaddr, 429 FAULT_FLAG_WRITE, NULL); 430 mmap_read_unlock(mm); 431 432 return ret < 0 ? ret : 0; 433 } 434 435 /** 436 * futex_top_waiter() - Return the highest priority waiter on a futex 437 * @hb: the hash bucket the futex_q's reside in 438 * @key: the futex key (to distinguish it from other futex futex_q's) 439 * 440 * Must be called with the hb lock held. 441 */ 442 struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) 443 { 444 struct futex_q *this; 445 446 plist_for_each_entry(this, &hb->chain, list) { 447 if (futex_match(&this->key, key)) 448 return this; 449 } 450 return NULL; 451 } 452 453 int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) 454 { 455 int ret; 456 457 pagefault_disable(); 458 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); 459 pagefault_enable(); 460 461 return ret; 462 } 463 464 int futex_get_value_locked(u32 *dest, u32 __user *from) 465 { 466 int ret; 467 468 pagefault_disable(); 469 ret = __get_user(*dest, from); 470 pagefault_enable(); 471 472 return ret ? -EFAULT : 0; 473 } 474 475 /** 476 * wait_for_owner_exiting - Block until the owner has exited 477 * @ret: owner's current futex lock status 478 * @exiting: Pointer to the exiting task 479 * 480 * Caller must hold a refcount on @exiting. 481 */ 482 void wait_for_owner_exiting(int ret, struct task_struct *exiting) 483 { 484 if (ret != -EBUSY) { 485 WARN_ON_ONCE(exiting); 486 return; 487 } 488 489 if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) 490 return; 491 492 mutex_lock(&exiting->futex_exit_mutex); 493 /* 494 * No point in doing state checking here. If the waiter got here 495 * while the task was in exec()->exec_futex_release() then it can 496 * have any FUTEX_STATE_* value when the waiter has acquired the 497 * mutex. OK, if running, EXITING or DEAD if it reached exit() 498 * already. Highly unlikely and not a problem. Just one more round 499 * through the futex maze. 500 */ 501 mutex_unlock(&exiting->futex_exit_mutex); 502 503 put_task_struct(exiting); 504 } 505 506 /** 507 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket 508 * @q: The futex_q to unqueue 509 * 510 * The q->lock_ptr must not be NULL and must be held by the caller. 511 */ 512 void __futex_unqueue(struct futex_q *q) 513 { 514 struct futex_hash_bucket *hb; 515 516 if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) 517 return; 518 lockdep_assert_held(q->lock_ptr); 519 520 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 521 plist_del(&q->list, &hb->chain); 522 futex_hb_waiters_dec(hb); 523 } 524 525 /* The key must be already stored in q->key. */ 526 struct futex_hash_bucket *futex_q_lock(struct futex_q *q) 527 __acquires(&hb->lock) 528 { 529 struct futex_hash_bucket *hb; 530 531 hb = futex_hash(&q->key); 532 533 /* 534 * Increment the counter before taking the lock so that 535 * a potential waker won't miss a to-be-slept task that is 536 * waiting for the spinlock. This is safe as all futex_q_lock() 537 * users end up calling futex_queue(). Similarly, for housekeeping, 538 * decrement the counter at futex_q_unlock() when some error has 539 * occurred and we don't end up adding the task to the list. 540 */ 541 futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ 542 543 q->lock_ptr = &hb->lock; 544 545 spin_lock(&hb->lock); 546 return hb; 547 } 548 549 void futex_q_unlock(struct futex_hash_bucket *hb) 550 __releases(&hb->lock) 551 { 552 spin_unlock(&hb->lock); 553 futex_hb_waiters_dec(hb); 554 } 555 556 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) 557 { 558 int prio; 559 560 /* 561 * The priority used to register this element is 562 * - either the real thread-priority for the real-time threads 563 * (i.e. threads with a priority lower than MAX_RT_PRIO) 564 * - or MAX_RT_PRIO for non-RT threads. 565 * Thus, all RT-threads are woken first in priority order, and 566 * the others are woken last, in FIFO order. 567 */ 568 prio = min(current->normal_prio, MAX_RT_PRIO); 569 570 plist_node_init(&q->list, prio); 571 plist_add(&q->list, &hb->chain); 572 q->task = current; 573 } 574 575 /** 576 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket 577 * @q: The futex_q to unqueue 578 * 579 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must 580 * be paired with exactly one earlier call to futex_queue(). 581 * 582 * Return: 583 * - 1 - if the futex_q was still queued (and we removed unqueued it); 584 * - 0 - if the futex_q was already removed by the waking thread 585 */ 586 int futex_unqueue(struct futex_q *q) 587 { 588 spinlock_t *lock_ptr; 589 int ret = 0; 590 591 /* In the common case we don't take the spinlock, which is nice. */ 592 retry: 593 /* 594 * q->lock_ptr can change between this read and the following spin_lock. 595 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 596 * optimizing lock_ptr out of the logic below. 597 */ 598 lock_ptr = READ_ONCE(q->lock_ptr); 599 if (lock_ptr != NULL) { 600 spin_lock(lock_ptr); 601 /* 602 * q->lock_ptr can change between reading it and 603 * spin_lock(), causing us to take the wrong lock. This 604 * corrects the race condition. 605 * 606 * Reasoning goes like this: if we have the wrong lock, 607 * q->lock_ptr must have changed (maybe several times) 608 * between reading it and the spin_lock(). It can 609 * change again after the spin_lock() but only if it was 610 * already changed before the spin_lock(). It cannot, 611 * however, change back to the original value. Therefore 612 * we can detect whether we acquired the correct lock. 613 */ 614 if (unlikely(lock_ptr != q->lock_ptr)) { 615 spin_unlock(lock_ptr); 616 goto retry; 617 } 618 __futex_unqueue(q); 619 620 BUG_ON(q->pi_state); 621 622 spin_unlock(lock_ptr); 623 ret = 1; 624 } 625 626 return ret; 627 } 628 629 /* 630 * PI futexes can not be requeued and must remove themselves from the hash 631 * bucket. The hash bucket lock (i.e. lock_ptr) is held. 632 */ 633 void futex_unqueue_pi(struct futex_q *q) 634 { 635 /* 636 * If the lock was not acquired (due to timeout or signal) then the 637 * rt_waiter is removed before futex_q is. If this is observed by 638 * an unlocker after dropping the rtmutex wait lock and before 639 * acquiring the hash bucket lock, then the unlocker dequeues the 640 * futex_q from the hash bucket list to guarantee consistent state 641 * vs. userspace. Therefore the dequeue here must be conditional. 642 */ 643 if (!plist_node_empty(&q->list)) 644 __futex_unqueue(q); 645 646 BUG_ON(!q->pi_state); 647 put_pi_state(q->pi_state); 648 q->pi_state = NULL; 649 } 650 651 /* Constants for the pending_op argument of handle_futex_death */ 652 #define HANDLE_DEATH_PENDING true 653 #define HANDLE_DEATH_LIST false 654 655 /* 656 * Process a futex-list entry, check whether it's owned by the 657 * dying task, and do notification if so: 658 */ 659 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, 660 bool pi, bool pending_op) 661 { 662 u32 uval, nval, mval; 663 pid_t owner; 664 int err; 665 666 /* Futex address must be 32bit aligned */ 667 if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) 668 return -1; 669 670 retry: 671 if (get_user(uval, uaddr)) 672 return -1; 673 674 /* 675 * Special case for regular (non PI) futexes. The unlock path in 676 * user space has two race scenarios: 677 * 678 * 1. The unlock path releases the user space futex value and 679 * before it can execute the futex() syscall to wake up 680 * waiters it is killed. 681 * 682 * 2. A woken up waiter is killed before it can acquire the 683 * futex in user space. 684 * 685 * In the second case, the wake up notification could be generated 686 * by the unlock path in user space after setting the futex value 687 * to zero or by the kernel after setting the OWNER_DIED bit below. 688 * 689 * In both cases the TID validation below prevents a wakeup of 690 * potential waiters which can cause these waiters to block 691 * forever. 692 * 693 * In both cases the following conditions are met: 694 * 695 * 1) task->robust_list->list_op_pending != NULL 696 * @pending_op == true 697 * 2) The owner part of user space futex value == 0 698 * 3) Regular futex: @pi == false 699 * 700 * If these conditions are met, it is safe to attempt waking up a 701 * potential waiter without touching the user space futex value and 702 * trying to set the OWNER_DIED bit. If the futex value is zero, 703 * the rest of the user space mutex state is consistent, so a woken 704 * waiter will just take over the uncontended futex. Setting the 705 * OWNER_DIED bit would create inconsistent state and malfunction 706 * of the user space owner died handling. Otherwise, the OWNER_DIED 707 * bit is already set, and the woken waiter is expected to deal with 708 * this. 709 */ 710 owner = uval & FUTEX_TID_MASK; 711 712 if (pending_op && !pi && !owner) { 713 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 714 FUTEX_BITSET_MATCH_ANY); 715 return 0; 716 } 717 718 if (owner != task_pid_vnr(curr)) 719 return 0; 720 721 /* 722 * Ok, this dying thread is truly holding a futex 723 * of interest. Set the OWNER_DIED bit atomically 724 * via cmpxchg, and if the value had FUTEX_WAITERS 725 * set, wake up a waiter (if any). (We have to do a 726 * futex_wake() even if OWNER_DIED is already set - 727 * to handle the rare but possible case of recursive 728 * thread-death.) The rest of the cleanup is done in 729 * userspace. 730 */ 731 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 732 733 /* 734 * We are not holding a lock here, but we want to have 735 * the pagefault_disable/enable() protection because 736 * we want to handle the fault gracefully. If the 737 * access fails we try to fault in the futex with R/W 738 * verification via get_user_pages. get_user() above 739 * does not guarantee R/W access. If that fails we 740 * give up and leave the futex locked. 741 */ 742 if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { 743 switch (err) { 744 case -EFAULT: 745 if (fault_in_user_writeable(uaddr)) 746 return -1; 747 goto retry; 748 749 case -EAGAIN: 750 cond_resched(); 751 goto retry; 752 753 default: 754 WARN_ON_ONCE(1); 755 return err; 756 } 757 } 758 759 if (nval != uval) 760 goto retry; 761 762 /* 763 * Wake robust non-PI futexes here. The wakeup of 764 * PI futexes happens in exit_pi_state(): 765 */ 766 if (!pi && (uval & FUTEX_WAITERS)) { 767 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 768 FUTEX_BITSET_MATCH_ANY); 769 } 770 771 return 0; 772 } 773 774 /* 775 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 776 */ 777 static inline int fetch_robust_entry(struct robust_list __user **entry, 778 struct robust_list __user * __user *head, 779 unsigned int *pi) 780 { 781 unsigned long uentry; 782 783 if (get_user(uentry, (unsigned long __user *)head)) 784 return -EFAULT; 785 786 *entry = (void __user *)(uentry & ~1UL); 787 *pi = uentry & 1; 788 789 return 0; 790 } 791 792 /* 793 * Walk curr->robust_list (very carefully, it's a userspace list!) 794 * and mark any locks found there dead, and notify any waiters. 795 * 796 * We silently return on any sign of list-walking problem. 797 */ 798 static void exit_robust_list(struct task_struct *curr) 799 { 800 struct robust_list_head __user *head = curr->robust_list; 801 struct robust_list __user *entry, *next_entry, *pending; 802 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 803 unsigned int next_pi; 804 unsigned long futex_offset; 805 int rc; 806 807 /* 808 * Fetch the list head (which was registered earlier, via 809 * sys_set_robust_list()): 810 */ 811 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 812 return; 813 /* 814 * Fetch the relative futex offset: 815 */ 816 if (get_user(futex_offset, &head->futex_offset)) 817 return; 818 /* 819 * Fetch any possibly pending lock-add first, and handle it 820 * if it exists: 821 */ 822 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 823 return; 824 825 next_entry = NULL; /* avoid warning with gcc */ 826 while (entry != &head->list) { 827 /* 828 * Fetch the next entry in the list before calling 829 * handle_futex_death: 830 */ 831 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 832 /* 833 * A pending lock might already be on the list, so 834 * don't process it twice: 835 */ 836 if (entry != pending) { 837 if (handle_futex_death((void __user *)entry + futex_offset, 838 curr, pi, HANDLE_DEATH_LIST)) 839 return; 840 } 841 if (rc) 842 return; 843 entry = next_entry; 844 pi = next_pi; 845 /* 846 * Avoid excessively long or circular lists: 847 */ 848 if (!--limit) 849 break; 850 851 cond_resched(); 852 } 853 854 if (pending) { 855 handle_futex_death((void __user *)pending + futex_offset, 856 curr, pip, HANDLE_DEATH_PENDING); 857 } 858 } 859 860 #ifdef CONFIG_COMPAT 861 static void __user *futex_uaddr(struct robust_list __user *entry, 862 compat_long_t futex_offset) 863 { 864 compat_uptr_t base = ptr_to_compat(entry); 865 void __user *uaddr = compat_ptr(base + futex_offset); 866 867 return uaddr; 868 } 869 870 /* 871 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 872 */ 873 static inline int 874 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 875 compat_uptr_t __user *head, unsigned int *pi) 876 { 877 if (get_user(*uentry, head)) 878 return -EFAULT; 879 880 *entry = compat_ptr((*uentry) & ~1); 881 *pi = (unsigned int)(*uentry) & 1; 882 883 return 0; 884 } 885 886 /* 887 * Walk curr->robust_list (very carefully, it's a userspace list!) 888 * and mark any locks found there dead, and notify any waiters. 889 * 890 * We silently return on any sign of list-walking problem. 891 */ 892 static void compat_exit_robust_list(struct task_struct *curr) 893 { 894 struct compat_robust_list_head __user *head = curr->compat_robust_list; 895 struct robust_list __user *entry, *next_entry, *pending; 896 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 897 unsigned int next_pi; 898 compat_uptr_t uentry, next_uentry, upending; 899 compat_long_t futex_offset; 900 int rc; 901 902 /* 903 * Fetch the list head (which was registered earlier, via 904 * sys_set_robust_list()): 905 */ 906 if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) 907 return; 908 /* 909 * Fetch the relative futex offset: 910 */ 911 if (get_user(futex_offset, &head->futex_offset)) 912 return; 913 /* 914 * Fetch any possibly pending lock-add first, and handle it 915 * if it exists: 916 */ 917 if (compat_fetch_robust_entry(&upending, &pending, 918 &head->list_op_pending, &pip)) 919 return; 920 921 next_entry = NULL; /* avoid warning with gcc */ 922 while (entry != (struct robust_list __user *) &head->list) { 923 /* 924 * Fetch the next entry in the list before calling 925 * handle_futex_death: 926 */ 927 rc = compat_fetch_robust_entry(&next_uentry, &next_entry, 928 (compat_uptr_t __user *)&entry->next, &next_pi); 929 /* 930 * A pending lock might already be on the list, so 931 * dont process it twice: 932 */ 933 if (entry != pending) { 934 void __user *uaddr = futex_uaddr(entry, futex_offset); 935 936 if (handle_futex_death(uaddr, curr, pi, 937 HANDLE_DEATH_LIST)) 938 return; 939 } 940 if (rc) 941 return; 942 uentry = next_uentry; 943 entry = next_entry; 944 pi = next_pi; 945 /* 946 * Avoid excessively long or circular lists: 947 */ 948 if (!--limit) 949 break; 950 951 cond_resched(); 952 } 953 if (pending) { 954 void __user *uaddr = futex_uaddr(pending, futex_offset); 955 956 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); 957 } 958 } 959 #endif 960 961 #ifdef CONFIG_FUTEX_PI 962 963 /* 964 * This task is holding PI mutexes at exit time => bad. 965 * Kernel cleans up PI-state, but userspace is likely hosed. 966 * (Robust-futex cleanup is separate and might save the day for userspace.) 967 */ 968 static void exit_pi_state_list(struct task_struct *curr) 969 { 970 struct list_head *next, *head = &curr->pi_state_list; 971 struct futex_pi_state *pi_state; 972 struct futex_hash_bucket *hb; 973 union futex_key key = FUTEX_KEY_INIT; 974 975 /* 976 * We are a ZOMBIE and nobody can enqueue itself on 977 * pi_state_list anymore, but we have to be careful 978 * versus waiters unqueueing themselves: 979 */ 980 raw_spin_lock_irq(&curr->pi_lock); 981 while (!list_empty(head)) { 982 next = head->next; 983 pi_state = list_entry(next, struct futex_pi_state, list); 984 key = pi_state->key; 985 hb = futex_hash(&key); 986 987 /* 988 * We can race against put_pi_state() removing itself from the 989 * list (a waiter going away). put_pi_state() will first 990 * decrement the reference count and then modify the list, so 991 * its possible to see the list entry but fail this reference 992 * acquire. 993 * 994 * In that case; drop the locks to let put_pi_state() make 995 * progress and retry the loop. 996 */ 997 if (!refcount_inc_not_zero(&pi_state->refcount)) { 998 raw_spin_unlock_irq(&curr->pi_lock); 999 cpu_relax(); 1000 raw_spin_lock_irq(&curr->pi_lock); 1001 continue; 1002 } 1003 raw_spin_unlock_irq(&curr->pi_lock); 1004 1005 spin_lock(&hb->lock); 1006 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1007 raw_spin_lock(&curr->pi_lock); 1008 /* 1009 * We dropped the pi-lock, so re-check whether this 1010 * task still owns the PI-state: 1011 */ 1012 if (head->next != next) { 1013 /* retain curr->pi_lock for the loop invariant */ 1014 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 1015 spin_unlock(&hb->lock); 1016 put_pi_state(pi_state); 1017 continue; 1018 } 1019 1020 WARN_ON(pi_state->owner != curr); 1021 WARN_ON(list_empty(&pi_state->list)); 1022 list_del_init(&pi_state->list); 1023 pi_state->owner = NULL; 1024 1025 raw_spin_unlock(&curr->pi_lock); 1026 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1027 spin_unlock(&hb->lock); 1028 1029 rt_mutex_futex_unlock(&pi_state->pi_mutex); 1030 put_pi_state(pi_state); 1031 1032 raw_spin_lock_irq(&curr->pi_lock); 1033 } 1034 raw_spin_unlock_irq(&curr->pi_lock); 1035 } 1036 #else 1037 static inline void exit_pi_state_list(struct task_struct *curr) { } 1038 #endif 1039 1040 static void futex_cleanup(struct task_struct *tsk) 1041 { 1042 if (unlikely(tsk->robust_list)) { 1043 exit_robust_list(tsk); 1044 tsk->robust_list = NULL; 1045 } 1046 1047 #ifdef CONFIG_COMPAT 1048 if (unlikely(tsk->compat_robust_list)) { 1049 compat_exit_robust_list(tsk); 1050 tsk->compat_robust_list = NULL; 1051 } 1052 #endif 1053 1054 if (unlikely(!list_empty(&tsk->pi_state_list))) 1055 exit_pi_state_list(tsk); 1056 } 1057 1058 /** 1059 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD 1060 * @tsk: task to set the state on 1061 * 1062 * Set the futex exit state of the task lockless. The futex waiter code 1063 * observes that state when a task is exiting and loops until the task has 1064 * actually finished the futex cleanup. The worst case for this is that the 1065 * waiter runs through the wait loop until the state becomes visible. 1066 * 1067 * This is called from the recursive fault handling path in make_task_dead(). 1068 * 1069 * This is best effort. Either the futex exit code has run already or 1070 * not. If the OWNER_DIED bit has been set on the futex then the waiter can 1071 * take it over. If not, the problem is pushed back to user space. If the 1072 * futex exit code did not run yet, then an already queued waiter might 1073 * block forever, but there is nothing which can be done about that. 1074 */ 1075 void futex_exit_recursive(struct task_struct *tsk) 1076 { 1077 /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ 1078 if (tsk->futex_state == FUTEX_STATE_EXITING) 1079 mutex_unlock(&tsk->futex_exit_mutex); 1080 tsk->futex_state = FUTEX_STATE_DEAD; 1081 } 1082 1083 static void futex_cleanup_begin(struct task_struct *tsk) 1084 { 1085 /* 1086 * Prevent various race issues against a concurrent incoming waiter 1087 * including live locks by forcing the waiter to block on 1088 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in 1089 * attach_to_pi_owner(). 1090 */ 1091 mutex_lock(&tsk->futex_exit_mutex); 1092 1093 /* 1094 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. 1095 * 1096 * This ensures that all subsequent checks of tsk->futex_state in 1097 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with 1098 * tsk->pi_lock held. 1099 * 1100 * It guarantees also that a pi_state which was queued right before 1101 * the state change under tsk->pi_lock by a concurrent waiter must 1102 * be observed in exit_pi_state_list(). 1103 */ 1104 raw_spin_lock_irq(&tsk->pi_lock); 1105 tsk->futex_state = FUTEX_STATE_EXITING; 1106 raw_spin_unlock_irq(&tsk->pi_lock); 1107 } 1108 1109 static void futex_cleanup_end(struct task_struct *tsk, int state) 1110 { 1111 /* 1112 * Lockless store. The only side effect is that an observer might 1113 * take another loop until it becomes visible. 1114 */ 1115 tsk->futex_state = state; 1116 /* 1117 * Drop the exit protection. This unblocks waiters which observed 1118 * FUTEX_STATE_EXITING to reevaluate the state. 1119 */ 1120 mutex_unlock(&tsk->futex_exit_mutex); 1121 } 1122 1123 void futex_exec_release(struct task_struct *tsk) 1124 { 1125 /* 1126 * The state handling is done for consistency, but in the case of 1127 * exec() there is no way to prevent further damage as the PID stays 1128 * the same. But for the unlikely and arguably buggy case that a 1129 * futex is held on exec(), this provides at least as much state 1130 * consistency protection which is possible. 1131 */ 1132 futex_cleanup_begin(tsk); 1133 futex_cleanup(tsk); 1134 /* 1135 * Reset the state to FUTEX_STATE_OK. The task is alive and about 1136 * exec a new binary. 1137 */ 1138 futex_cleanup_end(tsk, FUTEX_STATE_OK); 1139 } 1140 1141 void futex_exit_release(struct task_struct *tsk) 1142 { 1143 futex_cleanup_begin(tsk); 1144 futex_cleanup(tsk); 1145 futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 1146 } 1147 1148 static int __init futex_init(void) 1149 { 1150 unsigned int futex_shift; 1151 unsigned long i; 1152 1153 #if CONFIG_BASE_SMALL 1154 futex_hashsize = 16; 1155 #else 1156 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 1157 #endif 1158 1159 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 1160 futex_hashsize, 0, 0, 1161 &futex_shift, NULL, 1162 futex_hashsize, futex_hashsize); 1163 futex_hashsize = 1UL << futex_shift; 1164 1165 for (i = 0; i < futex_hashsize; i++) { 1166 atomic_set(&futex_queues[i].waiters, 0); 1167 plist_head_init(&futex_queues[i].chain); 1168 spin_lock_init(&futex_queues[i].lock); 1169 } 1170 1171 return 0; 1172 } 1173 core_initcall(futex_init); 1174