1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Fast Userspace Mutexes (which I call "Futexes!"). 4 * (C) Rusty Russell, IBM 2002 5 * 6 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 7 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 8 * 9 * Removed page pinning, fix privately mapped COW pages and other cleanups 10 * (C) Copyright 2003, 2004 Jamie Lokier 11 * 12 * Robust futex support started by Ingo Molnar 13 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 14 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 15 * 16 * PI-futex support started by Ingo Molnar and Thomas Gleixner 17 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 18 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 19 * 20 * PRIVATE futexes by Eric Dumazet 21 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 22 * 23 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 24 * Copyright (C) IBM Corporation, 2009 25 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 26 * 27 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 28 * enough at me, Linus for the original (flawed) idea, Matthew 29 * Kirkwood for proof-of-concept implementation. 30 * 31 * "The futexes are also cursed." 32 * "But they come in a choice of three flavours!" 33 */ 34 #include <linux/compat.h> 35 #include <linux/jhash.h> 36 #include <linux/pagemap.h> 37 #include <linux/plist.h> 38 #include <linux/memblock.h> 39 #include <linux/fault-inject.h> 40 #include <linux/slab.h> 41 42 #include "futex.h" 43 #include "../locking/rtmutex_common.h" 44 45 /* 46 * The base of the bucket array and its size are always used together 47 * (after initialization only in futex_hash()), so ensure that they 48 * reside in the same cacheline. 49 */ 50 static struct { 51 struct futex_hash_bucket *queues; 52 unsigned long hashsize; 53 } __futex_data __read_mostly __aligned(2*sizeof(long)); 54 #define futex_queues (__futex_data.queues) 55 #define futex_hashsize (__futex_data.hashsize) 56 57 58 /* 59 * Fault injections for futexes. 60 */ 61 #ifdef CONFIG_FAIL_FUTEX 62 63 static struct { 64 struct fault_attr attr; 65 66 bool ignore_private; 67 } fail_futex = { 68 .attr = FAULT_ATTR_INITIALIZER, 69 .ignore_private = false, 70 }; 71 72 static int __init setup_fail_futex(char *str) 73 { 74 return setup_fault_attr(&fail_futex.attr, str); 75 } 76 __setup("fail_futex=", setup_fail_futex); 77 78 bool should_fail_futex(bool fshared) 79 { 80 if (fail_futex.ignore_private && !fshared) 81 return false; 82 83 return should_fail(&fail_futex.attr, 1); 84 } 85 86 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 87 88 static int __init fail_futex_debugfs(void) 89 { 90 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 91 struct dentry *dir; 92 93 dir = fault_create_debugfs_attr("fail_futex", NULL, 94 &fail_futex.attr); 95 if (IS_ERR(dir)) 96 return PTR_ERR(dir); 97 98 debugfs_create_bool("ignore-private", mode, dir, 99 &fail_futex.ignore_private); 100 return 0; 101 } 102 103 late_initcall(fail_futex_debugfs); 104 105 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 106 107 #endif /* CONFIG_FAIL_FUTEX */ 108 109 /** 110 * futex_hash - Return the hash bucket in the global hash 111 * @key: Pointer to the futex key for which the hash is calculated 112 * 113 * We hash on the keys returned from get_futex_key (see below) and return the 114 * corresponding hash bucket in the global hash. 115 */ 116 struct futex_hash_bucket *futex_hash(union futex_key *key) 117 { 118 u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, 119 key->both.offset); 120 121 return &futex_queues[hash & (futex_hashsize - 1)]; 122 } 123 124 125 /** 126 * futex_setup_timer - set up the sleeping hrtimer. 127 * @time: ptr to the given timeout value 128 * @timeout: the hrtimer_sleeper structure to be set up 129 * @flags: futex flags 130 * @range_ns: optional range in ns 131 * 132 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout 133 * value given 134 */ 135 struct hrtimer_sleeper * 136 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, 137 int flags, u64 range_ns) 138 { 139 if (!time) 140 return NULL; 141 142 hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? 143 CLOCK_REALTIME : CLOCK_MONOTONIC, 144 HRTIMER_MODE_ABS); 145 /* 146 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is 147 * effectively the same as calling hrtimer_set_expires(). 148 */ 149 hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); 150 151 return timeout; 152 } 153 154 /* 155 * Generate a machine wide unique identifier for this inode. 156 * 157 * This relies on u64 not wrapping in the life-time of the machine; which with 158 * 1ns resolution means almost 585 years. 159 * 160 * This further relies on the fact that a well formed program will not unmap 161 * the file while it has a (shared) futex waiting on it. This mapping will have 162 * a file reference which pins the mount and inode. 163 * 164 * If for some reason an inode gets evicted and read back in again, it will get 165 * a new sequence number and will _NOT_ match, even though it is the exact same 166 * file. 167 * 168 * It is important that futex_match() will never have a false-positive, esp. 169 * for PI futexes that can mess up the state. The above argues that false-negatives 170 * are only possible for malformed programs. 171 */ 172 static u64 get_inode_sequence_number(struct inode *inode) 173 { 174 static atomic64_t i_seq; 175 u64 old; 176 177 /* Does the inode already have a sequence number? */ 178 old = atomic64_read(&inode->i_sequence); 179 if (likely(old)) 180 return old; 181 182 for (;;) { 183 u64 new = atomic64_add_return(1, &i_seq); 184 if (WARN_ON_ONCE(!new)) 185 continue; 186 187 old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); 188 if (old) 189 return old; 190 return new; 191 } 192 } 193 194 /** 195 * get_futex_key() - Get parameters which are the keys for a futex 196 * @uaddr: virtual address of the futex 197 * @flags: FLAGS_* 198 * @key: address where result is stored. 199 * @rw: mapping needs to be read/write (values: FUTEX_READ, 200 * FUTEX_WRITE) 201 * 202 * Return: a negative error code or 0 203 * 204 * The key words are stored in @key on success. 205 * 206 * For shared mappings (when @fshared), the key is: 207 * 208 * ( inode->i_sequence, page->index, offset_within_page ) 209 * 210 * [ also see get_inode_sequence_number() ] 211 * 212 * For private mappings (or when !@fshared), the key is: 213 * 214 * ( current->mm, address, 0 ) 215 * 216 * This allows (cross process, where applicable) identification of the futex 217 * without keeping the page pinned for the duration of the FUTEX_WAIT. 218 * 219 * lock_page() might sleep, the caller should not hold a spinlock. 220 */ 221 int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, 222 enum futex_access rw) 223 { 224 unsigned long address = (unsigned long)uaddr; 225 struct mm_struct *mm = current->mm; 226 struct page *page; 227 struct folio *folio; 228 struct address_space *mapping; 229 int err, ro = 0; 230 bool fshared; 231 232 fshared = flags & FLAGS_SHARED; 233 234 /* 235 * The futex address must be "naturally" aligned. 236 */ 237 key->both.offset = address % PAGE_SIZE; 238 if (unlikely((address % sizeof(u32)) != 0)) 239 return -EINVAL; 240 address -= key->both.offset; 241 242 if (unlikely(!access_ok(uaddr, sizeof(u32)))) 243 return -EFAULT; 244 245 if (unlikely(should_fail_futex(fshared))) 246 return -EFAULT; 247 248 /* 249 * PROCESS_PRIVATE futexes are fast. 250 * As the mm cannot disappear under us and the 'key' only needs 251 * virtual address, we dont even have to find the underlying vma. 252 * Note : We do have to check 'uaddr' is a valid user address, 253 * but access_ok() should be faster than find_vma() 254 */ 255 if (!fshared) { 256 /* 257 * On no-MMU, shared futexes are treated as private, therefore 258 * we must not include the current process in the key. Since 259 * there is only one address space, the address is a unique key 260 * on its own. 261 */ 262 if (IS_ENABLED(CONFIG_MMU)) 263 key->private.mm = mm; 264 else 265 key->private.mm = NULL; 266 267 key->private.address = address; 268 return 0; 269 } 270 271 again: 272 /* Ignore any VERIFY_READ mapping (futex common case) */ 273 if (unlikely(should_fail_futex(true))) 274 return -EFAULT; 275 276 err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); 277 /* 278 * If write access is not required (eg. FUTEX_WAIT), try 279 * and get read-only access. 280 */ 281 if (err == -EFAULT && rw == FUTEX_READ) { 282 err = get_user_pages_fast(address, 1, 0, &page); 283 ro = 1; 284 } 285 if (err < 0) 286 return err; 287 else 288 err = 0; 289 290 /* 291 * The treatment of mapping from this point on is critical. The folio 292 * lock protects many things but in this context the folio lock 293 * stabilizes mapping, prevents inode freeing in the shared 294 * file-backed region case and guards against movement to swap cache. 295 * 296 * Strictly speaking the folio lock is not needed in all cases being 297 * considered here and folio lock forces unnecessarily serialization. 298 * From this point on, mapping will be re-verified if necessary and 299 * folio lock will be acquired only if it is unavoidable 300 * 301 * Mapping checks require the folio so it is looked up now. For 302 * anonymous pages, it does not matter if the folio is split 303 * in the future as the key is based on the address. For 304 * filesystem-backed pages, the precise page is required as the 305 * index of the page determines the key. 306 */ 307 folio = page_folio(page); 308 mapping = READ_ONCE(folio->mapping); 309 310 /* 311 * If folio->mapping is NULL, then it cannot be an anonymous 312 * page; but it might be the ZERO_PAGE or in the gate area or 313 * in a special mapping (all cases which we are happy to fail); 314 * or it may have been a good file page when get_user_pages_fast 315 * found it, but truncated or holepunched or subjected to 316 * invalidate_complete_page2 before we got the folio lock (also 317 * cases which we are happy to fail). And we hold a reference, 318 * so refcount care in invalidate_inode_page's remove_mapping 319 * prevents drop_caches from setting mapping to NULL beneath us. 320 * 321 * The case we do have to guard against is when memory pressure made 322 * shmem_writepage move it from filecache to swapcache beneath us: 323 * an unlikely race, but we do need to retry for folio->mapping. 324 */ 325 if (unlikely(!mapping)) { 326 int shmem_swizzled; 327 328 /* 329 * Folio lock is required to identify which special case above 330 * applies. If this is really a shmem page then the folio lock 331 * will prevent unexpected transitions. 332 */ 333 folio_lock(folio); 334 shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; 335 folio_unlock(folio); 336 folio_put(folio); 337 338 if (shmem_swizzled) 339 goto again; 340 341 return -EFAULT; 342 } 343 344 /* 345 * Private mappings are handled in a simple way. 346 * 347 * If the futex key is stored in anonymous memory, then the associated 348 * object is the mm which is implicitly pinned by the calling process. 349 * 350 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 351 * it's a read-only handle, it's expected that futexes attach to 352 * the object not the particular process. 353 */ 354 if (folio_test_anon(folio)) { 355 /* 356 * A RO anonymous page will never change and thus doesn't make 357 * sense for futex operations. 358 */ 359 if (unlikely(should_fail_futex(true)) || ro) { 360 err = -EFAULT; 361 goto out; 362 } 363 364 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 365 key->private.mm = mm; 366 key->private.address = address; 367 368 } else { 369 struct inode *inode; 370 371 /* 372 * The associated futex object in this case is the inode and 373 * the folio->mapping must be traversed. Ordinarily this should 374 * be stabilised under folio lock but it's not strictly 375 * necessary in this case as we just want to pin the inode, not 376 * update i_pages or anything like that. 377 * 378 * The RCU read lock is taken as the inode is finally freed 379 * under RCU. If the mapping still matches expectations then the 380 * mapping->host can be safely accessed as being a valid inode. 381 */ 382 rcu_read_lock(); 383 384 if (READ_ONCE(folio->mapping) != mapping) { 385 rcu_read_unlock(); 386 folio_put(folio); 387 388 goto again; 389 } 390 391 inode = READ_ONCE(mapping->host); 392 if (!inode) { 393 rcu_read_unlock(); 394 folio_put(folio); 395 396 goto again; 397 } 398 399 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 400 key->shared.i_seq = get_inode_sequence_number(inode); 401 key->shared.pgoff = folio->index + folio_page_idx(folio, page); 402 rcu_read_unlock(); 403 } 404 405 out: 406 folio_put(folio); 407 return err; 408 } 409 410 /** 411 * fault_in_user_writeable() - Fault in user address and verify RW access 412 * @uaddr: pointer to faulting user space address 413 * 414 * Slow path to fixup the fault we just took in the atomic write 415 * access to @uaddr. 416 * 417 * We have no generic implementation of a non-destructive write to the 418 * user address. We know that we faulted in the atomic pagefault 419 * disabled section so we can as well avoid the #PF overhead by 420 * calling get_user_pages() right away. 421 */ 422 int fault_in_user_writeable(u32 __user *uaddr) 423 { 424 struct mm_struct *mm = current->mm; 425 int ret; 426 427 mmap_read_lock(mm); 428 ret = fixup_user_fault(mm, (unsigned long)uaddr, 429 FAULT_FLAG_WRITE, NULL); 430 mmap_read_unlock(mm); 431 432 return ret < 0 ? ret : 0; 433 } 434 435 /** 436 * futex_top_waiter() - Return the highest priority waiter on a futex 437 * @hb: the hash bucket the futex_q's reside in 438 * @key: the futex key (to distinguish it from other futex futex_q's) 439 * 440 * Must be called with the hb lock held. 441 */ 442 struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) 443 { 444 struct futex_q *this; 445 446 plist_for_each_entry(this, &hb->chain, list) { 447 if (futex_match(&this->key, key)) 448 return this; 449 } 450 return NULL; 451 } 452 453 int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) 454 { 455 int ret; 456 457 pagefault_disable(); 458 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); 459 pagefault_enable(); 460 461 return ret; 462 } 463 464 int futex_get_value_locked(u32 *dest, u32 __user *from) 465 { 466 int ret; 467 468 pagefault_disable(); 469 ret = __get_user(*dest, from); 470 pagefault_enable(); 471 472 return ret ? -EFAULT : 0; 473 } 474 475 /** 476 * wait_for_owner_exiting - Block until the owner has exited 477 * @ret: owner's current futex lock status 478 * @exiting: Pointer to the exiting task 479 * 480 * Caller must hold a refcount on @exiting. 481 */ 482 void wait_for_owner_exiting(int ret, struct task_struct *exiting) 483 { 484 if (ret != -EBUSY) { 485 WARN_ON_ONCE(exiting); 486 return; 487 } 488 489 if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) 490 return; 491 492 mutex_lock(&exiting->futex_exit_mutex); 493 /* 494 * No point in doing state checking here. If the waiter got here 495 * while the task was in exec()->exec_futex_release() then it can 496 * have any FUTEX_STATE_* value when the waiter has acquired the 497 * mutex. OK, if running, EXITING or DEAD if it reached exit() 498 * already. Highly unlikely and not a problem. Just one more round 499 * through the futex maze. 500 */ 501 mutex_unlock(&exiting->futex_exit_mutex); 502 503 put_task_struct(exiting); 504 } 505 506 /** 507 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket 508 * @q: The futex_q to unqueue 509 * 510 * The q->lock_ptr must not be NULL and must be held by the caller. 511 */ 512 void __futex_unqueue(struct futex_q *q) 513 { 514 struct futex_hash_bucket *hb; 515 516 if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) 517 return; 518 lockdep_assert_held(q->lock_ptr); 519 520 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 521 plist_del(&q->list, &hb->chain); 522 futex_hb_waiters_dec(hb); 523 } 524 525 /* The key must be already stored in q->key. */ 526 struct futex_hash_bucket *futex_q_lock(struct futex_q *q) 527 __acquires(&hb->lock) 528 { 529 struct futex_hash_bucket *hb; 530 531 hb = futex_hash(&q->key); 532 533 /* 534 * Increment the counter before taking the lock so that 535 * a potential waker won't miss a to-be-slept task that is 536 * waiting for the spinlock. This is safe as all futex_q_lock() 537 * users end up calling futex_queue(). Similarly, for housekeeping, 538 * decrement the counter at futex_q_unlock() when some error has 539 * occurred and we don't end up adding the task to the list. 540 */ 541 futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ 542 543 q->lock_ptr = &hb->lock; 544 545 spin_lock(&hb->lock); 546 return hb; 547 } 548 549 void futex_q_unlock(struct futex_hash_bucket *hb) 550 __releases(&hb->lock) 551 { 552 spin_unlock(&hb->lock); 553 futex_hb_waiters_dec(hb); 554 } 555 556 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) 557 { 558 int prio; 559 560 /* 561 * The priority used to register this element is 562 * - either the real thread-priority for the real-time threads 563 * (i.e. threads with a priority lower than MAX_RT_PRIO) 564 * - or MAX_RT_PRIO for non-RT threads. 565 * Thus, all RT-threads are woken first in priority order, and 566 * the others are woken last, in FIFO order. 567 */ 568 prio = min(current->normal_prio, MAX_RT_PRIO); 569 570 plist_node_init(&q->list, prio); 571 plist_add(&q->list, &hb->chain); 572 q->task = current; 573 } 574 575 /** 576 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket 577 * @q: The futex_q to unqueue 578 * 579 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must 580 * be paired with exactly one earlier call to futex_queue(). 581 * 582 * Return: 583 * - 1 - if the futex_q was still queued (and we removed unqueued it); 584 * - 0 - if the futex_q was already removed by the waking thread 585 */ 586 int futex_unqueue(struct futex_q *q) 587 { 588 spinlock_t *lock_ptr; 589 int ret = 0; 590 591 /* In the common case we don't take the spinlock, which is nice. */ 592 retry: 593 /* 594 * q->lock_ptr can change between this read and the following spin_lock. 595 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 596 * optimizing lock_ptr out of the logic below. 597 */ 598 lock_ptr = READ_ONCE(q->lock_ptr); 599 if (lock_ptr != NULL) { 600 spin_lock(lock_ptr); 601 /* 602 * q->lock_ptr can change between reading it and 603 * spin_lock(), causing us to take the wrong lock. This 604 * corrects the race condition. 605 * 606 * Reasoning goes like this: if we have the wrong lock, 607 * q->lock_ptr must have changed (maybe several times) 608 * between reading it and the spin_lock(). It can 609 * change again after the spin_lock() but only if it was 610 * already changed before the spin_lock(). It cannot, 611 * however, change back to the original value. Therefore 612 * we can detect whether we acquired the correct lock. 613 */ 614 if (unlikely(lock_ptr != q->lock_ptr)) { 615 spin_unlock(lock_ptr); 616 goto retry; 617 } 618 __futex_unqueue(q); 619 620 BUG_ON(q->pi_state); 621 622 spin_unlock(lock_ptr); 623 ret = 1; 624 } 625 626 return ret; 627 } 628 629 /* 630 * PI futexes can not be requeued and must remove themselves from the 631 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. 632 */ 633 void futex_unqueue_pi(struct futex_q *q) 634 { 635 __futex_unqueue(q); 636 637 BUG_ON(!q->pi_state); 638 put_pi_state(q->pi_state); 639 q->pi_state = NULL; 640 } 641 642 /* Constants for the pending_op argument of handle_futex_death */ 643 #define HANDLE_DEATH_PENDING true 644 #define HANDLE_DEATH_LIST false 645 646 /* 647 * Process a futex-list entry, check whether it's owned by the 648 * dying task, and do notification if so: 649 */ 650 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, 651 bool pi, bool pending_op) 652 { 653 u32 uval, nval, mval; 654 pid_t owner; 655 int err; 656 657 /* Futex address must be 32bit aligned */ 658 if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) 659 return -1; 660 661 retry: 662 if (get_user(uval, uaddr)) 663 return -1; 664 665 /* 666 * Special case for regular (non PI) futexes. The unlock path in 667 * user space has two race scenarios: 668 * 669 * 1. The unlock path releases the user space futex value and 670 * before it can execute the futex() syscall to wake up 671 * waiters it is killed. 672 * 673 * 2. A woken up waiter is killed before it can acquire the 674 * futex in user space. 675 * 676 * In the second case, the wake up notification could be generated 677 * by the unlock path in user space after setting the futex value 678 * to zero or by the kernel after setting the OWNER_DIED bit below. 679 * 680 * In both cases the TID validation below prevents a wakeup of 681 * potential waiters which can cause these waiters to block 682 * forever. 683 * 684 * In both cases the following conditions are met: 685 * 686 * 1) task->robust_list->list_op_pending != NULL 687 * @pending_op == true 688 * 2) The owner part of user space futex value == 0 689 * 3) Regular futex: @pi == false 690 * 691 * If these conditions are met, it is safe to attempt waking up a 692 * potential waiter without touching the user space futex value and 693 * trying to set the OWNER_DIED bit. If the futex value is zero, 694 * the rest of the user space mutex state is consistent, so a woken 695 * waiter will just take over the uncontended futex. Setting the 696 * OWNER_DIED bit would create inconsistent state and malfunction 697 * of the user space owner died handling. Otherwise, the OWNER_DIED 698 * bit is already set, and the woken waiter is expected to deal with 699 * this. 700 */ 701 owner = uval & FUTEX_TID_MASK; 702 703 if (pending_op && !pi && !owner) { 704 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 705 FUTEX_BITSET_MATCH_ANY); 706 return 0; 707 } 708 709 if (owner != task_pid_vnr(curr)) 710 return 0; 711 712 /* 713 * Ok, this dying thread is truly holding a futex 714 * of interest. Set the OWNER_DIED bit atomically 715 * via cmpxchg, and if the value had FUTEX_WAITERS 716 * set, wake up a waiter (if any). (We have to do a 717 * futex_wake() even if OWNER_DIED is already set - 718 * to handle the rare but possible case of recursive 719 * thread-death.) The rest of the cleanup is done in 720 * userspace. 721 */ 722 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 723 724 /* 725 * We are not holding a lock here, but we want to have 726 * the pagefault_disable/enable() protection because 727 * we want to handle the fault gracefully. If the 728 * access fails we try to fault in the futex with R/W 729 * verification via get_user_pages. get_user() above 730 * does not guarantee R/W access. If that fails we 731 * give up and leave the futex locked. 732 */ 733 if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { 734 switch (err) { 735 case -EFAULT: 736 if (fault_in_user_writeable(uaddr)) 737 return -1; 738 goto retry; 739 740 case -EAGAIN: 741 cond_resched(); 742 goto retry; 743 744 default: 745 WARN_ON_ONCE(1); 746 return err; 747 } 748 } 749 750 if (nval != uval) 751 goto retry; 752 753 /* 754 * Wake robust non-PI futexes here. The wakeup of 755 * PI futexes happens in exit_pi_state(): 756 */ 757 if (!pi && (uval & FUTEX_WAITERS)) { 758 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 759 FUTEX_BITSET_MATCH_ANY); 760 } 761 762 return 0; 763 } 764 765 /* 766 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 767 */ 768 static inline int fetch_robust_entry(struct robust_list __user **entry, 769 struct robust_list __user * __user *head, 770 unsigned int *pi) 771 { 772 unsigned long uentry; 773 774 if (get_user(uentry, (unsigned long __user *)head)) 775 return -EFAULT; 776 777 *entry = (void __user *)(uentry & ~1UL); 778 *pi = uentry & 1; 779 780 return 0; 781 } 782 783 /* 784 * Walk curr->robust_list (very carefully, it's a userspace list!) 785 * and mark any locks found there dead, and notify any waiters. 786 * 787 * We silently return on any sign of list-walking problem. 788 */ 789 static void exit_robust_list(struct task_struct *curr) 790 { 791 struct robust_list_head __user *head = curr->robust_list; 792 struct robust_list __user *entry, *next_entry, *pending; 793 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 794 unsigned int next_pi; 795 unsigned long futex_offset; 796 int rc; 797 798 /* 799 * Fetch the list head (which was registered earlier, via 800 * sys_set_robust_list()): 801 */ 802 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 803 return; 804 /* 805 * Fetch the relative futex offset: 806 */ 807 if (get_user(futex_offset, &head->futex_offset)) 808 return; 809 /* 810 * Fetch any possibly pending lock-add first, and handle it 811 * if it exists: 812 */ 813 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 814 return; 815 816 next_entry = NULL; /* avoid warning with gcc */ 817 while (entry != &head->list) { 818 /* 819 * Fetch the next entry in the list before calling 820 * handle_futex_death: 821 */ 822 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 823 /* 824 * A pending lock might already be on the list, so 825 * don't process it twice: 826 */ 827 if (entry != pending) { 828 if (handle_futex_death((void __user *)entry + futex_offset, 829 curr, pi, HANDLE_DEATH_LIST)) 830 return; 831 } 832 if (rc) 833 return; 834 entry = next_entry; 835 pi = next_pi; 836 /* 837 * Avoid excessively long or circular lists: 838 */ 839 if (!--limit) 840 break; 841 842 cond_resched(); 843 } 844 845 if (pending) { 846 handle_futex_death((void __user *)pending + futex_offset, 847 curr, pip, HANDLE_DEATH_PENDING); 848 } 849 } 850 851 #ifdef CONFIG_COMPAT 852 static void __user *futex_uaddr(struct robust_list __user *entry, 853 compat_long_t futex_offset) 854 { 855 compat_uptr_t base = ptr_to_compat(entry); 856 void __user *uaddr = compat_ptr(base + futex_offset); 857 858 return uaddr; 859 } 860 861 /* 862 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 863 */ 864 static inline int 865 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 866 compat_uptr_t __user *head, unsigned int *pi) 867 { 868 if (get_user(*uentry, head)) 869 return -EFAULT; 870 871 *entry = compat_ptr((*uentry) & ~1); 872 *pi = (unsigned int)(*uentry) & 1; 873 874 return 0; 875 } 876 877 /* 878 * Walk curr->robust_list (very carefully, it's a userspace list!) 879 * and mark any locks found there dead, and notify any waiters. 880 * 881 * We silently return on any sign of list-walking problem. 882 */ 883 static void compat_exit_robust_list(struct task_struct *curr) 884 { 885 struct compat_robust_list_head __user *head = curr->compat_robust_list; 886 struct robust_list __user *entry, *next_entry, *pending; 887 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 888 unsigned int next_pi; 889 compat_uptr_t uentry, next_uentry, upending; 890 compat_long_t futex_offset; 891 int rc; 892 893 /* 894 * Fetch the list head (which was registered earlier, via 895 * sys_set_robust_list()): 896 */ 897 if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) 898 return; 899 /* 900 * Fetch the relative futex offset: 901 */ 902 if (get_user(futex_offset, &head->futex_offset)) 903 return; 904 /* 905 * Fetch any possibly pending lock-add first, and handle it 906 * if it exists: 907 */ 908 if (compat_fetch_robust_entry(&upending, &pending, 909 &head->list_op_pending, &pip)) 910 return; 911 912 next_entry = NULL; /* avoid warning with gcc */ 913 while (entry != (struct robust_list __user *) &head->list) { 914 /* 915 * Fetch the next entry in the list before calling 916 * handle_futex_death: 917 */ 918 rc = compat_fetch_robust_entry(&next_uentry, &next_entry, 919 (compat_uptr_t __user *)&entry->next, &next_pi); 920 /* 921 * A pending lock might already be on the list, so 922 * dont process it twice: 923 */ 924 if (entry != pending) { 925 void __user *uaddr = futex_uaddr(entry, futex_offset); 926 927 if (handle_futex_death(uaddr, curr, pi, 928 HANDLE_DEATH_LIST)) 929 return; 930 } 931 if (rc) 932 return; 933 uentry = next_uentry; 934 entry = next_entry; 935 pi = next_pi; 936 /* 937 * Avoid excessively long or circular lists: 938 */ 939 if (!--limit) 940 break; 941 942 cond_resched(); 943 } 944 if (pending) { 945 void __user *uaddr = futex_uaddr(pending, futex_offset); 946 947 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); 948 } 949 } 950 #endif 951 952 #ifdef CONFIG_FUTEX_PI 953 954 /* 955 * This task is holding PI mutexes at exit time => bad. 956 * Kernel cleans up PI-state, but userspace is likely hosed. 957 * (Robust-futex cleanup is separate and might save the day for userspace.) 958 */ 959 static void exit_pi_state_list(struct task_struct *curr) 960 { 961 struct list_head *next, *head = &curr->pi_state_list; 962 struct futex_pi_state *pi_state; 963 struct futex_hash_bucket *hb; 964 union futex_key key = FUTEX_KEY_INIT; 965 966 /* 967 * We are a ZOMBIE and nobody can enqueue itself on 968 * pi_state_list anymore, but we have to be careful 969 * versus waiters unqueueing themselves: 970 */ 971 raw_spin_lock_irq(&curr->pi_lock); 972 while (!list_empty(head)) { 973 next = head->next; 974 pi_state = list_entry(next, struct futex_pi_state, list); 975 key = pi_state->key; 976 hb = futex_hash(&key); 977 978 /* 979 * We can race against put_pi_state() removing itself from the 980 * list (a waiter going away). put_pi_state() will first 981 * decrement the reference count and then modify the list, so 982 * its possible to see the list entry but fail this reference 983 * acquire. 984 * 985 * In that case; drop the locks to let put_pi_state() make 986 * progress and retry the loop. 987 */ 988 if (!refcount_inc_not_zero(&pi_state->refcount)) { 989 raw_spin_unlock_irq(&curr->pi_lock); 990 cpu_relax(); 991 raw_spin_lock_irq(&curr->pi_lock); 992 continue; 993 } 994 raw_spin_unlock_irq(&curr->pi_lock); 995 996 spin_lock(&hb->lock); 997 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 998 raw_spin_lock(&curr->pi_lock); 999 /* 1000 * We dropped the pi-lock, so re-check whether this 1001 * task still owns the PI-state: 1002 */ 1003 if (head->next != next) { 1004 /* retain curr->pi_lock for the loop invariant */ 1005 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 1006 spin_unlock(&hb->lock); 1007 put_pi_state(pi_state); 1008 continue; 1009 } 1010 1011 WARN_ON(pi_state->owner != curr); 1012 WARN_ON(list_empty(&pi_state->list)); 1013 list_del_init(&pi_state->list); 1014 pi_state->owner = NULL; 1015 1016 raw_spin_unlock(&curr->pi_lock); 1017 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1018 spin_unlock(&hb->lock); 1019 1020 rt_mutex_futex_unlock(&pi_state->pi_mutex); 1021 put_pi_state(pi_state); 1022 1023 raw_spin_lock_irq(&curr->pi_lock); 1024 } 1025 raw_spin_unlock_irq(&curr->pi_lock); 1026 } 1027 #else 1028 static inline void exit_pi_state_list(struct task_struct *curr) { } 1029 #endif 1030 1031 static void futex_cleanup(struct task_struct *tsk) 1032 { 1033 if (unlikely(tsk->robust_list)) { 1034 exit_robust_list(tsk); 1035 tsk->robust_list = NULL; 1036 } 1037 1038 #ifdef CONFIG_COMPAT 1039 if (unlikely(tsk->compat_robust_list)) { 1040 compat_exit_robust_list(tsk); 1041 tsk->compat_robust_list = NULL; 1042 } 1043 #endif 1044 1045 if (unlikely(!list_empty(&tsk->pi_state_list))) 1046 exit_pi_state_list(tsk); 1047 } 1048 1049 /** 1050 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD 1051 * @tsk: task to set the state on 1052 * 1053 * Set the futex exit state of the task lockless. The futex waiter code 1054 * observes that state when a task is exiting and loops until the task has 1055 * actually finished the futex cleanup. The worst case for this is that the 1056 * waiter runs through the wait loop until the state becomes visible. 1057 * 1058 * This is called from the recursive fault handling path in make_task_dead(). 1059 * 1060 * This is best effort. Either the futex exit code has run already or 1061 * not. If the OWNER_DIED bit has been set on the futex then the waiter can 1062 * take it over. If not, the problem is pushed back to user space. If the 1063 * futex exit code did not run yet, then an already queued waiter might 1064 * block forever, but there is nothing which can be done about that. 1065 */ 1066 void futex_exit_recursive(struct task_struct *tsk) 1067 { 1068 /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ 1069 if (tsk->futex_state == FUTEX_STATE_EXITING) 1070 mutex_unlock(&tsk->futex_exit_mutex); 1071 tsk->futex_state = FUTEX_STATE_DEAD; 1072 } 1073 1074 static void futex_cleanup_begin(struct task_struct *tsk) 1075 { 1076 /* 1077 * Prevent various race issues against a concurrent incoming waiter 1078 * including live locks by forcing the waiter to block on 1079 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in 1080 * attach_to_pi_owner(). 1081 */ 1082 mutex_lock(&tsk->futex_exit_mutex); 1083 1084 /* 1085 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. 1086 * 1087 * This ensures that all subsequent checks of tsk->futex_state in 1088 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with 1089 * tsk->pi_lock held. 1090 * 1091 * It guarantees also that a pi_state which was queued right before 1092 * the state change under tsk->pi_lock by a concurrent waiter must 1093 * be observed in exit_pi_state_list(). 1094 */ 1095 raw_spin_lock_irq(&tsk->pi_lock); 1096 tsk->futex_state = FUTEX_STATE_EXITING; 1097 raw_spin_unlock_irq(&tsk->pi_lock); 1098 } 1099 1100 static void futex_cleanup_end(struct task_struct *tsk, int state) 1101 { 1102 /* 1103 * Lockless store. The only side effect is that an observer might 1104 * take another loop until it becomes visible. 1105 */ 1106 tsk->futex_state = state; 1107 /* 1108 * Drop the exit protection. This unblocks waiters which observed 1109 * FUTEX_STATE_EXITING to reevaluate the state. 1110 */ 1111 mutex_unlock(&tsk->futex_exit_mutex); 1112 } 1113 1114 void futex_exec_release(struct task_struct *tsk) 1115 { 1116 /* 1117 * The state handling is done for consistency, but in the case of 1118 * exec() there is no way to prevent further damage as the PID stays 1119 * the same. But for the unlikely and arguably buggy case that a 1120 * futex is held on exec(), this provides at least as much state 1121 * consistency protection which is possible. 1122 */ 1123 futex_cleanup_begin(tsk); 1124 futex_cleanup(tsk); 1125 /* 1126 * Reset the state to FUTEX_STATE_OK. The task is alive and about 1127 * exec a new binary. 1128 */ 1129 futex_cleanup_end(tsk, FUTEX_STATE_OK); 1130 } 1131 1132 void futex_exit_release(struct task_struct *tsk) 1133 { 1134 futex_cleanup_begin(tsk); 1135 futex_cleanup(tsk); 1136 futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 1137 } 1138 1139 static int __init futex_init(void) 1140 { 1141 unsigned int futex_shift; 1142 unsigned long i; 1143 1144 #if CONFIG_BASE_SMALL 1145 futex_hashsize = 16; 1146 #else 1147 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 1148 #endif 1149 1150 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 1151 futex_hashsize, 0, 0, 1152 &futex_shift, NULL, 1153 futex_hashsize, futex_hashsize); 1154 futex_hashsize = 1UL << futex_shift; 1155 1156 for (i = 0; i < futex_hashsize; i++) { 1157 atomic_set(&futex_queues[i].waiters, 0); 1158 plist_head_init(&futex_queues[i].chain); 1159 spin_lock_init(&futex_queues[i].lock); 1160 } 1161 1162 return 0; 1163 } 1164 core_initcall(futex_init); 1165