1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Fast Userspace Mutexes (which I call "Futexes!"). 4 * (C) Rusty Russell, IBM 2002 5 * 6 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 7 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 8 * 9 * Removed page pinning, fix privately mapped COW pages and other cleanups 10 * (C) Copyright 2003, 2004 Jamie Lokier 11 * 12 * Robust futex support started by Ingo Molnar 13 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 14 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 15 * 16 * PI-futex support started by Ingo Molnar and Thomas Gleixner 17 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 18 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 19 * 20 * PRIVATE futexes by Eric Dumazet 21 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 22 * 23 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 24 * Copyright (C) IBM Corporation, 2009 25 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 26 * 27 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 28 * enough at me, Linus for the original (flawed) idea, Matthew 29 * Kirkwood for proof-of-concept implementation. 30 * 31 * "The futexes are also cursed." 32 * "But they come in a choice of three flavours!" 33 */ 34 #include <linux/compat.h> 35 #include <linux/jhash.h> 36 #include <linux/pagemap.h> 37 #include <linux/debugfs.h> 38 #include <linux/plist.h> 39 #include <linux/memblock.h> 40 #include <linux/fault-inject.h> 41 #include <linux/slab.h> 42 #include <linux/prctl.h> 43 44 #include "futex.h" 45 #include "../locking/rtmutex_common.h" 46 47 /* 48 * The base of the bucket array and its size are always used together 49 * (after initialization only in futex_hash()), so ensure that they 50 * reside in the same cacheline. 51 */ 52 static struct { 53 struct futex_hash_bucket *queues; 54 unsigned long hashmask; 55 } __futex_data __read_mostly __aligned(2*sizeof(long)); 56 #define futex_queues (__futex_data.queues) 57 #define futex_hashmask (__futex_data.hashmask) 58 59 struct futex_private_hash { 60 unsigned int hash_mask; 61 void *mm; 62 bool custom; 63 struct futex_hash_bucket queues[]; 64 }; 65 66 /* 67 * Fault injections for futexes. 68 */ 69 #ifdef CONFIG_FAIL_FUTEX 70 71 static struct { 72 struct fault_attr attr; 73 74 bool ignore_private; 75 } fail_futex = { 76 .attr = FAULT_ATTR_INITIALIZER, 77 .ignore_private = false, 78 }; 79 80 static int __init setup_fail_futex(char *str) 81 { 82 return setup_fault_attr(&fail_futex.attr, str); 83 } 84 __setup("fail_futex=", setup_fail_futex); 85 86 bool should_fail_futex(bool fshared) 87 { 88 if (fail_futex.ignore_private && !fshared) 89 return false; 90 91 return should_fail(&fail_futex.attr, 1); 92 } 93 94 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 95 96 static int __init fail_futex_debugfs(void) 97 { 98 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 99 struct dentry *dir; 100 101 dir = fault_create_debugfs_attr("fail_futex", NULL, 102 &fail_futex.attr); 103 if (IS_ERR(dir)) 104 return PTR_ERR(dir); 105 106 debugfs_create_bool("ignore-private", mode, dir, 107 &fail_futex.ignore_private); 108 return 0; 109 } 110 111 late_initcall(fail_futex_debugfs); 112 113 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 114 115 #endif /* CONFIG_FAIL_FUTEX */ 116 117 static struct futex_hash_bucket * 118 __futex_hash(union futex_key *key, struct futex_private_hash *fph); 119 120 #ifdef CONFIG_FUTEX_PRIVATE_HASH 121 static inline bool futex_key_is_private(union futex_key *key) 122 { 123 /* 124 * Relies on get_futex_key() to set either bit for shared 125 * futexes -- see comment with union futex_key. 126 */ 127 return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); 128 } 129 130 bool futex_private_hash_get(struct futex_private_hash *fph) 131 { 132 return false; 133 } 134 135 void futex_private_hash_put(struct futex_private_hash *fph) 136 { 137 } 138 139 /** 140 * futex_hash_get - Get an additional reference for the local hash. 141 * @hb: ptr to the private local hash. 142 * 143 * Obtain an additional reference for the already obtained hash bucket. The 144 * caller must already own an reference. 145 */ 146 void futex_hash_get(struct futex_hash_bucket *hb) { } 147 void futex_hash_put(struct futex_hash_bucket *hb) { } 148 149 static struct futex_hash_bucket * 150 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 151 { 152 u32 hash; 153 154 if (!futex_key_is_private(key)) 155 return NULL; 156 157 if (!fph) 158 fph = key->private.mm->futex_phash; 159 if (!fph || !fph->hash_mask) 160 return NULL; 161 162 hash = jhash2((void *)&key->private.address, 163 sizeof(key->private.address) / 4, 164 key->both.offset); 165 return &fph->queues[hash & fph->hash_mask]; 166 } 167 168 struct futex_private_hash *futex_private_hash(void) 169 { 170 struct mm_struct *mm = current->mm; 171 struct futex_private_hash *fph; 172 173 fph = mm->futex_phash; 174 return fph; 175 } 176 177 struct futex_hash_bucket *futex_hash(union futex_key *key) 178 { 179 struct futex_hash_bucket *hb; 180 181 hb = __futex_hash(key, NULL); 182 return hb; 183 } 184 185 #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 186 187 static struct futex_hash_bucket * 188 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 189 { 190 return NULL; 191 } 192 193 struct futex_hash_bucket *futex_hash(union futex_key *key) 194 { 195 return __futex_hash(key, NULL); 196 } 197 198 #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 199 200 /** 201 * __futex_hash - Return the hash bucket 202 * @key: Pointer to the futex key for which the hash is calculated 203 * @fph: Pointer to private hash if known 204 * 205 * We hash on the keys returned from get_futex_key (see below) and return the 206 * corresponding hash bucket. 207 * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the 208 * private hash) is returned if existing. Otherwise a hash bucket from the 209 * global hash is returned. 210 */ 211 static struct futex_hash_bucket * 212 __futex_hash(union futex_key *key, struct futex_private_hash *fph) 213 { 214 struct futex_hash_bucket *hb; 215 u32 hash; 216 217 hb = __futex_hash_private(key, fph); 218 if (hb) 219 return hb; 220 221 hash = jhash2((u32 *)key, 222 offsetof(typeof(*key), both.offset) / 4, 223 key->both.offset); 224 return &futex_queues[hash & futex_hashmask]; 225 } 226 227 /** 228 * futex_setup_timer - set up the sleeping hrtimer. 229 * @time: ptr to the given timeout value 230 * @timeout: the hrtimer_sleeper structure to be set up 231 * @flags: futex flags 232 * @range_ns: optional range in ns 233 * 234 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout 235 * value given 236 */ 237 struct hrtimer_sleeper * 238 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, 239 int flags, u64 range_ns) 240 { 241 if (!time) 242 return NULL; 243 244 hrtimer_setup_sleeper_on_stack(timeout, 245 (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, 246 HRTIMER_MODE_ABS); 247 /* 248 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is 249 * effectively the same as calling hrtimer_set_expires(). 250 */ 251 hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); 252 253 return timeout; 254 } 255 256 /* 257 * Generate a machine wide unique identifier for this inode. 258 * 259 * This relies on u64 not wrapping in the life-time of the machine; which with 260 * 1ns resolution means almost 585 years. 261 * 262 * This further relies on the fact that a well formed program will not unmap 263 * the file while it has a (shared) futex waiting on it. This mapping will have 264 * a file reference which pins the mount and inode. 265 * 266 * If for some reason an inode gets evicted and read back in again, it will get 267 * a new sequence number and will _NOT_ match, even though it is the exact same 268 * file. 269 * 270 * It is important that futex_match() will never have a false-positive, esp. 271 * for PI futexes that can mess up the state. The above argues that false-negatives 272 * are only possible for malformed programs. 273 */ 274 static u64 get_inode_sequence_number(struct inode *inode) 275 { 276 static atomic64_t i_seq; 277 u64 old; 278 279 /* Does the inode already have a sequence number? */ 280 old = atomic64_read(&inode->i_sequence); 281 if (likely(old)) 282 return old; 283 284 for (;;) { 285 u64 new = atomic64_inc_return(&i_seq); 286 if (WARN_ON_ONCE(!new)) 287 continue; 288 289 old = 0; 290 if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) 291 return old; 292 return new; 293 } 294 } 295 296 /** 297 * get_futex_key() - Get parameters which are the keys for a futex 298 * @uaddr: virtual address of the futex 299 * @flags: FLAGS_* 300 * @key: address where result is stored. 301 * @rw: mapping needs to be read/write (values: FUTEX_READ, 302 * FUTEX_WRITE) 303 * 304 * Return: a negative error code or 0 305 * 306 * The key words are stored in @key on success. 307 * 308 * For shared mappings (when @fshared), the key is: 309 * 310 * ( inode->i_sequence, page->index, offset_within_page ) 311 * 312 * [ also see get_inode_sequence_number() ] 313 * 314 * For private mappings (or when !@fshared), the key is: 315 * 316 * ( current->mm, address, 0 ) 317 * 318 * This allows (cross process, where applicable) identification of the futex 319 * without keeping the page pinned for the duration of the FUTEX_WAIT. 320 * 321 * lock_page() might sleep, the caller should not hold a spinlock. 322 */ 323 int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, 324 enum futex_access rw) 325 { 326 unsigned long address = (unsigned long)uaddr; 327 struct mm_struct *mm = current->mm; 328 struct page *page; 329 struct folio *folio; 330 struct address_space *mapping; 331 int err, ro = 0; 332 bool fshared; 333 334 fshared = flags & FLAGS_SHARED; 335 336 /* 337 * The futex address must be "naturally" aligned. 338 */ 339 key->both.offset = address % PAGE_SIZE; 340 if (unlikely((address % sizeof(u32)) != 0)) 341 return -EINVAL; 342 address -= key->both.offset; 343 344 if (unlikely(!access_ok(uaddr, sizeof(u32)))) 345 return -EFAULT; 346 347 if (unlikely(should_fail_futex(fshared))) 348 return -EFAULT; 349 350 /* 351 * PROCESS_PRIVATE futexes are fast. 352 * As the mm cannot disappear under us and the 'key' only needs 353 * virtual address, we dont even have to find the underlying vma. 354 * Note : We do have to check 'uaddr' is a valid user address, 355 * but access_ok() should be faster than find_vma() 356 */ 357 if (!fshared) { 358 /* 359 * On no-MMU, shared futexes are treated as private, therefore 360 * we must not include the current process in the key. Since 361 * there is only one address space, the address is a unique key 362 * on its own. 363 */ 364 if (IS_ENABLED(CONFIG_MMU)) 365 key->private.mm = mm; 366 else 367 key->private.mm = NULL; 368 369 key->private.address = address; 370 return 0; 371 } 372 373 again: 374 /* Ignore any VERIFY_READ mapping (futex common case) */ 375 if (unlikely(should_fail_futex(true))) 376 return -EFAULT; 377 378 err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); 379 /* 380 * If write access is not required (eg. FUTEX_WAIT), try 381 * and get read-only access. 382 */ 383 if (err == -EFAULT && rw == FUTEX_READ) { 384 err = get_user_pages_fast(address, 1, 0, &page); 385 ro = 1; 386 } 387 if (err < 0) 388 return err; 389 else 390 err = 0; 391 392 /* 393 * The treatment of mapping from this point on is critical. The folio 394 * lock protects many things but in this context the folio lock 395 * stabilizes mapping, prevents inode freeing in the shared 396 * file-backed region case and guards against movement to swap cache. 397 * 398 * Strictly speaking the folio lock is not needed in all cases being 399 * considered here and folio lock forces unnecessarily serialization. 400 * From this point on, mapping will be re-verified if necessary and 401 * folio lock will be acquired only if it is unavoidable 402 * 403 * Mapping checks require the folio so it is looked up now. For 404 * anonymous pages, it does not matter if the folio is split 405 * in the future as the key is based on the address. For 406 * filesystem-backed pages, the precise page is required as the 407 * index of the page determines the key. 408 */ 409 folio = page_folio(page); 410 mapping = READ_ONCE(folio->mapping); 411 412 /* 413 * If folio->mapping is NULL, then it cannot be an anonymous 414 * page; but it might be the ZERO_PAGE or in the gate area or 415 * in a special mapping (all cases which we are happy to fail); 416 * or it may have been a good file page when get_user_pages_fast 417 * found it, but truncated or holepunched or subjected to 418 * invalidate_complete_page2 before we got the folio lock (also 419 * cases which we are happy to fail). And we hold a reference, 420 * so refcount care in invalidate_inode_page's remove_mapping 421 * prevents drop_caches from setting mapping to NULL beneath us. 422 * 423 * The case we do have to guard against is when memory pressure made 424 * shmem_writepage move it from filecache to swapcache beneath us: 425 * an unlikely race, but we do need to retry for folio->mapping. 426 */ 427 if (unlikely(!mapping)) { 428 int shmem_swizzled; 429 430 /* 431 * Folio lock is required to identify which special case above 432 * applies. If this is really a shmem page then the folio lock 433 * will prevent unexpected transitions. 434 */ 435 folio_lock(folio); 436 shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; 437 folio_unlock(folio); 438 folio_put(folio); 439 440 if (shmem_swizzled) 441 goto again; 442 443 return -EFAULT; 444 } 445 446 /* 447 * Private mappings are handled in a simple way. 448 * 449 * If the futex key is stored in anonymous memory, then the associated 450 * object is the mm which is implicitly pinned by the calling process. 451 * 452 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 453 * it's a read-only handle, it's expected that futexes attach to 454 * the object not the particular process. 455 */ 456 if (folio_test_anon(folio)) { 457 /* 458 * A RO anonymous page will never change and thus doesn't make 459 * sense for futex operations. 460 */ 461 if (unlikely(should_fail_futex(true)) || ro) { 462 err = -EFAULT; 463 goto out; 464 } 465 466 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 467 key->private.mm = mm; 468 key->private.address = address; 469 470 } else { 471 struct inode *inode; 472 473 /* 474 * The associated futex object in this case is the inode and 475 * the folio->mapping must be traversed. Ordinarily this should 476 * be stabilised under folio lock but it's not strictly 477 * necessary in this case as we just want to pin the inode, not 478 * update i_pages or anything like that. 479 * 480 * The RCU read lock is taken as the inode is finally freed 481 * under RCU. If the mapping still matches expectations then the 482 * mapping->host can be safely accessed as being a valid inode. 483 */ 484 rcu_read_lock(); 485 486 if (READ_ONCE(folio->mapping) != mapping) { 487 rcu_read_unlock(); 488 folio_put(folio); 489 490 goto again; 491 } 492 493 inode = READ_ONCE(mapping->host); 494 if (!inode) { 495 rcu_read_unlock(); 496 folio_put(folio); 497 498 goto again; 499 } 500 501 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 502 key->shared.i_seq = get_inode_sequence_number(inode); 503 key->shared.pgoff = page_pgoff(folio, page); 504 rcu_read_unlock(); 505 } 506 507 out: 508 folio_put(folio); 509 return err; 510 } 511 512 /** 513 * fault_in_user_writeable() - Fault in user address and verify RW access 514 * @uaddr: pointer to faulting user space address 515 * 516 * Slow path to fixup the fault we just took in the atomic write 517 * access to @uaddr. 518 * 519 * We have no generic implementation of a non-destructive write to the 520 * user address. We know that we faulted in the atomic pagefault 521 * disabled section so we can as well avoid the #PF overhead by 522 * calling get_user_pages() right away. 523 */ 524 int fault_in_user_writeable(u32 __user *uaddr) 525 { 526 struct mm_struct *mm = current->mm; 527 int ret; 528 529 mmap_read_lock(mm); 530 ret = fixup_user_fault(mm, (unsigned long)uaddr, 531 FAULT_FLAG_WRITE, NULL); 532 mmap_read_unlock(mm); 533 534 return ret < 0 ? ret : 0; 535 } 536 537 /** 538 * futex_top_waiter() - Return the highest priority waiter on a futex 539 * @hb: the hash bucket the futex_q's reside in 540 * @key: the futex key (to distinguish it from other futex futex_q's) 541 * 542 * Must be called with the hb lock held. 543 */ 544 struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) 545 { 546 struct futex_q *this; 547 548 plist_for_each_entry(this, &hb->chain, list) { 549 if (futex_match(&this->key, key)) 550 return this; 551 } 552 return NULL; 553 } 554 555 /** 556 * wait_for_owner_exiting - Block until the owner has exited 557 * @ret: owner's current futex lock status 558 * @exiting: Pointer to the exiting task 559 * 560 * Caller must hold a refcount on @exiting. 561 */ 562 void wait_for_owner_exiting(int ret, struct task_struct *exiting) 563 { 564 if (ret != -EBUSY) { 565 WARN_ON_ONCE(exiting); 566 return; 567 } 568 569 if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) 570 return; 571 572 mutex_lock(&exiting->futex_exit_mutex); 573 /* 574 * No point in doing state checking here. If the waiter got here 575 * while the task was in exec()->exec_futex_release() then it can 576 * have any FUTEX_STATE_* value when the waiter has acquired the 577 * mutex. OK, if running, EXITING or DEAD if it reached exit() 578 * already. Highly unlikely and not a problem. Just one more round 579 * through the futex maze. 580 */ 581 mutex_unlock(&exiting->futex_exit_mutex); 582 583 put_task_struct(exiting); 584 } 585 586 /** 587 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket 588 * @q: The futex_q to unqueue 589 * 590 * The q->lock_ptr must not be NULL and must be held by the caller. 591 */ 592 void __futex_unqueue(struct futex_q *q) 593 { 594 struct futex_hash_bucket *hb; 595 596 if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) 597 return; 598 lockdep_assert_held(q->lock_ptr); 599 600 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 601 plist_del(&q->list, &hb->chain); 602 futex_hb_waiters_dec(hb); 603 } 604 605 /* The key must be already stored in q->key. */ 606 void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) 607 __acquires(&hb->lock) 608 { 609 /* 610 * Increment the counter before taking the lock so that 611 * a potential waker won't miss a to-be-slept task that is 612 * waiting for the spinlock. This is safe as all futex_q_lock() 613 * users end up calling futex_queue(). Similarly, for housekeeping, 614 * decrement the counter at futex_q_unlock() when some error has 615 * occurred and we don't end up adding the task to the list. 616 */ 617 futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ 618 619 q->lock_ptr = &hb->lock; 620 621 spin_lock(&hb->lock); 622 } 623 624 void futex_q_unlock(struct futex_hash_bucket *hb) 625 __releases(&hb->lock) 626 { 627 futex_hb_waiters_dec(hb); 628 spin_unlock(&hb->lock); 629 } 630 631 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, 632 struct task_struct *task) 633 { 634 int prio; 635 636 /* 637 * The priority used to register this element is 638 * - either the real thread-priority for the real-time threads 639 * (i.e. threads with a priority lower than MAX_RT_PRIO) 640 * - or MAX_RT_PRIO for non-RT threads. 641 * Thus, all RT-threads are woken first in priority order, and 642 * the others are woken last, in FIFO order. 643 */ 644 prio = min(current->normal_prio, MAX_RT_PRIO); 645 646 plist_node_init(&q->list, prio); 647 plist_add(&q->list, &hb->chain); 648 q->task = task; 649 } 650 651 /** 652 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket 653 * @q: The futex_q to unqueue 654 * 655 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must 656 * be paired with exactly one earlier call to futex_queue(). 657 * 658 * Return: 659 * - 1 - if the futex_q was still queued (and we removed unqueued it); 660 * - 0 - if the futex_q was already removed by the waking thread 661 */ 662 int futex_unqueue(struct futex_q *q) 663 { 664 spinlock_t *lock_ptr; 665 int ret = 0; 666 667 /* In the common case we don't take the spinlock, which is nice. */ 668 retry: 669 /* 670 * q->lock_ptr can change between this read and the following spin_lock. 671 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 672 * optimizing lock_ptr out of the logic below. 673 */ 674 lock_ptr = READ_ONCE(q->lock_ptr); 675 if (lock_ptr != NULL) { 676 spin_lock(lock_ptr); 677 /* 678 * q->lock_ptr can change between reading it and 679 * spin_lock(), causing us to take the wrong lock. This 680 * corrects the race condition. 681 * 682 * Reasoning goes like this: if we have the wrong lock, 683 * q->lock_ptr must have changed (maybe several times) 684 * between reading it and the spin_lock(). It can 685 * change again after the spin_lock() but only if it was 686 * already changed before the spin_lock(). It cannot, 687 * however, change back to the original value. Therefore 688 * we can detect whether we acquired the correct lock. 689 */ 690 if (unlikely(lock_ptr != q->lock_ptr)) { 691 spin_unlock(lock_ptr); 692 goto retry; 693 } 694 __futex_unqueue(q); 695 696 BUG_ON(q->pi_state); 697 698 spin_unlock(lock_ptr); 699 ret = 1; 700 } 701 702 return ret; 703 } 704 705 void futex_q_lockptr_lock(struct futex_q *q) 706 { 707 spinlock_t *lock_ptr; 708 709 /* 710 * See futex_unqueue() why lock_ptr can change. 711 */ 712 guard(rcu)(); 713 retry: 714 lock_ptr = READ_ONCE(q->lock_ptr); 715 spin_lock(lock_ptr); 716 717 if (unlikely(lock_ptr != q->lock_ptr)) { 718 spin_unlock(lock_ptr); 719 goto retry; 720 } 721 } 722 723 /* 724 * PI futexes can not be requeued and must remove themselves from the hash 725 * bucket. The hash bucket lock (i.e. lock_ptr) is held. 726 */ 727 void futex_unqueue_pi(struct futex_q *q) 728 { 729 /* 730 * If the lock was not acquired (due to timeout or signal) then the 731 * rt_waiter is removed before futex_q is. If this is observed by 732 * an unlocker after dropping the rtmutex wait lock and before 733 * acquiring the hash bucket lock, then the unlocker dequeues the 734 * futex_q from the hash bucket list to guarantee consistent state 735 * vs. userspace. Therefore the dequeue here must be conditional. 736 */ 737 if (!plist_node_empty(&q->list)) 738 __futex_unqueue(q); 739 740 BUG_ON(!q->pi_state); 741 put_pi_state(q->pi_state); 742 q->pi_state = NULL; 743 } 744 745 /* Constants for the pending_op argument of handle_futex_death */ 746 #define HANDLE_DEATH_PENDING true 747 #define HANDLE_DEATH_LIST false 748 749 /* 750 * Process a futex-list entry, check whether it's owned by the 751 * dying task, and do notification if so: 752 */ 753 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, 754 bool pi, bool pending_op) 755 { 756 u32 uval, nval, mval; 757 pid_t owner; 758 int err; 759 760 /* Futex address must be 32bit aligned */ 761 if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) 762 return -1; 763 764 retry: 765 if (get_user(uval, uaddr)) 766 return -1; 767 768 /* 769 * Special case for regular (non PI) futexes. The unlock path in 770 * user space has two race scenarios: 771 * 772 * 1. The unlock path releases the user space futex value and 773 * before it can execute the futex() syscall to wake up 774 * waiters it is killed. 775 * 776 * 2. A woken up waiter is killed before it can acquire the 777 * futex in user space. 778 * 779 * In the second case, the wake up notification could be generated 780 * by the unlock path in user space after setting the futex value 781 * to zero or by the kernel after setting the OWNER_DIED bit below. 782 * 783 * In both cases the TID validation below prevents a wakeup of 784 * potential waiters which can cause these waiters to block 785 * forever. 786 * 787 * In both cases the following conditions are met: 788 * 789 * 1) task->robust_list->list_op_pending != NULL 790 * @pending_op == true 791 * 2) The owner part of user space futex value == 0 792 * 3) Regular futex: @pi == false 793 * 794 * If these conditions are met, it is safe to attempt waking up a 795 * potential waiter without touching the user space futex value and 796 * trying to set the OWNER_DIED bit. If the futex value is zero, 797 * the rest of the user space mutex state is consistent, so a woken 798 * waiter will just take over the uncontended futex. Setting the 799 * OWNER_DIED bit would create inconsistent state and malfunction 800 * of the user space owner died handling. Otherwise, the OWNER_DIED 801 * bit is already set, and the woken waiter is expected to deal with 802 * this. 803 */ 804 owner = uval & FUTEX_TID_MASK; 805 806 if (pending_op && !pi && !owner) { 807 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 808 FUTEX_BITSET_MATCH_ANY); 809 return 0; 810 } 811 812 if (owner != task_pid_vnr(curr)) 813 return 0; 814 815 /* 816 * Ok, this dying thread is truly holding a futex 817 * of interest. Set the OWNER_DIED bit atomically 818 * via cmpxchg, and if the value had FUTEX_WAITERS 819 * set, wake up a waiter (if any). (We have to do a 820 * futex_wake() even if OWNER_DIED is already set - 821 * to handle the rare but possible case of recursive 822 * thread-death.) The rest of the cleanup is done in 823 * userspace. 824 */ 825 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 826 827 /* 828 * We are not holding a lock here, but we want to have 829 * the pagefault_disable/enable() protection because 830 * we want to handle the fault gracefully. If the 831 * access fails we try to fault in the futex with R/W 832 * verification via get_user_pages. get_user() above 833 * does not guarantee R/W access. If that fails we 834 * give up and leave the futex locked. 835 */ 836 if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { 837 switch (err) { 838 case -EFAULT: 839 if (fault_in_user_writeable(uaddr)) 840 return -1; 841 goto retry; 842 843 case -EAGAIN: 844 cond_resched(); 845 goto retry; 846 847 default: 848 WARN_ON_ONCE(1); 849 return err; 850 } 851 } 852 853 if (nval != uval) 854 goto retry; 855 856 /* 857 * Wake robust non-PI futexes here. The wakeup of 858 * PI futexes happens in exit_pi_state(): 859 */ 860 if (!pi && (uval & FUTEX_WAITERS)) { 861 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 862 FUTEX_BITSET_MATCH_ANY); 863 } 864 865 return 0; 866 } 867 868 /* 869 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 870 */ 871 static inline int fetch_robust_entry(struct robust_list __user **entry, 872 struct robust_list __user * __user *head, 873 unsigned int *pi) 874 { 875 unsigned long uentry; 876 877 if (get_user(uentry, (unsigned long __user *)head)) 878 return -EFAULT; 879 880 *entry = (void __user *)(uentry & ~1UL); 881 *pi = uentry & 1; 882 883 return 0; 884 } 885 886 /* 887 * Walk curr->robust_list (very carefully, it's a userspace list!) 888 * and mark any locks found there dead, and notify any waiters. 889 * 890 * We silently return on any sign of list-walking problem. 891 */ 892 static void exit_robust_list(struct task_struct *curr) 893 { 894 struct robust_list_head __user *head = curr->robust_list; 895 struct robust_list __user *entry, *next_entry, *pending; 896 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 897 unsigned int next_pi; 898 unsigned long futex_offset; 899 int rc; 900 901 /* 902 * Fetch the list head (which was registered earlier, via 903 * sys_set_robust_list()): 904 */ 905 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 906 return; 907 /* 908 * Fetch the relative futex offset: 909 */ 910 if (get_user(futex_offset, &head->futex_offset)) 911 return; 912 /* 913 * Fetch any possibly pending lock-add first, and handle it 914 * if it exists: 915 */ 916 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 917 return; 918 919 next_entry = NULL; /* avoid warning with gcc */ 920 while (entry != &head->list) { 921 /* 922 * Fetch the next entry in the list before calling 923 * handle_futex_death: 924 */ 925 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 926 /* 927 * A pending lock might already be on the list, so 928 * don't process it twice: 929 */ 930 if (entry != pending) { 931 if (handle_futex_death((void __user *)entry + futex_offset, 932 curr, pi, HANDLE_DEATH_LIST)) 933 return; 934 } 935 if (rc) 936 return; 937 entry = next_entry; 938 pi = next_pi; 939 /* 940 * Avoid excessively long or circular lists: 941 */ 942 if (!--limit) 943 break; 944 945 cond_resched(); 946 } 947 948 if (pending) { 949 handle_futex_death((void __user *)pending + futex_offset, 950 curr, pip, HANDLE_DEATH_PENDING); 951 } 952 } 953 954 #ifdef CONFIG_COMPAT 955 static void __user *futex_uaddr(struct robust_list __user *entry, 956 compat_long_t futex_offset) 957 { 958 compat_uptr_t base = ptr_to_compat(entry); 959 void __user *uaddr = compat_ptr(base + futex_offset); 960 961 return uaddr; 962 } 963 964 /* 965 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 966 */ 967 static inline int 968 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 969 compat_uptr_t __user *head, unsigned int *pi) 970 { 971 if (get_user(*uentry, head)) 972 return -EFAULT; 973 974 *entry = compat_ptr((*uentry) & ~1); 975 *pi = (unsigned int)(*uentry) & 1; 976 977 return 0; 978 } 979 980 /* 981 * Walk curr->robust_list (very carefully, it's a userspace list!) 982 * and mark any locks found there dead, and notify any waiters. 983 * 984 * We silently return on any sign of list-walking problem. 985 */ 986 static void compat_exit_robust_list(struct task_struct *curr) 987 { 988 struct compat_robust_list_head __user *head = curr->compat_robust_list; 989 struct robust_list __user *entry, *next_entry, *pending; 990 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 991 unsigned int next_pi; 992 compat_uptr_t uentry, next_uentry, upending; 993 compat_long_t futex_offset; 994 int rc; 995 996 /* 997 * Fetch the list head (which was registered earlier, via 998 * sys_set_robust_list()): 999 */ 1000 if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) 1001 return; 1002 /* 1003 * Fetch the relative futex offset: 1004 */ 1005 if (get_user(futex_offset, &head->futex_offset)) 1006 return; 1007 /* 1008 * Fetch any possibly pending lock-add first, and handle it 1009 * if it exists: 1010 */ 1011 if (compat_fetch_robust_entry(&upending, &pending, 1012 &head->list_op_pending, &pip)) 1013 return; 1014 1015 next_entry = NULL; /* avoid warning with gcc */ 1016 while (entry != (struct robust_list __user *) &head->list) { 1017 /* 1018 * Fetch the next entry in the list before calling 1019 * handle_futex_death: 1020 */ 1021 rc = compat_fetch_robust_entry(&next_uentry, &next_entry, 1022 (compat_uptr_t __user *)&entry->next, &next_pi); 1023 /* 1024 * A pending lock might already be on the list, so 1025 * dont process it twice: 1026 */ 1027 if (entry != pending) { 1028 void __user *uaddr = futex_uaddr(entry, futex_offset); 1029 1030 if (handle_futex_death(uaddr, curr, pi, 1031 HANDLE_DEATH_LIST)) 1032 return; 1033 } 1034 if (rc) 1035 return; 1036 uentry = next_uentry; 1037 entry = next_entry; 1038 pi = next_pi; 1039 /* 1040 * Avoid excessively long or circular lists: 1041 */ 1042 if (!--limit) 1043 break; 1044 1045 cond_resched(); 1046 } 1047 if (pending) { 1048 void __user *uaddr = futex_uaddr(pending, futex_offset); 1049 1050 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); 1051 } 1052 } 1053 #endif 1054 1055 #ifdef CONFIG_FUTEX_PI 1056 1057 /* 1058 * This task is holding PI mutexes at exit time => bad. 1059 * Kernel cleans up PI-state, but userspace is likely hosed. 1060 * (Robust-futex cleanup is separate and might save the day for userspace.) 1061 */ 1062 static void exit_pi_state_list(struct task_struct *curr) 1063 { 1064 struct list_head *next, *head = &curr->pi_state_list; 1065 struct futex_pi_state *pi_state; 1066 union futex_key key = FUTEX_KEY_INIT; 1067 1068 /* 1069 * Ensure the hash remains stable (no resize) during the while loop 1070 * below. The hb pointer is acquired under the pi_lock so we can't block 1071 * on the mutex. 1072 */ 1073 WARN_ON(curr != current); 1074 guard(private_hash)(); 1075 /* 1076 * We are a ZOMBIE and nobody can enqueue itself on 1077 * pi_state_list anymore, but we have to be careful 1078 * versus waiters unqueueing themselves: 1079 */ 1080 raw_spin_lock_irq(&curr->pi_lock); 1081 while (!list_empty(head)) { 1082 next = head->next; 1083 pi_state = list_entry(next, struct futex_pi_state, list); 1084 key = pi_state->key; 1085 if (1) { 1086 CLASS(hb, hb)(&key); 1087 1088 /* 1089 * We can race against put_pi_state() removing itself from the 1090 * list (a waiter going away). put_pi_state() will first 1091 * decrement the reference count and then modify the list, so 1092 * its possible to see the list entry but fail this reference 1093 * acquire. 1094 * 1095 * In that case; drop the locks to let put_pi_state() make 1096 * progress and retry the loop. 1097 */ 1098 if (!refcount_inc_not_zero(&pi_state->refcount)) { 1099 raw_spin_unlock_irq(&curr->pi_lock); 1100 cpu_relax(); 1101 raw_spin_lock_irq(&curr->pi_lock); 1102 continue; 1103 } 1104 raw_spin_unlock_irq(&curr->pi_lock); 1105 1106 spin_lock(&hb->lock); 1107 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1108 raw_spin_lock(&curr->pi_lock); 1109 /* 1110 * We dropped the pi-lock, so re-check whether this 1111 * task still owns the PI-state: 1112 */ 1113 if (head->next != next) { 1114 /* retain curr->pi_lock for the loop invariant */ 1115 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 1116 spin_unlock(&hb->lock); 1117 put_pi_state(pi_state); 1118 continue; 1119 } 1120 1121 WARN_ON(pi_state->owner != curr); 1122 WARN_ON(list_empty(&pi_state->list)); 1123 list_del_init(&pi_state->list); 1124 pi_state->owner = NULL; 1125 1126 raw_spin_unlock(&curr->pi_lock); 1127 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1128 spin_unlock(&hb->lock); 1129 } 1130 1131 rt_mutex_futex_unlock(&pi_state->pi_mutex); 1132 put_pi_state(pi_state); 1133 1134 raw_spin_lock_irq(&curr->pi_lock); 1135 } 1136 raw_spin_unlock_irq(&curr->pi_lock); 1137 } 1138 #else 1139 static inline void exit_pi_state_list(struct task_struct *curr) { } 1140 #endif 1141 1142 static void futex_cleanup(struct task_struct *tsk) 1143 { 1144 if (unlikely(tsk->robust_list)) { 1145 exit_robust_list(tsk); 1146 tsk->robust_list = NULL; 1147 } 1148 1149 #ifdef CONFIG_COMPAT 1150 if (unlikely(tsk->compat_robust_list)) { 1151 compat_exit_robust_list(tsk); 1152 tsk->compat_robust_list = NULL; 1153 } 1154 #endif 1155 1156 if (unlikely(!list_empty(&tsk->pi_state_list))) 1157 exit_pi_state_list(tsk); 1158 } 1159 1160 /** 1161 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD 1162 * @tsk: task to set the state on 1163 * 1164 * Set the futex exit state of the task lockless. The futex waiter code 1165 * observes that state when a task is exiting and loops until the task has 1166 * actually finished the futex cleanup. The worst case for this is that the 1167 * waiter runs through the wait loop until the state becomes visible. 1168 * 1169 * This is called from the recursive fault handling path in make_task_dead(). 1170 * 1171 * This is best effort. Either the futex exit code has run already or 1172 * not. If the OWNER_DIED bit has been set on the futex then the waiter can 1173 * take it over. If not, the problem is pushed back to user space. If the 1174 * futex exit code did not run yet, then an already queued waiter might 1175 * block forever, but there is nothing which can be done about that. 1176 */ 1177 void futex_exit_recursive(struct task_struct *tsk) 1178 { 1179 /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ 1180 if (tsk->futex_state == FUTEX_STATE_EXITING) 1181 mutex_unlock(&tsk->futex_exit_mutex); 1182 tsk->futex_state = FUTEX_STATE_DEAD; 1183 } 1184 1185 static void futex_cleanup_begin(struct task_struct *tsk) 1186 { 1187 /* 1188 * Prevent various race issues against a concurrent incoming waiter 1189 * including live locks by forcing the waiter to block on 1190 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in 1191 * attach_to_pi_owner(). 1192 */ 1193 mutex_lock(&tsk->futex_exit_mutex); 1194 1195 /* 1196 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. 1197 * 1198 * This ensures that all subsequent checks of tsk->futex_state in 1199 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with 1200 * tsk->pi_lock held. 1201 * 1202 * It guarantees also that a pi_state which was queued right before 1203 * the state change under tsk->pi_lock by a concurrent waiter must 1204 * be observed in exit_pi_state_list(). 1205 */ 1206 raw_spin_lock_irq(&tsk->pi_lock); 1207 tsk->futex_state = FUTEX_STATE_EXITING; 1208 raw_spin_unlock_irq(&tsk->pi_lock); 1209 } 1210 1211 static void futex_cleanup_end(struct task_struct *tsk, int state) 1212 { 1213 /* 1214 * Lockless store. The only side effect is that an observer might 1215 * take another loop until it becomes visible. 1216 */ 1217 tsk->futex_state = state; 1218 /* 1219 * Drop the exit protection. This unblocks waiters which observed 1220 * FUTEX_STATE_EXITING to reevaluate the state. 1221 */ 1222 mutex_unlock(&tsk->futex_exit_mutex); 1223 } 1224 1225 void futex_exec_release(struct task_struct *tsk) 1226 { 1227 /* 1228 * The state handling is done for consistency, but in the case of 1229 * exec() there is no way to prevent further damage as the PID stays 1230 * the same. But for the unlikely and arguably buggy case that a 1231 * futex is held on exec(), this provides at least as much state 1232 * consistency protection which is possible. 1233 */ 1234 futex_cleanup_begin(tsk); 1235 futex_cleanup(tsk); 1236 /* 1237 * Reset the state to FUTEX_STATE_OK. The task is alive and about 1238 * exec a new binary. 1239 */ 1240 futex_cleanup_end(tsk, FUTEX_STATE_OK); 1241 } 1242 1243 void futex_exit_release(struct task_struct *tsk) 1244 { 1245 futex_cleanup_begin(tsk); 1246 futex_cleanup(tsk); 1247 futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 1248 } 1249 1250 static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, 1251 struct futex_private_hash *fph) 1252 { 1253 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1254 fhb->priv = fph; 1255 #endif 1256 atomic_set(&fhb->waiters, 0); 1257 plist_head_init(&fhb->chain); 1258 spin_lock_init(&fhb->lock); 1259 } 1260 1261 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1262 void futex_hash_free(struct mm_struct *mm) 1263 { 1264 kvfree(mm->futex_phash); 1265 } 1266 1267 static int futex_hash_allocate(unsigned int hash_slots, bool custom) 1268 { 1269 struct mm_struct *mm = current->mm; 1270 struct futex_private_hash *fph; 1271 int i; 1272 1273 if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) 1274 return -EINVAL; 1275 1276 if (mm->futex_phash) 1277 return -EALREADY; 1278 1279 if (!thread_group_empty(current)) 1280 return -EINVAL; 1281 1282 fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1283 if (!fph) 1284 return -ENOMEM; 1285 1286 fph->hash_mask = hash_slots ? hash_slots - 1 : 0; 1287 fph->custom = custom; 1288 fph->mm = mm; 1289 1290 for (i = 0; i < hash_slots; i++) 1291 futex_hash_bucket_init(&fph->queues[i], fph); 1292 1293 mm->futex_phash = fph; 1294 return 0; 1295 } 1296 1297 int futex_hash_allocate_default(void) 1298 { 1299 if (!current->mm) 1300 return 0; 1301 1302 if (current->mm->futex_phash) 1303 return 0; 1304 1305 return futex_hash_allocate(16, false); 1306 } 1307 1308 static int futex_hash_get_slots(void) 1309 { 1310 struct futex_private_hash *fph; 1311 1312 fph = current->mm->futex_phash; 1313 if (fph && fph->hash_mask) 1314 return fph->hash_mask + 1; 1315 return 0; 1316 } 1317 1318 #else 1319 1320 static int futex_hash_allocate(unsigned int hash_slots, bool custom) 1321 { 1322 return -EINVAL; 1323 } 1324 1325 static int futex_hash_get_slots(void) 1326 { 1327 return 0; 1328 } 1329 #endif 1330 1331 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) 1332 { 1333 int ret; 1334 1335 switch (arg2) { 1336 case PR_FUTEX_HASH_SET_SLOTS: 1337 if (arg4 != 0) 1338 return -EINVAL; 1339 ret = futex_hash_allocate(arg3, true); 1340 break; 1341 1342 case PR_FUTEX_HASH_GET_SLOTS: 1343 ret = futex_hash_get_slots(); 1344 break; 1345 1346 default: 1347 ret = -EINVAL; 1348 break; 1349 } 1350 return ret; 1351 } 1352 1353 static int __init futex_init(void) 1354 { 1355 unsigned long hashsize, i; 1356 unsigned int futex_shift; 1357 1358 #ifdef CONFIG_BASE_SMALL 1359 hashsize = 16; 1360 #else 1361 hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 1362 #endif 1363 1364 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 1365 hashsize, 0, 0, 1366 &futex_shift, NULL, 1367 hashsize, hashsize); 1368 hashsize = 1UL << futex_shift; 1369 1370 for (i = 0; i < hashsize; i++) 1371 futex_hash_bucket_init(&futex_queues[i], NULL); 1372 1373 futex_hashmask = hashsize - 1; 1374 return 0; 1375 } 1376 core_initcall(futex_init); 1377