1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Fast Userspace Mutexes (which I call "Futexes!"). 4 * (C) Rusty Russell, IBM 2002 5 * 6 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 7 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 8 * 9 * Removed page pinning, fix privately mapped COW pages and other cleanups 10 * (C) Copyright 2003, 2004 Jamie Lokier 11 * 12 * Robust futex support started by Ingo Molnar 13 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 14 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 15 * 16 * PI-futex support started by Ingo Molnar and Thomas Gleixner 17 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 18 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 19 * 20 * PRIVATE futexes by Eric Dumazet 21 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 22 * 23 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 24 * Copyright (C) IBM Corporation, 2009 25 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 26 * 27 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 28 * enough at me, Linus for the original (flawed) idea, Matthew 29 * Kirkwood for proof-of-concept implementation. 30 * 31 * "The futexes are also cursed." 32 * "But they come in a choice of three flavours!" 33 */ 34 #include <linux/compat.h> 35 #include <linux/jhash.h> 36 #include <linux/pagemap.h> 37 #include <linux/debugfs.h> 38 #include <linux/plist.h> 39 #include <linux/memblock.h> 40 #include <linux/fault-inject.h> 41 #include <linux/slab.h> 42 #include <linux/prctl.h> 43 #include <linux/rcuref.h> 44 45 #include "futex.h" 46 #include "../locking/rtmutex_common.h" 47 48 /* 49 * The base of the bucket array and its size are always used together 50 * (after initialization only in futex_hash()), so ensure that they 51 * reside in the same cacheline. 52 */ 53 static struct { 54 struct futex_hash_bucket *queues; 55 unsigned long hashmask; 56 } __futex_data __read_mostly __aligned(2*sizeof(long)); 57 #define futex_queues (__futex_data.queues) 58 #define futex_hashmask (__futex_data.hashmask) 59 60 struct futex_private_hash { 61 rcuref_t users; 62 unsigned int hash_mask; 63 struct rcu_head rcu; 64 void *mm; 65 bool custom; 66 bool immutable; 67 struct futex_hash_bucket queues[]; 68 }; 69 70 /* 71 * Fault injections for futexes. 72 */ 73 #ifdef CONFIG_FAIL_FUTEX 74 75 static struct { 76 struct fault_attr attr; 77 78 bool ignore_private; 79 } fail_futex = { 80 .attr = FAULT_ATTR_INITIALIZER, 81 .ignore_private = false, 82 }; 83 84 static int __init setup_fail_futex(char *str) 85 { 86 return setup_fault_attr(&fail_futex.attr, str); 87 } 88 __setup("fail_futex=", setup_fail_futex); 89 90 bool should_fail_futex(bool fshared) 91 { 92 if (fail_futex.ignore_private && !fshared) 93 return false; 94 95 return should_fail(&fail_futex.attr, 1); 96 } 97 98 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 99 100 static int __init fail_futex_debugfs(void) 101 { 102 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 103 struct dentry *dir; 104 105 dir = fault_create_debugfs_attr("fail_futex", NULL, 106 &fail_futex.attr); 107 if (IS_ERR(dir)) 108 return PTR_ERR(dir); 109 110 debugfs_create_bool("ignore-private", mode, dir, 111 &fail_futex.ignore_private); 112 return 0; 113 } 114 115 late_initcall(fail_futex_debugfs); 116 117 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 118 119 #endif /* CONFIG_FAIL_FUTEX */ 120 121 static struct futex_hash_bucket * 122 __futex_hash(union futex_key *key, struct futex_private_hash *fph); 123 124 #ifdef CONFIG_FUTEX_PRIVATE_HASH 125 static inline bool futex_key_is_private(union futex_key *key) 126 { 127 /* 128 * Relies on get_futex_key() to set either bit for shared 129 * futexes -- see comment with union futex_key. 130 */ 131 return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); 132 } 133 134 bool futex_private_hash_get(struct futex_private_hash *fph) 135 { 136 if (fph->immutable) 137 return true; 138 return rcuref_get(&fph->users); 139 } 140 141 void futex_private_hash_put(struct futex_private_hash *fph) 142 { 143 /* Ignore return value, last put is verified via rcuref_is_dead() */ 144 if (fph->immutable) 145 return; 146 if (rcuref_put(&fph->users)) 147 wake_up_var(fph->mm); 148 } 149 150 /** 151 * futex_hash_get - Get an additional reference for the local hash. 152 * @hb: ptr to the private local hash. 153 * 154 * Obtain an additional reference for the already obtained hash bucket. The 155 * caller must already own an reference. 156 */ 157 void futex_hash_get(struct futex_hash_bucket *hb) 158 { 159 struct futex_private_hash *fph = hb->priv; 160 161 if (!fph) 162 return; 163 WARN_ON_ONCE(!futex_private_hash_get(fph)); 164 } 165 166 void futex_hash_put(struct futex_hash_bucket *hb) 167 { 168 struct futex_private_hash *fph = hb->priv; 169 170 if (!fph) 171 return; 172 futex_private_hash_put(fph); 173 } 174 175 static struct futex_hash_bucket * 176 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 177 { 178 u32 hash; 179 180 if (!futex_key_is_private(key)) 181 return NULL; 182 183 if (!fph) 184 fph = rcu_dereference(key->private.mm->futex_phash); 185 if (!fph || !fph->hash_mask) 186 return NULL; 187 188 hash = jhash2((void *)&key->private.address, 189 sizeof(key->private.address) / 4, 190 key->both.offset); 191 return &fph->queues[hash & fph->hash_mask]; 192 } 193 194 static void futex_rehash_private(struct futex_private_hash *old, 195 struct futex_private_hash *new) 196 { 197 struct futex_hash_bucket *hb_old, *hb_new; 198 unsigned int slots = old->hash_mask + 1; 199 unsigned int i; 200 201 for (i = 0; i < slots; i++) { 202 struct futex_q *this, *tmp; 203 204 hb_old = &old->queues[i]; 205 206 spin_lock(&hb_old->lock); 207 plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { 208 209 plist_del(&this->list, &hb_old->chain); 210 futex_hb_waiters_dec(hb_old); 211 212 WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); 213 214 hb_new = __futex_hash(&this->key, new); 215 futex_hb_waiters_inc(hb_new); 216 /* 217 * The new pointer isn't published yet but an already 218 * moved user can be unqueued due to timeout or signal. 219 */ 220 spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); 221 plist_add(&this->list, &hb_new->chain); 222 this->lock_ptr = &hb_new->lock; 223 spin_unlock(&hb_new->lock); 224 } 225 spin_unlock(&hb_old->lock); 226 } 227 } 228 229 static bool __futex_pivot_hash(struct mm_struct *mm, 230 struct futex_private_hash *new) 231 { 232 struct futex_private_hash *fph; 233 234 WARN_ON_ONCE(mm->futex_phash_new); 235 236 fph = rcu_dereference_protected(mm->futex_phash, 237 lockdep_is_held(&mm->futex_hash_lock)); 238 if (fph) { 239 if (!rcuref_is_dead(&fph->users)) { 240 mm->futex_phash_new = new; 241 return false; 242 } 243 244 futex_rehash_private(fph, new); 245 } 246 rcu_assign_pointer(mm->futex_phash, new); 247 kvfree_rcu(fph, rcu); 248 return true; 249 } 250 251 static void futex_pivot_hash(struct mm_struct *mm) 252 { 253 scoped_guard(mutex, &mm->futex_hash_lock) { 254 struct futex_private_hash *fph; 255 256 fph = mm->futex_phash_new; 257 if (fph) { 258 mm->futex_phash_new = NULL; 259 __futex_pivot_hash(mm, fph); 260 } 261 } 262 } 263 264 struct futex_private_hash *futex_private_hash(void) 265 { 266 struct mm_struct *mm = current->mm; 267 /* 268 * Ideally we don't loop. If there is a replacement in progress 269 * then a new private hash is already prepared and a reference can't be 270 * obtained once the last user dropped it's. 271 * In that case we block on mm_struct::futex_hash_lock and either have 272 * to perform the replacement or wait while someone else is doing the 273 * job. Eitherway, on the second iteration we acquire a reference on the 274 * new private hash or loop again because a new replacement has been 275 * requested. 276 */ 277 again: 278 scoped_guard(rcu) { 279 struct futex_private_hash *fph; 280 281 fph = rcu_dereference(mm->futex_phash); 282 if (!fph) 283 return NULL; 284 285 if (fph->immutable) 286 return fph; 287 if (rcuref_get(&fph->users)) 288 return fph; 289 } 290 futex_pivot_hash(mm); 291 goto again; 292 } 293 294 struct futex_hash_bucket *futex_hash(union futex_key *key) 295 { 296 struct futex_private_hash *fph; 297 struct futex_hash_bucket *hb; 298 299 again: 300 scoped_guard(rcu) { 301 hb = __futex_hash(key, NULL); 302 fph = hb->priv; 303 304 if (!fph || futex_private_hash_get(fph)) 305 return hb; 306 } 307 futex_pivot_hash(key->private.mm); 308 goto again; 309 } 310 311 #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 312 313 static struct futex_hash_bucket * 314 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 315 { 316 return NULL; 317 } 318 319 struct futex_hash_bucket *futex_hash(union futex_key *key) 320 { 321 return __futex_hash(key, NULL); 322 } 323 324 #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 325 326 /** 327 * __futex_hash - Return the hash bucket 328 * @key: Pointer to the futex key for which the hash is calculated 329 * @fph: Pointer to private hash if known 330 * 331 * We hash on the keys returned from get_futex_key (see below) and return the 332 * corresponding hash bucket. 333 * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the 334 * private hash) is returned if existing. Otherwise a hash bucket from the 335 * global hash is returned. 336 */ 337 static struct futex_hash_bucket * 338 __futex_hash(union futex_key *key, struct futex_private_hash *fph) 339 { 340 struct futex_hash_bucket *hb; 341 u32 hash; 342 343 hb = __futex_hash_private(key, fph); 344 if (hb) 345 return hb; 346 347 hash = jhash2((u32 *)key, 348 offsetof(typeof(*key), both.offset) / 4, 349 key->both.offset); 350 return &futex_queues[hash & futex_hashmask]; 351 } 352 353 /** 354 * futex_setup_timer - set up the sleeping hrtimer. 355 * @time: ptr to the given timeout value 356 * @timeout: the hrtimer_sleeper structure to be set up 357 * @flags: futex flags 358 * @range_ns: optional range in ns 359 * 360 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout 361 * value given 362 */ 363 struct hrtimer_sleeper * 364 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, 365 int flags, u64 range_ns) 366 { 367 if (!time) 368 return NULL; 369 370 hrtimer_setup_sleeper_on_stack(timeout, 371 (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, 372 HRTIMER_MODE_ABS); 373 /* 374 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is 375 * effectively the same as calling hrtimer_set_expires(). 376 */ 377 hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); 378 379 return timeout; 380 } 381 382 /* 383 * Generate a machine wide unique identifier for this inode. 384 * 385 * This relies on u64 not wrapping in the life-time of the machine; which with 386 * 1ns resolution means almost 585 years. 387 * 388 * This further relies on the fact that a well formed program will not unmap 389 * the file while it has a (shared) futex waiting on it. This mapping will have 390 * a file reference which pins the mount and inode. 391 * 392 * If for some reason an inode gets evicted and read back in again, it will get 393 * a new sequence number and will _NOT_ match, even though it is the exact same 394 * file. 395 * 396 * It is important that futex_match() will never have a false-positive, esp. 397 * for PI futexes that can mess up the state. The above argues that false-negatives 398 * are only possible for malformed programs. 399 */ 400 static u64 get_inode_sequence_number(struct inode *inode) 401 { 402 static atomic64_t i_seq; 403 u64 old; 404 405 /* Does the inode already have a sequence number? */ 406 old = atomic64_read(&inode->i_sequence); 407 if (likely(old)) 408 return old; 409 410 for (;;) { 411 u64 new = atomic64_inc_return(&i_seq); 412 if (WARN_ON_ONCE(!new)) 413 continue; 414 415 old = 0; 416 if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) 417 return old; 418 return new; 419 } 420 } 421 422 /** 423 * get_futex_key() - Get parameters which are the keys for a futex 424 * @uaddr: virtual address of the futex 425 * @flags: FLAGS_* 426 * @key: address where result is stored. 427 * @rw: mapping needs to be read/write (values: FUTEX_READ, 428 * FUTEX_WRITE) 429 * 430 * Return: a negative error code or 0 431 * 432 * The key words are stored in @key on success. 433 * 434 * For shared mappings (when @fshared), the key is: 435 * 436 * ( inode->i_sequence, page->index, offset_within_page ) 437 * 438 * [ also see get_inode_sequence_number() ] 439 * 440 * For private mappings (or when !@fshared), the key is: 441 * 442 * ( current->mm, address, 0 ) 443 * 444 * This allows (cross process, where applicable) identification of the futex 445 * without keeping the page pinned for the duration of the FUTEX_WAIT. 446 * 447 * lock_page() might sleep, the caller should not hold a spinlock. 448 */ 449 int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, 450 enum futex_access rw) 451 { 452 unsigned long address = (unsigned long)uaddr; 453 struct mm_struct *mm = current->mm; 454 struct page *page; 455 struct folio *folio; 456 struct address_space *mapping; 457 int err, ro = 0; 458 bool fshared; 459 460 fshared = flags & FLAGS_SHARED; 461 462 /* 463 * The futex address must be "naturally" aligned. 464 */ 465 key->both.offset = address % PAGE_SIZE; 466 if (unlikely((address % sizeof(u32)) != 0)) 467 return -EINVAL; 468 address -= key->both.offset; 469 470 if (unlikely(!access_ok(uaddr, sizeof(u32)))) 471 return -EFAULT; 472 473 if (unlikely(should_fail_futex(fshared))) 474 return -EFAULT; 475 476 /* 477 * PROCESS_PRIVATE futexes are fast. 478 * As the mm cannot disappear under us and the 'key' only needs 479 * virtual address, we dont even have to find the underlying vma. 480 * Note : We do have to check 'uaddr' is a valid user address, 481 * but access_ok() should be faster than find_vma() 482 */ 483 if (!fshared) { 484 /* 485 * On no-MMU, shared futexes are treated as private, therefore 486 * we must not include the current process in the key. Since 487 * there is only one address space, the address is a unique key 488 * on its own. 489 */ 490 if (IS_ENABLED(CONFIG_MMU)) 491 key->private.mm = mm; 492 else 493 key->private.mm = NULL; 494 495 key->private.address = address; 496 return 0; 497 } 498 499 again: 500 /* Ignore any VERIFY_READ mapping (futex common case) */ 501 if (unlikely(should_fail_futex(true))) 502 return -EFAULT; 503 504 err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); 505 /* 506 * If write access is not required (eg. FUTEX_WAIT), try 507 * and get read-only access. 508 */ 509 if (err == -EFAULT && rw == FUTEX_READ) { 510 err = get_user_pages_fast(address, 1, 0, &page); 511 ro = 1; 512 } 513 if (err < 0) 514 return err; 515 else 516 err = 0; 517 518 /* 519 * The treatment of mapping from this point on is critical. The folio 520 * lock protects many things but in this context the folio lock 521 * stabilizes mapping, prevents inode freeing in the shared 522 * file-backed region case and guards against movement to swap cache. 523 * 524 * Strictly speaking the folio lock is not needed in all cases being 525 * considered here and folio lock forces unnecessarily serialization. 526 * From this point on, mapping will be re-verified if necessary and 527 * folio lock will be acquired only if it is unavoidable 528 * 529 * Mapping checks require the folio so it is looked up now. For 530 * anonymous pages, it does not matter if the folio is split 531 * in the future as the key is based on the address. For 532 * filesystem-backed pages, the precise page is required as the 533 * index of the page determines the key. 534 */ 535 folio = page_folio(page); 536 mapping = READ_ONCE(folio->mapping); 537 538 /* 539 * If folio->mapping is NULL, then it cannot be an anonymous 540 * page; but it might be the ZERO_PAGE or in the gate area or 541 * in a special mapping (all cases which we are happy to fail); 542 * or it may have been a good file page when get_user_pages_fast 543 * found it, but truncated or holepunched or subjected to 544 * invalidate_complete_page2 before we got the folio lock (also 545 * cases which we are happy to fail). And we hold a reference, 546 * so refcount care in invalidate_inode_page's remove_mapping 547 * prevents drop_caches from setting mapping to NULL beneath us. 548 * 549 * The case we do have to guard against is when memory pressure made 550 * shmem_writepage move it from filecache to swapcache beneath us: 551 * an unlikely race, but we do need to retry for folio->mapping. 552 */ 553 if (unlikely(!mapping)) { 554 int shmem_swizzled; 555 556 /* 557 * Folio lock is required to identify which special case above 558 * applies. If this is really a shmem page then the folio lock 559 * will prevent unexpected transitions. 560 */ 561 folio_lock(folio); 562 shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; 563 folio_unlock(folio); 564 folio_put(folio); 565 566 if (shmem_swizzled) 567 goto again; 568 569 return -EFAULT; 570 } 571 572 /* 573 * Private mappings are handled in a simple way. 574 * 575 * If the futex key is stored in anonymous memory, then the associated 576 * object is the mm which is implicitly pinned by the calling process. 577 * 578 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 579 * it's a read-only handle, it's expected that futexes attach to 580 * the object not the particular process. 581 */ 582 if (folio_test_anon(folio)) { 583 /* 584 * A RO anonymous page will never change and thus doesn't make 585 * sense for futex operations. 586 */ 587 if (unlikely(should_fail_futex(true)) || ro) { 588 err = -EFAULT; 589 goto out; 590 } 591 592 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 593 key->private.mm = mm; 594 key->private.address = address; 595 596 } else { 597 struct inode *inode; 598 599 /* 600 * The associated futex object in this case is the inode and 601 * the folio->mapping must be traversed. Ordinarily this should 602 * be stabilised under folio lock but it's not strictly 603 * necessary in this case as we just want to pin the inode, not 604 * update i_pages or anything like that. 605 * 606 * The RCU read lock is taken as the inode is finally freed 607 * under RCU. If the mapping still matches expectations then the 608 * mapping->host can be safely accessed as being a valid inode. 609 */ 610 rcu_read_lock(); 611 612 if (READ_ONCE(folio->mapping) != mapping) { 613 rcu_read_unlock(); 614 folio_put(folio); 615 616 goto again; 617 } 618 619 inode = READ_ONCE(mapping->host); 620 if (!inode) { 621 rcu_read_unlock(); 622 folio_put(folio); 623 624 goto again; 625 } 626 627 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 628 key->shared.i_seq = get_inode_sequence_number(inode); 629 key->shared.pgoff = page_pgoff(folio, page); 630 rcu_read_unlock(); 631 } 632 633 out: 634 folio_put(folio); 635 return err; 636 } 637 638 /** 639 * fault_in_user_writeable() - Fault in user address and verify RW access 640 * @uaddr: pointer to faulting user space address 641 * 642 * Slow path to fixup the fault we just took in the atomic write 643 * access to @uaddr. 644 * 645 * We have no generic implementation of a non-destructive write to the 646 * user address. We know that we faulted in the atomic pagefault 647 * disabled section so we can as well avoid the #PF overhead by 648 * calling get_user_pages() right away. 649 */ 650 int fault_in_user_writeable(u32 __user *uaddr) 651 { 652 struct mm_struct *mm = current->mm; 653 int ret; 654 655 mmap_read_lock(mm); 656 ret = fixup_user_fault(mm, (unsigned long)uaddr, 657 FAULT_FLAG_WRITE, NULL); 658 mmap_read_unlock(mm); 659 660 return ret < 0 ? ret : 0; 661 } 662 663 /** 664 * futex_top_waiter() - Return the highest priority waiter on a futex 665 * @hb: the hash bucket the futex_q's reside in 666 * @key: the futex key (to distinguish it from other futex futex_q's) 667 * 668 * Must be called with the hb lock held. 669 */ 670 struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) 671 { 672 struct futex_q *this; 673 674 plist_for_each_entry(this, &hb->chain, list) { 675 if (futex_match(&this->key, key)) 676 return this; 677 } 678 return NULL; 679 } 680 681 /** 682 * wait_for_owner_exiting - Block until the owner has exited 683 * @ret: owner's current futex lock status 684 * @exiting: Pointer to the exiting task 685 * 686 * Caller must hold a refcount on @exiting. 687 */ 688 void wait_for_owner_exiting(int ret, struct task_struct *exiting) 689 { 690 if (ret != -EBUSY) { 691 WARN_ON_ONCE(exiting); 692 return; 693 } 694 695 if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) 696 return; 697 698 mutex_lock(&exiting->futex_exit_mutex); 699 /* 700 * No point in doing state checking here. If the waiter got here 701 * while the task was in exec()->exec_futex_release() then it can 702 * have any FUTEX_STATE_* value when the waiter has acquired the 703 * mutex. OK, if running, EXITING or DEAD if it reached exit() 704 * already. Highly unlikely and not a problem. Just one more round 705 * through the futex maze. 706 */ 707 mutex_unlock(&exiting->futex_exit_mutex); 708 709 put_task_struct(exiting); 710 } 711 712 /** 713 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket 714 * @q: The futex_q to unqueue 715 * 716 * The q->lock_ptr must not be NULL and must be held by the caller. 717 */ 718 void __futex_unqueue(struct futex_q *q) 719 { 720 struct futex_hash_bucket *hb; 721 722 if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) 723 return; 724 lockdep_assert_held(q->lock_ptr); 725 726 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 727 plist_del(&q->list, &hb->chain); 728 futex_hb_waiters_dec(hb); 729 } 730 731 /* The key must be already stored in q->key. */ 732 void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) 733 __acquires(&hb->lock) 734 { 735 /* 736 * Increment the counter before taking the lock so that 737 * a potential waker won't miss a to-be-slept task that is 738 * waiting for the spinlock. This is safe as all futex_q_lock() 739 * users end up calling futex_queue(). Similarly, for housekeeping, 740 * decrement the counter at futex_q_unlock() when some error has 741 * occurred and we don't end up adding the task to the list. 742 */ 743 futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ 744 745 q->lock_ptr = &hb->lock; 746 747 spin_lock(&hb->lock); 748 } 749 750 void futex_q_unlock(struct futex_hash_bucket *hb) 751 __releases(&hb->lock) 752 { 753 futex_hb_waiters_dec(hb); 754 spin_unlock(&hb->lock); 755 } 756 757 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, 758 struct task_struct *task) 759 { 760 int prio; 761 762 /* 763 * The priority used to register this element is 764 * - either the real thread-priority for the real-time threads 765 * (i.e. threads with a priority lower than MAX_RT_PRIO) 766 * - or MAX_RT_PRIO for non-RT threads. 767 * Thus, all RT-threads are woken first in priority order, and 768 * the others are woken last, in FIFO order. 769 */ 770 prio = min(current->normal_prio, MAX_RT_PRIO); 771 772 plist_node_init(&q->list, prio); 773 plist_add(&q->list, &hb->chain); 774 q->task = task; 775 } 776 777 /** 778 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket 779 * @q: The futex_q to unqueue 780 * 781 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must 782 * be paired with exactly one earlier call to futex_queue(). 783 * 784 * Return: 785 * - 1 - if the futex_q was still queued (and we removed unqueued it); 786 * - 0 - if the futex_q was already removed by the waking thread 787 */ 788 int futex_unqueue(struct futex_q *q) 789 { 790 spinlock_t *lock_ptr; 791 int ret = 0; 792 793 /* RCU so lock_ptr is not going away during locking. */ 794 guard(rcu)(); 795 /* In the common case we don't take the spinlock, which is nice. */ 796 retry: 797 /* 798 * q->lock_ptr can change between this read and the following spin_lock. 799 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 800 * optimizing lock_ptr out of the logic below. 801 */ 802 lock_ptr = READ_ONCE(q->lock_ptr); 803 if (lock_ptr != NULL) { 804 spin_lock(lock_ptr); 805 /* 806 * q->lock_ptr can change between reading it and 807 * spin_lock(), causing us to take the wrong lock. This 808 * corrects the race condition. 809 * 810 * Reasoning goes like this: if we have the wrong lock, 811 * q->lock_ptr must have changed (maybe several times) 812 * between reading it and the spin_lock(). It can 813 * change again after the spin_lock() but only if it was 814 * already changed before the spin_lock(). It cannot, 815 * however, change back to the original value. Therefore 816 * we can detect whether we acquired the correct lock. 817 */ 818 if (unlikely(lock_ptr != q->lock_ptr)) { 819 spin_unlock(lock_ptr); 820 goto retry; 821 } 822 __futex_unqueue(q); 823 824 BUG_ON(q->pi_state); 825 826 spin_unlock(lock_ptr); 827 ret = 1; 828 } 829 830 return ret; 831 } 832 833 void futex_q_lockptr_lock(struct futex_q *q) 834 { 835 spinlock_t *lock_ptr; 836 837 /* 838 * See futex_unqueue() why lock_ptr can change. 839 */ 840 guard(rcu)(); 841 retry: 842 lock_ptr = READ_ONCE(q->lock_ptr); 843 spin_lock(lock_ptr); 844 845 if (unlikely(lock_ptr != q->lock_ptr)) { 846 spin_unlock(lock_ptr); 847 goto retry; 848 } 849 } 850 851 /* 852 * PI futexes can not be requeued and must remove themselves from the hash 853 * bucket. The hash bucket lock (i.e. lock_ptr) is held. 854 */ 855 void futex_unqueue_pi(struct futex_q *q) 856 { 857 /* 858 * If the lock was not acquired (due to timeout or signal) then the 859 * rt_waiter is removed before futex_q is. If this is observed by 860 * an unlocker after dropping the rtmutex wait lock and before 861 * acquiring the hash bucket lock, then the unlocker dequeues the 862 * futex_q from the hash bucket list to guarantee consistent state 863 * vs. userspace. Therefore the dequeue here must be conditional. 864 */ 865 if (!plist_node_empty(&q->list)) 866 __futex_unqueue(q); 867 868 BUG_ON(!q->pi_state); 869 put_pi_state(q->pi_state); 870 q->pi_state = NULL; 871 } 872 873 /* Constants for the pending_op argument of handle_futex_death */ 874 #define HANDLE_DEATH_PENDING true 875 #define HANDLE_DEATH_LIST false 876 877 /* 878 * Process a futex-list entry, check whether it's owned by the 879 * dying task, and do notification if so: 880 */ 881 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, 882 bool pi, bool pending_op) 883 { 884 u32 uval, nval, mval; 885 pid_t owner; 886 int err; 887 888 /* Futex address must be 32bit aligned */ 889 if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) 890 return -1; 891 892 retry: 893 if (get_user(uval, uaddr)) 894 return -1; 895 896 /* 897 * Special case for regular (non PI) futexes. The unlock path in 898 * user space has two race scenarios: 899 * 900 * 1. The unlock path releases the user space futex value and 901 * before it can execute the futex() syscall to wake up 902 * waiters it is killed. 903 * 904 * 2. A woken up waiter is killed before it can acquire the 905 * futex in user space. 906 * 907 * In the second case, the wake up notification could be generated 908 * by the unlock path in user space after setting the futex value 909 * to zero or by the kernel after setting the OWNER_DIED bit below. 910 * 911 * In both cases the TID validation below prevents a wakeup of 912 * potential waiters which can cause these waiters to block 913 * forever. 914 * 915 * In both cases the following conditions are met: 916 * 917 * 1) task->robust_list->list_op_pending != NULL 918 * @pending_op == true 919 * 2) The owner part of user space futex value == 0 920 * 3) Regular futex: @pi == false 921 * 922 * If these conditions are met, it is safe to attempt waking up a 923 * potential waiter without touching the user space futex value and 924 * trying to set the OWNER_DIED bit. If the futex value is zero, 925 * the rest of the user space mutex state is consistent, so a woken 926 * waiter will just take over the uncontended futex. Setting the 927 * OWNER_DIED bit would create inconsistent state and malfunction 928 * of the user space owner died handling. Otherwise, the OWNER_DIED 929 * bit is already set, and the woken waiter is expected to deal with 930 * this. 931 */ 932 owner = uval & FUTEX_TID_MASK; 933 934 if (pending_op && !pi && !owner) { 935 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 936 FUTEX_BITSET_MATCH_ANY); 937 return 0; 938 } 939 940 if (owner != task_pid_vnr(curr)) 941 return 0; 942 943 /* 944 * Ok, this dying thread is truly holding a futex 945 * of interest. Set the OWNER_DIED bit atomically 946 * via cmpxchg, and if the value had FUTEX_WAITERS 947 * set, wake up a waiter (if any). (We have to do a 948 * futex_wake() even if OWNER_DIED is already set - 949 * to handle the rare but possible case of recursive 950 * thread-death.) The rest of the cleanup is done in 951 * userspace. 952 */ 953 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 954 955 /* 956 * We are not holding a lock here, but we want to have 957 * the pagefault_disable/enable() protection because 958 * we want to handle the fault gracefully. If the 959 * access fails we try to fault in the futex with R/W 960 * verification via get_user_pages. get_user() above 961 * does not guarantee R/W access. If that fails we 962 * give up and leave the futex locked. 963 */ 964 if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { 965 switch (err) { 966 case -EFAULT: 967 if (fault_in_user_writeable(uaddr)) 968 return -1; 969 goto retry; 970 971 case -EAGAIN: 972 cond_resched(); 973 goto retry; 974 975 default: 976 WARN_ON_ONCE(1); 977 return err; 978 } 979 } 980 981 if (nval != uval) 982 goto retry; 983 984 /* 985 * Wake robust non-PI futexes here. The wakeup of 986 * PI futexes happens in exit_pi_state(): 987 */ 988 if (!pi && (uval & FUTEX_WAITERS)) { 989 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 990 FUTEX_BITSET_MATCH_ANY); 991 } 992 993 return 0; 994 } 995 996 /* 997 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 998 */ 999 static inline int fetch_robust_entry(struct robust_list __user **entry, 1000 struct robust_list __user * __user *head, 1001 unsigned int *pi) 1002 { 1003 unsigned long uentry; 1004 1005 if (get_user(uentry, (unsigned long __user *)head)) 1006 return -EFAULT; 1007 1008 *entry = (void __user *)(uentry & ~1UL); 1009 *pi = uentry & 1; 1010 1011 return 0; 1012 } 1013 1014 /* 1015 * Walk curr->robust_list (very carefully, it's a userspace list!) 1016 * and mark any locks found there dead, and notify any waiters. 1017 * 1018 * We silently return on any sign of list-walking problem. 1019 */ 1020 static void exit_robust_list(struct task_struct *curr) 1021 { 1022 struct robust_list_head __user *head = curr->robust_list; 1023 struct robust_list __user *entry, *next_entry, *pending; 1024 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 1025 unsigned int next_pi; 1026 unsigned long futex_offset; 1027 int rc; 1028 1029 /* 1030 * Fetch the list head (which was registered earlier, via 1031 * sys_set_robust_list()): 1032 */ 1033 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 1034 return; 1035 /* 1036 * Fetch the relative futex offset: 1037 */ 1038 if (get_user(futex_offset, &head->futex_offset)) 1039 return; 1040 /* 1041 * Fetch any possibly pending lock-add first, and handle it 1042 * if it exists: 1043 */ 1044 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 1045 return; 1046 1047 next_entry = NULL; /* avoid warning with gcc */ 1048 while (entry != &head->list) { 1049 /* 1050 * Fetch the next entry in the list before calling 1051 * handle_futex_death: 1052 */ 1053 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 1054 /* 1055 * A pending lock might already be on the list, so 1056 * don't process it twice: 1057 */ 1058 if (entry != pending) { 1059 if (handle_futex_death((void __user *)entry + futex_offset, 1060 curr, pi, HANDLE_DEATH_LIST)) 1061 return; 1062 } 1063 if (rc) 1064 return; 1065 entry = next_entry; 1066 pi = next_pi; 1067 /* 1068 * Avoid excessively long or circular lists: 1069 */ 1070 if (!--limit) 1071 break; 1072 1073 cond_resched(); 1074 } 1075 1076 if (pending) { 1077 handle_futex_death((void __user *)pending + futex_offset, 1078 curr, pip, HANDLE_DEATH_PENDING); 1079 } 1080 } 1081 1082 #ifdef CONFIG_COMPAT 1083 static void __user *futex_uaddr(struct robust_list __user *entry, 1084 compat_long_t futex_offset) 1085 { 1086 compat_uptr_t base = ptr_to_compat(entry); 1087 void __user *uaddr = compat_ptr(base + futex_offset); 1088 1089 return uaddr; 1090 } 1091 1092 /* 1093 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 1094 */ 1095 static inline int 1096 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 1097 compat_uptr_t __user *head, unsigned int *pi) 1098 { 1099 if (get_user(*uentry, head)) 1100 return -EFAULT; 1101 1102 *entry = compat_ptr((*uentry) & ~1); 1103 *pi = (unsigned int)(*uentry) & 1; 1104 1105 return 0; 1106 } 1107 1108 /* 1109 * Walk curr->robust_list (very carefully, it's a userspace list!) 1110 * and mark any locks found there dead, and notify any waiters. 1111 * 1112 * We silently return on any sign of list-walking problem. 1113 */ 1114 static void compat_exit_robust_list(struct task_struct *curr) 1115 { 1116 struct compat_robust_list_head __user *head = curr->compat_robust_list; 1117 struct robust_list __user *entry, *next_entry, *pending; 1118 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 1119 unsigned int next_pi; 1120 compat_uptr_t uentry, next_uentry, upending; 1121 compat_long_t futex_offset; 1122 int rc; 1123 1124 /* 1125 * Fetch the list head (which was registered earlier, via 1126 * sys_set_robust_list()): 1127 */ 1128 if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) 1129 return; 1130 /* 1131 * Fetch the relative futex offset: 1132 */ 1133 if (get_user(futex_offset, &head->futex_offset)) 1134 return; 1135 /* 1136 * Fetch any possibly pending lock-add first, and handle it 1137 * if it exists: 1138 */ 1139 if (compat_fetch_robust_entry(&upending, &pending, 1140 &head->list_op_pending, &pip)) 1141 return; 1142 1143 next_entry = NULL; /* avoid warning with gcc */ 1144 while (entry != (struct robust_list __user *) &head->list) { 1145 /* 1146 * Fetch the next entry in the list before calling 1147 * handle_futex_death: 1148 */ 1149 rc = compat_fetch_robust_entry(&next_uentry, &next_entry, 1150 (compat_uptr_t __user *)&entry->next, &next_pi); 1151 /* 1152 * A pending lock might already be on the list, so 1153 * dont process it twice: 1154 */ 1155 if (entry != pending) { 1156 void __user *uaddr = futex_uaddr(entry, futex_offset); 1157 1158 if (handle_futex_death(uaddr, curr, pi, 1159 HANDLE_DEATH_LIST)) 1160 return; 1161 } 1162 if (rc) 1163 return; 1164 uentry = next_uentry; 1165 entry = next_entry; 1166 pi = next_pi; 1167 /* 1168 * Avoid excessively long or circular lists: 1169 */ 1170 if (!--limit) 1171 break; 1172 1173 cond_resched(); 1174 } 1175 if (pending) { 1176 void __user *uaddr = futex_uaddr(pending, futex_offset); 1177 1178 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); 1179 } 1180 } 1181 #endif 1182 1183 #ifdef CONFIG_FUTEX_PI 1184 1185 /* 1186 * This task is holding PI mutexes at exit time => bad. 1187 * Kernel cleans up PI-state, but userspace is likely hosed. 1188 * (Robust-futex cleanup is separate and might save the day for userspace.) 1189 */ 1190 static void exit_pi_state_list(struct task_struct *curr) 1191 { 1192 struct list_head *next, *head = &curr->pi_state_list; 1193 struct futex_pi_state *pi_state; 1194 union futex_key key = FUTEX_KEY_INIT; 1195 1196 /* 1197 * The mutex mm_struct::futex_hash_lock might be acquired. 1198 */ 1199 might_sleep(); 1200 /* 1201 * Ensure the hash remains stable (no resize) during the while loop 1202 * below. The hb pointer is acquired under the pi_lock so we can't block 1203 * on the mutex. 1204 */ 1205 WARN_ON(curr != current); 1206 guard(private_hash)(); 1207 /* 1208 * We are a ZOMBIE and nobody can enqueue itself on 1209 * pi_state_list anymore, but we have to be careful 1210 * versus waiters unqueueing themselves: 1211 */ 1212 raw_spin_lock_irq(&curr->pi_lock); 1213 while (!list_empty(head)) { 1214 next = head->next; 1215 pi_state = list_entry(next, struct futex_pi_state, list); 1216 key = pi_state->key; 1217 if (1) { 1218 CLASS(hb, hb)(&key); 1219 1220 /* 1221 * We can race against put_pi_state() removing itself from the 1222 * list (a waiter going away). put_pi_state() will first 1223 * decrement the reference count and then modify the list, so 1224 * its possible to see the list entry but fail this reference 1225 * acquire. 1226 * 1227 * In that case; drop the locks to let put_pi_state() make 1228 * progress and retry the loop. 1229 */ 1230 if (!refcount_inc_not_zero(&pi_state->refcount)) { 1231 raw_spin_unlock_irq(&curr->pi_lock); 1232 cpu_relax(); 1233 raw_spin_lock_irq(&curr->pi_lock); 1234 continue; 1235 } 1236 raw_spin_unlock_irq(&curr->pi_lock); 1237 1238 spin_lock(&hb->lock); 1239 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1240 raw_spin_lock(&curr->pi_lock); 1241 /* 1242 * We dropped the pi-lock, so re-check whether this 1243 * task still owns the PI-state: 1244 */ 1245 if (head->next != next) { 1246 /* retain curr->pi_lock for the loop invariant */ 1247 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 1248 spin_unlock(&hb->lock); 1249 put_pi_state(pi_state); 1250 continue; 1251 } 1252 1253 WARN_ON(pi_state->owner != curr); 1254 WARN_ON(list_empty(&pi_state->list)); 1255 list_del_init(&pi_state->list); 1256 pi_state->owner = NULL; 1257 1258 raw_spin_unlock(&curr->pi_lock); 1259 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1260 spin_unlock(&hb->lock); 1261 } 1262 1263 rt_mutex_futex_unlock(&pi_state->pi_mutex); 1264 put_pi_state(pi_state); 1265 1266 raw_spin_lock_irq(&curr->pi_lock); 1267 } 1268 raw_spin_unlock_irq(&curr->pi_lock); 1269 } 1270 #else 1271 static inline void exit_pi_state_list(struct task_struct *curr) { } 1272 #endif 1273 1274 static void futex_cleanup(struct task_struct *tsk) 1275 { 1276 if (unlikely(tsk->robust_list)) { 1277 exit_robust_list(tsk); 1278 tsk->robust_list = NULL; 1279 } 1280 1281 #ifdef CONFIG_COMPAT 1282 if (unlikely(tsk->compat_robust_list)) { 1283 compat_exit_robust_list(tsk); 1284 tsk->compat_robust_list = NULL; 1285 } 1286 #endif 1287 1288 if (unlikely(!list_empty(&tsk->pi_state_list))) 1289 exit_pi_state_list(tsk); 1290 } 1291 1292 /** 1293 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD 1294 * @tsk: task to set the state on 1295 * 1296 * Set the futex exit state of the task lockless. The futex waiter code 1297 * observes that state when a task is exiting and loops until the task has 1298 * actually finished the futex cleanup. The worst case for this is that the 1299 * waiter runs through the wait loop until the state becomes visible. 1300 * 1301 * This is called from the recursive fault handling path in make_task_dead(). 1302 * 1303 * This is best effort. Either the futex exit code has run already or 1304 * not. If the OWNER_DIED bit has been set on the futex then the waiter can 1305 * take it over. If not, the problem is pushed back to user space. If the 1306 * futex exit code did not run yet, then an already queued waiter might 1307 * block forever, but there is nothing which can be done about that. 1308 */ 1309 void futex_exit_recursive(struct task_struct *tsk) 1310 { 1311 /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ 1312 if (tsk->futex_state == FUTEX_STATE_EXITING) 1313 mutex_unlock(&tsk->futex_exit_mutex); 1314 tsk->futex_state = FUTEX_STATE_DEAD; 1315 } 1316 1317 static void futex_cleanup_begin(struct task_struct *tsk) 1318 { 1319 /* 1320 * Prevent various race issues against a concurrent incoming waiter 1321 * including live locks by forcing the waiter to block on 1322 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in 1323 * attach_to_pi_owner(). 1324 */ 1325 mutex_lock(&tsk->futex_exit_mutex); 1326 1327 /* 1328 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. 1329 * 1330 * This ensures that all subsequent checks of tsk->futex_state in 1331 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with 1332 * tsk->pi_lock held. 1333 * 1334 * It guarantees also that a pi_state which was queued right before 1335 * the state change under tsk->pi_lock by a concurrent waiter must 1336 * be observed in exit_pi_state_list(). 1337 */ 1338 raw_spin_lock_irq(&tsk->pi_lock); 1339 tsk->futex_state = FUTEX_STATE_EXITING; 1340 raw_spin_unlock_irq(&tsk->pi_lock); 1341 } 1342 1343 static void futex_cleanup_end(struct task_struct *tsk, int state) 1344 { 1345 /* 1346 * Lockless store. The only side effect is that an observer might 1347 * take another loop until it becomes visible. 1348 */ 1349 tsk->futex_state = state; 1350 /* 1351 * Drop the exit protection. This unblocks waiters which observed 1352 * FUTEX_STATE_EXITING to reevaluate the state. 1353 */ 1354 mutex_unlock(&tsk->futex_exit_mutex); 1355 } 1356 1357 void futex_exec_release(struct task_struct *tsk) 1358 { 1359 /* 1360 * The state handling is done for consistency, but in the case of 1361 * exec() there is no way to prevent further damage as the PID stays 1362 * the same. But for the unlikely and arguably buggy case that a 1363 * futex is held on exec(), this provides at least as much state 1364 * consistency protection which is possible. 1365 */ 1366 futex_cleanup_begin(tsk); 1367 futex_cleanup(tsk); 1368 /* 1369 * Reset the state to FUTEX_STATE_OK. The task is alive and about 1370 * exec a new binary. 1371 */ 1372 futex_cleanup_end(tsk, FUTEX_STATE_OK); 1373 } 1374 1375 void futex_exit_release(struct task_struct *tsk) 1376 { 1377 futex_cleanup_begin(tsk); 1378 futex_cleanup(tsk); 1379 futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 1380 } 1381 1382 static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, 1383 struct futex_private_hash *fph) 1384 { 1385 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1386 fhb->priv = fph; 1387 #endif 1388 atomic_set(&fhb->waiters, 0); 1389 plist_head_init(&fhb->chain); 1390 spin_lock_init(&fhb->lock); 1391 } 1392 1393 #define FH_CUSTOM 0x01 1394 #define FH_IMMUTABLE 0x02 1395 1396 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1397 void futex_hash_free(struct mm_struct *mm) 1398 { 1399 struct futex_private_hash *fph; 1400 1401 kvfree(mm->futex_phash_new); 1402 fph = rcu_dereference_raw(mm->futex_phash); 1403 if (fph) { 1404 WARN_ON_ONCE(rcuref_read(&fph->users) > 1); 1405 kvfree(fph); 1406 } 1407 } 1408 1409 static bool futex_pivot_pending(struct mm_struct *mm) 1410 { 1411 struct futex_private_hash *fph; 1412 1413 guard(rcu)(); 1414 1415 if (!mm->futex_phash_new) 1416 return true; 1417 1418 fph = rcu_dereference(mm->futex_phash); 1419 return rcuref_is_dead(&fph->users); 1420 } 1421 1422 static bool futex_hash_less(struct futex_private_hash *a, 1423 struct futex_private_hash *b) 1424 { 1425 /* user provided always wins */ 1426 if (!a->custom && b->custom) 1427 return true; 1428 if (a->custom && !b->custom) 1429 return false; 1430 1431 /* zero-sized hash wins */ 1432 if (!b->hash_mask) 1433 return true; 1434 if (!a->hash_mask) 1435 return false; 1436 1437 /* keep the biggest */ 1438 if (a->hash_mask < b->hash_mask) 1439 return true; 1440 if (a->hash_mask > b->hash_mask) 1441 return false; 1442 1443 return false; /* equal */ 1444 } 1445 1446 static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) 1447 { 1448 struct mm_struct *mm = current->mm; 1449 struct futex_private_hash *fph; 1450 bool custom = flags & FH_CUSTOM; 1451 int i; 1452 1453 if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) 1454 return -EINVAL; 1455 1456 /* 1457 * Once we've disabled the global hash there is no way back. 1458 */ 1459 scoped_guard(rcu) { 1460 fph = rcu_dereference(mm->futex_phash); 1461 if (fph && (!fph->hash_mask || fph->immutable)) { 1462 if (custom) 1463 return -EBUSY; 1464 return 0; 1465 } 1466 } 1467 1468 fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1469 if (!fph) 1470 return -ENOMEM; 1471 1472 rcuref_init(&fph->users, 1); 1473 fph->hash_mask = hash_slots ? hash_slots - 1 : 0; 1474 fph->custom = custom; 1475 fph->immutable = !!(flags & FH_IMMUTABLE); 1476 fph->mm = mm; 1477 1478 for (i = 0; i < hash_slots; i++) 1479 futex_hash_bucket_init(&fph->queues[i], fph); 1480 1481 if (custom) { 1482 /* 1483 * Only let prctl() wait / retry; don't unduly delay clone(). 1484 */ 1485 again: 1486 wait_var_event(mm, futex_pivot_pending(mm)); 1487 } 1488 1489 scoped_guard(mutex, &mm->futex_hash_lock) { 1490 struct futex_private_hash *free __free(kvfree) = NULL; 1491 struct futex_private_hash *cur, *new; 1492 1493 cur = rcu_dereference_protected(mm->futex_phash, 1494 lockdep_is_held(&mm->futex_hash_lock)); 1495 new = mm->futex_phash_new; 1496 mm->futex_phash_new = NULL; 1497 1498 if (fph) { 1499 if (cur && !new) { 1500 /* 1501 * If we have an existing hash, but do not yet have 1502 * allocated a replacement hash, drop the initial 1503 * reference on the existing hash. 1504 */ 1505 futex_private_hash_put(cur); 1506 } 1507 1508 if (new) { 1509 /* 1510 * Two updates raced; throw out the lesser one. 1511 */ 1512 if (futex_hash_less(new, fph)) { 1513 free = new; 1514 new = fph; 1515 } else { 1516 free = fph; 1517 } 1518 } else { 1519 new = fph; 1520 } 1521 fph = NULL; 1522 } 1523 1524 if (new) { 1525 /* 1526 * Will set mm->futex_phash_new on failure; 1527 * futex_private_hash_get() will try again. 1528 */ 1529 if (!__futex_pivot_hash(mm, new) && custom) 1530 goto again; 1531 } 1532 } 1533 return 0; 1534 } 1535 1536 int futex_hash_allocate_default(void) 1537 { 1538 unsigned int threads, buckets, current_buckets = 0; 1539 struct futex_private_hash *fph; 1540 1541 if (!current->mm) 1542 return 0; 1543 1544 scoped_guard(rcu) { 1545 threads = min_t(unsigned int, 1546 get_nr_threads(current), 1547 num_online_cpus()); 1548 1549 fph = rcu_dereference(current->mm->futex_phash); 1550 if (fph) { 1551 if (fph->custom) 1552 return 0; 1553 1554 current_buckets = fph->hash_mask + 1; 1555 } 1556 } 1557 1558 /* 1559 * The default allocation will remain within 1560 * 16 <= threads * 4 <= global hash size 1561 */ 1562 buckets = roundup_pow_of_two(4 * threads); 1563 buckets = clamp(buckets, 16, futex_hashmask + 1); 1564 1565 if (current_buckets >= buckets) 1566 return 0; 1567 1568 return futex_hash_allocate(buckets, 0); 1569 } 1570 1571 static int futex_hash_get_slots(void) 1572 { 1573 struct futex_private_hash *fph; 1574 1575 guard(rcu)(); 1576 fph = rcu_dereference(current->mm->futex_phash); 1577 if (fph && fph->hash_mask) 1578 return fph->hash_mask + 1; 1579 return 0; 1580 } 1581 1582 static int futex_hash_get_immutable(void) 1583 { 1584 struct futex_private_hash *fph; 1585 1586 guard(rcu)(); 1587 fph = rcu_dereference(current->mm->futex_phash); 1588 if (fph && fph->immutable) 1589 return 1; 1590 if (fph && !fph->hash_mask) 1591 return 1; 1592 return 0; 1593 } 1594 1595 #else 1596 1597 static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) 1598 { 1599 return -EINVAL; 1600 } 1601 1602 static int futex_hash_get_slots(void) 1603 { 1604 return 0; 1605 } 1606 1607 static int futex_hash_get_immutable(void) 1608 { 1609 return 0; 1610 } 1611 #endif 1612 1613 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) 1614 { 1615 unsigned int flags = FH_CUSTOM; 1616 int ret; 1617 1618 switch (arg2) { 1619 case PR_FUTEX_HASH_SET_SLOTS: 1620 if (arg4 & ~FH_FLAG_IMMUTABLE) 1621 return -EINVAL; 1622 if (arg4 & FH_FLAG_IMMUTABLE) 1623 flags |= FH_IMMUTABLE; 1624 ret = futex_hash_allocate(arg3, flags); 1625 break; 1626 1627 case PR_FUTEX_HASH_GET_SLOTS: 1628 ret = futex_hash_get_slots(); 1629 break; 1630 1631 case PR_FUTEX_HASH_GET_IMMUTABLE: 1632 ret = futex_hash_get_immutable(); 1633 break; 1634 1635 default: 1636 ret = -EINVAL; 1637 break; 1638 } 1639 return ret; 1640 } 1641 1642 static int __init futex_init(void) 1643 { 1644 unsigned long hashsize, i; 1645 unsigned int futex_shift; 1646 1647 #ifdef CONFIG_BASE_SMALL 1648 hashsize = 16; 1649 #else 1650 hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 1651 #endif 1652 1653 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 1654 hashsize, 0, 0, 1655 &futex_shift, NULL, 1656 hashsize, hashsize); 1657 hashsize = 1UL << futex_shift; 1658 1659 for (i = 0; i < hashsize; i++) 1660 futex_hash_bucket_init(&futex_queues[i], NULL); 1661 1662 futex_hashmask = hashsize - 1; 1663 return 0; 1664 } 1665 core_initcall(futex_init); 1666