1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Fast Userspace Mutexes (which I call "Futexes!"). 4 * (C) Rusty Russell, IBM 2002 5 * 6 * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar 7 * (C) Copyright 2003 Red Hat Inc, All Rights Reserved 8 * 9 * Removed page pinning, fix privately mapped COW pages and other cleanups 10 * (C) Copyright 2003, 2004 Jamie Lokier 11 * 12 * Robust futex support started by Ingo Molnar 13 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 14 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 15 * 16 * PI-futex support started by Ingo Molnar and Thomas Gleixner 17 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 18 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 19 * 20 * PRIVATE futexes by Eric Dumazet 21 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 22 * 23 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> 24 * Copyright (C) IBM Corporation, 2009 25 * Thanks to Thomas Gleixner for conceptual design and careful reviews. 26 * 27 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 28 * enough at me, Linus for the original (flawed) idea, Matthew 29 * Kirkwood for proof-of-concept implementation. 30 * 31 * "The futexes are also cursed." 32 * "But they come in a choice of three flavours!" 33 */ 34 #include <linux/compat.h> 35 #include <linux/jhash.h> 36 #include <linux/pagemap.h> 37 #include <linux/debugfs.h> 38 #include <linux/plist.h> 39 #include <linux/memblock.h> 40 #include <linux/fault-inject.h> 41 #include <linux/slab.h> 42 #include <linux/prctl.h> 43 #include <linux/rcuref.h> 44 45 #include "futex.h" 46 #include "../locking/rtmutex_common.h" 47 48 /* 49 * The base of the bucket array and its size are always used together 50 * (after initialization only in futex_hash()), so ensure that they 51 * reside in the same cacheline. 52 */ 53 static struct { 54 struct futex_hash_bucket *queues; 55 unsigned long hashmask; 56 } __futex_data __read_mostly __aligned(2*sizeof(long)); 57 #define futex_queues (__futex_data.queues) 58 #define futex_hashmask (__futex_data.hashmask) 59 60 struct futex_private_hash { 61 rcuref_t users; 62 unsigned int hash_mask; 63 struct rcu_head rcu; 64 void *mm; 65 bool custom; 66 struct futex_hash_bucket queues[]; 67 }; 68 69 /* 70 * Fault injections for futexes. 71 */ 72 #ifdef CONFIG_FAIL_FUTEX 73 74 static struct { 75 struct fault_attr attr; 76 77 bool ignore_private; 78 } fail_futex = { 79 .attr = FAULT_ATTR_INITIALIZER, 80 .ignore_private = false, 81 }; 82 83 static int __init setup_fail_futex(char *str) 84 { 85 return setup_fault_attr(&fail_futex.attr, str); 86 } 87 __setup("fail_futex=", setup_fail_futex); 88 89 bool should_fail_futex(bool fshared) 90 { 91 if (fail_futex.ignore_private && !fshared) 92 return false; 93 94 return should_fail(&fail_futex.attr, 1); 95 } 96 97 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 98 99 static int __init fail_futex_debugfs(void) 100 { 101 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 102 struct dentry *dir; 103 104 dir = fault_create_debugfs_attr("fail_futex", NULL, 105 &fail_futex.attr); 106 if (IS_ERR(dir)) 107 return PTR_ERR(dir); 108 109 debugfs_create_bool("ignore-private", mode, dir, 110 &fail_futex.ignore_private); 111 return 0; 112 } 113 114 late_initcall(fail_futex_debugfs); 115 116 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 117 118 #endif /* CONFIG_FAIL_FUTEX */ 119 120 static struct futex_hash_bucket * 121 __futex_hash(union futex_key *key, struct futex_private_hash *fph); 122 123 #ifdef CONFIG_FUTEX_PRIVATE_HASH 124 static inline bool futex_key_is_private(union futex_key *key) 125 { 126 /* 127 * Relies on get_futex_key() to set either bit for shared 128 * futexes -- see comment with union futex_key. 129 */ 130 return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); 131 } 132 133 bool futex_private_hash_get(struct futex_private_hash *fph) 134 { 135 return rcuref_get(&fph->users); 136 } 137 138 void futex_private_hash_put(struct futex_private_hash *fph) 139 { 140 /* Ignore return value, last put is verified via rcuref_is_dead() */ 141 if (rcuref_put(&fph->users)) 142 wake_up_var(fph->mm); 143 } 144 145 /** 146 * futex_hash_get - Get an additional reference for the local hash. 147 * @hb: ptr to the private local hash. 148 * 149 * Obtain an additional reference for the already obtained hash bucket. The 150 * caller must already own an reference. 151 */ 152 void futex_hash_get(struct futex_hash_bucket *hb) 153 { 154 struct futex_private_hash *fph = hb->priv; 155 156 if (!fph) 157 return; 158 WARN_ON_ONCE(!futex_private_hash_get(fph)); 159 } 160 161 void futex_hash_put(struct futex_hash_bucket *hb) 162 { 163 struct futex_private_hash *fph = hb->priv; 164 165 if (!fph) 166 return; 167 futex_private_hash_put(fph); 168 } 169 170 static struct futex_hash_bucket * 171 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 172 { 173 u32 hash; 174 175 if (!futex_key_is_private(key)) 176 return NULL; 177 178 if (!fph) 179 fph = rcu_dereference(key->private.mm->futex_phash); 180 if (!fph || !fph->hash_mask) 181 return NULL; 182 183 hash = jhash2((void *)&key->private.address, 184 sizeof(key->private.address) / 4, 185 key->both.offset); 186 return &fph->queues[hash & fph->hash_mask]; 187 } 188 189 static void futex_rehash_private(struct futex_private_hash *old, 190 struct futex_private_hash *new) 191 { 192 struct futex_hash_bucket *hb_old, *hb_new; 193 unsigned int slots = old->hash_mask + 1; 194 unsigned int i; 195 196 for (i = 0; i < slots; i++) { 197 struct futex_q *this, *tmp; 198 199 hb_old = &old->queues[i]; 200 201 spin_lock(&hb_old->lock); 202 plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { 203 204 plist_del(&this->list, &hb_old->chain); 205 futex_hb_waiters_dec(hb_old); 206 207 WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); 208 209 hb_new = __futex_hash(&this->key, new); 210 futex_hb_waiters_inc(hb_new); 211 /* 212 * The new pointer isn't published yet but an already 213 * moved user can be unqueued due to timeout or signal. 214 */ 215 spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING); 216 plist_add(&this->list, &hb_new->chain); 217 this->lock_ptr = &hb_new->lock; 218 spin_unlock(&hb_new->lock); 219 } 220 spin_unlock(&hb_old->lock); 221 } 222 } 223 224 static bool __futex_pivot_hash(struct mm_struct *mm, 225 struct futex_private_hash *new) 226 { 227 struct futex_private_hash *fph; 228 229 WARN_ON_ONCE(mm->futex_phash_new); 230 231 fph = rcu_dereference_protected(mm->futex_phash, 232 lockdep_is_held(&mm->futex_hash_lock)); 233 if (fph) { 234 if (!rcuref_is_dead(&fph->users)) { 235 mm->futex_phash_new = new; 236 return false; 237 } 238 239 futex_rehash_private(fph, new); 240 } 241 rcu_assign_pointer(mm->futex_phash, new); 242 kvfree_rcu(fph, rcu); 243 return true; 244 } 245 246 static void futex_pivot_hash(struct mm_struct *mm) 247 { 248 scoped_guard(mutex, &mm->futex_hash_lock) { 249 struct futex_private_hash *fph; 250 251 fph = mm->futex_phash_new; 252 if (fph) { 253 mm->futex_phash_new = NULL; 254 __futex_pivot_hash(mm, fph); 255 } 256 } 257 } 258 259 struct futex_private_hash *futex_private_hash(void) 260 { 261 struct mm_struct *mm = current->mm; 262 /* 263 * Ideally we don't loop. If there is a replacement in progress 264 * then a new private hash is already prepared and a reference can't be 265 * obtained once the last user dropped it's. 266 * In that case we block on mm_struct::futex_hash_lock and either have 267 * to perform the replacement or wait while someone else is doing the 268 * job. Eitherway, on the second iteration we acquire a reference on the 269 * new private hash or loop again because a new replacement has been 270 * requested. 271 */ 272 again: 273 scoped_guard(rcu) { 274 struct futex_private_hash *fph; 275 276 fph = rcu_dereference(mm->futex_phash); 277 if (!fph) 278 return NULL; 279 280 if (rcuref_get(&fph->users)) 281 return fph; 282 } 283 futex_pivot_hash(mm); 284 goto again; 285 } 286 287 struct futex_hash_bucket *futex_hash(union futex_key *key) 288 { 289 struct futex_private_hash *fph; 290 struct futex_hash_bucket *hb; 291 292 again: 293 scoped_guard(rcu) { 294 hb = __futex_hash(key, NULL); 295 fph = hb->priv; 296 297 if (!fph || futex_private_hash_get(fph)) 298 return hb; 299 } 300 futex_pivot_hash(key->private.mm); 301 goto again; 302 } 303 304 #else /* !CONFIG_FUTEX_PRIVATE_HASH */ 305 306 static struct futex_hash_bucket * 307 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) 308 { 309 return NULL; 310 } 311 312 struct futex_hash_bucket *futex_hash(union futex_key *key) 313 { 314 return __futex_hash(key, NULL); 315 } 316 317 #endif /* CONFIG_FUTEX_PRIVATE_HASH */ 318 319 /** 320 * __futex_hash - Return the hash bucket 321 * @key: Pointer to the futex key for which the hash is calculated 322 * @fph: Pointer to private hash if known 323 * 324 * We hash on the keys returned from get_futex_key (see below) and return the 325 * corresponding hash bucket. 326 * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the 327 * private hash) is returned if existing. Otherwise a hash bucket from the 328 * global hash is returned. 329 */ 330 static struct futex_hash_bucket * 331 __futex_hash(union futex_key *key, struct futex_private_hash *fph) 332 { 333 struct futex_hash_bucket *hb; 334 u32 hash; 335 336 hb = __futex_hash_private(key, fph); 337 if (hb) 338 return hb; 339 340 hash = jhash2((u32 *)key, 341 offsetof(typeof(*key), both.offset) / 4, 342 key->both.offset); 343 return &futex_queues[hash & futex_hashmask]; 344 } 345 346 /** 347 * futex_setup_timer - set up the sleeping hrtimer. 348 * @time: ptr to the given timeout value 349 * @timeout: the hrtimer_sleeper structure to be set up 350 * @flags: futex flags 351 * @range_ns: optional range in ns 352 * 353 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout 354 * value given 355 */ 356 struct hrtimer_sleeper * 357 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, 358 int flags, u64 range_ns) 359 { 360 if (!time) 361 return NULL; 362 363 hrtimer_setup_sleeper_on_stack(timeout, 364 (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC, 365 HRTIMER_MODE_ABS); 366 /* 367 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is 368 * effectively the same as calling hrtimer_set_expires(). 369 */ 370 hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); 371 372 return timeout; 373 } 374 375 /* 376 * Generate a machine wide unique identifier for this inode. 377 * 378 * This relies on u64 not wrapping in the life-time of the machine; which with 379 * 1ns resolution means almost 585 years. 380 * 381 * This further relies on the fact that a well formed program will not unmap 382 * the file while it has a (shared) futex waiting on it. This mapping will have 383 * a file reference which pins the mount and inode. 384 * 385 * If for some reason an inode gets evicted and read back in again, it will get 386 * a new sequence number and will _NOT_ match, even though it is the exact same 387 * file. 388 * 389 * It is important that futex_match() will never have a false-positive, esp. 390 * for PI futexes that can mess up the state. The above argues that false-negatives 391 * are only possible for malformed programs. 392 */ 393 static u64 get_inode_sequence_number(struct inode *inode) 394 { 395 static atomic64_t i_seq; 396 u64 old; 397 398 /* Does the inode already have a sequence number? */ 399 old = atomic64_read(&inode->i_sequence); 400 if (likely(old)) 401 return old; 402 403 for (;;) { 404 u64 new = atomic64_inc_return(&i_seq); 405 if (WARN_ON_ONCE(!new)) 406 continue; 407 408 old = 0; 409 if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new)) 410 return old; 411 return new; 412 } 413 } 414 415 /** 416 * get_futex_key() - Get parameters which are the keys for a futex 417 * @uaddr: virtual address of the futex 418 * @flags: FLAGS_* 419 * @key: address where result is stored. 420 * @rw: mapping needs to be read/write (values: FUTEX_READ, 421 * FUTEX_WRITE) 422 * 423 * Return: a negative error code or 0 424 * 425 * The key words are stored in @key on success. 426 * 427 * For shared mappings (when @fshared), the key is: 428 * 429 * ( inode->i_sequence, page->index, offset_within_page ) 430 * 431 * [ also see get_inode_sequence_number() ] 432 * 433 * For private mappings (or when !@fshared), the key is: 434 * 435 * ( current->mm, address, 0 ) 436 * 437 * This allows (cross process, where applicable) identification of the futex 438 * without keeping the page pinned for the duration of the FUTEX_WAIT. 439 * 440 * lock_page() might sleep, the caller should not hold a spinlock. 441 */ 442 int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, 443 enum futex_access rw) 444 { 445 unsigned long address = (unsigned long)uaddr; 446 struct mm_struct *mm = current->mm; 447 struct page *page; 448 struct folio *folio; 449 struct address_space *mapping; 450 int err, ro = 0; 451 bool fshared; 452 453 fshared = flags & FLAGS_SHARED; 454 455 /* 456 * The futex address must be "naturally" aligned. 457 */ 458 key->both.offset = address % PAGE_SIZE; 459 if (unlikely((address % sizeof(u32)) != 0)) 460 return -EINVAL; 461 address -= key->both.offset; 462 463 if (unlikely(!access_ok(uaddr, sizeof(u32)))) 464 return -EFAULT; 465 466 if (unlikely(should_fail_futex(fshared))) 467 return -EFAULT; 468 469 /* 470 * PROCESS_PRIVATE futexes are fast. 471 * As the mm cannot disappear under us and the 'key' only needs 472 * virtual address, we dont even have to find the underlying vma. 473 * Note : We do have to check 'uaddr' is a valid user address, 474 * but access_ok() should be faster than find_vma() 475 */ 476 if (!fshared) { 477 /* 478 * On no-MMU, shared futexes are treated as private, therefore 479 * we must not include the current process in the key. Since 480 * there is only one address space, the address is a unique key 481 * on its own. 482 */ 483 if (IS_ENABLED(CONFIG_MMU)) 484 key->private.mm = mm; 485 else 486 key->private.mm = NULL; 487 488 key->private.address = address; 489 return 0; 490 } 491 492 again: 493 /* Ignore any VERIFY_READ mapping (futex common case) */ 494 if (unlikely(should_fail_futex(true))) 495 return -EFAULT; 496 497 err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); 498 /* 499 * If write access is not required (eg. FUTEX_WAIT), try 500 * and get read-only access. 501 */ 502 if (err == -EFAULT && rw == FUTEX_READ) { 503 err = get_user_pages_fast(address, 1, 0, &page); 504 ro = 1; 505 } 506 if (err < 0) 507 return err; 508 else 509 err = 0; 510 511 /* 512 * The treatment of mapping from this point on is critical. The folio 513 * lock protects many things but in this context the folio lock 514 * stabilizes mapping, prevents inode freeing in the shared 515 * file-backed region case and guards against movement to swap cache. 516 * 517 * Strictly speaking the folio lock is not needed in all cases being 518 * considered here and folio lock forces unnecessarily serialization. 519 * From this point on, mapping will be re-verified if necessary and 520 * folio lock will be acquired only if it is unavoidable 521 * 522 * Mapping checks require the folio so it is looked up now. For 523 * anonymous pages, it does not matter if the folio is split 524 * in the future as the key is based on the address. For 525 * filesystem-backed pages, the precise page is required as the 526 * index of the page determines the key. 527 */ 528 folio = page_folio(page); 529 mapping = READ_ONCE(folio->mapping); 530 531 /* 532 * If folio->mapping is NULL, then it cannot be an anonymous 533 * page; but it might be the ZERO_PAGE or in the gate area or 534 * in a special mapping (all cases which we are happy to fail); 535 * or it may have been a good file page when get_user_pages_fast 536 * found it, but truncated or holepunched or subjected to 537 * invalidate_complete_page2 before we got the folio lock (also 538 * cases which we are happy to fail). And we hold a reference, 539 * so refcount care in invalidate_inode_page's remove_mapping 540 * prevents drop_caches from setting mapping to NULL beneath us. 541 * 542 * The case we do have to guard against is when memory pressure made 543 * shmem_writepage move it from filecache to swapcache beneath us: 544 * an unlikely race, but we do need to retry for folio->mapping. 545 */ 546 if (unlikely(!mapping)) { 547 int shmem_swizzled; 548 549 /* 550 * Folio lock is required to identify which special case above 551 * applies. If this is really a shmem page then the folio lock 552 * will prevent unexpected transitions. 553 */ 554 folio_lock(folio); 555 shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; 556 folio_unlock(folio); 557 folio_put(folio); 558 559 if (shmem_swizzled) 560 goto again; 561 562 return -EFAULT; 563 } 564 565 /* 566 * Private mappings are handled in a simple way. 567 * 568 * If the futex key is stored in anonymous memory, then the associated 569 * object is the mm which is implicitly pinned by the calling process. 570 * 571 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 572 * it's a read-only handle, it's expected that futexes attach to 573 * the object not the particular process. 574 */ 575 if (folio_test_anon(folio)) { 576 /* 577 * A RO anonymous page will never change and thus doesn't make 578 * sense for futex operations. 579 */ 580 if (unlikely(should_fail_futex(true)) || ro) { 581 err = -EFAULT; 582 goto out; 583 } 584 585 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ 586 key->private.mm = mm; 587 key->private.address = address; 588 589 } else { 590 struct inode *inode; 591 592 /* 593 * The associated futex object in this case is the inode and 594 * the folio->mapping must be traversed. Ordinarily this should 595 * be stabilised under folio lock but it's not strictly 596 * necessary in this case as we just want to pin the inode, not 597 * update i_pages or anything like that. 598 * 599 * The RCU read lock is taken as the inode is finally freed 600 * under RCU. If the mapping still matches expectations then the 601 * mapping->host can be safely accessed as being a valid inode. 602 */ 603 rcu_read_lock(); 604 605 if (READ_ONCE(folio->mapping) != mapping) { 606 rcu_read_unlock(); 607 folio_put(folio); 608 609 goto again; 610 } 611 612 inode = READ_ONCE(mapping->host); 613 if (!inode) { 614 rcu_read_unlock(); 615 folio_put(folio); 616 617 goto again; 618 } 619 620 key->both.offset |= FUT_OFF_INODE; /* inode-based key */ 621 key->shared.i_seq = get_inode_sequence_number(inode); 622 key->shared.pgoff = page_pgoff(folio, page); 623 rcu_read_unlock(); 624 } 625 626 out: 627 folio_put(folio); 628 return err; 629 } 630 631 /** 632 * fault_in_user_writeable() - Fault in user address and verify RW access 633 * @uaddr: pointer to faulting user space address 634 * 635 * Slow path to fixup the fault we just took in the atomic write 636 * access to @uaddr. 637 * 638 * We have no generic implementation of a non-destructive write to the 639 * user address. We know that we faulted in the atomic pagefault 640 * disabled section so we can as well avoid the #PF overhead by 641 * calling get_user_pages() right away. 642 */ 643 int fault_in_user_writeable(u32 __user *uaddr) 644 { 645 struct mm_struct *mm = current->mm; 646 int ret; 647 648 mmap_read_lock(mm); 649 ret = fixup_user_fault(mm, (unsigned long)uaddr, 650 FAULT_FLAG_WRITE, NULL); 651 mmap_read_unlock(mm); 652 653 return ret < 0 ? ret : 0; 654 } 655 656 /** 657 * futex_top_waiter() - Return the highest priority waiter on a futex 658 * @hb: the hash bucket the futex_q's reside in 659 * @key: the futex key (to distinguish it from other futex futex_q's) 660 * 661 * Must be called with the hb lock held. 662 */ 663 struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) 664 { 665 struct futex_q *this; 666 667 plist_for_each_entry(this, &hb->chain, list) { 668 if (futex_match(&this->key, key)) 669 return this; 670 } 671 return NULL; 672 } 673 674 /** 675 * wait_for_owner_exiting - Block until the owner has exited 676 * @ret: owner's current futex lock status 677 * @exiting: Pointer to the exiting task 678 * 679 * Caller must hold a refcount on @exiting. 680 */ 681 void wait_for_owner_exiting(int ret, struct task_struct *exiting) 682 { 683 if (ret != -EBUSY) { 684 WARN_ON_ONCE(exiting); 685 return; 686 } 687 688 if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) 689 return; 690 691 mutex_lock(&exiting->futex_exit_mutex); 692 /* 693 * No point in doing state checking here. If the waiter got here 694 * while the task was in exec()->exec_futex_release() then it can 695 * have any FUTEX_STATE_* value when the waiter has acquired the 696 * mutex. OK, if running, EXITING or DEAD if it reached exit() 697 * already. Highly unlikely and not a problem. Just one more round 698 * through the futex maze. 699 */ 700 mutex_unlock(&exiting->futex_exit_mutex); 701 702 put_task_struct(exiting); 703 } 704 705 /** 706 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket 707 * @q: The futex_q to unqueue 708 * 709 * The q->lock_ptr must not be NULL and must be held by the caller. 710 */ 711 void __futex_unqueue(struct futex_q *q) 712 { 713 struct futex_hash_bucket *hb; 714 715 if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) 716 return; 717 lockdep_assert_held(q->lock_ptr); 718 719 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 720 plist_del(&q->list, &hb->chain); 721 futex_hb_waiters_dec(hb); 722 } 723 724 /* The key must be already stored in q->key. */ 725 void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) 726 __acquires(&hb->lock) 727 { 728 /* 729 * Increment the counter before taking the lock so that 730 * a potential waker won't miss a to-be-slept task that is 731 * waiting for the spinlock. This is safe as all futex_q_lock() 732 * users end up calling futex_queue(). Similarly, for housekeeping, 733 * decrement the counter at futex_q_unlock() when some error has 734 * occurred and we don't end up adding the task to the list. 735 */ 736 futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ 737 738 q->lock_ptr = &hb->lock; 739 740 spin_lock(&hb->lock); 741 } 742 743 void futex_q_unlock(struct futex_hash_bucket *hb) 744 __releases(&hb->lock) 745 { 746 futex_hb_waiters_dec(hb); 747 spin_unlock(&hb->lock); 748 } 749 750 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, 751 struct task_struct *task) 752 { 753 int prio; 754 755 /* 756 * The priority used to register this element is 757 * - either the real thread-priority for the real-time threads 758 * (i.e. threads with a priority lower than MAX_RT_PRIO) 759 * - or MAX_RT_PRIO for non-RT threads. 760 * Thus, all RT-threads are woken first in priority order, and 761 * the others are woken last, in FIFO order. 762 */ 763 prio = min(current->normal_prio, MAX_RT_PRIO); 764 765 plist_node_init(&q->list, prio); 766 plist_add(&q->list, &hb->chain); 767 q->task = task; 768 } 769 770 /** 771 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket 772 * @q: The futex_q to unqueue 773 * 774 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must 775 * be paired with exactly one earlier call to futex_queue(). 776 * 777 * Return: 778 * - 1 - if the futex_q was still queued (and we removed unqueued it); 779 * - 0 - if the futex_q was already removed by the waking thread 780 */ 781 int futex_unqueue(struct futex_q *q) 782 { 783 spinlock_t *lock_ptr; 784 int ret = 0; 785 786 /* RCU so lock_ptr is not going away during locking. */ 787 guard(rcu)(); 788 /* In the common case we don't take the spinlock, which is nice. */ 789 retry: 790 /* 791 * q->lock_ptr can change between this read and the following spin_lock. 792 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and 793 * optimizing lock_ptr out of the logic below. 794 */ 795 lock_ptr = READ_ONCE(q->lock_ptr); 796 if (lock_ptr != NULL) { 797 spin_lock(lock_ptr); 798 /* 799 * q->lock_ptr can change between reading it and 800 * spin_lock(), causing us to take the wrong lock. This 801 * corrects the race condition. 802 * 803 * Reasoning goes like this: if we have the wrong lock, 804 * q->lock_ptr must have changed (maybe several times) 805 * between reading it and the spin_lock(). It can 806 * change again after the spin_lock() but only if it was 807 * already changed before the spin_lock(). It cannot, 808 * however, change back to the original value. Therefore 809 * we can detect whether we acquired the correct lock. 810 */ 811 if (unlikely(lock_ptr != q->lock_ptr)) { 812 spin_unlock(lock_ptr); 813 goto retry; 814 } 815 __futex_unqueue(q); 816 817 BUG_ON(q->pi_state); 818 819 spin_unlock(lock_ptr); 820 ret = 1; 821 } 822 823 return ret; 824 } 825 826 void futex_q_lockptr_lock(struct futex_q *q) 827 { 828 spinlock_t *lock_ptr; 829 830 /* 831 * See futex_unqueue() why lock_ptr can change. 832 */ 833 guard(rcu)(); 834 retry: 835 lock_ptr = READ_ONCE(q->lock_ptr); 836 spin_lock(lock_ptr); 837 838 if (unlikely(lock_ptr != q->lock_ptr)) { 839 spin_unlock(lock_ptr); 840 goto retry; 841 } 842 } 843 844 /* 845 * PI futexes can not be requeued and must remove themselves from the hash 846 * bucket. The hash bucket lock (i.e. lock_ptr) is held. 847 */ 848 void futex_unqueue_pi(struct futex_q *q) 849 { 850 /* 851 * If the lock was not acquired (due to timeout or signal) then the 852 * rt_waiter is removed before futex_q is. If this is observed by 853 * an unlocker after dropping the rtmutex wait lock and before 854 * acquiring the hash bucket lock, then the unlocker dequeues the 855 * futex_q from the hash bucket list to guarantee consistent state 856 * vs. userspace. Therefore the dequeue here must be conditional. 857 */ 858 if (!plist_node_empty(&q->list)) 859 __futex_unqueue(q); 860 861 BUG_ON(!q->pi_state); 862 put_pi_state(q->pi_state); 863 q->pi_state = NULL; 864 } 865 866 /* Constants for the pending_op argument of handle_futex_death */ 867 #define HANDLE_DEATH_PENDING true 868 #define HANDLE_DEATH_LIST false 869 870 /* 871 * Process a futex-list entry, check whether it's owned by the 872 * dying task, and do notification if so: 873 */ 874 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, 875 bool pi, bool pending_op) 876 { 877 u32 uval, nval, mval; 878 pid_t owner; 879 int err; 880 881 /* Futex address must be 32bit aligned */ 882 if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) 883 return -1; 884 885 retry: 886 if (get_user(uval, uaddr)) 887 return -1; 888 889 /* 890 * Special case for regular (non PI) futexes. The unlock path in 891 * user space has two race scenarios: 892 * 893 * 1. The unlock path releases the user space futex value and 894 * before it can execute the futex() syscall to wake up 895 * waiters it is killed. 896 * 897 * 2. A woken up waiter is killed before it can acquire the 898 * futex in user space. 899 * 900 * In the second case, the wake up notification could be generated 901 * by the unlock path in user space after setting the futex value 902 * to zero or by the kernel after setting the OWNER_DIED bit below. 903 * 904 * In both cases the TID validation below prevents a wakeup of 905 * potential waiters which can cause these waiters to block 906 * forever. 907 * 908 * In both cases the following conditions are met: 909 * 910 * 1) task->robust_list->list_op_pending != NULL 911 * @pending_op == true 912 * 2) The owner part of user space futex value == 0 913 * 3) Regular futex: @pi == false 914 * 915 * If these conditions are met, it is safe to attempt waking up a 916 * potential waiter without touching the user space futex value and 917 * trying to set the OWNER_DIED bit. If the futex value is zero, 918 * the rest of the user space mutex state is consistent, so a woken 919 * waiter will just take over the uncontended futex. Setting the 920 * OWNER_DIED bit would create inconsistent state and malfunction 921 * of the user space owner died handling. Otherwise, the OWNER_DIED 922 * bit is already set, and the woken waiter is expected to deal with 923 * this. 924 */ 925 owner = uval & FUTEX_TID_MASK; 926 927 if (pending_op && !pi && !owner) { 928 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 929 FUTEX_BITSET_MATCH_ANY); 930 return 0; 931 } 932 933 if (owner != task_pid_vnr(curr)) 934 return 0; 935 936 /* 937 * Ok, this dying thread is truly holding a futex 938 * of interest. Set the OWNER_DIED bit atomically 939 * via cmpxchg, and if the value had FUTEX_WAITERS 940 * set, wake up a waiter (if any). (We have to do a 941 * futex_wake() even if OWNER_DIED is already set - 942 * to handle the rare but possible case of recursive 943 * thread-death.) The rest of the cleanup is done in 944 * userspace. 945 */ 946 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 947 948 /* 949 * We are not holding a lock here, but we want to have 950 * the pagefault_disable/enable() protection because 951 * we want to handle the fault gracefully. If the 952 * access fails we try to fault in the futex with R/W 953 * verification via get_user_pages. get_user() above 954 * does not guarantee R/W access. If that fails we 955 * give up and leave the futex locked. 956 */ 957 if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { 958 switch (err) { 959 case -EFAULT: 960 if (fault_in_user_writeable(uaddr)) 961 return -1; 962 goto retry; 963 964 case -EAGAIN: 965 cond_resched(); 966 goto retry; 967 968 default: 969 WARN_ON_ONCE(1); 970 return err; 971 } 972 } 973 974 if (nval != uval) 975 goto retry; 976 977 /* 978 * Wake robust non-PI futexes here. The wakeup of 979 * PI futexes happens in exit_pi_state(): 980 */ 981 if (!pi && (uval & FUTEX_WAITERS)) { 982 futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, 983 FUTEX_BITSET_MATCH_ANY); 984 } 985 986 return 0; 987 } 988 989 /* 990 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 991 */ 992 static inline int fetch_robust_entry(struct robust_list __user **entry, 993 struct robust_list __user * __user *head, 994 unsigned int *pi) 995 { 996 unsigned long uentry; 997 998 if (get_user(uentry, (unsigned long __user *)head)) 999 return -EFAULT; 1000 1001 *entry = (void __user *)(uentry & ~1UL); 1002 *pi = uentry & 1; 1003 1004 return 0; 1005 } 1006 1007 /* 1008 * Walk curr->robust_list (very carefully, it's a userspace list!) 1009 * and mark any locks found there dead, and notify any waiters. 1010 * 1011 * We silently return on any sign of list-walking problem. 1012 */ 1013 static void exit_robust_list(struct task_struct *curr) 1014 { 1015 struct robust_list_head __user *head = curr->robust_list; 1016 struct robust_list __user *entry, *next_entry, *pending; 1017 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 1018 unsigned int next_pi; 1019 unsigned long futex_offset; 1020 int rc; 1021 1022 /* 1023 * Fetch the list head (which was registered earlier, via 1024 * sys_set_robust_list()): 1025 */ 1026 if (fetch_robust_entry(&entry, &head->list.next, &pi)) 1027 return; 1028 /* 1029 * Fetch the relative futex offset: 1030 */ 1031 if (get_user(futex_offset, &head->futex_offset)) 1032 return; 1033 /* 1034 * Fetch any possibly pending lock-add first, and handle it 1035 * if it exists: 1036 */ 1037 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) 1038 return; 1039 1040 next_entry = NULL; /* avoid warning with gcc */ 1041 while (entry != &head->list) { 1042 /* 1043 * Fetch the next entry in the list before calling 1044 * handle_futex_death: 1045 */ 1046 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); 1047 /* 1048 * A pending lock might already be on the list, so 1049 * don't process it twice: 1050 */ 1051 if (entry != pending) { 1052 if (handle_futex_death((void __user *)entry + futex_offset, 1053 curr, pi, HANDLE_DEATH_LIST)) 1054 return; 1055 } 1056 if (rc) 1057 return; 1058 entry = next_entry; 1059 pi = next_pi; 1060 /* 1061 * Avoid excessively long or circular lists: 1062 */ 1063 if (!--limit) 1064 break; 1065 1066 cond_resched(); 1067 } 1068 1069 if (pending) { 1070 handle_futex_death((void __user *)pending + futex_offset, 1071 curr, pip, HANDLE_DEATH_PENDING); 1072 } 1073 } 1074 1075 #ifdef CONFIG_COMPAT 1076 static void __user *futex_uaddr(struct robust_list __user *entry, 1077 compat_long_t futex_offset) 1078 { 1079 compat_uptr_t base = ptr_to_compat(entry); 1080 void __user *uaddr = compat_ptr(base + futex_offset); 1081 1082 return uaddr; 1083 } 1084 1085 /* 1086 * Fetch a robust-list pointer. Bit 0 signals PI futexes: 1087 */ 1088 static inline int 1089 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 1090 compat_uptr_t __user *head, unsigned int *pi) 1091 { 1092 if (get_user(*uentry, head)) 1093 return -EFAULT; 1094 1095 *entry = compat_ptr((*uentry) & ~1); 1096 *pi = (unsigned int)(*uentry) & 1; 1097 1098 return 0; 1099 } 1100 1101 /* 1102 * Walk curr->robust_list (very carefully, it's a userspace list!) 1103 * and mark any locks found there dead, and notify any waiters. 1104 * 1105 * We silently return on any sign of list-walking problem. 1106 */ 1107 static void compat_exit_robust_list(struct task_struct *curr) 1108 { 1109 struct compat_robust_list_head __user *head = curr->compat_robust_list; 1110 struct robust_list __user *entry, *next_entry, *pending; 1111 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; 1112 unsigned int next_pi; 1113 compat_uptr_t uentry, next_uentry, upending; 1114 compat_long_t futex_offset; 1115 int rc; 1116 1117 /* 1118 * Fetch the list head (which was registered earlier, via 1119 * sys_set_robust_list()): 1120 */ 1121 if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) 1122 return; 1123 /* 1124 * Fetch the relative futex offset: 1125 */ 1126 if (get_user(futex_offset, &head->futex_offset)) 1127 return; 1128 /* 1129 * Fetch any possibly pending lock-add first, and handle it 1130 * if it exists: 1131 */ 1132 if (compat_fetch_robust_entry(&upending, &pending, 1133 &head->list_op_pending, &pip)) 1134 return; 1135 1136 next_entry = NULL; /* avoid warning with gcc */ 1137 while (entry != (struct robust_list __user *) &head->list) { 1138 /* 1139 * Fetch the next entry in the list before calling 1140 * handle_futex_death: 1141 */ 1142 rc = compat_fetch_robust_entry(&next_uentry, &next_entry, 1143 (compat_uptr_t __user *)&entry->next, &next_pi); 1144 /* 1145 * A pending lock might already be on the list, so 1146 * dont process it twice: 1147 */ 1148 if (entry != pending) { 1149 void __user *uaddr = futex_uaddr(entry, futex_offset); 1150 1151 if (handle_futex_death(uaddr, curr, pi, 1152 HANDLE_DEATH_LIST)) 1153 return; 1154 } 1155 if (rc) 1156 return; 1157 uentry = next_uentry; 1158 entry = next_entry; 1159 pi = next_pi; 1160 /* 1161 * Avoid excessively long or circular lists: 1162 */ 1163 if (!--limit) 1164 break; 1165 1166 cond_resched(); 1167 } 1168 if (pending) { 1169 void __user *uaddr = futex_uaddr(pending, futex_offset); 1170 1171 handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); 1172 } 1173 } 1174 #endif 1175 1176 #ifdef CONFIG_FUTEX_PI 1177 1178 /* 1179 * This task is holding PI mutexes at exit time => bad. 1180 * Kernel cleans up PI-state, but userspace is likely hosed. 1181 * (Robust-futex cleanup is separate and might save the day for userspace.) 1182 */ 1183 static void exit_pi_state_list(struct task_struct *curr) 1184 { 1185 struct list_head *next, *head = &curr->pi_state_list; 1186 struct futex_pi_state *pi_state; 1187 union futex_key key = FUTEX_KEY_INIT; 1188 1189 /* 1190 * The mutex mm_struct::futex_hash_lock might be acquired. 1191 */ 1192 might_sleep(); 1193 /* 1194 * Ensure the hash remains stable (no resize) during the while loop 1195 * below. The hb pointer is acquired under the pi_lock so we can't block 1196 * on the mutex. 1197 */ 1198 WARN_ON(curr != current); 1199 guard(private_hash)(); 1200 /* 1201 * We are a ZOMBIE and nobody can enqueue itself on 1202 * pi_state_list anymore, but we have to be careful 1203 * versus waiters unqueueing themselves: 1204 */ 1205 raw_spin_lock_irq(&curr->pi_lock); 1206 while (!list_empty(head)) { 1207 next = head->next; 1208 pi_state = list_entry(next, struct futex_pi_state, list); 1209 key = pi_state->key; 1210 if (1) { 1211 CLASS(hb, hb)(&key); 1212 1213 /* 1214 * We can race against put_pi_state() removing itself from the 1215 * list (a waiter going away). put_pi_state() will first 1216 * decrement the reference count and then modify the list, so 1217 * its possible to see the list entry but fail this reference 1218 * acquire. 1219 * 1220 * In that case; drop the locks to let put_pi_state() make 1221 * progress and retry the loop. 1222 */ 1223 if (!refcount_inc_not_zero(&pi_state->refcount)) { 1224 raw_spin_unlock_irq(&curr->pi_lock); 1225 cpu_relax(); 1226 raw_spin_lock_irq(&curr->pi_lock); 1227 continue; 1228 } 1229 raw_spin_unlock_irq(&curr->pi_lock); 1230 1231 spin_lock(&hb->lock); 1232 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1233 raw_spin_lock(&curr->pi_lock); 1234 /* 1235 * We dropped the pi-lock, so re-check whether this 1236 * task still owns the PI-state: 1237 */ 1238 if (head->next != next) { 1239 /* retain curr->pi_lock for the loop invariant */ 1240 raw_spin_unlock(&pi_state->pi_mutex.wait_lock); 1241 spin_unlock(&hb->lock); 1242 put_pi_state(pi_state); 1243 continue; 1244 } 1245 1246 WARN_ON(pi_state->owner != curr); 1247 WARN_ON(list_empty(&pi_state->list)); 1248 list_del_init(&pi_state->list); 1249 pi_state->owner = NULL; 1250 1251 raw_spin_unlock(&curr->pi_lock); 1252 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 1253 spin_unlock(&hb->lock); 1254 } 1255 1256 rt_mutex_futex_unlock(&pi_state->pi_mutex); 1257 put_pi_state(pi_state); 1258 1259 raw_spin_lock_irq(&curr->pi_lock); 1260 } 1261 raw_spin_unlock_irq(&curr->pi_lock); 1262 } 1263 #else 1264 static inline void exit_pi_state_list(struct task_struct *curr) { } 1265 #endif 1266 1267 static void futex_cleanup(struct task_struct *tsk) 1268 { 1269 if (unlikely(tsk->robust_list)) { 1270 exit_robust_list(tsk); 1271 tsk->robust_list = NULL; 1272 } 1273 1274 #ifdef CONFIG_COMPAT 1275 if (unlikely(tsk->compat_robust_list)) { 1276 compat_exit_robust_list(tsk); 1277 tsk->compat_robust_list = NULL; 1278 } 1279 #endif 1280 1281 if (unlikely(!list_empty(&tsk->pi_state_list))) 1282 exit_pi_state_list(tsk); 1283 } 1284 1285 /** 1286 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD 1287 * @tsk: task to set the state on 1288 * 1289 * Set the futex exit state of the task lockless. The futex waiter code 1290 * observes that state when a task is exiting and loops until the task has 1291 * actually finished the futex cleanup. The worst case for this is that the 1292 * waiter runs through the wait loop until the state becomes visible. 1293 * 1294 * This is called from the recursive fault handling path in make_task_dead(). 1295 * 1296 * This is best effort. Either the futex exit code has run already or 1297 * not. If the OWNER_DIED bit has been set on the futex then the waiter can 1298 * take it over. If not, the problem is pushed back to user space. If the 1299 * futex exit code did not run yet, then an already queued waiter might 1300 * block forever, but there is nothing which can be done about that. 1301 */ 1302 void futex_exit_recursive(struct task_struct *tsk) 1303 { 1304 /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ 1305 if (tsk->futex_state == FUTEX_STATE_EXITING) 1306 mutex_unlock(&tsk->futex_exit_mutex); 1307 tsk->futex_state = FUTEX_STATE_DEAD; 1308 } 1309 1310 static void futex_cleanup_begin(struct task_struct *tsk) 1311 { 1312 /* 1313 * Prevent various race issues against a concurrent incoming waiter 1314 * including live locks by forcing the waiter to block on 1315 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in 1316 * attach_to_pi_owner(). 1317 */ 1318 mutex_lock(&tsk->futex_exit_mutex); 1319 1320 /* 1321 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. 1322 * 1323 * This ensures that all subsequent checks of tsk->futex_state in 1324 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with 1325 * tsk->pi_lock held. 1326 * 1327 * It guarantees also that a pi_state which was queued right before 1328 * the state change under tsk->pi_lock by a concurrent waiter must 1329 * be observed in exit_pi_state_list(). 1330 */ 1331 raw_spin_lock_irq(&tsk->pi_lock); 1332 tsk->futex_state = FUTEX_STATE_EXITING; 1333 raw_spin_unlock_irq(&tsk->pi_lock); 1334 } 1335 1336 static void futex_cleanup_end(struct task_struct *tsk, int state) 1337 { 1338 /* 1339 * Lockless store. The only side effect is that an observer might 1340 * take another loop until it becomes visible. 1341 */ 1342 tsk->futex_state = state; 1343 /* 1344 * Drop the exit protection. This unblocks waiters which observed 1345 * FUTEX_STATE_EXITING to reevaluate the state. 1346 */ 1347 mutex_unlock(&tsk->futex_exit_mutex); 1348 } 1349 1350 void futex_exec_release(struct task_struct *tsk) 1351 { 1352 /* 1353 * The state handling is done for consistency, but in the case of 1354 * exec() there is no way to prevent further damage as the PID stays 1355 * the same. But for the unlikely and arguably buggy case that a 1356 * futex is held on exec(), this provides at least as much state 1357 * consistency protection which is possible. 1358 */ 1359 futex_cleanup_begin(tsk); 1360 futex_cleanup(tsk); 1361 /* 1362 * Reset the state to FUTEX_STATE_OK. The task is alive and about 1363 * exec a new binary. 1364 */ 1365 futex_cleanup_end(tsk, FUTEX_STATE_OK); 1366 } 1367 1368 void futex_exit_release(struct task_struct *tsk) 1369 { 1370 futex_cleanup_begin(tsk); 1371 futex_cleanup(tsk); 1372 futex_cleanup_end(tsk, FUTEX_STATE_DEAD); 1373 } 1374 1375 static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, 1376 struct futex_private_hash *fph) 1377 { 1378 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1379 fhb->priv = fph; 1380 #endif 1381 atomic_set(&fhb->waiters, 0); 1382 plist_head_init(&fhb->chain); 1383 spin_lock_init(&fhb->lock); 1384 } 1385 1386 #ifdef CONFIG_FUTEX_PRIVATE_HASH 1387 void futex_hash_free(struct mm_struct *mm) 1388 { 1389 struct futex_private_hash *fph; 1390 1391 kvfree(mm->futex_phash_new); 1392 fph = rcu_dereference_raw(mm->futex_phash); 1393 if (fph) { 1394 WARN_ON_ONCE(rcuref_read(&fph->users) > 1); 1395 kvfree(fph); 1396 } 1397 } 1398 1399 static bool futex_pivot_pending(struct mm_struct *mm) 1400 { 1401 struct futex_private_hash *fph; 1402 1403 guard(rcu)(); 1404 1405 if (!mm->futex_phash_new) 1406 return true; 1407 1408 fph = rcu_dereference(mm->futex_phash); 1409 return rcuref_is_dead(&fph->users); 1410 } 1411 1412 static bool futex_hash_less(struct futex_private_hash *a, 1413 struct futex_private_hash *b) 1414 { 1415 /* user provided always wins */ 1416 if (!a->custom && b->custom) 1417 return true; 1418 if (a->custom && !b->custom) 1419 return false; 1420 1421 /* zero-sized hash wins */ 1422 if (!b->hash_mask) 1423 return true; 1424 if (!a->hash_mask) 1425 return false; 1426 1427 /* keep the biggest */ 1428 if (a->hash_mask < b->hash_mask) 1429 return true; 1430 if (a->hash_mask > b->hash_mask) 1431 return false; 1432 1433 return false; /* equal */ 1434 } 1435 1436 static int futex_hash_allocate(unsigned int hash_slots, bool custom) 1437 { 1438 struct mm_struct *mm = current->mm; 1439 struct futex_private_hash *fph; 1440 int i; 1441 1442 if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots))) 1443 return -EINVAL; 1444 1445 /* 1446 * Once we've disabled the global hash there is no way back. 1447 */ 1448 scoped_guard(rcu) { 1449 fph = rcu_dereference(mm->futex_phash); 1450 if (fph && !fph->hash_mask) { 1451 if (custom) 1452 return -EBUSY; 1453 return 0; 1454 } 1455 } 1456 1457 fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1458 if (!fph) 1459 return -ENOMEM; 1460 1461 rcuref_init(&fph->users, 1); 1462 fph->hash_mask = hash_slots ? hash_slots - 1 : 0; 1463 fph->custom = custom; 1464 fph->mm = mm; 1465 1466 for (i = 0; i < hash_slots; i++) 1467 futex_hash_bucket_init(&fph->queues[i], fph); 1468 1469 if (custom) { 1470 /* 1471 * Only let prctl() wait / retry; don't unduly delay clone(). 1472 */ 1473 again: 1474 wait_var_event(mm, futex_pivot_pending(mm)); 1475 } 1476 1477 scoped_guard(mutex, &mm->futex_hash_lock) { 1478 struct futex_private_hash *free __free(kvfree) = NULL; 1479 struct futex_private_hash *cur, *new; 1480 1481 cur = rcu_dereference_protected(mm->futex_phash, 1482 lockdep_is_held(&mm->futex_hash_lock)); 1483 new = mm->futex_phash_new; 1484 mm->futex_phash_new = NULL; 1485 1486 if (fph) { 1487 if (cur && !new) { 1488 /* 1489 * If we have an existing hash, but do not yet have 1490 * allocated a replacement hash, drop the initial 1491 * reference on the existing hash. 1492 */ 1493 futex_private_hash_put(cur); 1494 } 1495 1496 if (new) { 1497 /* 1498 * Two updates raced; throw out the lesser one. 1499 */ 1500 if (futex_hash_less(new, fph)) { 1501 free = new; 1502 new = fph; 1503 } else { 1504 free = fph; 1505 } 1506 } else { 1507 new = fph; 1508 } 1509 fph = NULL; 1510 } 1511 1512 if (new) { 1513 /* 1514 * Will set mm->futex_phash_new on failure; 1515 * futex_private_hash_get() will try again. 1516 */ 1517 if (!__futex_pivot_hash(mm, new) && custom) 1518 goto again; 1519 } 1520 } 1521 return 0; 1522 } 1523 1524 int futex_hash_allocate_default(void) 1525 { 1526 unsigned int threads, buckets, current_buckets = 0; 1527 struct futex_private_hash *fph; 1528 1529 if (!current->mm) 1530 return 0; 1531 1532 scoped_guard(rcu) { 1533 threads = min_t(unsigned int, 1534 get_nr_threads(current), 1535 num_online_cpus()); 1536 1537 fph = rcu_dereference(current->mm->futex_phash); 1538 if (fph) { 1539 if (fph->custom) 1540 return 0; 1541 1542 current_buckets = fph->hash_mask + 1; 1543 } 1544 } 1545 1546 /* 1547 * The default allocation will remain within 1548 * 16 <= threads * 4 <= global hash size 1549 */ 1550 buckets = roundup_pow_of_two(4 * threads); 1551 buckets = clamp(buckets, 16, futex_hashmask + 1); 1552 1553 if (current_buckets >= buckets) 1554 return 0; 1555 1556 return futex_hash_allocate(buckets, false); 1557 } 1558 1559 static int futex_hash_get_slots(void) 1560 { 1561 struct futex_private_hash *fph; 1562 1563 guard(rcu)(); 1564 fph = rcu_dereference(current->mm->futex_phash); 1565 if (fph && fph->hash_mask) 1566 return fph->hash_mask + 1; 1567 return 0; 1568 } 1569 1570 #else 1571 1572 static int futex_hash_allocate(unsigned int hash_slots, bool custom) 1573 { 1574 return -EINVAL; 1575 } 1576 1577 static int futex_hash_get_slots(void) 1578 { 1579 return 0; 1580 } 1581 #endif 1582 1583 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) 1584 { 1585 int ret; 1586 1587 switch (arg2) { 1588 case PR_FUTEX_HASH_SET_SLOTS: 1589 if (arg4 != 0) 1590 return -EINVAL; 1591 ret = futex_hash_allocate(arg3, true); 1592 break; 1593 1594 case PR_FUTEX_HASH_GET_SLOTS: 1595 ret = futex_hash_get_slots(); 1596 break; 1597 1598 default: 1599 ret = -EINVAL; 1600 break; 1601 } 1602 return ret; 1603 } 1604 1605 static int __init futex_init(void) 1606 { 1607 unsigned long hashsize, i; 1608 unsigned int futex_shift; 1609 1610 #ifdef CONFIG_BASE_SMALL 1611 hashsize = 16; 1612 #else 1613 hashsize = roundup_pow_of_two(256 * num_possible_cpus()); 1614 #endif 1615 1616 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), 1617 hashsize, 0, 0, 1618 &futex_shift, NULL, 1619 hashsize, hashsize); 1620 hashsize = 1UL << futex_shift; 1621 1622 for (i = 0; i < hashsize; i++) 1623 futex_hash_bucket_init(&futex_queues[i], NULL); 1624 1625 futex_hashmask = hashsize - 1; 1626 return 0; 1627 } 1628 core_initcall(futex_init); 1629