xref: /linux/kernel/futex/core.c (revision a734d9fca84e1d4fa0cb442ef5f84c88f8212d32)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *  Fast Userspace Mutexes (which I call "Futexes!").
4  *  (C) Rusty Russell, IBM 2002
5  *
6  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
7  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
8  *
9  *  Removed page pinning, fix privately mapped COW pages and other cleanups
10  *  (C) Copyright 2003, 2004 Jamie Lokier
11  *
12  *  Robust futex support started by Ingo Molnar
13  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
14  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
15  *
16  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
17  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19  *
20  *  PRIVATE futexes by Eric Dumazet
21  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
22  *
23  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
24  *  Copyright (C) IBM Corporation, 2009
25  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
26  *
27  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
28  *  enough at me, Linus for the original (flawed) idea, Matthew
29  *  Kirkwood for proof-of-concept implementation.
30  *
31  *  "The futexes are also cursed."
32  *  "But they come in a choice of three flavours!"
33  */
34 #include <linux/compat.h>
35 #include <linux/debugfs.h>
36 #include <linux/fault-inject.h>
37 #include <linux/gfp.h>
38 #include <linux/jhash.h>
39 #include <linux/memblock.h>
40 #include <linux/mempolicy.h>
41 #include <linux/mmap_lock.h>
42 #include <linux/pagemap.h>
43 #include <linux/plist.h>
44 #include <linux/prctl.h>
45 #include <linux/rseq.h>
46 #include <linux/slab.h>
47 #include <linux/vmalloc.h>
48 
49 #include <vdso/futex.h>
50 
51 #include "futex.h"
52 #include "../locking/rtmutex_common.h"
53 
54 /*
55  * The base of the bucket array and its size are always used together
56  * (after initialization only in futex_hash()), so ensure that they
57  * reside in the same cacheline.
58  */
59 static struct {
60 	unsigned long            hashmask;
61 	unsigned int		 hashshift;
62 	struct futex_hash_bucket *queues[MAX_NUMNODES];
63 } __futex_data __read_mostly __aligned(2*sizeof(long));
64 
65 #define futex_hashmask	(__futex_data.hashmask)
66 #define futex_hashshift	(__futex_data.hashshift)
67 #define futex_queues	(__futex_data.queues)
68 
69 struct futex_private_hash {
70 	int		state;
71 	unsigned int	hash_mask;
72 	struct rcu_head	rcu;
73 	void		*mm;
74 	bool		custom;
75 	struct futex_hash_bucket queues[];
76 };
77 
78 /*
79  * Fault injections for futexes.
80  */
81 #ifdef CONFIG_FAIL_FUTEX
82 
83 static struct {
84 	struct fault_attr attr;
85 
86 	bool ignore_private;
87 } fail_futex = {
88 	.attr = FAULT_ATTR_INITIALIZER,
89 	.ignore_private = false,
90 };
91 
92 static int __init setup_fail_futex(char *str)
93 {
94 	return setup_fault_attr(&fail_futex.attr, str);
95 }
96 __setup("fail_futex=", setup_fail_futex);
97 
98 bool should_fail_futex(bool fshared)
99 {
100 	if (fail_futex.ignore_private && !fshared)
101 		return false;
102 
103 	return should_fail(&fail_futex.attr, 1);
104 }
105 
106 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
107 
108 static int __init fail_futex_debugfs(void)
109 {
110 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
111 	struct dentry *dir;
112 
113 	dir = fault_create_debugfs_attr("fail_futex", NULL,
114 					&fail_futex.attr);
115 	if (IS_ERR(dir))
116 		return PTR_ERR(dir);
117 
118 	debugfs_create_bool("ignore-private", mode, dir,
119 			    &fail_futex.ignore_private);
120 	return 0;
121 }
122 
123 late_initcall(fail_futex_debugfs);
124 
125 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
126 
127 #endif /* CONFIG_FAIL_FUTEX */
128 
129 static struct futex_hash_bucket *
130 __futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p);
131 
132 #ifdef CONFIG_FUTEX_PRIVATE_HASH
133 static bool futex_ref_get(struct futex_private_hash *fph);
134 static bool futex_ref_put(struct futex_private_hash *fph);
135 static bool futex_ref_is_dead(struct futex_private_hash *fph);
136 
137 enum { FR_PERCPU = 0, FR_ATOMIC };
138 
139 static bool futex_private_hash_get(struct futex_private_hash *fph)
140 {
141 	return futex_ref_get(fph);
142 }
143 
144 void futex_private_hash_put(struct futex_private_hash *fph)
145 {
146 	if (fph && futex_ref_put(fph))
147 		wake_up_var(fph->mm);
148 }
149 
150 static struct futex_hash_bucket *
151 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
152 {
153 	u32 hash;
154 
155 	hash = jhash2((void *)&key->private.address, sizeof(key->private.address) / 4,
156 		      key->both.offset);
157 
158 	return &fph->queues[hash & fph->hash_mask];
159 }
160 
161 static void futex_rehash_private(struct futex_private_hash *old,
162 				 struct futex_private_hash *new)
163 {
164 	struct futex_hash_bucket *hb_old, *hb_new;
165 	unsigned int slots = old->hash_mask + 1;
166 	unsigned int i;
167 
168 	for (i = 0; i < slots; i++) {
169 		struct futex_q *this, *tmp;
170 
171 		hb_old = &old->queues[i];
172 
173 		spin_lock(&hb_old->lock);
174 		plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) {
175 			plist_del(&this->list, &hb_old->chain);
176 			futex_hb_waiters_dec(hb_old);
177 
178 			WARN_ON_ONCE(this->lock_ptr != &hb_old->lock);
179 
180 			hb_new = __futex_hash(&this->key, new, NULL);
181 			futex_hb_waiters_inc(hb_new);
182 			/*
183 			 * The new pointer isn't published yet but an already
184 			 * moved user can be unqueued due to timeout or signal.
185 			 */
186 			spin_lock_nested(&hb_new->lock, SINGLE_DEPTH_NESTING);
187 			plist_add(&this->list, &hb_new->chain);
188 			this->lock_ptr = &hb_new->lock;
189 			spin_unlock(&hb_new->lock);
190 		}
191 		spin_unlock(&hb_old->lock);
192 	}
193 }
194 
195 static bool __futex_pivot_hash(struct mm_struct *mm, struct futex_private_hash *new)
196 {
197 	struct futex_mm_phash *mmph = &mm->futex.phash;
198 	struct futex_private_hash *fph;
199 
200 	WARN_ON_ONCE(mmph->hash_new);
201 
202 	fph = rcu_dereference_protected(mmph->hash, lockdep_is_held(&mmph->lock));
203 	if (fph) {
204 		if (!futex_ref_is_dead(fph)) {
205 			mmph->hash_new = new;
206 			return false;
207 		}
208 
209 		futex_rehash_private(fph, new);
210 	}
211 	new->state = FR_PERCPU;
212 	scoped_guard(rcu) {
213 		mmph->batches = get_state_synchronize_rcu();
214 		rcu_assign_pointer(mmph->hash, new);
215 	}
216 	kvfree_rcu(fph, rcu);
217 	return true;
218 }
219 
220 static void futex_pivot_hash(struct mm_struct *mm)
221 {
222 	scoped_guard(mutex, &mm->futex.phash.lock) {
223 		struct futex_private_hash *fph;
224 
225 		fph = mm->futex.phash.hash_new;
226 		if (fph) {
227 			mm->futex.phash.hash_new = NULL;
228 			__futex_pivot_hash(mm, fph);
229 		}
230 	}
231 }
232 
233 struct futex_private_hash *futex_private_hash(struct mm_struct *mm)
234 {
235 	/*
236 	 * Ideally we don't loop. If there is a replacement in progress
237 	 * then a new private hash is already prepared and a reference can't be
238 	 * obtained once the last user dropped it's.
239 	 * In that case we block on mm_struct::futex_hash_lock and either have
240 	 * to perform the replacement or wait while someone else is doing the
241 	 * job. Eitherway, on the second iteration we acquire a reference on the
242 	 * new private hash or loop again because a new replacement has been
243 	 * requested.
244 	 */
245 again:
246 	scoped_guard(rcu) {
247 		struct futex_private_hash *fph;
248 
249 		fph = rcu_dereference(mm->futex.phash.hash);
250 		if (!fph)
251 			return NULL;
252 
253 		if (futex_private_hash_get(fph))
254 			return fph;
255 	}
256 	futex_pivot_hash(mm);
257 	goto again;
258 }
259 
260 struct futex_bucket_ref futex_hash(union futex_key *key)
261 {
262 again:
263 	scoped_guard(rcu) {
264 		struct futex_private_hash *fph = NULL;
265 		struct futex_hash_bucket *hb;
266 
267 		hb = __futex_hash(key, NULL, &fph);
268 
269 		if (!fph || futex_private_hash_get(fph))
270 			return (struct futex_bucket_ref){ .hb = hb, .fph = fph };
271 	}
272 	futex_pivot_hash(key->private.mm);
273 	goto again;
274 }
275 
276 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
277 
278 struct futex_bucket_ref futex_hash(union futex_key *key)
279 {
280 	return (struct futex_bucket_ref){ .hb = __futex_hash(key, NULL, NULL), .fph = NULL };
281 }
282 
283 #endif /* CONFIG_FUTEX_PRIVATE_HASH */
284 
285 #ifdef CONFIG_FUTEX_MPOL
286 
287 static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr)
288 {
289 	struct vm_area_struct *vma = vma_lookup(mm, addr);
290 	struct mempolicy *mpol;
291 	int node = FUTEX_NO_NODE;
292 
293 	if (!vma)
294 		return FUTEX_NO_NODE;
295 
296 	mpol = READ_ONCE(vma->vm_policy);
297 	if (!mpol)
298 		return FUTEX_NO_NODE;
299 
300 	switch (mpol->mode) {
301 	case MPOL_PREFERRED:
302 		node = first_node(mpol->nodes);
303 		break;
304 	case MPOL_PREFERRED_MANY:
305 	case MPOL_BIND:
306 		if (mpol->home_node != NUMA_NO_NODE)
307 			node = mpol->home_node;
308 		break;
309 	default:
310 		break;
311 	}
312 
313 	return node;
314 }
315 
316 static int futex_key_to_node_opt(struct mm_struct *mm, unsigned long addr)
317 {
318 	int seq, node;
319 
320 	guard(rcu)();
321 
322 	if (!mmap_lock_speculate_try_begin(mm, &seq))
323 		return -EBUSY;
324 
325 	node = __futex_key_to_node(mm, addr);
326 
327 	if (mmap_lock_speculate_retry(mm, seq))
328 		return -EAGAIN;
329 
330 	return node;
331 }
332 
333 static int futex_mpol(struct mm_struct *mm, unsigned long addr)
334 {
335 	int node;
336 
337 	node = futex_key_to_node_opt(mm, addr);
338 	if (node >= FUTEX_NO_NODE)
339 		return node;
340 
341 	guard(mmap_read_lock)(mm);
342 	return __futex_key_to_node(mm, addr);
343 }
344 
345 #else /* !CONFIG_FUTEX_MPOL */
346 
347 static int futex_mpol(struct mm_struct *mm, unsigned long addr)
348 {
349 	return FUTEX_NO_NODE;
350 }
351 
352 #endif /* CONFIG_FUTEX_MPOL */
353 
354 /**
355  * __futex_hash - Return the hash bucket
356  * @key:	Pointer to the futex key for which the hash is calculated
357  * @fph:	Pointer to private hash if known
358  * @fph_p:	Pointer to a private hash pointer; output for the private hash
359  *              used when set.
360  *
361  * We hash on the keys returned from get_futex_key (see below) and return the
362  * corresponding hash bucket.
363  * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
364  * private hash) is returned if existing. Otherwise a hash bucket from the
365  * global hash is returned.
366  */
367 static struct futex_hash_bucket *
368 __futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p)
369 {
370 	int node = key->both.node;
371 	u32 hash;
372 
373 #ifdef CONFIG_FUTEX_PRIVATE_HASH
374 	if (node == FUTEX_NO_NODE && futex_key_is_private(key)) {
375 		if (!fph)
376 			fph = rcu_dereference(key->private.mm->futex.phash.hash);
377 		if (fph && fph->hash_mask) {
378 			if (fph_p)
379 				*fph_p = fph;
380 			return __futex_hash_private(key, fph);
381 		}
382 	}
383 #endif
384 
385 	hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32),
386 		      key->both.offset);
387 
388 	if (node == FUTEX_NO_NODE) {
389 		/*
390 		 * In case of !FLAGS_NUMA, use some unused hash bits to pick a
391 		 * node -- this ensures regular futexes are interleaved across
392 		 * the nodes and avoids having to allocate multiple
393 		 * hash-tables.
394 		 *
395 		 * NOTE: this isn't perfectly uniform, but it is fast and
396 		 * handles sparse node masks.
397 		 */
398 		node = (hash >> futex_hashshift) % nr_node_ids;
399 		if (!node_possible(node)) {
400 			node = find_next_bit_wrap(node_possible_map.bits, nr_node_ids, node);
401 		}
402 	}
403 
404 	return &futex_queues[node][hash & futex_hashmask];
405 }
406 
407 /**
408  * futex_setup_timer - set up the sleeping hrtimer.
409  * @time:	ptr to the given timeout value
410  * @timeout:	the hrtimer_sleeper structure to be set up
411  * @flags:	futex flags
412  * @range_ns:	optional range in ns
413  *
414  * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
415  *	   value given
416  */
417 struct hrtimer_sleeper *futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
418 					  int flags, u64 range_ns)
419 {
420 	if (!time)
421 		return NULL;
422 
423 	hrtimer_setup_sleeper_on_stack(timeout,
424 				       (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC,
425 				       HRTIMER_MODE_ABS);
426 	/*
427 	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
428 	 * effectively the same as calling hrtimer_set_expires().
429 	 */
430 	hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
431 
432 	return timeout;
433 }
434 
435 /*
436  * Generate a machine wide unique identifier for this inode.
437  *
438  * This relies on u64 not wrapping in the life-time of the machine; which with
439  * 1ns resolution means almost 585 years.
440  *
441  * This further relies on the fact that a well formed program will not unmap
442  * the file while it has a (shared) futex waiting on it. This mapping will have
443  * a file reference which pins the mount and inode.
444  *
445  * If for some reason an inode gets evicted and read back in again, it will get
446  * a new sequence number and will _NOT_ match, even though it is the exact same
447  * file.
448  *
449  * It is important that futex_match() will never have a false-positive, esp.
450  * for PI futexes that can mess up the state. The above argues that false-negatives
451  * are only possible for malformed programs.
452  */
453 static u64 get_inode_sequence_number(struct inode *inode)
454 {
455 	static atomic64_t i_seq;
456 	u64 old;
457 
458 	/* Does the inode already have a sequence number? */
459 	old = atomic64_read(&inode->i_sequence);
460 	if (likely(old))
461 		return old;
462 
463 	for (;;) {
464 		u64 new = atomic64_inc_return(&i_seq);
465 		if (WARN_ON_ONCE(!new))
466 			continue;
467 
468 		old = 0;
469 		if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new))
470 			return old;
471 		return new;
472 	}
473 }
474 
475 /**
476  * get_futex_key() - Get parameters which are the keys for a futex
477  * @uaddr:	virtual address of the futex
478  * @flags:	FLAGS_*
479  * @key:	address where result is stored.
480  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
481  *              FUTEX_WRITE)
482  *
483  * Return: a negative error code or 0
484  *
485  * The key words are stored in @key on success.
486  *
487  * For shared mappings (when @fshared), the key is:
488  *
489  *   ( inode->i_sequence, page offset within mapping, offset_within_page )
490  *
491  * [ also see get_inode_sequence_number() ]
492  *
493  * For private mappings (or when !@fshared), the key is:
494  *
495  *   ( current->mm, address, 0 )
496  *
497  * This allows (cross process, where applicable) identification of the futex
498  * without keeping the page pinned for the duration of the FUTEX_WAIT.
499  *
500  * lock_page() might sleep, the caller should not hold a spinlock.
501  */
502 int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
503 		  enum futex_access rw)
504 {
505 	unsigned long address = (unsigned long)uaddr;
506 	struct mm_struct *mm = current->mm;
507 	struct page *page;
508 	struct folio *folio;
509 	struct address_space *mapping;
510 	int node, err, size, ro = 0;
511 	bool node_updated = false;
512 	bool fshared;
513 
514 	fshared = flags & FLAGS_SHARED;
515 	size = futex_size(flags);
516 	if (flags & FLAGS_NUMA)
517 		size *= 2;
518 
519 	/*
520 	 * The futex address must be "naturally" aligned.
521 	 */
522 	key->both.offset = address % PAGE_SIZE;
523 	if (unlikely((address % size) != 0))
524 		return -EINVAL;
525 	address -= key->both.offset;
526 
527 	if (unlikely(!access_ok(uaddr, size)))
528 		return -EFAULT;
529 
530 	if (unlikely(should_fail_futex(fshared)))
531 		return -EFAULT;
532 
533 	node = FUTEX_NO_NODE;
534 
535 	if (flags & FLAGS_NUMA) {
536 		u32 __user *naddr = (void *)uaddr + size / 2;
537 
538 		if (get_user_inline(node, naddr))
539 			return -EFAULT;
540 
541 		if ((node != FUTEX_NO_NODE) &&
542 		    ((unsigned int)node >= MAX_NUMNODES || !node_possible(node)))
543 			return -EINVAL;
544 	}
545 
546 	if (node == FUTEX_NO_NODE && (flags & FLAGS_MPOL)) {
547 		node = futex_mpol(mm, address);
548 		node_updated = true;
549 	}
550 
551 	if (flags & FLAGS_NUMA) {
552 		u32 __user *naddr = (void *)uaddr + size / 2;
553 
554 		if (node == FUTEX_NO_NODE) {
555 			node = numa_node_id();
556 			node_updated = true;
557 		}
558 		if (node_updated && put_user_inline(node, naddr))
559 			return -EFAULT;
560 	}
561 
562 	key->both.node = node;
563 
564 	/*
565 	 * PROCESS_PRIVATE futexes are fast.
566 	 * As the mm cannot disappear under us and the 'key' only needs
567 	 * virtual address, we dont even have to find the underlying vma.
568 	 * Note : We do have to check 'uaddr' is a valid user address,
569 	 *        but access_ok() should be faster than find_vma()
570 	 */
571 	if (!fshared) {
572 		/*
573 		 * On no-MMU, shared futexes are treated as private, therefore
574 		 * we must not include the current process in the key. Since
575 		 * there is only one address space, the address is a unique key
576 		 * on its own.
577 		 */
578 		if (IS_ENABLED(CONFIG_MMU))
579 			key->private.mm = mm;
580 		else
581 			key->private.mm = NULL;
582 
583 		key->private.address = address;
584 		return 0;
585 	}
586 
587 again:
588 	/* Ignore any VERIFY_READ mapping (futex common case) */
589 	if (unlikely(should_fail_futex(true)))
590 		return -EFAULT;
591 
592 	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
593 	/*
594 	 * If write access is not required (eg. FUTEX_WAIT), try
595 	 * and get read-only access.
596 	 */
597 	if (err == -EFAULT && rw == FUTEX_READ) {
598 		err = get_user_pages_fast(address, 1, 0, &page);
599 		ro = 1;
600 	}
601 	if (err < 0)
602 		return err;
603 	else
604 		err = 0;
605 
606 	/*
607 	 * The treatment of mapping from this point on is critical. The folio
608 	 * lock protects many things but in this context the folio lock
609 	 * stabilizes mapping, prevents inode freeing in the shared
610 	 * file-backed region case and guards against movement to swap cache.
611 	 *
612 	 * Strictly speaking the folio lock is not needed in all cases being
613 	 * considered here and folio lock forces unnecessarily serialization.
614 	 * From this point on, mapping will be re-verified if necessary and
615 	 * folio lock will be acquired only if it is unavoidable
616 	 *
617 	 * Mapping checks require the folio so it is looked up now. For
618 	 * anonymous pages, it does not matter if the folio is split
619 	 * in the future as the key is based on the address. For
620 	 * filesystem-backed pages, the precise page is required as the
621 	 * index of the page determines the key.
622 	 */
623 	folio = page_folio(page);
624 	mapping = READ_ONCE(folio->mapping);
625 
626 	/*
627 	 * If folio->mapping is NULL, then it cannot be an anonymous
628 	 * page; but it might be the ZERO_PAGE or in the gate area or
629 	 * in a special mapping (all cases which we are happy to fail);
630 	 * or it may have been a good file page when get_user_pages_fast
631 	 * found it, but truncated or holepunched or subjected to
632 	 * invalidate_complete_page2 before we got the folio lock (also
633 	 * cases which we are happy to fail).  And we hold a reference,
634 	 * so refcount care in invalidate_inode_page's remove_mapping
635 	 * prevents drop_caches from setting mapping to NULL beneath us.
636 	 *
637 	 * The case we do have to guard against is when memory pressure made
638 	 * shmem_writepage move it from filecache to swapcache beneath us:
639 	 * an unlikely race, but we do need to retry for folio->mapping.
640 	 */
641 	if (unlikely(!mapping)) {
642 		int shmem_swizzled;
643 
644 		/*
645 		 * Folio lock is required to identify which special case above
646 		 * applies. If this is really a shmem page then the folio lock
647 		 * will prevent unexpected transitions.
648 		 */
649 		folio_lock(folio);
650 		shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
651 		folio_unlock(folio);
652 		folio_put(folio);
653 
654 		if (shmem_swizzled)
655 			goto again;
656 
657 		return -EFAULT;
658 	}
659 
660 	/*
661 	 * Private mappings are handled in a simple way.
662 	 *
663 	 * If the futex key is stored in anonymous memory, then the associated
664 	 * object is the mm which is implicitly pinned by the calling process.
665 	 *
666 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
667 	 * it's a read-only handle, it's expected that futexes attach to
668 	 * the object not the particular process.
669 	 */
670 	if (folio_test_anon(folio)) {
671 		/*
672 		 * A RO anonymous page will never change and thus doesn't make
673 		 * sense for futex operations.
674 		 */
675 		if (unlikely(should_fail_futex(true)) || ro) {
676 			err = -EFAULT;
677 			goto out;
678 		}
679 
680 		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
681 		key->private.mm = mm;
682 		key->private.address = address;
683 
684 	} else {
685 		struct inode *inode;
686 
687 		/*
688 		 * The associated futex object in this case is the inode and
689 		 * the folio->mapping must be traversed. Ordinarily this should
690 		 * be stabilised under folio lock but it's not strictly
691 		 * necessary in this case as we just want to pin the inode, not
692 		 * update i_pages or anything like that.
693 		 *
694 		 * The RCU read lock is taken as the inode is finally freed
695 		 * under RCU. If the mapping still matches expectations then the
696 		 * mapping->host can be safely accessed as being a valid inode.
697 		 */
698 		rcu_read_lock();
699 
700 		if (READ_ONCE(folio->mapping) != mapping) {
701 			rcu_read_unlock();
702 			folio_put(folio);
703 
704 			goto again;
705 		}
706 
707 		inode = READ_ONCE(mapping->host);
708 		if (!inode) {
709 			rcu_read_unlock();
710 			folio_put(folio);
711 
712 			goto again;
713 		}
714 
715 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
716 		key->shared.i_seq = get_inode_sequence_number(inode);
717 		key->shared.pgoff = page_pgoff(folio, page);
718 		rcu_read_unlock();
719 	}
720 
721 out:
722 	folio_put(folio);
723 	return err;
724 }
725 
726 /**
727  * fault_in_user_writeable() - Fault in user address and verify RW access
728  * @uaddr:	pointer to faulting user space address
729  *
730  * Slow path to fixup the fault we just took in the atomic write
731  * access to @uaddr.
732  *
733  * We have no generic implementation of a non-destructive write to the
734  * user address. We know that we faulted in the atomic pagefault
735  * disabled section so we can as well avoid the #PF overhead by
736  * calling get_user_pages() right away.
737  */
738 int fault_in_user_writeable(u32 __user *uaddr)
739 {
740 	struct mm_struct *mm = current->mm;
741 	int ret;
742 
743 	mmap_read_lock(mm);
744 	ret = fixup_user_fault(mm, (unsigned long)uaddr,
745 			       FAULT_FLAG_WRITE, NULL);
746 	mmap_read_unlock(mm);
747 
748 	return ret < 0 ? ret : 0;
749 }
750 
751 /**
752  * futex_top_waiter() - Return the highest priority waiter on a futex
753  * @hb:		the hash bucket the futex_q's reside in
754  * @key:	the futex key (to distinguish it from other futex futex_q's)
755  *
756  * Must be called with the hb lock held.
757  */
758 struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
759 {
760 	struct futex_q *this;
761 
762 	plist_for_each_entry(this, &hb->chain, list) {
763 		if (futex_match(&this->key, key))
764 			return this;
765 	}
766 	return NULL;
767 }
768 
769 /**
770  * wait_for_owner_exiting - Block until the owner has exited
771  * @ret: owner's current futex lock status
772  * @exiting:	Pointer to the exiting task
773  *
774  * Caller must hold a refcount on @exiting.
775  */
776 void wait_for_owner_exiting(int ret, struct task_struct *exiting)
777 {
778 	if (ret != -EBUSY) {
779 		WARN_ON_ONCE(exiting);
780 		return;
781 	}
782 
783 	if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
784 		return;
785 
786 	mutex_lock(&exiting->futex.exit_mutex);
787 	/*
788 	 * No point in doing state checking here. If the waiter got here
789 	 * while the task was in exec()->exec_futex_release() then it can
790 	 * have any FUTEX_STATE_* value when the waiter has acquired the
791 	 * mutex. OK, if running, EXITING or DEAD if it reached exit()
792 	 * already. Highly unlikely and not a problem. Just one more round
793 	 * through the futex maze.
794 	 */
795 	mutex_unlock(&exiting->futex.exit_mutex);
796 
797 	put_task_struct(exiting);
798 }
799 
800 /**
801  * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
802  * @q:	The futex_q to unqueue
803  *
804  * The q->lock_ptr must not be NULL and must be held by the caller.
805  */
806 void __futex_unqueue(struct futex_q *q)
807 {
808 	struct futex_hash_bucket *hb;
809 
810 	if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
811 		return;
812 	lockdep_assert_held(q->lock_ptr);
813 
814 	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
815 	plist_del(&q->list, &hb->chain);
816 	futex_hb_waiters_dec(hb);
817 }
818 
819 /* The key must be already stored in q->key. */
820 void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
821 {
822 	/*
823 	 * Increment the counter before taking the lock so that
824 	 * a potential waker won't miss a to-be-slept task that is
825 	 * waiting for the spinlock. This is safe as all futex_q_lock()
826 	 * users end up calling futex_queue(). Similarly, for housekeeping,
827 	 * decrement the counter at futex_q_unlock() when some error has
828 	 * occurred and we don't end up adding the task to the list.
829 	 */
830 	futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
831 
832 	q->lock_ptr = &hb->lock;
833 
834 	spin_lock(&hb->lock);
835 	__acquire(q->lock_ptr);
836 }
837 
838 void futex_q_unlock(struct futex_hash_bucket *hb)
839 {
840 	futex_hb_waiters_dec(hb);
841 	spin_unlock(&hb->lock);
842 }
843 
844 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
845 		   struct task_struct *task)
846 {
847 	int prio;
848 
849 	/*
850 	 * The priority used to register this element is
851 	 * - either the real thread-priority for the real-time threads
852 	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
853 	 * - or MAX_RT_PRIO for non-RT threads.
854 	 * Thus, all RT-threads are woken first in priority order, and
855 	 * the others are woken last, in FIFO order.
856 	 */
857 	prio = min(current->normal_prio, MAX_RT_PRIO);
858 
859 	plist_node_init(&q->list, prio);
860 	plist_add(&q->list, &hb->chain);
861 	q->task = task;
862 }
863 
864 /**
865  * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
866  * @q:	The futex_q to unqueue
867  *
868  * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
869  * be paired with exactly one earlier call to futex_queue().
870  *
871  * Return:
872  *  - 1 - if the futex_q was still queued (and we removed unqueued it);
873  *  - 0 - if the futex_q was already removed by the waking thread
874  */
875 int futex_unqueue(struct futex_q *q)
876 {
877 	spinlock_t *lock_ptr;
878 	int ret = 0;
879 
880 	/* RCU so lock_ptr is not going away during locking. */
881 	guard(rcu)();
882 	/* In the common case we don't take the spinlock, which is nice. */
883 retry:
884 	/*
885 	 * q->lock_ptr can change between this read and the following spin_lock.
886 	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
887 	 * optimizing lock_ptr out of the logic below.
888 	 */
889 	lock_ptr = READ_ONCE(q->lock_ptr);
890 	if (lock_ptr != NULL) {
891 		spin_lock(lock_ptr);
892 		/*
893 		 * q->lock_ptr can change between reading it and
894 		 * spin_lock(), causing us to take the wrong lock.  This
895 		 * corrects the race condition.
896 		 *
897 		 * Reasoning goes like this: if we have the wrong lock,
898 		 * q->lock_ptr must have changed (maybe several times)
899 		 * between reading it and the spin_lock().  It can
900 		 * change again after the spin_lock() but only if it was
901 		 * already changed before the spin_lock().  It cannot,
902 		 * however, change back to the original value.  Therefore
903 		 * we can detect whether we acquired the correct lock.
904 		 */
905 		if (unlikely(lock_ptr != q->lock_ptr)) {
906 			spin_unlock(lock_ptr);
907 			goto retry;
908 		}
909 		__futex_unqueue(q);
910 
911 		BUG_ON(q->pi_state);
912 
913 		spin_unlock(lock_ptr);
914 		ret = 1;
915 	}
916 
917 	return ret;
918 }
919 
920 void futex_q_lockptr_lock(struct futex_q *q)
921 {
922 	spinlock_t *lock_ptr;
923 
924 	/*
925 	 * See futex_unqueue() why lock_ptr can change.
926 	 */
927 	guard(rcu)();
928 retry:
929 	lock_ptr = READ_ONCE(q->lock_ptr);
930 	spin_lock(lock_ptr);
931 
932 	if (unlikely(lock_ptr != q->lock_ptr)) {
933 		spin_unlock(lock_ptr);
934 		goto retry;
935 	}
936 }
937 
938 /*
939  * PI futexes can not be requeued and must remove themselves from the hash
940  * bucket. The hash bucket lock (i.e. lock_ptr) is held.
941  */
942 void futex_unqueue_pi(struct futex_q *q)
943 {
944 	/*
945 	 * If the lock was not acquired (due to timeout or signal) then the
946 	 * rt_waiter is removed before futex_q is. If this is observed by
947 	 * an unlocker after dropping the rtmutex wait lock and before
948 	 * acquiring the hash bucket lock, then the unlocker dequeues the
949 	 * futex_q from the hash bucket list to guarantee consistent state
950 	 * vs. userspace. Therefore the dequeue here must be conditional.
951 	 */
952 	if (!plist_node_empty(&q->list))
953 		__futex_unqueue(q);
954 
955 	BUG_ON(!q->pi_state);
956 	put_pi_state(q->pi_state);
957 	q->pi_state = NULL;
958 }
959 
960 /* Constants for the pending_op argument of handle_futex_death */
961 #define HANDLE_DEATH_PENDING	true
962 #define HANDLE_DEATH_LIST	false
963 
964 /*
965  * Process a futex-list entry, check whether it's owned by the
966  * dying task, and do notification if so:
967  */
968 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
969 			      unsigned int mod, bool pending_op)
970 {
971 	bool pi = !!(mod & FUTEX_ROBUST_MOD_PI);
972 	u32 uval, nval, mval;
973 	pid_t owner;
974 	int err;
975 
976 	/* Futex address must be 32bit aligned */
977 	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
978 		return -1;
979 
980 retry:
981 	if (get_user(uval, uaddr))
982 		return -1;
983 
984 	/*
985 	 * Special case for regular (non PI) futexes. The unlock path in
986 	 * user space has two race scenarios:
987 	 *
988 	 * 1. The unlock path releases the user space futex value and
989 	 *    before it can execute the futex() syscall to wake up
990 	 *    waiters it is killed.
991 	 *
992 	 * 2. A woken up waiter is killed before it can acquire the
993 	 *    futex in user space.
994 	 *
995 	 * In the second case, the wake up notification could be generated
996 	 * by the unlock path in user space after setting the futex value
997 	 * to zero or by the kernel after setting the OWNER_DIED bit below.
998 	 *
999 	 * In both cases the TID validation below prevents a wakeup of
1000 	 * potential waiters which can cause these waiters to block
1001 	 * forever.
1002 	 *
1003 	 * In both cases the following conditions are met:
1004 	 *
1005 	 *	1) task->futex.robust_list->list_op_pending != NULL
1006 	 *	   @pending_op == true
1007 	 *	2) The owner part of user space futex value == 0
1008 	 *	3) Regular futex: @pi == false
1009 	 *
1010 	 * If these conditions are met, it is safe to attempt waking up a
1011 	 * potential waiter without touching the user space futex value and
1012 	 * trying to set the OWNER_DIED bit. If the futex value is zero,
1013 	 * the rest of the user space mutex state is consistent, so a woken
1014 	 * waiter will just take over the uncontended futex. Setting the
1015 	 * OWNER_DIED bit would create inconsistent state and malfunction
1016 	 * of the user space owner died handling. Otherwise, the OWNER_DIED
1017 	 * bit is already set, and the woken waiter is expected to deal with
1018 	 * this.
1019 	 */
1020 	owner = uval & FUTEX_TID_MASK;
1021 
1022 	if (pending_op && !pi && !owner) {
1023 		futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, NULL, 1,
1024 			   FUTEX_BITSET_MATCH_ANY);
1025 		return 0;
1026 	}
1027 
1028 	if (owner != task_pid_vnr(curr))
1029 		return 0;
1030 
1031 	/*
1032 	 * Ok, this dying thread is truly holding a futex
1033 	 * of interest. Set the OWNER_DIED bit atomically
1034 	 * via cmpxchg, and if the value had FUTEX_WAITERS
1035 	 * set, wake up a waiter (if any). (We have to do a
1036 	 * futex_wake() even if OWNER_DIED is already set -
1037 	 * to handle the rare but possible case of recursive
1038 	 * thread-death.) The rest of the cleanup is done in
1039 	 * userspace.
1040 	 */
1041 	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
1042 
1043 	/*
1044 	 * We are not holding a lock here, but we want to have
1045 	 * the pagefault_disable/enable() protection because
1046 	 * we want to handle the fault gracefully. If the
1047 	 * access fails we try to fault in the futex with R/W
1048 	 * verification via get_user_pages. get_user() above
1049 	 * does not guarantee R/W access. If that fails we
1050 	 * give up and leave the futex locked.
1051 	 */
1052 	if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
1053 		switch (err) {
1054 		case -EFAULT:
1055 			if (fault_in_user_writeable(uaddr))
1056 				return -1;
1057 			goto retry;
1058 
1059 		case -EAGAIN:
1060 			cond_resched();
1061 			goto retry;
1062 
1063 		default:
1064 			WARN_ON_ONCE(1);
1065 			return err;
1066 		}
1067 	}
1068 
1069 	if (nval != uval)
1070 		goto retry;
1071 
1072 	/*
1073 	 * Wake robust non-PI futexes here. The wakeup of
1074 	 * PI futexes happens in exit_pi_state():
1075 	 */
1076 	if (!pi && (uval & FUTEX_WAITERS)) {
1077 		futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, NULL, 1,
1078 			   FUTEX_BITSET_MATCH_ANY);
1079 	}
1080 
1081 	return 0;
1082 }
1083 
1084 /*
1085  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
1086  */
1087 static inline int fetch_robust_entry(struct robust_list __user **entry,
1088 				     struct robust_list __user * __user *head,
1089 				     unsigned int *mod)
1090 {
1091 	unsigned long uentry;
1092 
1093 	if (get_user(uentry, (unsigned long __user *)head))
1094 		return -EFAULT;
1095 
1096 	*entry = (void __user *)(uentry & ~FUTEX_ROBUST_MOD_MASK);
1097 	*mod = uentry & FUTEX_ROBUST_MOD_MASK;
1098 
1099 	return 0;
1100 }
1101 
1102 /*
1103  * Walk curr->futex.robust_list (very carefully, it's a userspace list!)
1104  * and mark any locks found there dead, and notify any waiters.
1105  *
1106  * We silently return on any sign of list-walking problem.
1107  */
1108 static void exit_robust_list(struct task_struct *curr)
1109 {
1110 	struct robust_list_head __user *head = curr->futex.robust_list;
1111 	unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod;
1112 	struct robust_list __user *entry, *next_entry, *pending;
1113 	unsigned long futex_offset;
1114 	int rc;
1115 
1116 	/*
1117 	 * Fetch the list head (which was registered earlier, via
1118 	 * sys_set_robust_list()):
1119 	 */
1120 	if (fetch_robust_entry(&entry, &head->list.next, &cur_mod))
1121 		return;
1122 	/*
1123 	 * Fetch the relative futex offset:
1124 	 */
1125 	if (get_user(futex_offset, &head->futex_offset))
1126 		return;
1127 	/*
1128 	 * Fetch any possibly pending lock-add first, and handle it
1129 	 * if it exists:
1130 	 */
1131 	if (fetch_robust_entry(&pending, &head->list_op_pending, &pend_mod))
1132 		return;
1133 
1134 	next_entry = NULL;	/* avoid warning with gcc */
1135 	while (entry != &head->list) {
1136 		/*
1137 		 * Fetch the next entry in the list before calling
1138 		 * handle_futex_death:
1139 		 */
1140 		rc = fetch_robust_entry(&next_entry, &entry->next, &next_mod);
1141 		/*
1142 		 * A pending lock might already be on the list, so
1143 		 * don't process it twice:
1144 		 */
1145 		if (entry != pending) {
1146 			if (handle_futex_death((void __user *)entry + futex_offset,
1147 						curr, cur_mod, HANDLE_DEATH_LIST))
1148 				return;
1149 		}
1150 		if (rc)
1151 			return;
1152 		entry = next_entry;
1153 		cur_mod = next_mod;
1154 		/*
1155 		 * Avoid excessively long or circular lists:
1156 		 */
1157 		if (!--limit)
1158 			break;
1159 
1160 		cond_resched();
1161 	}
1162 
1163 	if (pending) {
1164 		handle_futex_death((void __user *)pending + futex_offset,
1165 				   curr, pend_mod, HANDLE_DEATH_PENDING);
1166 	}
1167 }
1168 
1169 static bool robust_list_clear_pending(unsigned long __user *pop)
1170 {
1171 	struct robust_list_head __user *head = current->futex.robust_list;
1172 
1173 	if (!put_user(0UL, pop))
1174 		return true;
1175 
1176 	/*
1177 	 * Just give up. The robust list head is usually part of TLS, so the
1178 	 * chance that this gets resolved is close to zero.
1179 	 *
1180 	 * If @pop_addr is the robust_list_head::list_op_pending pointer then
1181 	 * clear the robust list head pointer to prevent further damage when the
1182 	 * task exits.  Better a few stale futexes than corrupted memory. But
1183 	 * that's mostly an academic exercise.
1184 	 */
1185 	if (pop == (unsigned long __user *)&head->list_op_pending)
1186 		current->futex.robust_list = NULL;
1187 	return false;
1188 }
1189 
1190 #ifdef CONFIG_COMPAT
1191 static void __user *futex_uaddr(struct robust_list __user *entry,
1192 				compat_long_t futex_offset)
1193 {
1194 	compat_uptr_t base = ptr_to_compat(entry);
1195 	void __user *uaddr = compat_ptr(base + futex_offset);
1196 
1197 	return uaddr;
1198 }
1199 
1200 /*
1201  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
1202  */
1203 static inline int
1204 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
1205 		   compat_uptr_t __user *head, unsigned int *pflags)
1206 {
1207 	if (get_user(*uentry, head))
1208 		return -EFAULT;
1209 
1210 	*entry = compat_ptr((*uentry) & ~FUTEX_ROBUST_MOD_MASK);
1211 	*pflags = (unsigned int)(*uentry) & FUTEX_ROBUST_MOD_MASK;
1212 
1213 	return 0;
1214 }
1215 
1216 /*
1217  * Walk curr->futex.robust_list (very carefully, it's a userspace list!)
1218  * and mark any locks found there dead, and notify any waiters.
1219  *
1220  * We silently return on any sign of list-walking problem.
1221  */
1222 static void compat_exit_robust_list(struct task_struct *curr)
1223 {
1224 	struct compat_robust_list_head __user *head = current->futex.compat_robust_list;
1225 	unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod;
1226 	struct robust_list __user *entry, *next_entry, *pending;
1227 	compat_uptr_t uentry, next_uentry, upending;
1228 	compat_long_t futex_offset;
1229 	int rc;
1230 
1231 	/*
1232 	 * Fetch the list head (which was registered earlier, via
1233 	 * sys_set_robust_list()):
1234 	 */
1235 	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &cur_mod))
1236 		return;
1237 	/*
1238 	 * Fetch the relative futex offset:
1239 	 */
1240 	if (get_user(futex_offset, &head->futex_offset))
1241 		return;
1242 	/*
1243 	 * Fetch any possibly pending lock-add first, and handle it
1244 	 * if it exists:
1245 	 */
1246 	if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pend_mod))
1247 		return;
1248 
1249 	next_entry = NULL;	/* avoid warning with gcc */
1250 	while (entry != (struct robust_list __user *) &head->list) {
1251 		/*
1252 		 * Fetch the next entry in the list before calling
1253 		 * handle_futex_death:
1254 		 */
1255 		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
1256 			(compat_uptr_t __user *)&entry->next, &next_mod);
1257 		/*
1258 		 * A pending lock might already be on the list, so
1259 		 * dont process it twice:
1260 		 */
1261 		if (entry != pending) {
1262 			void __user *uaddr = futex_uaddr(entry, futex_offset);
1263 
1264 			if (handle_futex_death(uaddr, curr, cur_mod, HANDLE_DEATH_LIST))
1265 				return;
1266 		}
1267 		if (rc)
1268 			return;
1269 		uentry = next_uentry;
1270 		entry = next_entry;
1271 		cur_mod = next_mod;
1272 		/*
1273 		 * Avoid excessively long or circular lists:
1274 		 */
1275 		if (!--limit)
1276 			break;
1277 
1278 		cond_resched();
1279 	}
1280 	if (pending) {
1281 		void __user *uaddr = futex_uaddr(pending, futex_offset);
1282 
1283 		handle_futex_death(uaddr, curr, pend_mod, HANDLE_DEATH_PENDING);
1284 	}
1285 }
1286 
1287 static bool compat_robust_list_clear_pending(u32 __user *pop)
1288 {
1289 	struct compat_robust_list_head __user *head = current->futex.compat_robust_list;
1290 
1291 	if (!put_user(0U, pop))
1292 		return true;
1293 
1294 	/* See comment in robust_list_clear_pending(). */
1295 	if (pop == &head->list_op_pending)
1296 		current->futex.compat_robust_list = NULL;
1297 	return false;
1298 }
1299 #else
1300 static bool compat_robust_list_clear_pending(u32 __user *pop_addr) { return false; }
1301 #endif
1302 
1303 #ifdef CONFIG_FUTEX_PI
1304 
1305 /*
1306  * This task is holding PI mutexes at exit time => bad.
1307  * Kernel cleans up PI-state, but userspace is likely hosed.
1308  * (Robust-futex cleanup is separate and might save the day for userspace.)
1309  */
1310 static void exit_pi_state_list(struct task_struct *curr)
1311 {
1312 	struct list_head *next, *head = &curr->futex.pi_state_list;
1313 	struct futex_pi_state *pi_state;
1314 	union futex_key key = FUTEX_KEY_INIT;
1315 
1316 	/*
1317 	 * The mutex mm_struct::futex_hash_lock might be acquired.
1318 	 */
1319 	might_sleep();
1320 	/*
1321 	 * Ensure the hash remains stable (no resize) during the while loop
1322 	 * below. The hb pointer is acquired under the pi_lock so we can't block
1323 	 * on the mutex.
1324 	 */
1325 	WARN_ON(curr != current);
1326 	guard(private_hash)(current->mm);
1327 	/*
1328 	 * We are a ZOMBIE and nobody can enqueue itself on
1329 	 * pi_state_list anymore, but we have to be careful
1330 	 * versus waiters unqueueing themselves:
1331 	 */
1332 	raw_spin_lock_irq(&curr->pi_lock);
1333 	while (!list_empty(head)) {
1334 		next = head->next;
1335 		pi_state = list_entry(next, struct futex_pi_state, list);
1336 		key = pi_state->key;
1337 		if (1) {
1338 			CLASS(hbr, hbr)(&key);
1339 			auto hb = hbr.hb;
1340 
1341 			/*
1342 			 * We can race against put_pi_state() removing itself from the
1343 			 * list (a waiter going away). put_pi_state() will first
1344 			 * decrement the reference count and then modify the list, so
1345 			 * its possible to see the list entry but fail this reference
1346 			 * acquire.
1347 			 *
1348 			 * In that case; drop the locks to let put_pi_state() make
1349 			 * progress and retry the loop.
1350 			 */
1351 			if (!refcount_inc_not_zero(&pi_state->refcount)) {
1352 				raw_spin_unlock_irq(&curr->pi_lock);
1353 				cpu_relax();
1354 				raw_spin_lock_irq(&curr->pi_lock);
1355 				continue;
1356 			}
1357 			raw_spin_unlock_irq(&curr->pi_lock);
1358 
1359 			spin_lock(&hb->lock);
1360 			raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1361 			raw_spin_lock(&curr->pi_lock);
1362 			/*
1363 			 * We dropped the pi-lock, so re-check whether this
1364 			 * task still owns the PI-state:
1365 			 */
1366 			if (head->next != next) {
1367 				/* retain curr->pi_lock for the loop invariant */
1368 				raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1369 				spin_unlock(&hb->lock);
1370 				put_pi_state(pi_state);
1371 				continue;
1372 			}
1373 
1374 			WARN_ON(pi_state->owner != curr);
1375 			WARN_ON(list_empty(&pi_state->list));
1376 			list_del_init(&pi_state->list);
1377 			pi_state->owner = NULL;
1378 
1379 			raw_spin_unlock(&curr->pi_lock);
1380 			raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1381 			spin_unlock(&hb->lock);
1382 		}
1383 
1384 		rt_mutex_futex_unlock(&pi_state->pi_mutex);
1385 		put_pi_state(pi_state);
1386 
1387 		raw_spin_lock_irq(&curr->pi_lock);
1388 	}
1389 	raw_spin_unlock_irq(&curr->pi_lock);
1390 }
1391 #else
1392 static inline void exit_pi_state_list(struct task_struct *curr) { }
1393 #endif
1394 
1395 bool futex_robust_list_clear_pending(void __user *pop, unsigned int flags)
1396 {
1397 	bool size32bit = !!(flags & FLAGS_ROBUST_LIST32);
1398 
1399 	if (!IS_ENABLED(CONFIG_64BIT) && !size32bit)
1400 		return false;
1401 
1402 	if (IS_ENABLED(CONFIG_64BIT) && size32bit)
1403 		return compat_robust_list_clear_pending(pop);
1404 
1405 	return robust_list_clear_pending(pop);
1406 }
1407 
1408 #ifdef CONFIG_FUTEX_ROBUST_UNLOCK
1409 void __futex_fixup_robust_unlock(struct pt_regs *regs, struct futex_unlock_cs_range *csr)
1410 {
1411 	/*
1412 	 * arch_futex_robust_unlock_get_pop() returns the list pending op pointer from
1413 	 * @regs if the try_cmpxchg() succeeded.
1414 	 */
1415 	void __user *pop = arch_futex_robust_unlock_get_pop(regs);
1416 
1417 	if (!pop)
1418 		return;
1419 
1420 	futex_robust_list_clear_pending(pop, csr->pop_size32 ? FLAGS_ROBUST_LIST32 : 0);
1421 }
1422 #endif /* CONFIG_FUTEX_ROBUST_UNLOCK */
1423 
1424 static void futex_cleanup(struct task_struct *tsk)
1425 {
1426 	if (unlikely(tsk->futex.robust_list)) {
1427 		exit_robust_list(tsk);
1428 		tsk->futex.robust_list = NULL;
1429 	}
1430 
1431 #ifdef CONFIG_COMPAT
1432 	if (unlikely(tsk->futex.compat_robust_list)) {
1433 		compat_exit_robust_list(tsk);
1434 		tsk->futex.compat_robust_list = NULL;
1435 	}
1436 #endif
1437 
1438 	if (unlikely(!list_empty(&tsk->futex.pi_state_list)))
1439 		exit_pi_state_list(tsk);
1440 }
1441 
1442 /**
1443  * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
1444  * @tsk:	task to set the state on
1445  *
1446  * Set the futex exit state of the task lockless. The futex waiter code
1447  * observes that state when a task is exiting and loops until the task has
1448  * actually finished the futex cleanup. The worst case for this is that the
1449  * waiter runs through the wait loop until the state becomes visible.
1450  *
1451  * This is called from the recursive fault handling path in make_task_dead().
1452  *
1453  * This is best effort. Either the futex exit code has run already or
1454  * not. If the OWNER_DIED bit has been set on the futex then the waiter can
1455  * take it over. If not, the problem is pushed back to user space. If the
1456  * futex exit code did not run yet, then an already queued waiter might
1457  * block forever, but there is nothing which can be done about that.
1458  */
1459 void futex_exit_recursive(struct task_struct *tsk)
1460 {
1461 	/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
1462 	if (tsk->futex.state == FUTEX_STATE_EXITING) {
1463 		__assume_ctx_lock(&tsk->futex.exit_mutex);
1464 		mutex_unlock(&tsk->futex.exit_mutex);
1465 	}
1466 	tsk->futex.state = FUTEX_STATE_DEAD;
1467 }
1468 
1469 static void futex_cleanup_begin(struct task_struct *tsk)
1470 	__acquires(&tsk->futex.exit_mutex)
1471 {
1472 	/*
1473 	 * Prevent various race issues against a concurrent incoming waiter
1474 	 * including live locks by forcing the waiter to block on
1475 	 * tsk->futex.exit_mutex when it observes FUTEX_STATE_EXITING in
1476 	 * attach_to_pi_owner().
1477 	 */
1478 	mutex_lock(&tsk->futex.exit_mutex);
1479 
1480 	/*
1481 	 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
1482 	 *
1483 	 * This ensures that all subsequent checks of tsk->futex_state in
1484 	 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
1485 	 * tsk->pi_lock held.
1486 	 *
1487 	 * It guarantees also that a pi_state which was queued right before
1488 	 * the state change under tsk->pi_lock by a concurrent waiter must
1489 	 * be observed in exit_pi_state_list().
1490 	 */
1491 	raw_spin_lock_irq(&tsk->pi_lock);
1492 	tsk->futex.state = FUTEX_STATE_EXITING;
1493 	raw_spin_unlock_irq(&tsk->pi_lock);
1494 }
1495 
1496 static void futex_cleanup_end(struct task_struct *tsk, int state)
1497 	__releases(&tsk->futex.exit_mutex)
1498 {
1499 	/*
1500 	 * Lockless store. The only side effect is that an observer might
1501 	 * take another loop until it becomes visible.
1502 	 */
1503 	tsk->futex.state = state;
1504 	/*
1505 	 * Drop the exit protection. This unblocks waiters which observed
1506 	 * FUTEX_STATE_EXITING to reevaluate the state.
1507 	 */
1508 	mutex_unlock(&tsk->futex.exit_mutex);
1509 }
1510 
1511 void futex_exec_release(struct task_struct *tsk)
1512 {
1513 	/*
1514 	 * The state handling is done for consistency, but in the case of
1515 	 * exec() there is no way to prevent further damage as the PID stays
1516 	 * the same. But for the unlikely and arguably buggy case that a
1517 	 * futex is held on exec(), this provides at least as much state
1518 	 * consistency protection which is possible.
1519 	 */
1520 	futex_cleanup_begin(tsk);
1521 	futex_cleanup(tsk);
1522 	/*
1523 	 * Reset the state to FUTEX_STATE_OK. The task is alive and about
1524 	 * exec a new binary.
1525 	 */
1526 	futex_cleanup_end(tsk, FUTEX_STATE_OK);
1527 }
1528 
1529 void futex_exit_release(struct task_struct *tsk)
1530 {
1531 	futex_cleanup_begin(tsk);
1532 	futex_cleanup(tsk);
1533 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
1534 }
1535 
1536 static void futex_hash_bucket_init(struct futex_hash_bucket *fhb)
1537 {
1538 	atomic_set(&fhb->waiters, 0);
1539 	plist_head_init(&fhb->chain);
1540 	spin_lock_init(&fhb->lock);
1541 }
1542 
1543 #define FH_CUSTOM	0x01
1544 
1545 #ifdef CONFIG_FUTEX_PRIVATE_HASH
1546 
1547 /*
1548  * futex-ref
1549  *
1550  * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
1551  * code because it just doesn't fit right.
1552  *
1553  * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
1554  * re-initializes the state automatically, such that the fph swizzle is also a
1555  * transition back to per-cpu.
1556  */
1557 
1558 static void futex_ref_rcu(struct rcu_head *head);
1559 
1560 static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
1561 {
1562 	struct mm_struct *mm = fph->mm;
1563 
1564 	/*
1565 	 * The counter we're about to switch to must have fully switched;
1566 	 * otherwise it would be impossible for it to have reported success
1567 	 * from futex_ref_is_dead().
1568 	 */
1569 	WARN_ON_ONCE(atomic_long_read(&mm->futex.phash.atomic) != 0);
1570 
1571 	/*
1572 	 * Set the atomic to the bias value such that futex_ref_{get,put}()
1573 	 * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
1574 	 * when folding in the percpu count.
1575 	 */
1576 	atomic_long_set(&mm->futex.phash.atomic, LONG_MAX);
1577 	smp_store_release(&fph->state, FR_ATOMIC);
1578 
1579 	call_rcu_hurry(&mm->futex.phash.rcu, futex_ref_rcu);
1580 }
1581 
1582 static void __futex_ref_atomic_end(struct futex_private_hash *fph)
1583 {
1584 	struct mm_struct *mm = fph->mm;
1585 	unsigned int count = 0;
1586 	long ret;
1587 	int cpu;
1588 
1589 	/*
1590 	 * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
1591 	 * and per this RCU callback, everybody must now observe this state and
1592 	 * use the atomic variable.
1593 	 */
1594 	WARN_ON_ONCE(fph->state != FR_ATOMIC);
1595 
1596 	/*
1597 	 * Therefore the per-cpu counter is now stable, sum and reset.
1598 	 */
1599 	for_each_possible_cpu(cpu) {
1600 		unsigned int *ptr = per_cpu_ptr(mm->futex.phash.ref, cpu);
1601 		count += *ptr;
1602 		*ptr = 0;
1603 	}
1604 
1605 	/*
1606 	 * Re-init for the next cycle.
1607 	 */
1608 	this_cpu_inc(*mm->futex.phash.ref); /* 0 -> 1 */
1609 
1610 	/*
1611 	 * Add actual count, subtract bias and initial refcount.
1612 	 *
1613 	 * The moment this atomic operation happens, futex_ref_is_dead() can
1614 	 * become true.
1615 	 */
1616 	ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex.phash.atomic);
1617 	if (!ret)
1618 		wake_up_var(mm);
1619 
1620 	WARN_ON_ONCE(ret < 0);
1621 	mmput_async(mm);
1622 }
1623 
1624 static void futex_ref_rcu(struct rcu_head *head)
1625 {
1626 	struct mm_struct *mm = container_of(head, struct mm_struct, futex.phash.rcu);
1627 	struct futex_private_hash *fph = rcu_dereference_raw(mm->futex.phash.hash);
1628 
1629 	if (fph->state == FR_PERCPU) {
1630 		/*
1631 		 * Per this extra grace-period, everybody must now observe
1632 		 * fph as the current fph and no previously observed fph's
1633 		 * are in-flight.
1634 		 *
1635 		 * Notably, nobody will now rely on the atomic
1636 		 * futex_ref_is_dead() state anymore so we can begin the
1637 		 * migration of the per-cpu counter into the atomic.
1638 		 */
1639 		__futex_ref_atomic_begin(fph);
1640 		return;
1641 	}
1642 
1643 	__futex_ref_atomic_end(fph);
1644 }
1645 
1646 /*
1647  * Drop the initial refcount and transition to atomics.
1648  */
1649 static void futex_ref_drop(struct futex_private_hash *fph)
1650 {
1651 	struct mm_struct *mm = fph->mm;
1652 
1653 	/*
1654 	 * Can only transition the current fph;
1655 	 */
1656 	WARN_ON_ONCE(rcu_dereference_raw(mm->futex.phash.hash) != fph);
1657 	/*
1658 	 * We enqueue at least one RCU callback. Ensure mm stays if the task
1659 	 * exits before the transition is completed.
1660 	 */
1661 	mmget(mm);
1662 
1663 	/*
1664 	 * In order to avoid the following scenario:
1665 	 *
1666 	 * futex_hash()			__futex_pivot_hash()
1667 	 *   guard(rcu);		  guard(mm->futex.phash.lock);
1668 	 *   fph = mm->futex.phash.hash;
1669 	 *				  rcu_assign_pointer(&mm->futex.phash.hash, new);
1670 	 *				futex_hash_allocate()
1671 	 *				  futex_ref_drop()
1672 	 *				    fph->state = FR_ATOMIC;
1673 	 *				    atomic_set(, BIAS);
1674 	 *
1675 	 *   futex_private_hash_get(fph); // OOPS
1676 	 *
1677 	 * Where an old fph (which is FR_ATOMIC) and should fail on
1678 	 * inc_not_zero, will succeed because a new transition is started and
1679 	 * the atomic is bias'ed away from 0.
1680 	 *
1681 	 * There must be at least one full grace-period between publishing a
1682 	 * new fph and trying to replace it.
1683 	 */
1684 	if (poll_state_synchronize_rcu(mm->futex.phash.batches)) {
1685 		/*
1686 		 * There was a grace-period, we can begin now.
1687 		 */
1688 		__futex_ref_atomic_begin(fph);
1689 		return;
1690 	}
1691 
1692 	call_rcu_hurry(&mm->futex.phash.rcu, futex_ref_rcu);
1693 }
1694 
1695 static bool futex_ref_get(struct futex_private_hash *fph)
1696 {
1697 	struct mm_struct *mm = fph->mm;
1698 
1699 	guard(preempt)();
1700 
1701 	if (READ_ONCE(fph->state) == FR_PERCPU) {
1702 		__this_cpu_inc(*mm->futex.phash.ref);
1703 		return true;
1704 	}
1705 
1706 	return atomic_long_inc_not_zero(&mm->futex.phash.atomic);
1707 }
1708 
1709 static bool futex_ref_put(struct futex_private_hash *fph)
1710 {
1711 	struct mm_struct *mm = fph->mm;
1712 
1713 	guard(preempt)();
1714 
1715 	if (READ_ONCE(fph->state) == FR_PERCPU) {
1716 		__this_cpu_dec(*mm->futex.phash.ref);
1717 		return false;
1718 	}
1719 
1720 	return atomic_long_dec_and_test(&mm->futex.phash.atomic);
1721 }
1722 
1723 static bool futex_ref_is_dead(struct futex_private_hash *fph)
1724 {
1725 	struct mm_struct *mm = fph->mm;
1726 
1727 	guard(rcu)();
1728 
1729 	if (smp_load_acquire(&fph->state) == FR_PERCPU)
1730 		return false;
1731 
1732 	return atomic_long_read(&mm->futex.phash.atomic) == 0;
1733 }
1734 
1735 static void futex_hash_init_mm(struct futex_mm_data *fd)
1736 {
1737 	memset(&fd->phash, 0, sizeof(fd->phash));
1738 	mutex_init(&fd->phash.lock);
1739 	fd->phash.batches = get_state_synchronize_rcu();
1740 }
1741 
1742 void futex_hash_free(struct mm_struct *mm)
1743 {
1744 	struct futex_private_hash *fph;
1745 
1746 	free_percpu(mm->futex.phash.ref);
1747 	kvfree(mm->futex.phash.hash_new);
1748 	fph = rcu_dereference_raw(mm->futex.phash.hash);
1749 	if (fph)
1750 		kvfree(fph);
1751 }
1752 
1753 static bool futex_pivot_pending(struct mm_struct *mm)
1754 {
1755 	struct futex_private_hash *fph;
1756 
1757 	guard(rcu)();
1758 
1759 	if (!mm->futex.phash.hash_new)
1760 		return true;
1761 
1762 	fph = rcu_dereference(mm->futex.phash.hash);
1763 	return futex_ref_is_dead(fph);
1764 }
1765 
1766 static bool futex_hash_less(struct futex_private_hash *a,
1767 			    struct futex_private_hash *b)
1768 {
1769 	/* user provided always wins */
1770 	if (!a->custom && b->custom)
1771 		return true;
1772 	if (a->custom && !b->custom)
1773 		return false;
1774 
1775 	/* zero-sized hash wins */
1776 	if (!b->hash_mask)
1777 		return true;
1778 	if (!a->hash_mask)
1779 		return false;
1780 
1781 	/* keep the biggest */
1782 	if (a->hash_mask < b->hash_mask)
1783 		return true;
1784 	if (a->hash_mask > b->hash_mask)
1785 		return false;
1786 
1787 	return false; /* equal */
1788 }
1789 
1790 static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
1791 {
1792 	struct mm_struct *mm = current->mm;
1793 	struct futex_private_hash *fph;
1794 	bool custom = flags & FH_CUSTOM;
1795 	int i;
1796 
1797 	if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
1798 		return -EINVAL;
1799 
1800 	/*
1801 	 * Once we've disabled the global hash there is no way back.
1802 	 */
1803 	scoped_guard(rcu) {
1804 		fph = rcu_dereference(mm->futex.phash.hash);
1805 		if (fph && !fph->hash_mask) {
1806 			if (custom)
1807 				return -EBUSY;
1808 			return 0;
1809 		}
1810 	}
1811 
1812 	if (!mm->futex.phash.ref) {
1813 		/*
1814 		 * This will always be allocated by the first thread and
1815 		 * therefore requires no locking.
1816 		 */
1817 		mm->futex.phash.ref = alloc_percpu(unsigned int);
1818 		if (!mm->futex.phash.ref)
1819 			return -ENOMEM;
1820 		this_cpu_inc(*mm->futex.phash.ref); /* 0 -> 1 */
1821 	}
1822 
1823 	fph = kvzalloc(struct_size(fph, queues, hash_slots),
1824 		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1825 	if (!fph)
1826 		return -ENOMEM;
1827 
1828 	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
1829 	fph->custom = custom;
1830 	fph->mm = mm;
1831 
1832 	for (i = 0; i < hash_slots; i++)
1833 		futex_hash_bucket_init(&fph->queues[i]);
1834 
1835 	if (custom) {
1836 		/*
1837 		 * Only let prctl() wait / retry; don't unduly delay clone().
1838 		 */
1839 again:
1840 		wait_var_event(mm, futex_pivot_pending(mm));
1841 	}
1842 
1843 	scoped_guard(mutex, &mm->futex.phash.lock) {
1844 		struct futex_private_hash *free __free(kvfree) = NULL;
1845 		struct futex_private_hash *cur, *new;
1846 
1847 		cur = rcu_dereference_protected(mm->futex.phash.hash,
1848 						lockdep_is_held(&mm->futex.phash.lock));
1849 		new = mm->futex.phash.hash_new;
1850 		mm->futex.phash.hash_new = NULL;
1851 
1852 		if (fph) {
1853 			if (cur && !cur->hash_mask) {
1854 				/*
1855 				 * If two threads simultaneously request the global
1856 				 * hash then the first one performs the switch,
1857 				 * the second one returns here.
1858 				 */
1859 				free = fph;
1860 				mm->futex.phash.hash_new = new;
1861 				return -EBUSY;
1862 			}
1863 			if (cur && !new) {
1864 				/*
1865 				 * If we have an existing hash, but do not yet have
1866 				 * allocated a replacement hash, drop the initial
1867 				 * reference on the existing hash.
1868 				 */
1869 				futex_ref_drop(cur);
1870 			}
1871 
1872 			if (new) {
1873 				/*
1874 				 * Two updates raced; throw out the lesser one.
1875 				 */
1876 				if (futex_hash_less(new, fph)) {
1877 					free = new;
1878 					new = fph;
1879 				} else {
1880 					free = fph;
1881 				}
1882 			} else {
1883 				new = fph;
1884 			}
1885 			fph = NULL;
1886 		}
1887 
1888 		if (new) {
1889 			/*
1890 			 * Will set mm->futex.phash.new_hash on failure;
1891 			 * futex_private_hash_get() will try again.
1892 			 */
1893 			if (!__futex_pivot_hash(mm, new) && custom)
1894 				goto again;
1895 		}
1896 	}
1897 	return 0;
1898 }
1899 
1900 int futex_hash_allocate_default(void)
1901 {
1902 	unsigned int threads, buckets, current_buckets = 0;
1903 	struct futex_private_hash *fph;
1904 
1905 	if (!current->mm)
1906 		return 0;
1907 
1908 	scoped_guard(rcu) {
1909 		threads = min_t(unsigned int, get_nr_threads(current), num_online_cpus());
1910 
1911 		fph = rcu_dereference(current->mm->futex.phash.hash);
1912 		if (fph) {
1913 			if (fph->custom)
1914 				return 0;
1915 
1916 			current_buckets = fph->hash_mask + 1;
1917 		}
1918 	}
1919 
1920 	/*
1921 	 * The default allocation will remain within
1922 	 *   16 <= threads * 4 <= global hash size
1923 	 */
1924 	buckets = roundup_pow_of_two(4 * threads);
1925 	buckets = clamp(buckets, 16, futex_hashmask + 1);
1926 
1927 	if (current_buckets >= buckets)
1928 		return 0;
1929 
1930 	return futex_hash_allocate(buckets, 0);
1931 }
1932 
1933 static int futex_hash_get_slots(void)
1934 {
1935 	struct futex_private_hash *fph;
1936 
1937 	guard(rcu)();
1938 	fph = rcu_dereference(current->mm->futex.phash.hash);
1939 	if (fph && fph->hash_mask)
1940 		return fph->hash_mask + 1;
1941 	return 0;
1942 }
1943 #else  /* CONFIG_FUTEX_PRIVATE_HASH */
1944 static inline int futex_hash_allocate(unsigned int hslots, unsigned int flags) { return -EINVAL; }
1945 static inline int futex_hash_get_slots(void) { return 0; }
1946 static inline void futex_hash_init_mm(struct futex_mm_data *fd) { }
1947 #endif /* !CONFIG_FUTEX_PRIVATE_HASH */
1948 
1949 #ifdef CONFIG_FUTEX_ROBUST_UNLOCK
1950 static void futex_invalidate_cs_ranges(struct futex_mm_data *fd)
1951 {
1952 	/*
1953 	 * Invalidate start_ip so that the quick check fails for ip >= start_ip
1954 	 * if VDSO is not mapped or the second slot is not available for compat
1955 	 * tasks as they use VDSO32 which does not provide the 64-bit pointer
1956 	 * variant.
1957 	 */
1958 	for (int i = 0; i < FUTEX_ROBUST_MAX_CS_RANGES; i++)
1959 		fd->unlock.cs_ranges[i].start_ip = ~0UL;
1960 }
1961 
1962 void futex_reset_cs_ranges(struct futex_mm_data *fd)
1963 {
1964 	memset(fd->unlock.cs_ranges, 0, sizeof(fd->unlock.cs_ranges));
1965 	futex_invalidate_cs_ranges(fd);
1966 }
1967 
1968 static void futex_robust_unlock_init_mm(struct futex_mm_data *fd)
1969 {
1970 	/* mm_dup() preserves the range, mm_alloc() clears it */
1971 	if (!fd->unlock.cs_ranges[0].start_ip)
1972 		futex_invalidate_cs_ranges(fd);
1973 }
1974 #else  /* CONFIG_FUTEX_ROBUST_UNLOCK */
1975 static inline void futex_robust_unlock_init_mm(struct futex_mm_data *fd) { }
1976 #endif /* !CONFIG_FUTEX_ROBUST_UNLOCK */
1977 
1978 #if defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_FUTEX_ROBUST_UNLOCK)
1979 void futex_mm_init(struct mm_struct *mm)
1980 {
1981 	futex_hash_init_mm(&mm->futex);
1982 	futex_robust_unlock_init_mm(&mm->futex);
1983 }
1984 #endif
1985 
1986 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
1987 {
1988 	unsigned int flags = FH_CUSTOM;
1989 	int ret;
1990 
1991 	switch (arg2) {
1992 	case PR_FUTEX_HASH_SET_SLOTS:
1993 		if (arg4)
1994 			return -EINVAL;
1995 		ret = futex_hash_allocate(arg3, flags);
1996 		break;
1997 
1998 	case PR_FUTEX_HASH_GET_SLOTS:
1999 		ret = futex_hash_get_slots();
2000 		break;
2001 
2002 	default:
2003 		ret = -EINVAL;
2004 		break;
2005 	}
2006 	return ret;
2007 }
2008 
2009 static int __init futex_init(void)
2010 {
2011 	unsigned long hashsize, i;
2012 	unsigned int order, n;
2013 	unsigned long size;
2014 
2015 #ifdef CONFIG_BASE_SMALL
2016 	hashsize = 16;
2017 #else
2018 	hashsize = 256 * num_possible_cpus();
2019 	hashsize /= num_possible_nodes();
2020 	hashsize = max(4, hashsize);
2021 	hashsize = roundup_pow_of_two(hashsize);
2022 #endif
2023 	futex_hashshift = ilog2(hashsize);
2024 	size = sizeof(struct futex_hash_bucket) * hashsize;
2025 	order = get_order(size);
2026 
2027 	for_each_node(n) {
2028 		struct futex_hash_bucket *table;
2029 
2030 		if (order > MAX_PAGE_ORDER)
2031 			table = vmalloc_huge_node(size, GFP_KERNEL, n);
2032 		else
2033 			table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
2034 
2035 		BUG_ON(!table);
2036 
2037 		for (i = 0; i < hashsize; i++)
2038 			futex_hash_bucket_init(&table[i]);
2039 
2040 		futex_queues[n] = table;
2041 	}
2042 
2043 	futex_hashmask = hashsize - 1;
2044 	pr_info("futex hash table entries: %lu (%lu bytes on %d NUMA nodes, total %lu KiB, %s).\n",
2045 		hashsize, size, num_possible_nodes(), size * num_possible_nodes() / 1024,
2046 		order > MAX_PAGE_ORDER ? "vmalloc" : "linear");
2047 	return 0;
2048 }
2049 core_initcall(futex_init);
2050