xref: /linux/kernel/futex/core.c (revision 7c4f75a21f636486d2969d9b6680403ea8483539)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *  Fast Userspace Mutexes (which I call "Futexes!").
4  *  (C) Rusty Russell, IBM 2002
5  *
6  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
7  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
8  *
9  *  Removed page pinning, fix privately mapped COW pages and other cleanups
10  *  (C) Copyright 2003, 2004 Jamie Lokier
11  *
12  *  Robust futex support started by Ingo Molnar
13  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
14  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
15  *
16  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
17  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
18  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
19  *
20  *  PRIVATE futexes by Eric Dumazet
21  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
22  *
23  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
24  *  Copyright (C) IBM Corporation, 2009
25  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
26  *
27  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
28  *  enough at me, Linus for the original (flawed) idea, Matthew
29  *  Kirkwood for proof-of-concept implementation.
30  *
31  *  "The futexes are also cursed."
32  *  "But they come in a choice of three flavours!"
33  */
34 #include <linux/compat.h>
35 #include <linux/jhash.h>
36 #include <linux/pagemap.h>
37 #include <linux/debugfs.h>
38 #include <linux/plist.h>
39 #include <linux/memblock.h>
40 #include <linux/fault-inject.h>
41 #include <linux/slab.h>
42 #include <linux/prctl.h>
43 
44 #include "futex.h"
45 #include "../locking/rtmutex_common.h"
46 
47 /*
48  * The base of the bucket array and its size are always used together
49  * (after initialization only in futex_hash()), so ensure that they
50  * reside in the same cacheline.
51  */
52 static struct {
53 	struct futex_hash_bucket *queues;
54 	unsigned long            hashmask;
55 } __futex_data __read_mostly __aligned(2*sizeof(long));
56 #define futex_queues   (__futex_data.queues)
57 #define futex_hashmask (__futex_data.hashmask)
58 
59 struct futex_private_hash {
60 	unsigned int	hash_mask;
61 	void		*mm;
62 	bool		custom;
63 	struct futex_hash_bucket queues[];
64 };
65 
66 /*
67  * Fault injections for futexes.
68  */
69 #ifdef CONFIG_FAIL_FUTEX
70 
71 static struct {
72 	struct fault_attr attr;
73 
74 	bool ignore_private;
75 } fail_futex = {
76 	.attr = FAULT_ATTR_INITIALIZER,
77 	.ignore_private = false,
78 };
79 
80 static int __init setup_fail_futex(char *str)
81 {
82 	return setup_fault_attr(&fail_futex.attr, str);
83 }
84 __setup("fail_futex=", setup_fail_futex);
85 
86 bool should_fail_futex(bool fshared)
87 {
88 	if (fail_futex.ignore_private && !fshared)
89 		return false;
90 
91 	return should_fail(&fail_futex.attr, 1);
92 }
93 
94 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
95 
96 static int __init fail_futex_debugfs(void)
97 {
98 	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
99 	struct dentry *dir;
100 
101 	dir = fault_create_debugfs_attr("fail_futex", NULL,
102 					&fail_futex.attr);
103 	if (IS_ERR(dir))
104 		return PTR_ERR(dir);
105 
106 	debugfs_create_bool("ignore-private", mode, dir,
107 			    &fail_futex.ignore_private);
108 	return 0;
109 }
110 
111 late_initcall(fail_futex_debugfs);
112 
113 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
114 
115 #endif /* CONFIG_FAIL_FUTEX */
116 
117 static struct futex_hash_bucket *
118 __futex_hash(union futex_key *key, struct futex_private_hash *fph);
119 
120 #ifdef CONFIG_FUTEX_PRIVATE_HASH
121 static inline bool futex_key_is_private(union futex_key *key)
122 {
123 	/*
124 	 * Relies on get_futex_key() to set either bit for shared
125 	 * futexes -- see comment with union futex_key.
126 	 */
127 	return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
128 }
129 
130 bool futex_private_hash_get(struct futex_private_hash *fph)
131 {
132 	return false;
133 }
134 
135 void futex_private_hash_put(struct futex_private_hash *fph)
136 {
137 }
138 
139 /**
140  * futex_hash_get - Get an additional reference for the local hash.
141  * @hb:                    ptr to the private local hash.
142  *
143  * Obtain an additional reference for the already obtained hash bucket. The
144  * caller must already own an reference.
145  */
146 void futex_hash_get(struct futex_hash_bucket *hb) { }
147 void futex_hash_put(struct futex_hash_bucket *hb) { }
148 
149 static struct futex_hash_bucket *
150 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
151 {
152 	u32 hash;
153 
154 	if (!futex_key_is_private(key))
155 		return NULL;
156 
157 	if (!fph)
158 		fph = key->private.mm->futex_phash;
159 	if (!fph || !fph->hash_mask)
160 		return NULL;
161 
162 	hash = jhash2((void *)&key->private.address,
163 		      sizeof(key->private.address) / 4,
164 		      key->both.offset);
165 	return &fph->queues[hash & fph->hash_mask];
166 }
167 
168 struct futex_private_hash *futex_private_hash(void)
169 {
170 	struct mm_struct *mm = current->mm;
171 	struct futex_private_hash *fph;
172 
173 	fph = mm->futex_phash;
174 	return fph;
175 }
176 
177 struct futex_hash_bucket *futex_hash(union futex_key *key)
178 {
179 	struct futex_hash_bucket *hb;
180 
181 	hb = __futex_hash(key, NULL);
182 	return hb;
183 }
184 
185 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
186 
187 static struct futex_hash_bucket *
188 __futex_hash_private(union futex_key *key, struct futex_private_hash *fph)
189 {
190 	return NULL;
191 }
192 
193 struct futex_hash_bucket *futex_hash(union futex_key *key)
194 {
195 	return __futex_hash(key, NULL);
196 }
197 
198 #endif /* CONFIG_FUTEX_PRIVATE_HASH */
199 
200 /**
201  * __futex_hash - Return the hash bucket
202  * @key:	Pointer to the futex key for which the hash is calculated
203  * @fph:	Pointer to private hash if known
204  *
205  * We hash on the keys returned from get_futex_key (see below) and return the
206  * corresponding hash bucket.
207  * If the FUTEX is PROCESS_PRIVATE then a per-process hash bucket (from the
208  * private hash) is returned if existing. Otherwise a hash bucket from the
209  * global hash is returned.
210  */
211 static struct futex_hash_bucket *
212 __futex_hash(union futex_key *key, struct futex_private_hash *fph)
213 {
214 	struct futex_hash_bucket *hb;
215 	u32 hash;
216 
217 	hb = __futex_hash_private(key, fph);
218 	if (hb)
219 		return hb;
220 
221 	hash = jhash2((u32 *)key,
222 		      offsetof(typeof(*key), both.offset) / 4,
223 		      key->both.offset);
224 	return &futex_queues[hash & futex_hashmask];
225 }
226 
227 /**
228  * futex_setup_timer - set up the sleeping hrtimer.
229  * @time:	ptr to the given timeout value
230  * @timeout:	the hrtimer_sleeper structure to be set up
231  * @flags:	futex flags
232  * @range_ns:	optional range in ns
233  *
234  * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
235  *	   value given
236  */
237 struct hrtimer_sleeper *
238 futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
239 		  int flags, u64 range_ns)
240 {
241 	if (!time)
242 		return NULL;
243 
244 	hrtimer_setup_sleeper_on_stack(timeout,
245 				       (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC,
246 				       HRTIMER_MODE_ABS);
247 	/*
248 	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
249 	 * effectively the same as calling hrtimer_set_expires().
250 	 */
251 	hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
252 
253 	return timeout;
254 }
255 
256 /*
257  * Generate a machine wide unique identifier for this inode.
258  *
259  * This relies on u64 not wrapping in the life-time of the machine; which with
260  * 1ns resolution means almost 585 years.
261  *
262  * This further relies on the fact that a well formed program will not unmap
263  * the file while it has a (shared) futex waiting on it. This mapping will have
264  * a file reference which pins the mount and inode.
265  *
266  * If for some reason an inode gets evicted and read back in again, it will get
267  * a new sequence number and will _NOT_ match, even though it is the exact same
268  * file.
269  *
270  * It is important that futex_match() will never have a false-positive, esp.
271  * for PI futexes that can mess up the state. The above argues that false-negatives
272  * are only possible for malformed programs.
273  */
274 static u64 get_inode_sequence_number(struct inode *inode)
275 {
276 	static atomic64_t i_seq;
277 	u64 old;
278 
279 	/* Does the inode already have a sequence number? */
280 	old = atomic64_read(&inode->i_sequence);
281 	if (likely(old))
282 		return old;
283 
284 	for (;;) {
285 		u64 new = atomic64_inc_return(&i_seq);
286 		if (WARN_ON_ONCE(!new))
287 			continue;
288 
289 		old = 0;
290 		if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new))
291 			return old;
292 		return new;
293 	}
294 }
295 
296 /**
297  * get_futex_key() - Get parameters which are the keys for a futex
298  * @uaddr:	virtual address of the futex
299  * @flags:	FLAGS_*
300  * @key:	address where result is stored.
301  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
302  *              FUTEX_WRITE)
303  *
304  * Return: a negative error code or 0
305  *
306  * The key words are stored in @key on success.
307  *
308  * For shared mappings (when @fshared), the key is:
309  *
310  *   ( inode->i_sequence, page->index, offset_within_page )
311  *
312  * [ also see get_inode_sequence_number() ]
313  *
314  * For private mappings (or when !@fshared), the key is:
315  *
316  *   ( current->mm, address, 0 )
317  *
318  * This allows (cross process, where applicable) identification of the futex
319  * without keeping the page pinned for the duration of the FUTEX_WAIT.
320  *
321  * lock_page() might sleep, the caller should not hold a spinlock.
322  */
323 int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
324 		  enum futex_access rw)
325 {
326 	unsigned long address = (unsigned long)uaddr;
327 	struct mm_struct *mm = current->mm;
328 	struct page *page;
329 	struct folio *folio;
330 	struct address_space *mapping;
331 	int err, ro = 0;
332 	bool fshared;
333 
334 	fshared = flags & FLAGS_SHARED;
335 
336 	/*
337 	 * The futex address must be "naturally" aligned.
338 	 */
339 	key->both.offset = address % PAGE_SIZE;
340 	if (unlikely((address % sizeof(u32)) != 0))
341 		return -EINVAL;
342 	address -= key->both.offset;
343 
344 	if (unlikely(!access_ok(uaddr, sizeof(u32))))
345 		return -EFAULT;
346 
347 	if (unlikely(should_fail_futex(fshared)))
348 		return -EFAULT;
349 
350 	/*
351 	 * PROCESS_PRIVATE futexes are fast.
352 	 * As the mm cannot disappear under us and the 'key' only needs
353 	 * virtual address, we dont even have to find the underlying vma.
354 	 * Note : We do have to check 'uaddr' is a valid user address,
355 	 *        but access_ok() should be faster than find_vma()
356 	 */
357 	if (!fshared) {
358 		/*
359 		 * On no-MMU, shared futexes are treated as private, therefore
360 		 * we must not include the current process in the key. Since
361 		 * there is only one address space, the address is a unique key
362 		 * on its own.
363 		 */
364 		if (IS_ENABLED(CONFIG_MMU))
365 			key->private.mm = mm;
366 		else
367 			key->private.mm = NULL;
368 
369 		key->private.address = address;
370 		return 0;
371 	}
372 
373 again:
374 	/* Ignore any VERIFY_READ mapping (futex common case) */
375 	if (unlikely(should_fail_futex(true)))
376 		return -EFAULT;
377 
378 	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
379 	/*
380 	 * If write access is not required (eg. FUTEX_WAIT), try
381 	 * and get read-only access.
382 	 */
383 	if (err == -EFAULT && rw == FUTEX_READ) {
384 		err = get_user_pages_fast(address, 1, 0, &page);
385 		ro = 1;
386 	}
387 	if (err < 0)
388 		return err;
389 	else
390 		err = 0;
391 
392 	/*
393 	 * The treatment of mapping from this point on is critical. The folio
394 	 * lock protects many things but in this context the folio lock
395 	 * stabilizes mapping, prevents inode freeing in the shared
396 	 * file-backed region case and guards against movement to swap cache.
397 	 *
398 	 * Strictly speaking the folio lock is not needed in all cases being
399 	 * considered here and folio lock forces unnecessarily serialization.
400 	 * From this point on, mapping will be re-verified if necessary and
401 	 * folio lock will be acquired only if it is unavoidable
402 	 *
403 	 * Mapping checks require the folio so it is looked up now. For
404 	 * anonymous pages, it does not matter if the folio is split
405 	 * in the future as the key is based on the address. For
406 	 * filesystem-backed pages, the precise page is required as the
407 	 * index of the page determines the key.
408 	 */
409 	folio = page_folio(page);
410 	mapping = READ_ONCE(folio->mapping);
411 
412 	/*
413 	 * If folio->mapping is NULL, then it cannot be an anonymous
414 	 * page; but it might be the ZERO_PAGE or in the gate area or
415 	 * in a special mapping (all cases which we are happy to fail);
416 	 * or it may have been a good file page when get_user_pages_fast
417 	 * found it, but truncated or holepunched or subjected to
418 	 * invalidate_complete_page2 before we got the folio lock (also
419 	 * cases which we are happy to fail).  And we hold a reference,
420 	 * so refcount care in invalidate_inode_page's remove_mapping
421 	 * prevents drop_caches from setting mapping to NULL beneath us.
422 	 *
423 	 * The case we do have to guard against is when memory pressure made
424 	 * shmem_writepage move it from filecache to swapcache beneath us:
425 	 * an unlikely race, but we do need to retry for folio->mapping.
426 	 */
427 	if (unlikely(!mapping)) {
428 		int shmem_swizzled;
429 
430 		/*
431 		 * Folio lock is required to identify which special case above
432 		 * applies. If this is really a shmem page then the folio lock
433 		 * will prevent unexpected transitions.
434 		 */
435 		folio_lock(folio);
436 		shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
437 		folio_unlock(folio);
438 		folio_put(folio);
439 
440 		if (shmem_swizzled)
441 			goto again;
442 
443 		return -EFAULT;
444 	}
445 
446 	/*
447 	 * Private mappings are handled in a simple way.
448 	 *
449 	 * If the futex key is stored in anonymous memory, then the associated
450 	 * object is the mm which is implicitly pinned by the calling process.
451 	 *
452 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
453 	 * it's a read-only handle, it's expected that futexes attach to
454 	 * the object not the particular process.
455 	 */
456 	if (folio_test_anon(folio)) {
457 		/*
458 		 * A RO anonymous page will never change and thus doesn't make
459 		 * sense for futex operations.
460 		 */
461 		if (unlikely(should_fail_futex(true)) || ro) {
462 			err = -EFAULT;
463 			goto out;
464 		}
465 
466 		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
467 		key->private.mm = mm;
468 		key->private.address = address;
469 
470 	} else {
471 		struct inode *inode;
472 
473 		/*
474 		 * The associated futex object in this case is the inode and
475 		 * the folio->mapping must be traversed. Ordinarily this should
476 		 * be stabilised under folio lock but it's not strictly
477 		 * necessary in this case as we just want to pin the inode, not
478 		 * update i_pages or anything like that.
479 		 *
480 		 * The RCU read lock is taken as the inode is finally freed
481 		 * under RCU. If the mapping still matches expectations then the
482 		 * mapping->host can be safely accessed as being a valid inode.
483 		 */
484 		rcu_read_lock();
485 
486 		if (READ_ONCE(folio->mapping) != mapping) {
487 			rcu_read_unlock();
488 			folio_put(folio);
489 
490 			goto again;
491 		}
492 
493 		inode = READ_ONCE(mapping->host);
494 		if (!inode) {
495 			rcu_read_unlock();
496 			folio_put(folio);
497 
498 			goto again;
499 		}
500 
501 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
502 		key->shared.i_seq = get_inode_sequence_number(inode);
503 		key->shared.pgoff = page_pgoff(folio, page);
504 		rcu_read_unlock();
505 	}
506 
507 out:
508 	folio_put(folio);
509 	return err;
510 }
511 
512 /**
513  * fault_in_user_writeable() - Fault in user address and verify RW access
514  * @uaddr:	pointer to faulting user space address
515  *
516  * Slow path to fixup the fault we just took in the atomic write
517  * access to @uaddr.
518  *
519  * We have no generic implementation of a non-destructive write to the
520  * user address. We know that we faulted in the atomic pagefault
521  * disabled section so we can as well avoid the #PF overhead by
522  * calling get_user_pages() right away.
523  */
524 int fault_in_user_writeable(u32 __user *uaddr)
525 {
526 	struct mm_struct *mm = current->mm;
527 	int ret;
528 
529 	mmap_read_lock(mm);
530 	ret = fixup_user_fault(mm, (unsigned long)uaddr,
531 			       FAULT_FLAG_WRITE, NULL);
532 	mmap_read_unlock(mm);
533 
534 	return ret < 0 ? ret : 0;
535 }
536 
537 /**
538  * futex_top_waiter() - Return the highest priority waiter on a futex
539  * @hb:		the hash bucket the futex_q's reside in
540  * @key:	the futex key (to distinguish it from other futex futex_q's)
541  *
542  * Must be called with the hb lock held.
543  */
544 struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
545 {
546 	struct futex_q *this;
547 
548 	plist_for_each_entry(this, &hb->chain, list) {
549 		if (futex_match(&this->key, key))
550 			return this;
551 	}
552 	return NULL;
553 }
554 
555 /**
556  * wait_for_owner_exiting - Block until the owner has exited
557  * @ret: owner's current futex lock status
558  * @exiting:	Pointer to the exiting task
559  *
560  * Caller must hold a refcount on @exiting.
561  */
562 void wait_for_owner_exiting(int ret, struct task_struct *exiting)
563 {
564 	if (ret != -EBUSY) {
565 		WARN_ON_ONCE(exiting);
566 		return;
567 	}
568 
569 	if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
570 		return;
571 
572 	mutex_lock(&exiting->futex_exit_mutex);
573 	/*
574 	 * No point in doing state checking here. If the waiter got here
575 	 * while the task was in exec()->exec_futex_release() then it can
576 	 * have any FUTEX_STATE_* value when the waiter has acquired the
577 	 * mutex. OK, if running, EXITING or DEAD if it reached exit()
578 	 * already. Highly unlikely and not a problem. Just one more round
579 	 * through the futex maze.
580 	 */
581 	mutex_unlock(&exiting->futex_exit_mutex);
582 
583 	put_task_struct(exiting);
584 }
585 
586 /**
587  * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
588  * @q:	The futex_q to unqueue
589  *
590  * The q->lock_ptr must not be NULL and must be held by the caller.
591  */
592 void __futex_unqueue(struct futex_q *q)
593 {
594 	struct futex_hash_bucket *hb;
595 
596 	if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
597 		return;
598 	lockdep_assert_held(q->lock_ptr);
599 
600 	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
601 	plist_del(&q->list, &hb->chain);
602 	futex_hb_waiters_dec(hb);
603 }
604 
605 /* The key must be already stored in q->key. */
606 void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb)
607 	__acquires(&hb->lock)
608 {
609 	/*
610 	 * Increment the counter before taking the lock so that
611 	 * a potential waker won't miss a to-be-slept task that is
612 	 * waiting for the spinlock. This is safe as all futex_q_lock()
613 	 * users end up calling futex_queue(). Similarly, for housekeeping,
614 	 * decrement the counter at futex_q_unlock() when some error has
615 	 * occurred and we don't end up adding the task to the list.
616 	 */
617 	futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
618 
619 	q->lock_ptr = &hb->lock;
620 
621 	spin_lock(&hb->lock);
622 }
623 
624 void futex_q_unlock(struct futex_hash_bucket *hb)
625 	__releases(&hb->lock)
626 {
627 	futex_hb_waiters_dec(hb);
628 	spin_unlock(&hb->lock);
629 }
630 
631 void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
632 		   struct task_struct *task)
633 {
634 	int prio;
635 
636 	/*
637 	 * The priority used to register this element is
638 	 * - either the real thread-priority for the real-time threads
639 	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
640 	 * - or MAX_RT_PRIO for non-RT threads.
641 	 * Thus, all RT-threads are woken first in priority order, and
642 	 * the others are woken last, in FIFO order.
643 	 */
644 	prio = min(current->normal_prio, MAX_RT_PRIO);
645 
646 	plist_node_init(&q->list, prio);
647 	plist_add(&q->list, &hb->chain);
648 	q->task = task;
649 }
650 
651 /**
652  * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
653  * @q:	The futex_q to unqueue
654  *
655  * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
656  * be paired with exactly one earlier call to futex_queue().
657  *
658  * Return:
659  *  - 1 - if the futex_q was still queued (and we removed unqueued it);
660  *  - 0 - if the futex_q was already removed by the waking thread
661  */
662 int futex_unqueue(struct futex_q *q)
663 {
664 	spinlock_t *lock_ptr;
665 	int ret = 0;
666 
667 	/* In the common case we don't take the spinlock, which is nice. */
668 retry:
669 	/*
670 	 * q->lock_ptr can change between this read and the following spin_lock.
671 	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
672 	 * optimizing lock_ptr out of the logic below.
673 	 */
674 	lock_ptr = READ_ONCE(q->lock_ptr);
675 	if (lock_ptr != NULL) {
676 		spin_lock(lock_ptr);
677 		/*
678 		 * q->lock_ptr can change between reading it and
679 		 * spin_lock(), causing us to take the wrong lock.  This
680 		 * corrects the race condition.
681 		 *
682 		 * Reasoning goes like this: if we have the wrong lock,
683 		 * q->lock_ptr must have changed (maybe several times)
684 		 * between reading it and the spin_lock().  It can
685 		 * change again after the spin_lock() but only if it was
686 		 * already changed before the spin_lock().  It cannot,
687 		 * however, change back to the original value.  Therefore
688 		 * we can detect whether we acquired the correct lock.
689 		 */
690 		if (unlikely(lock_ptr != q->lock_ptr)) {
691 			spin_unlock(lock_ptr);
692 			goto retry;
693 		}
694 		__futex_unqueue(q);
695 
696 		BUG_ON(q->pi_state);
697 
698 		spin_unlock(lock_ptr);
699 		ret = 1;
700 	}
701 
702 	return ret;
703 }
704 
705 void futex_q_lockptr_lock(struct futex_q *q)
706 {
707 	spinlock_t *lock_ptr;
708 
709 	/*
710 	 * See futex_unqueue() why lock_ptr can change.
711 	 */
712 	guard(rcu)();
713 retry:
714 	lock_ptr = READ_ONCE(q->lock_ptr);
715 	spin_lock(lock_ptr);
716 
717 	if (unlikely(lock_ptr != q->lock_ptr)) {
718 		spin_unlock(lock_ptr);
719 		goto retry;
720 	}
721 }
722 
723 /*
724  * PI futexes can not be requeued and must remove themselves from the hash
725  * bucket. The hash bucket lock (i.e. lock_ptr) is held.
726  */
727 void futex_unqueue_pi(struct futex_q *q)
728 {
729 	/*
730 	 * If the lock was not acquired (due to timeout or signal) then the
731 	 * rt_waiter is removed before futex_q is. If this is observed by
732 	 * an unlocker after dropping the rtmutex wait lock and before
733 	 * acquiring the hash bucket lock, then the unlocker dequeues the
734 	 * futex_q from the hash bucket list to guarantee consistent state
735 	 * vs. userspace. Therefore the dequeue here must be conditional.
736 	 */
737 	if (!plist_node_empty(&q->list))
738 		__futex_unqueue(q);
739 
740 	BUG_ON(!q->pi_state);
741 	put_pi_state(q->pi_state);
742 	q->pi_state = NULL;
743 }
744 
745 /* Constants for the pending_op argument of handle_futex_death */
746 #define HANDLE_DEATH_PENDING	true
747 #define HANDLE_DEATH_LIST	false
748 
749 /*
750  * Process a futex-list entry, check whether it's owned by the
751  * dying task, and do notification if so:
752  */
753 static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
754 			      bool pi, bool pending_op)
755 {
756 	u32 uval, nval, mval;
757 	pid_t owner;
758 	int err;
759 
760 	/* Futex address must be 32bit aligned */
761 	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
762 		return -1;
763 
764 retry:
765 	if (get_user(uval, uaddr))
766 		return -1;
767 
768 	/*
769 	 * Special case for regular (non PI) futexes. The unlock path in
770 	 * user space has two race scenarios:
771 	 *
772 	 * 1. The unlock path releases the user space futex value and
773 	 *    before it can execute the futex() syscall to wake up
774 	 *    waiters it is killed.
775 	 *
776 	 * 2. A woken up waiter is killed before it can acquire the
777 	 *    futex in user space.
778 	 *
779 	 * In the second case, the wake up notification could be generated
780 	 * by the unlock path in user space after setting the futex value
781 	 * to zero or by the kernel after setting the OWNER_DIED bit below.
782 	 *
783 	 * In both cases the TID validation below prevents a wakeup of
784 	 * potential waiters which can cause these waiters to block
785 	 * forever.
786 	 *
787 	 * In both cases the following conditions are met:
788 	 *
789 	 *	1) task->robust_list->list_op_pending != NULL
790 	 *	   @pending_op == true
791 	 *	2) The owner part of user space futex value == 0
792 	 *	3) Regular futex: @pi == false
793 	 *
794 	 * If these conditions are met, it is safe to attempt waking up a
795 	 * potential waiter without touching the user space futex value and
796 	 * trying to set the OWNER_DIED bit. If the futex value is zero,
797 	 * the rest of the user space mutex state is consistent, so a woken
798 	 * waiter will just take over the uncontended futex. Setting the
799 	 * OWNER_DIED bit would create inconsistent state and malfunction
800 	 * of the user space owner died handling. Otherwise, the OWNER_DIED
801 	 * bit is already set, and the woken waiter is expected to deal with
802 	 * this.
803 	 */
804 	owner = uval & FUTEX_TID_MASK;
805 
806 	if (pending_op && !pi && !owner) {
807 		futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
808 			   FUTEX_BITSET_MATCH_ANY);
809 		return 0;
810 	}
811 
812 	if (owner != task_pid_vnr(curr))
813 		return 0;
814 
815 	/*
816 	 * Ok, this dying thread is truly holding a futex
817 	 * of interest. Set the OWNER_DIED bit atomically
818 	 * via cmpxchg, and if the value had FUTEX_WAITERS
819 	 * set, wake up a waiter (if any). (We have to do a
820 	 * futex_wake() even if OWNER_DIED is already set -
821 	 * to handle the rare but possible case of recursive
822 	 * thread-death.) The rest of the cleanup is done in
823 	 * userspace.
824 	 */
825 	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
826 
827 	/*
828 	 * We are not holding a lock here, but we want to have
829 	 * the pagefault_disable/enable() protection because
830 	 * we want to handle the fault gracefully. If the
831 	 * access fails we try to fault in the futex with R/W
832 	 * verification via get_user_pages. get_user() above
833 	 * does not guarantee R/W access. If that fails we
834 	 * give up and leave the futex locked.
835 	 */
836 	if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
837 		switch (err) {
838 		case -EFAULT:
839 			if (fault_in_user_writeable(uaddr))
840 				return -1;
841 			goto retry;
842 
843 		case -EAGAIN:
844 			cond_resched();
845 			goto retry;
846 
847 		default:
848 			WARN_ON_ONCE(1);
849 			return err;
850 		}
851 	}
852 
853 	if (nval != uval)
854 		goto retry;
855 
856 	/*
857 	 * Wake robust non-PI futexes here. The wakeup of
858 	 * PI futexes happens in exit_pi_state():
859 	 */
860 	if (!pi && (uval & FUTEX_WAITERS)) {
861 		futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
862 			   FUTEX_BITSET_MATCH_ANY);
863 	}
864 
865 	return 0;
866 }
867 
868 /*
869  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
870  */
871 static inline int fetch_robust_entry(struct robust_list __user **entry,
872 				     struct robust_list __user * __user *head,
873 				     unsigned int *pi)
874 {
875 	unsigned long uentry;
876 
877 	if (get_user(uentry, (unsigned long __user *)head))
878 		return -EFAULT;
879 
880 	*entry = (void __user *)(uentry & ~1UL);
881 	*pi = uentry & 1;
882 
883 	return 0;
884 }
885 
886 /*
887  * Walk curr->robust_list (very carefully, it's a userspace list!)
888  * and mark any locks found there dead, and notify any waiters.
889  *
890  * We silently return on any sign of list-walking problem.
891  */
892 static void exit_robust_list(struct task_struct *curr)
893 {
894 	struct robust_list_head __user *head = curr->robust_list;
895 	struct robust_list __user *entry, *next_entry, *pending;
896 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
897 	unsigned int next_pi;
898 	unsigned long futex_offset;
899 	int rc;
900 
901 	/*
902 	 * Fetch the list head (which was registered earlier, via
903 	 * sys_set_robust_list()):
904 	 */
905 	if (fetch_robust_entry(&entry, &head->list.next, &pi))
906 		return;
907 	/*
908 	 * Fetch the relative futex offset:
909 	 */
910 	if (get_user(futex_offset, &head->futex_offset))
911 		return;
912 	/*
913 	 * Fetch any possibly pending lock-add first, and handle it
914 	 * if it exists:
915 	 */
916 	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
917 		return;
918 
919 	next_entry = NULL;	/* avoid warning with gcc */
920 	while (entry != &head->list) {
921 		/*
922 		 * Fetch the next entry in the list before calling
923 		 * handle_futex_death:
924 		 */
925 		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
926 		/*
927 		 * A pending lock might already be on the list, so
928 		 * don't process it twice:
929 		 */
930 		if (entry != pending) {
931 			if (handle_futex_death((void __user *)entry + futex_offset,
932 						curr, pi, HANDLE_DEATH_LIST))
933 				return;
934 		}
935 		if (rc)
936 			return;
937 		entry = next_entry;
938 		pi = next_pi;
939 		/*
940 		 * Avoid excessively long or circular lists:
941 		 */
942 		if (!--limit)
943 			break;
944 
945 		cond_resched();
946 	}
947 
948 	if (pending) {
949 		handle_futex_death((void __user *)pending + futex_offset,
950 				   curr, pip, HANDLE_DEATH_PENDING);
951 	}
952 }
953 
954 #ifdef CONFIG_COMPAT
955 static void __user *futex_uaddr(struct robust_list __user *entry,
956 				compat_long_t futex_offset)
957 {
958 	compat_uptr_t base = ptr_to_compat(entry);
959 	void __user *uaddr = compat_ptr(base + futex_offset);
960 
961 	return uaddr;
962 }
963 
964 /*
965  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
966  */
967 static inline int
968 compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
969 		   compat_uptr_t __user *head, unsigned int *pi)
970 {
971 	if (get_user(*uentry, head))
972 		return -EFAULT;
973 
974 	*entry = compat_ptr((*uentry) & ~1);
975 	*pi = (unsigned int)(*uentry) & 1;
976 
977 	return 0;
978 }
979 
980 /*
981  * Walk curr->robust_list (very carefully, it's a userspace list!)
982  * and mark any locks found there dead, and notify any waiters.
983  *
984  * We silently return on any sign of list-walking problem.
985  */
986 static void compat_exit_robust_list(struct task_struct *curr)
987 {
988 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
989 	struct robust_list __user *entry, *next_entry, *pending;
990 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
991 	unsigned int next_pi;
992 	compat_uptr_t uentry, next_uentry, upending;
993 	compat_long_t futex_offset;
994 	int rc;
995 
996 	/*
997 	 * Fetch the list head (which was registered earlier, via
998 	 * sys_set_robust_list()):
999 	 */
1000 	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
1001 		return;
1002 	/*
1003 	 * Fetch the relative futex offset:
1004 	 */
1005 	if (get_user(futex_offset, &head->futex_offset))
1006 		return;
1007 	/*
1008 	 * Fetch any possibly pending lock-add first, and handle it
1009 	 * if it exists:
1010 	 */
1011 	if (compat_fetch_robust_entry(&upending, &pending,
1012 			       &head->list_op_pending, &pip))
1013 		return;
1014 
1015 	next_entry = NULL;	/* avoid warning with gcc */
1016 	while (entry != (struct robust_list __user *) &head->list) {
1017 		/*
1018 		 * Fetch the next entry in the list before calling
1019 		 * handle_futex_death:
1020 		 */
1021 		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
1022 			(compat_uptr_t __user *)&entry->next, &next_pi);
1023 		/*
1024 		 * A pending lock might already be on the list, so
1025 		 * dont process it twice:
1026 		 */
1027 		if (entry != pending) {
1028 			void __user *uaddr = futex_uaddr(entry, futex_offset);
1029 
1030 			if (handle_futex_death(uaddr, curr, pi,
1031 					       HANDLE_DEATH_LIST))
1032 				return;
1033 		}
1034 		if (rc)
1035 			return;
1036 		uentry = next_uentry;
1037 		entry = next_entry;
1038 		pi = next_pi;
1039 		/*
1040 		 * Avoid excessively long or circular lists:
1041 		 */
1042 		if (!--limit)
1043 			break;
1044 
1045 		cond_resched();
1046 	}
1047 	if (pending) {
1048 		void __user *uaddr = futex_uaddr(pending, futex_offset);
1049 
1050 		handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
1051 	}
1052 }
1053 #endif
1054 
1055 #ifdef CONFIG_FUTEX_PI
1056 
1057 /*
1058  * This task is holding PI mutexes at exit time => bad.
1059  * Kernel cleans up PI-state, but userspace is likely hosed.
1060  * (Robust-futex cleanup is separate and might save the day for userspace.)
1061  */
1062 static void exit_pi_state_list(struct task_struct *curr)
1063 {
1064 	struct list_head *next, *head = &curr->pi_state_list;
1065 	struct futex_pi_state *pi_state;
1066 	union futex_key key = FUTEX_KEY_INIT;
1067 
1068 	/*
1069 	 * Ensure the hash remains stable (no resize) during the while loop
1070 	 * below. The hb pointer is acquired under the pi_lock so we can't block
1071 	 * on the mutex.
1072 	 */
1073 	WARN_ON(curr != current);
1074 	guard(private_hash)();
1075 	/*
1076 	 * We are a ZOMBIE and nobody can enqueue itself on
1077 	 * pi_state_list anymore, but we have to be careful
1078 	 * versus waiters unqueueing themselves:
1079 	 */
1080 	raw_spin_lock_irq(&curr->pi_lock);
1081 	while (!list_empty(head)) {
1082 		next = head->next;
1083 		pi_state = list_entry(next, struct futex_pi_state, list);
1084 		key = pi_state->key;
1085 		if (1) {
1086 			CLASS(hb, hb)(&key);
1087 
1088 			/*
1089 			 * We can race against put_pi_state() removing itself from the
1090 			 * list (a waiter going away). put_pi_state() will first
1091 			 * decrement the reference count and then modify the list, so
1092 			 * its possible to see the list entry but fail this reference
1093 			 * acquire.
1094 			 *
1095 			 * In that case; drop the locks to let put_pi_state() make
1096 			 * progress and retry the loop.
1097 			 */
1098 			if (!refcount_inc_not_zero(&pi_state->refcount)) {
1099 				raw_spin_unlock_irq(&curr->pi_lock);
1100 				cpu_relax();
1101 				raw_spin_lock_irq(&curr->pi_lock);
1102 				continue;
1103 			}
1104 			raw_spin_unlock_irq(&curr->pi_lock);
1105 
1106 			spin_lock(&hb->lock);
1107 			raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1108 			raw_spin_lock(&curr->pi_lock);
1109 			/*
1110 			 * We dropped the pi-lock, so re-check whether this
1111 			 * task still owns the PI-state:
1112 			 */
1113 			if (head->next != next) {
1114 				/* retain curr->pi_lock for the loop invariant */
1115 				raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1116 				spin_unlock(&hb->lock);
1117 				put_pi_state(pi_state);
1118 				continue;
1119 			}
1120 
1121 			WARN_ON(pi_state->owner != curr);
1122 			WARN_ON(list_empty(&pi_state->list));
1123 			list_del_init(&pi_state->list);
1124 			pi_state->owner = NULL;
1125 
1126 			raw_spin_unlock(&curr->pi_lock);
1127 			raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1128 			spin_unlock(&hb->lock);
1129 		}
1130 
1131 		rt_mutex_futex_unlock(&pi_state->pi_mutex);
1132 		put_pi_state(pi_state);
1133 
1134 		raw_spin_lock_irq(&curr->pi_lock);
1135 	}
1136 	raw_spin_unlock_irq(&curr->pi_lock);
1137 }
1138 #else
1139 static inline void exit_pi_state_list(struct task_struct *curr) { }
1140 #endif
1141 
1142 static void futex_cleanup(struct task_struct *tsk)
1143 {
1144 	if (unlikely(tsk->robust_list)) {
1145 		exit_robust_list(tsk);
1146 		tsk->robust_list = NULL;
1147 	}
1148 
1149 #ifdef CONFIG_COMPAT
1150 	if (unlikely(tsk->compat_robust_list)) {
1151 		compat_exit_robust_list(tsk);
1152 		tsk->compat_robust_list = NULL;
1153 	}
1154 #endif
1155 
1156 	if (unlikely(!list_empty(&tsk->pi_state_list)))
1157 		exit_pi_state_list(tsk);
1158 }
1159 
1160 /**
1161  * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
1162  * @tsk:	task to set the state on
1163  *
1164  * Set the futex exit state of the task lockless. The futex waiter code
1165  * observes that state when a task is exiting and loops until the task has
1166  * actually finished the futex cleanup. The worst case for this is that the
1167  * waiter runs through the wait loop until the state becomes visible.
1168  *
1169  * This is called from the recursive fault handling path in make_task_dead().
1170  *
1171  * This is best effort. Either the futex exit code has run already or
1172  * not. If the OWNER_DIED bit has been set on the futex then the waiter can
1173  * take it over. If not, the problem is pushed back to user space. If the
1174  * futex exit code did not run yet, then an already queued waiter might
1175  * block forever, but there is nothing which can be done about that.
1176  */
1177 void futex_exit_recursive(struct task_struct *tsk)
1178 {
1179 	/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
1180 	if (tsk->futex_state == FUTEX_STATE_EXITING)
1181 		mutex_unlock(&tsk->futex_exit_mutex);
1182 	tsk->futex_state = FUTEX_STATE_DEAD;
1183 }
1184 
1185 static void futex_cleanup_begin(struct task_struct *tsk)
1186 {
1187 	/*
1188 	 * Prevent various race issues against a concurrent incoming waiter
1189 	 * including live locks by forcing the waiter to block on
1190 	 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
1191 	 * attach_to_pi_owner().
1192 	 */
1193 	mutex_lock(&tsk->futex_exit_mutex);
1194 
1195 	/*
1196 	 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
1197 	 *
1198 	 * This ensures that all subsequent checks of tsk->futex_state in
1199 	 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
1200 	 * tsk->pi_lock held.
1201 	 *
1202 	 * It guarantees also that a pi_state which was queued right before
1203 	 * the state change under tsk->pi_lock by a concurrent waiter must
1204 	 * be observed in exit_pi_state_list().
1205 	 */
1206 	raw_spin_lock_irq(&tsk->pi_lock);
1207 	tsk->futex_state = FUTEX_STATE_EXITING;
1208 	raw_spin_unlock_irq(&tsk->pi_lock);
1209 }
1210 
1211 static void futex_cleanup_end(struct task_struct *tsk, int state)
1212 {
1213 	/*
1214 	 * Lockless store. The only side effect is that an observer might
1215 	 * take another loop until it becomes visible.
1216 	 */
1217 	tsk->futex_state = state;
1218 	/*
1219 	 * Drop the exit protection. This unblocks waiters which observed
1220 	 * FUTEX_STATE_EXITING to reevaluate the state.
1221 	 */
1222 	mutex_unlock(&tsk->futex_exit_mutex);
1223 }
1224 
1225 void futex_exec_release(struct task_struct *tsk)
1226 {
1227 	/*
1228 	 * The state handling is done for consistency, but in the case of
1229 	 * exec() there is no way to prevent further damage as the PID stays
1230 	 * the same. But for the unlikely and arguably buggy case that a
1231 	 * futex is held on exec(), this provides at least as much state
1232 	 * consistency protection which is possible.
1233 	 */
1234 	futex_cleanup_begin(tsk);
1235 	futex_cleanup(tsk);
1236 	/*
1237 	 * Reset the state to FUTEX_STATE_OK. The task is alive and about
1238 	 * exec a new binary.
1239 	 */
1240 	futex_cleanup_end(tsk, FUTEX_STATE_OK);
1241 }
1242 
1243 void futex_exit_release(struct task_struct *tsk)
1244 {
1245 	futex_cleanup_begin(tsk);
1246 	futex_cleanup(tsk);
1247 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
1248 }
1249 
1250 static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
1251 				   struct futex_private_hash *fph)
1252 {
1253 #ifdef CONFIG_FUTEX_PRIVATE_HASH
1254 	fhb->priv = fph;
1255 #endif
1256 	atomic_set(&fhb->waiters, 0);
1257 	plist_head_init(&fhb->chain);
1258 	spin_lock_init(&fhb->lock);
1259 }
1260 
1261 #ifdef CONFIG_FUTEX_PRIVATE_HASH
1262 void futex_hash_free(struct mm_struct *mm)
1263 {
1264 	kvfree(mm->futex_phash);
1265 }
1266 
1267 static int futex_hash_allocate(unsigned int hash_slots, bool custom)
1268 {
1269 	struct mm_struct *mm = current->mm;
1270 	struct futex_private_hash *fph;
1271 	int i;
1272 
1273 	if (hash_slots && (hash_slots == 1 || !is_power_of_2(hash_slots)))
1274 		return -EINVAL;
1275 
1276 	if (mm->futex_phash)
1277 		return -EALREADY;
1278 
1279 	if (!thread_group_empty(current))
1280 		return -EINVAL;
1281 
1282 	fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1283 	if (!fph)
1284 		return -ENOMEM;
1285 
1286 	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
1287 	fph->custom = custom;
1288 	fph->mm = mm;
1289 
1290 	for (i = 0; i < hash_slots; i++)
1291 		futex_hash_bucket_init(&fph->queues[i], fph);
1292 
1293 	mm->futex_phash = fph;
1294 	return 0;
1295 }
1296 
1297 int futex_hash_allocate_default(void)
1298 {
1299 	if (!current->mm)
1300 		return 0;
1301 
1302 	if (current->mm->futex_phash)
1303 		return 0;
1304 
1305 	return futex_hash_allocate(16, false);
1306 }
1307 
1308 static int futex_hash_get_slots(void)
1309 {
1310 	struct futex_private_hash *fph;
1311 
1312 	fph = current->mm->futex_phash;
1313 	if (fph && fph->hash_mask)
1314 		return fph->hash_mask + 1;
1315 	return 0;
1316 }
1317 
1318 #else
1319 
1320 static int futex_hash_allocate(unsigned int hash_slots, bool custom)
1321 {
1322 	return -EINVAL;
1323 }
1324 
1325 static int futex_hash_get_slots(void)
1326 {
1327 	return 0;
1328 }
1329 #endif
1330 
1331 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
1332 {
1333 	int ret;
1334 
1335 	switch (arg2) {
1336 	case PR_FUTEX_HASH_SET_SLOTS:
1337 		if (arg4 != 0)
1338 			return -EINVAL;
1339 		ret = futex_hash_allocate(arg3, true);
1340 		break;
1341 
1342 	case PR_FUTEX_HASH_GET_SLOTS:
1343 		ret = futex_hash_get_slots();
1344 		break;
1345 
1346 	default:
1347 		ret = -EINVAL;
1348 		break;
1349 	}
1350 	return ret;
1351 }
1352 
1353 static int __init futex_init(void)
1354 {
1355 	unsigned long hashsize, i;
1356 	unsigned int futex_shift;
1357 
1358 #ifdef CONFIG_BASE_SMALL
1359 	hashsize = 16;
1360 #else
1361 	hashsize = roundup_pow_of_two(256 * num_possible_cpus());
1362 #endif
1363 
1364 	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
1365 					       hashsize, 0, 0,
1366 					       &futex_shift, NULL,
1367 					       hashsize, hashsize);
1368 	hashsize = 1UL << futex_shift;
1369 
1370 	for (i = 0; i < hashsize; i++)
1371 		futex_hash_bucket_init(&futex_queues[i], NULL);
1372 
1373 	futex_hashmask = hashsize - 1;
1374 	return 0;
1375 }
1376 core_initcall(futex_init);
1377