xref: /freebsd/sys/compat/linux/linux_futex.c (revision d90df8ac1324d731f8423b5501158ee870fbd5ba)
1 /*	$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
2 
3 /*-
4  * SPDX-License-Identifier: BSD-4-Clause
5  *
6  * Copyright (c) 2005 Emmanuel Dreyfus
7  * All rights reserved.
8  * Copyright (c) 2009-2016 Dmitry Chagin <dchagin@FreeBSD.org>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by Emmanuel Dreyfus
21  * 4. The name of the author may not be used to endorse or promote
22  *    products derived from this software without specific prior written
23  *    permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
26  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
27  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 #include <sys/cdefs.h>
39 __FBSDID("$FreeBSD$");
40 #if 0
41 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
42 #endif
43 
44 #include "opt_compat.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/imgact.h>
49 #include <sys/imgact_elf.h>
50 #include <sys/kernel.h>
51 #include <sys/ktr.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/mutex.h>
55 #include <sys/priv.h>
56 #include <sys/proc.h>
57 #include <sys/queue.h>
58 #include <sys/sched.h>
59 #include <sys/umtxvar.h>
60 
61 #include <vm/vm_extern.h>
62 
63 #ifdef COMPAT_LINUX32
64 #include <machine/../linux32/linux.h>
65 #include <machine/../linux32/linux32_proto.h>
66 #else
67 #include <machine/../linux/linux.h>
68 #include <machine/../linux/linux_proto.h>
69 #endif
70 #include <compat/linux/linux_emul.h>
71 #include <compat/linux/linux_futex.h>
72 #include <compat/linux/linux_misc.h>
73 #include <compat/linux/linux_timer.h>
74 #include <compat/linux/linux_util.h>
75 
76 #define	FUTEX_SHARED	0x8     /* shared futex */
77 
78 #define	GET_SHARED(a)	(a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
79 
80 static int futex_atomic_op(struct thread *, int, uint32_t *);
81 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
82     uint32_t *, unsigned int, bool);
83 static int fetch_robust_entry(struct linux_robust_list **,
84     struct linux_robust_list **, unsigned int *);
85 
86 struct linux_futex_args {
87 	uint32_t	*uaddr;
88 	int32_t		op;
89 	uint32_t	flags;
90 	bool		clockrt;
91 	uint32_t	val;
92 	struct timespec	*ts;
93 	uint32_t	*uaddr2;
94 	uint32_t	val3;
95 	bool		val3_compare;
96 	struct timespec	kts;
97 };
98 
99 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
100 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
101 	    struct linux_futex_args *);
102 static int	linux_futex(struct thread *, struct linux_futex_args *);
103 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
104 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
105 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
106 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
107 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
108 static int linux_futex_unlock_pi(struct thread *, bool,
109 	    struct linux_futex_args *);
110 static int futex_wake_pi(struct thread *, uint32_t *, bool);
111 
112 static int
113 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
114 {
115 
116 	/* Check that futex address is a 32bit aligned. */
117 	if (!__is_aligned(uaddr, sizeof(uint32_t)))
118 		return (EINVAL);
119 	return (umtx_key_get(uaddr, type, share, key));
120 }
121 
122 int
123 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
124 {
125 	struct linux_futex_args args;
126 
127 	bzero(&args, sizeof(args));
128 	args.op = LINUX_FUTEX_WAKE;
129 	args.uaddr = uaddr;
130 	args.flags = shared == true ? FUTEX_SHARED : 0;
131 	args.val = val;
132 	args.val3 = FUTEX_BITSET_MATCH_ANY;
133 
134 	return (linux_futex_wake(td, &args));
135 }
136 
137 static int
138 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
139 {
140 	struct linux_futex_args args;
141 
142 	bzero(&args, sizeof(args));
143 	args.op = LINUX_FUTEX_UNLOCK_PI;
144 	args.uaddr = uaddr;
145 	args.flags = shared == true ? FUTEX_SHARED : 0;
146 
147 	return (linux_futex_unlock_pi(td, true, &args));
148 }
149 
150 static int
151 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
152 {
153 	int op = (encoded_op >> 28) & 7;
154 	int cmp = (encoded_op >> 24) & 15;
155 	int oparg = (encoded_op << 8) >> 20;
156 	int cmparg = (encoded_op << 20) >> 20;
157 	int oldval = 0, ret;
158 
159 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
160 		oparg = 1 << oparg;
161 
162 	switch (op) {
163 	case FUTEX_OP_SET:
164 		ret = futex_xchgl(oparg, uaddr, &oldval);
165 		break;
166 	case FUTEX_OP_ADD:
167 		ret = futex_addl(oparg, uaddr, &oldval);
168 		break;
169 	case FUTEX_OP_OR:
170 		ret = futex_orl(oparg, uaddr, &oldval);
171 		break;
172 	case FUTEX_OP_ANDN:
173 		ret = futex_andl(~oparg, uaddr, &oldval);
174 		break;
175 	case FUTEX_OP_XOR:
176 		ret = futex_xorl(oparg, uaddr, &oldval);
177 		break;
178 	default:
179 		ret = -ENOSYS;
180 		break;
181 	}
182 
183 	if (ret)
184 		return (ret);
185 
186 	switch (cmp) {
187 	case FUTEX_OP_CMP_EQ:
188 		ret = (oldval == cmparg);
189 		break;
190 	case FUTEX_OP_CMP_NE:
191 		ret = (oldval != cmparg);
192 		break;
193 	case FUTEX_OP_CMP_LT:
194 		ret = (oldval < cmparg);
195 		break;
196 	case FUTEX_OP_CMP_GE:
197 		ret = (oldval >= cmparg);
198 		break;
199 	case FUTEX_OP_CMP_LE:
200 		ret = (oldval <= cmparg);
201 		break;
202 	case FUTEX_OP_CMP_GT:
203 		ret = (oldval > cmparg);
204 		break;
205 	default:
206 		ret = -ENOSYS;
207 	}
208 
209 	return (ret);
210 }
211 
212 static int
213 linux_futex(struct thread *td, struct linux_futex_args *args)
214 {
215 	struct linux_pemuldata *pem;
216 	struct proc *p;
217 
218 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
219 		args->flags = 0;
220 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
221 	} else
222 		args->flags = FUTEX_SHARED;
223 
224 	args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
225 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
226 
227 	if (args->clockrt &&
228 	    args->op != LINUX_FUTEX_WAIT_BITSET &&
229 	    args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
230 	    args->op != LINUX_FUTEX_LOCK_PI2)
231 		return (ENOSYS);
232 
233 	switch (args->op) {
234 	case LINUX_FUTEX_WAIT:
235 		args->val3 = FUTEX_BITSET_MATCH_ANY;
236 		/* FALLTHROUGH */
237 
238 	case LINUX_FUTEX_WAIT_BITSET:
239 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
240 		    args->uaddr, args->val, args->val3);
241 
242 		return (linux_futex_wait(td, args));
243 
244 	case LINUX_FUTEX_WAKE:
245 		args->val3 = FUTEX_BITSET_MATCH_ANY;
246 		/* FALLTHROUGH */
247 
248 	case LINUX_FUTEX_WAKE_BITSET:
249 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
250 		    args->uaddr, args->val, args->val3);
251 
252 		return (linux_futex_wake(td, args));
253 
254 	case LINUX_FUTEX_REQUEUE:
255 		/*
256 		 * Glibc does not use this operation since version 2.3.3,
257 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
258 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
259 		 * FUTEX_REQUEUE returned EINVAL.
260 		 */
261 		pem = pem_find(td->td_proc);
262 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
263 			linux_msg(td, "unsupported FUTEX_REQUEUE");
264 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
265 		}
266 
267 		/*
268 		 * The above is true, however musl libc does make use of the
269 		 * futex requeue operation, allow operation for brands which
270 		 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
271 		 */
272 		p = td->td_proc;
273 		Elf_Brandinfo *bi = p->p_elf_brandinfo;
274 		if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
275 			return (EINVAL);
276 		args->val3_compare = false;
277 		/* FALLTHROUGH */
278 
279 	case LINUX_FUTEX_CMP_REQUEUE:
280 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
281 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
282 		    args->uaddr, args->val, args->val3, args->uaddr2,
283 		    args->ts);
284 
285 		return (linux_futex_requeue(td, args));
286 
287 	case LINUX_FUTEX_WAKE_OP:
288 		LINUX_CTR5(sys_futex, "WAKE_OP "
289 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
290 		    args->uaddr, args->val, args->uaddr2, args->val3,
291 		    args->ts);
292 
293 		return (linux_futex_wakeop(td, args));
294 
295 	case LINUX_FUTEX_LOCK_PI:
296 		args->clockrt = true;
297 		/* FALLTHROUGH */
298 
299 	case LINUX_FUTEX_LOCK_PI2:
300 		LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
301 		    args->uaddr, args->val);
302 
303 		return (linux_futex_lock_pi(td, false, args));
304 
305 	case LINUX_FUTEX_UNLOCK_PI:
306 		LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
307 		    args->uaddr);
308 
309 		return (linux_futex_unlock_pi(td, false, args));
310 
311 	case LINUX_FUTEX_TRYLOCK_PI:
312 		LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
313 		    args->uaddr);
314 
315 		return (linux_futex_lock_pi(td, true, args));
316 
317 	/*
318 	 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
319 	 * can't be used anymore to implement conditional variables.
320 	 * A detailed explanation can be found here:
321 	 *
322 	 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
323 	 * and here http://austingroupbugs.net/view.php?id=609
324 	 *
325 	 * And since commit
326 	 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
327 	 * glibc does not uses it.
328 	 */
329 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
330 		/* not yet implemented */
331 		pem = pem_find(td->td_proc);
332 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
333 			linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
334 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
335 		}
336 		return (ENOSYS);
337 
338 	case LINUX_FUTEX_CMP_REQUEUE_PI:
339 		/* not yet implemented */
340 		pem = pem_find(td->td_proc);
341 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
342 			linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
343 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
344 		}
345 		return (ENOSYS);
346 
347 	default:
348 		linux_msg(td, "unsupported futex op %d", args->op);
349 		return (ENOSYS);
350 	}
351 }
352 
353 /*
354  * pi protocol:
355  * - 0 futex word value means unlocked.
356  * - TID futex word value means locked.
357  * Userspace uses atomic ops to lock/unlock these futexes without entering the
358  * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
359  * then FUTEX_LOCK_PI is called.
360  * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
361  * other waiters exists looks up the thread that owns the futex (it has put its
362  * own TID into the futex value) and made this thread the owner of the internal
363  * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
364  * object, on which it blocks. Once it returns, it has the mutex acquired, and it
365  * sets the futex value to its own TID and returns (futex value contains
366  * FUTEX_WAITERS|TID).
367  * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
368  * FUTEX_UNLOCK_PI will be called.
369  * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
370  * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
371  * bit is preserved (if any).
372  * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
373  * the internal futex state and if correct, acquire futex.
374  */
375 static int
376 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
377 {
378 	struct umtx_abs_timeout timo;
379 	struct linux_emuldata *em;
380 	struct umtx_pi *pi, *new_pi;
381 	struct thread *td1;
382 	struct umtx_q *uq;
383 	int error, rv;
384 	uint32_t owner, old_owner;
385 
386 	em = em_find(td);
387 	uq = td->td_umtxq;
388 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
389 	    &uq->uq_key);
390 	if (error != 0)
391 		return (error);
392 	if (args->ts != NULL)
393 		linux_umtx_abs_timeout_init(&timo, args);
394 
395 	umtxq_lock(&uq->uq_key);
396 	pi = umtx_pi_lookup(&uq->uq_key);
397 	if (pi == NULL) {
398 		new_pi = umtx_pi_alloc(M_NOWAIT);
399 		if (new_pi == NULL) {
400 			umtxq_unlock(&uq->uq_key);
401 			new_pi = umtx_pi_alloc(M_WAITOK);
402 			umtxq_lock(&uq->uq_key);
403 			pi = umtx_pi_lookup(&uq->uq_key);
404 			if (pi != NULL) {
405 				umtx_pi_free(new_pi);
406 				new_pi = NULL;
407 			}
408 		}
409 		if (new_pi != NULL) {
410 			new_pi->pi_key = uq->uq_key;
411 			umtx_pi_insert(new_pi);
412 			pi = new_pi;
413 		}
414 	}
415 	umtx_pi_ref(pi);
416 	umtxq_unlock(&uq->uq_key);
417 	for (;;) {
418 		/* Try uncontested case first. */
419 		rv = casueword32(args->uaddr, 0, &owner, em->em_tid);
420 		/* The acquire succeeded. */
421 		if (rv == 0) {
422 			error = 0;
423 			break;
424 		}
425 		if (rv == -1) {
426 			error = EFAULT;
427 			break;
428 		}
429 
430 		/*
431 		 * Avoid overwriting a possible error from sleep due
432 		 * to the pending signal with suspension check result.
433 		 */
434 		if (error == 0) {
435 			error = thread_check_susp(td, true);
436 			if (error != 0)
437 				break;
438 		}
439 
440 		/* The futex word at *uaddr is already locked by the caller. */
441 		if ((owner & FUTEX_TID_MASK) == em->em_tid) {
442 			error = EDEADLK;
443 			break;
444 		}
445 
446 		/*
447 		 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
448 		 * and clear tid. Try to acquire it.
449 		 */
450 		if ((owner & FUTEX_TID_MASK) == 0) {
451 			old_owner = owner;
452 			owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
453 			owner |= em->em_tid;
454 			rv = casueword32(args->uaddr, old_owner, &owner, owner);
455 			if (rv == -1) {
456 				error = EFAULT;
457 				break;
458 			}
459 			if (rv == 1) {
460 				if (error == 0) {
461 					error = thread_check_susp(td, true);
462 					if (error != 0)
463 						break;
464 				}
465 
466 				/*
467 				 * If this failed the lock could
468 				 * changed, restart.
469 				 */
470 				continue;
471 			}
472 
473 			umtxq_lock(&uq->uq_key);
474 			umtxq_busy(&uq->uq_key);
475 			error = umtx_pi_claim(pi, td);
476 			umtxq_unbusy(&uq->uq_key);
477 			umtxq_unlock(&uq->uq_key);
478 			if (error != 0) {
479 				/*
480 				 * Since we're going to return an
481 				 * error, restore the futex to its
482 				 * previous, unowned state to avoid
483 				 * compounding the problem.
484 				 */
485 				(void)casuword32(args->uaddr, owner, old_owner);
486 			}
487 			break;
488 		}
489 
490 		/*
491 		 * Inconsistent state: OWNER_DIED is set and tid is not 0.
492 		 * Linux does some checks of futex state, we return EINVAL,
493 		 * as the user space can take care of this.
494 		 */
495 		if ((owner & FUTEX_OWNER_DIED) != 0) {
496 			error = EINVAL;
497 			break;
498 		}
499 
500 		if (try != 0) {
501 			error = EBUSY;
502 			break;
503 		}
504 
505 		/*
506 		 * If we caught a signal, we have retried and now
507 		 * exit immediately.
508 		 */
509 		if (error != 0)
510 			break;
511 
512 		umtxq_lock(&uq->uq_key);
513 		umtxq_busy(&uq->uq_key);
514 		umtxq_unlock(&uq->uq_key);
515 
516 		/*
517 		 * Set the contested bit so that a release in user space knows
518 		 * to use the system call for unlock. If this fails either some
519 		 * one else has acquired the lock or it has been released.
520 		 */
521 		rv = casueword32(args->uaddr, owner, &owner,
522 		    owner | FUTEX_WAITERS);
523 		if (rv == -1) {
524 			umtxq_unbusy_unlocked(&uq->uq_key);
525 			error = EFAULT;
526 			break;
527 		}
528 		if (rv == 1) {
529 			umtxq_unbusy_unlocked(&uq->uq_key);
530 			error = thread_check_susp(td, true);
531 			if (error != 0)
532 				break;
533 
534 			/*
535 			 * The lock changed and we need to retry or we
536 			 * lost a race to the thread unlocking the umtx.
537 			 */
538 			continue;
539 		}
540 
541 		/*
542 		 * Substitute Linux thread id by native thread id to
543 		 * avoid refactoring code of umtxq_sleep_pi().
544 		 */
545 		td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
546 		if (td1 != NULL) {
547 			owner = td1->td_tid;
548 			PROC_UNLOCK(td1->td_proc);
549 		} else {
550 			umtxq_unbusy_unlocked(&uq->uq_key);
551 			error = EINVAL;
552 			break;
553 		}
554 
555 		umtxq_lock(&uq->uq_key);
556 
557 		/* We set the contested bit, sleep. */
558 		error = umtxq_sleep_pi(uq, pi, owner, "futexp",
559 		    args->ts == NULL ? NULL : &timo,
560 		    (args->flags & FUTEX_SHARED) != 0);
561 		if (error != 0)
562 			continue;
563 
564 		error = thread_check_susp(td, false);
565 		if (error != 0)
566 			break;
567 	}
568 
569 	umtxq_lock(&uq->uq_key);
570 	umtx_pi_unref(pi);
571 	umtxq_unlock(&uq->uq_key);
572 	umtx_key_release(&uq->uq_key);
573 	return (error);
574 }
575 
576 static int
577 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
578 {
579 	struct linux_emuldata *em;
580 	struct umtx_key key;
581 	uint32_t old, owner, new_owner;
582 	int count, error;
583 
584 	em = em_find(td);
585 
586 	/*
587 	 * Make sure we own this mtx.
588 	 */
589 	error = fueword32(args->uaddr, &owner);
590 	if (error == -1)
591 		return (EFAULT);
592 	if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
593 		return (EPERM);
594 
595 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
596 	if (error != 0)
597 		return (error);
598 	umtxq_lock(&key);
599 	umtxq_busy(&key);
600 	error = umtx_pi_drop(td, &key, rb, &count);
601 	if (error != 0 || rb) {
602 		umtxq_unbusy(&key);
603 		umtxq_unlock(&key);
604 		umtx_key_release(&key);
605 		return (error);
606 	}
607 	umtxq_unlock(&key);
608 
609 	/*
610 	 * When unlocking the futex, it must be marked as unowned if
611 	 * there is zero or one thread only waiting for it.
612 	 * Otherwise, it must be marked as contested.
613 	 */
614 	if (count > 1)
615 		new_owner = FUTEX_WAITERS;
616 	else
617 		new_owner = 0;
618 
619 again:
620 	error = casueword32(args->uaddr, owner, &old, new_owner);
621 	if (error == 1) {
622 		error = thread_check_susp(td, false);
623 		if (error == 0)
624 			goto again;
625 	}
626 	umtxq_unbusy_unlocked(&key);
627 	umtx_key_release(&key);
628 	if (error == -1)
629 		return (EFAULT);
630 	if (error == 0 && old != owner)
631 		return (EINVAL);
632 	return (error);
633 }
634 
635 static int
636 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
637 {
638 	struct umtx_key key, key2;
639 	int nrwake, op_ret, ret;
640 	int error, count;
641 
642 	if (args->uaddr == args->uaddr2)
643 		return (EINVAL);
644 
645 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
646 	if (error != 0)
647 		return (error);
648 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
649 	if (error != 0) {
650 		umtx_key_release(&key);
651 		return (error);
652 	}
653 	umtxq_lock(&key);
654 	umtxq_busy(&key);
655 	umtxq_unlock(&key);
656 	op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
657 	if (op_ret < 0) {
658 		if (op_ret == -ENOSYS)
659 			error = ENOSYS;
660 		else
661 			error = EFAULT;
662 	}
663 	umtxq_lock(&key);
664 	umtxq_unbusy(&key);
665 	if (error != 0)
666 		goto out;
667 	ret = umtxq_signal_mask(&key, args->val, args->val3);
668 	if (op_ret > 0) {
669 		nrwake = (int)(unsigned long)args->ts;
670 		umtxq_lock(&key2);
671 		count = umtxq_count(&key2);
672 		if (count > 0)
673 			ret += umtxq_signal_mask(&key2, nrwake, args->val3);
674 		else
675 			ret += umtxq_signal_mask(&key, nrwake, args->val3);
676 		umtxq_unlock(&key2);
677 	}
678 	td->td_retval[0] = ret;
679 out:
680 	umtxq_unlock(&key);
681 	umtx_key_release(&key2);
682 	umtx_key_release(&key);
683 	return (error);
684 }
685 
686 static int
687 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
688 {
689 	int nrwake, nrrequeue;
690 	struct umtx_key key, key2;
691 	int error;
692 	uint32_t uval;
693 
694 	/*
695 	 * Linux allows this, we would not, it is an incorrect
696 	 * usage of declared ABI, so return EINVAL.
697 	 */
698 	if (args->uaddr == args->uaddr2)
699 		return (EINVAL);
700 
701 	nrrequeue = (int)(unsigned long)args->ts;
702 	nrwake = args->val;
703 	/*
704 	 * Sanity check to prevent signed integer overflow,
705 	 * see Linux CVE-2018-6927
706 	 */
707 	if (nrwake < 0 || nrrequeue < 0)
708 		return (EINVAL);
709 
710 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
711 	if (error != 0)
712 		return (error);
713 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
714 	if (error != 0) {
715 		umtx_key_release(&key);
716 		return (error);
717 	}
718 	umtxq_lock(&key);
719 	umtxq_busy(&key);
720 	umtxq_unlock(&key);
721 	error = fueword32(args->uaddr, &uval);
722 	if (error != 0)
723 		error = EFAULT;
724 	else if (args->val3_compare == true && uval != args->val3)
725 		error = EWOULDBLOCK;
726 	umtxq_lock(&key);
727 	umtxq_unbusy(&key);
728 	if (error == 0) {
729 		umtxq_lock(&key2);
730 		td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
731 		umtxq_unlock(&key2);
732 	}
733 	umtxq_unlock(&key);
734 	umtx_key_release(&key2);
735 	umtx_key_release(&key);
736 	return (error);
737 }
738 
739 static int
740 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
741 {
742 	struct umtx_key key;
743 	int error;
744 
745 	if (args->val3 == 0)
746 		return (EINVAL);
747 
748 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
749 	if (error != 0)
750 		return (error);
751 	umtxq_lock(&key);
752 	td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
753 	umtxq_unlock(&key);
754 	umtx_key_release(&key);
755 	return (0);
756 }
757 
758 static int
759 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
760 {
761 	struct umtx_abs_timeout timo;
762 	struct umtx_q *uq;
763 	uint32_t uval;
764 	int error;
765 
766 	if (args->val3 == 0)
767 		error = EINVAL;
768 
769 	uq = td->td_umtxq;
770 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
771 	    &uq->uq_key);
772 	if (error != 0)
773 		return (error);
774 	if (args->ts != NULL)
775 		linux_umtx_abs_timeout_init(&timo, args);
776 	umtxq_lock(&uq->uq_key);
777 	umtxq_busy(&uq->uq_key);
778 	uq->uq_bitset = args->val3;
779 	umtxq_insert(uq);
780 	umtxq_unlock(&uq->uq_key);
781 	error = fueword32(args->uaddr, &uval);
782 	if (error != 0)
783 		error = EFAULT;
784 	else if (uval != args->val)
785 		error = EWOULDBLOCK;
786 	umtxq_lock(&uq->uq_key);
787 	umtxq_unbusy(&uq->uq_key);
788 	if (error == 0) {
789 		error = umtxq_sleep(uq, "futex",
790 		    args->ts == NULL ? NULL : &timo);
791 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
792 			error = 0;
793 		else
794 			umtxq_remove(uq);
795 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
796 		umtxq_remove(uq);
797 	}
798 	umtxq_unlock(&uq->uq_key);
799 	umtx_key_release(&uq->uq_key);
800 	if (error == ERESTART)
801 		error = EINTR;
802 	return (error);
803 }
804 
805 static void
806 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
807     struct linux_futex_args *args)
808 {
809 	int clockid, absolute;
810 
811 	/*
812 	 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
813 	 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
814 	 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
815 	 * futex operations timeout is interpreted as an absolute value.
816 	 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
817 	 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
818 	 * measures the timeout against the CLOCK_MONOTONIC clock.
819 	 */
820 	clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
821 	absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
822 	umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
823 }
824 
825 int
826 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
827 {
828 	struct linux_futex_args fargs = {
829 		.uaddr = args->uaddr,
830 		.op = args->op,
831 		.val = args->val,
832 		.ts = NULL,
833 		.uaddr2 = args->uaddr2,
834 		.val3 = args->val3,
835 		.val3_compare = true,
836 	};
837 	struct l_timespec lts;
838 	int error;
839 
840 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
841 	case LINUX_FUTEX_WAIT:
842 	case LINUX_FUTEX_WAIT_BITSET:
843 	case LINUX_FUTEX_LOCK_PI:
844 	case LINUX_FUTEX_LOCK_PI2:
845 		if (args->timeout != NULL) {
846 			error = copyin(args->timeout, &lts, sizeof(lts));
847 			if (error != 0)
848 				return (error);
849 			error = linux_to_native_timespec(&fargs.kts, &lts);
850 			if (error != 0)
851 				return (error);
852 			fargs.ts = &fargs.kts;
853 		}
854 		break;
855 	default:
856 		fargs.ts = PTRIN(args->timeout);
857 	}
858 	return (linux_futex(td, &fargs));
859 }
860 
861 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
862 int
863 linux_sys_futex_time64(struct thread *td,
864     struct linux_sys_futex_time64_args *args)
865 {
866 	struct linux_futex_args fargs = {
867 		.uaddr = args->uaddr,
868 		.op = args->op,
869 		.val = args->val,
870 		.ts = NULL,
871 		.uaddr2 = args->uaddr2,
872 		.val3 = args->val3,
873 		.val3_compare = true,
874 	};
875 	struct l_timespec64 lts;
876 	int error;
877 
878 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
879 	case LINUX_FUTEX_WAIT:
880 	case LINUX_FUTEX_WAIT_BITSET:
881 	case LINUX_FUTEX_LOCK_PI:
882 	case LINUX_FUTEX_LOCK_PI2:
883 		if (args->timeout != NULL) {
884 			error = copyin(args->timeout, &lts, sizeof(lts));
885 			if (error != 0)
886 				return (error);
887 			error = linux_to_native_timespec64(&fargs.kts, &lts);
888 			if (error != 0)
889 				return (error);
890 			fargs.ts = &fargs.kts;
891 		}
892 		break;
893 	default:
894 		fargs.ts = PTRIN(args->timeout);
895 	}
896 	return (linux_futex(td, &fargs));
897 }
898 #endif
899 
900 int
901 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
902 {
903 	struct linux_emuldata *em;
904 
905 	if (args->len != sizeof(struct linux_robust_list_head))
906 		return (EINVAL);
907 
908 	em = em_find(td);
909 	em->robust_futexes = args->head;
910 
911 	return (0);
912 }
913 
914 int
915 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
916 {
917 	struct linux_emuldata *em;
918 	struct linux_robust_list_head *head;
919 	l_size_t len;
920 	struct thread *td2;
921 	int error;
922 
923 	if (!args->pid) {
924 		em = em_find(td);
925 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
926 		head = em->robust_futexes;
927 	} else {
928 		td2 = linux_tdfind(td, args->pid, -1);
929 		if (td2 == NULL)
930 			return (ESRCH);
931 		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
932 			PROC_UNLOCK(td2->td_proc);
933 			return (EPERM);
934 		}
935 
936 		em = em_find(td2);
937 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
938 		/* XXX: ptrace? */
939 		if (priv_check(td, PRIV_CRED_SETUID) ||
940 		    priv_check(td, PRIV_CRED_SETEUID) ||
941 		    p_candebug(td, td2->td_proc)) {
942 			PROC_UNLOCK(td2->td_proc);
943 			return (EPERM);
944 		}
945 		head = em->robust_futexes;
946 
947 		PROC_UNLOCK(td2->td_proc);
948 	}
949 
950 	len = sizeof(struct linux_robust_list_head);
951 	error = copyout(&len, args->len, sizeof(l_size_t));
952 	if (error != 0)
953 		return (EFAULT);
954 
955 	return (copyout(&head, args->head, sizeof(head)));
956 }
957 
958 static int
959 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
960     unsigned int pi, bool pending_op)
961 {
962 	uint32_t uval, nval, mval;
963 	int error;
964 
965 retry:
966 	error = fueword32(uaddr, &uval);
967 	if (error != 0)
968 		return (EFAULT);
969 
970 	/*
971 	 * Special case for regular (non PI) futexes. The unlock path in
972 	 * user space has two race scenarios:
973 	 *
974 	 * 1. The unlock path releases the user space futex value and
975 	 *    before it can execute the futex() syscall to wake up
976 	 *    waiters it is killed.
977 	 *
978 	 * 2. A woken up waiter is killed before it can acquire the
979 	 *    futex in user space.
980 	 *
981 	 * In both cases the TID validation below prevents a wakeup of
982 	 * potential waiters which can cause these waiters to block
983 	 * forever.
984 	 *
985 	 * In both cases it is safe to attempt waking up a potential
986 	 * waiter without touching the user space futex value and trying
987 	 * to set the OWNER_DIED bit.
988 	 */
989 	if (pending_op && !pi && !uval) {
990 		(void)futex_wake(td, uaddr, 1, true);
991 		return (0);
992 	}
993 
994 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
995 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
996 		error = casueword32(uaddr, uval, &nval, mval);
997 		if (error == -1)
998 			return (EFAULT);
999 		if (error == 1) {
1000 			error = thread_check_susp(td, false);
1001 			if (error != 0)
1002 				return (error);
1003 			goto retry;
1004 		}
1005 
1006 		if (!pi && (uval & FUTEX_WAITERS)) {
1007 			error = futex_wake(td, uaddr, 1, true);
1008 			if (error != 0)
1009 				return (error);
1010 		} else if (pi && (uval & FUTEX_WAITERS)) {
1011 			error = futex_wake_pi(td, uaddr, true);
1012 			if (error != 0)
1013 				return (error);
1014 		}
1015 	}
1016 
1017 	return (0);
1018 }
1019 
1020 static int
1021 fetch_robust_entry(struct linux_robust_list **entry,
1022     struct linux_robust_list **head, unsigned int *pi)
1023 {
1024 	l_ulong uentry;
1025 	int error;
1026 
1027 	error = copyin((const void *)head, &uentry, sizeof(uentry));
1028 	if (error != 0)
1029 		return (EFAULT);
1030 
1031 	*entry = (void *)(uentry & ~1UL);
1032 	*pi = uentry & 1;
1033 
1034 	return (0);
1035 }
1036 
1037 #define	LINUX_HANDLE_DEATH_PENDING	true
1038 #define	LINUX_HANDLE_DEATH_LIST		false
1039 
1040 /* This walks the list of robust futexes releasing them. */
1041 void
1042 release_futexes(struct thread *td, struct linux_emuldata *em)
1043 {
1044 	struct linux_robust_list_head *head;
1045 	struct linux_robust_list *entry, *next_entry, *pending;
1046 	unsigned int limit = 2048, pi, next_pi, pip;
1047 	uint32_t *uaddr;
1048 	l_long futex_offset;
1049 	int error;
1050 
1051 	head = em->robust_futexes;
1052 	if (head == NULL)
1053 		return;
1054 
1055 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1056 		return;
1057 
1058 	error = copyin(&head->futex_offset, &futex_offset,
1059 	    sizeof(futex_offset));
1060 	if (error != 0)
1061 		return;
1062 
1063 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1064 		return;
1065 
1066 	while (entry != &head->list) {
1067 		error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1068 		    &next_pi);
1069 
1070 		/*
1071 		 * A pending lock might already be on the list, so
1072 		 * don't process it twice.
1073 		 */
1074 		if (entry != pending) {
1075 			uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1076 			if (handle_futex_death(td, em, uaddr, pi,
1077 			    LINUX_HANDLE_DEATH_LIST))
1078 				return;
1079 		}
1080 		if (error != 0)
1081 			return;
1082 
1083 		entry = next_entry;
1084 		pi = next_pi;
1085 
1086 		if (!--limit)
1087 			break;
1088 
1089 		sched_relinquish(curthread);
1090 	}
1091 
1092 	if (pending) {
1093 		uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1094 		(void)handle_futex_death(td, em, uaddr, pip,
1095 		    LINUX_HANDLE_DEATH_PENDING);
1096 	}
1097 }
1098