xref: /freebsd/sys/compat/linux/linux_futex.c (revision 1f88aa09417f1cfb3929fd37531b1ab51213c2d6)
1 /*	$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */
2 
3 /*-
4  * SPDX-License-Identifier: BSD-4-Clause
5  *
6  * Copyright (c) 2005 Emmanuel Dreyfus
7  * All rights reserved.
8  * Copyright (c) 2009-2016 Dmitry Chagin <dchagin@FreeBSD.org>
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by Emmanuel Dreyfus
21  * 4. The name of the author may not be used to endorse or promote
22  *    products derived from this software without specific prior written
23  *    permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
26  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
27  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 #include <sys/cdefs.h>
39 __FBSDID("$FreeBSD$");
40 #if 0
41 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $");
42 #endif
43 
44 #include "opt_compat.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/imgact.h>
49 #include <sys/imgact_elf.h>
50 #include <sys/ktr.h>
51 #include <sys/mutex.h>
52 #include <sys/priv.h>
53 #include <sys/proc.h>
54 #include <sys/sched.h>
55 #include <sys/umtxvar.h>
56 
57 #ifdef COMPAT_LINUX32
58 #include <machine/../linux32/linux.h>
59 #include <machine/../linux32/linux32_proto.h>
60 #else
61 #include <machine/../linux/linux.h>
62 #include <machine/../linux/linux_proto.h>
63 #endif
64 #include <compat/linux/linux_emul.h>
65 #include <compat/linux/linux_futex.h>
66 #include <compat/linux/linux_misc.h>
67 #include <compat/linux/linux_timer.h>
68 #include <compat/linux/linux_util.h>
69 
70 #define	FUTEX_SHARED	0x8     /* shared futex */
71 
72 #define	GET_SHARED(a)	(a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
73 
74 static int futex_atomic_op(struct thread *, int, uint32_t *);
75 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
76     uint32_t *, unsigned int, bool);
77 static int fetch_robust_entry(struct linux_robust_list **,
78     struct linux_robust_list **, unsigned int *);
79 
80 struct linux_futex_args {
81 	uint32_t	*uaddr;
82 	int32_t		op;
83 	uint32_t	flags;
84 	bool		clockrt;
85 	uint32_t	val;
86 	struct timespec	*ts;
87 	uint32_t	*uaddr2;
88 	uint32_t	val3;
89 	bool		val3_compare;
90 	struct timespec	kts;
91 };
92 
93 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
94 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
95 	    struct linux_futex_args *);
96 static int	linux_futex(struct thread *, struct linux_futex_args *);
97 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
98 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
99 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
100 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
101 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
102 static int linux_futex_unlock_pi(struct thread *, bool,
103 	    struct linux_futex_args *);
104 static int futex_wake_pi(struct thread *, uint32_t *, bool);
105 
106 static int
107 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
108 {
109 
110 	/* Check that futex address is a 32bit aligned. */
111 	if (!__is_aligned(uaddr, sizeof(uint32_t)))
112 		return (EINVAL);
113 	return (umtx_key_get(uaddr, type, share, key));
114 }
115 
116 int
117 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
118 {
119 	struct linux_futex_args args;
120 
121 	bzero(&args, sizeof(args));
122 	args.op = LINUX_FUTEX_WAKE;
123 	args.uaddr = uaddr;
124 	args.flags = shared == true ? FUTEX_SHARED : 0;
125 	args.val = val;
126 	args.val3 = FUTEX_BITSET_MATCH_ANY;
127 
128 	return (linux_futex_wake(td, &args));
129 }
130 
131 static int
132 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
133 {
134 	struct linux_futex_args args;
135 
136 	bzero(&args, sizeof(args));
137 	args.op = LINUX_FUTEX_UNLOCK_PI;
138 	args.uaddr = uaddr;
139 	args.flags = shared == true ? FUTEX_SHARED : 0;
140 
141 	return (linux_futex_unlock_pi(td, true, &args));
142 }
143 
144 static int
145 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
146 {
147 	int op = (encoded_op >> 28) & 7;
148 	int cmp = (encoded_op >> 24) & 15;
149 	int oparg = (encoded_op << 8) >> 20;
150 	int cmparg = (encoded_op << 20) >> 20;
151 	int oldval = 0, ret;
152 
153 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
154 		oparg = 1 << oparg;
155 
156 	switch (op) {
157 	case FUTEX_OP_SET:
158 		ret = futex_xchgl(oparg, uaddr, &oldval);
159 		break;
160 	case FUTEX_OP_ADD:
161 		ret = futex_addl(oparg, uaddr, &oldval);
162 		break;
163 	case FUTEX_OP_OR:
164 		ret = futex_orl(oparg, uaddr, &oldval);
165 		break;
166 	case FUTEX_OP_ANDN:
167 		ret = futex_andl(~oparg, uaddr, &oldval);
168 		break;
169 	case FUTEX_OP_XOR:
170 		ret = futex_xorl(oparg, uaddr, &oldval);
171 		break;
172 	default:
173 		ret = -ENOSYS;
174 		break;
175 	}
176 
177 	if (ret)
178 		return (ret);
179 
180 	switch (cmp) {
181 	case FUTEX_OP_CMP_EQ:
182 		ret = (oldval == cmparg);
183 		break;
184 	case FUTEX_OP_CMP_NE:
185 		ret = (oldval != cmparg);
186 		break;
187 	case FUTEX_OP_CMP_LT:
188 		ret = (oldval < cmparg);
189 		break;
190 	case FUTEX_OP_CMP_GE:
191 		ret = (oldval >= cmparg);
192 		break;
193 	case FUTEX_OP_CMP_LE:
194 		ret = (oldval <= cmparg);
195 		break;
196 	case FUTEX_OP_CMP_GT:
197 		ret = (oldval > cmparg);
198 		break;
199 	default:
200 		ret = -ENOSYS;
201 	}
202 
203 	return (ret);
204 }
205 
206 static int
207 linux_futex(struct thread *td, struct linux_futex_args *args)
208 {
209 	struct linux_pemuldata *pem;
210 	struct proc *p;
211 
212 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
213 		args->flags = 0;
214 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
215 	} else
216 		args->flags = FUTEX_SHARED;
217 
218 	args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
219 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
220 
221 	if (args->clockrt &&
222 	    args->op != LINUX_FUTEX_WAIT_BITSET &&
223 	    args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
224 	    args->op != LINUX_FUTEX_LOCK_PI2)
225 		return (ENOSYS);
226 
227 	switch (args->op) {
228 	case LINUX_FUTEX_WAIT:
229 		args->val3 = FUTEX_BITSET_MATCH_ANY;
230 		/* FALLTHROUGH */
231 
232 	case LINUX_FUTEX_WAIT_BITSET:
233 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
234 		    args->uaddr, args->val, args->val3);
235 
236 		return (linux_futex_wait(td, args));
237 
238 	case LINUX_FUTEX_WAKE:
239 		args->val3 = FUTEX_BITSET_MATCH_ANY;
240 		/* FALLTHROUGH */
241 
242 	case LINUX_FUTEX_WAKE_BITSET:
243 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
244 		    args->uaddr, args->val, args->val3);
245 
246 		return (linux_futex_wake(td, args));
247 
248 	case LINUX_FUTEX_REQUEUE:
249 		/*
250 		 * Glibc does not use this operation since version 2.3.3,
251 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
252 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
253 		 * FUTEX_REQUEUE returned EINVAL.
254 		 */
255 		pem = pem_find(td->td_proc);
256 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
257 			linux_msg(td, "unsupported FUTEX_REQUEUE");
258 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
259 		}
260 
261 		/*
262 		 * The above is true, however musl libc does make use of the
263 		 * futex requeue operation, allow operation for brands which
264 		 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
265 		 */
266 		p = td->td_proc;
267 		Elf_Brandinfo *bi = p->p_elf_brandinfo;
268 		if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
269 			return (EINVAL);
270 		args->val3_compare = false;
271 		/* FALLTHROUGH */
272 
273 	case LINUX_FUTEX_CMP_REQUEUE:
274 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
275 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
276 		    args->uaddr, args->val, args->val3, args->uaddr2,
277 		    args->ts);
278 
279 		return (linux_futex_requeue(td, args));
280 
281 	case LINUX_FUTEX_WAKE_OP:
282 		LINUX_CTR5(sys_futex, "WAKE_OP "
283 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
284 		    args->uaddr, args->val, args->uaddr2, args->val3,
285 		    args->ts);
286 
287 		return (linux_futex_wakeop(td, args));
288 
289 	case LINUX_FUTEX_LOCK_PI:
290 		args->clockrt = true;
291 		/* FALLTHROUGH */
292 
293 	case LINUX_FUTEX_LOCK_PI2:
294 		LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
295 		    args->uaddr, args->val);
296 
297 		return (linux_futex_lock_pi(td, false, args));
298 
299 	case LINUX_FUTEX_UNLOCK_PI:
300 		LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
301 		    args->uaddr);
302 
303 		return (linux_futex_unlock_pi(td, false, args));
304 
305 	case LINUX_FUTEX_TRYLOCK_PI:
306 		LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
307 		    args->uaddr);
308 
309 		return (linux_futex_lock_pi(td, true, args));
310 
311 	/*
312 	 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
313 	 * can't be used anymore to implement conditional variables.
314 	 * A detailed explanation can be found here:
315 	 *
316 	 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
317 	 * and here http://austingroupbugs.net/view.php?id=609
318 	 *
319 	 * And since commit
320 	 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
321 	 * glibc does not uses it.
322 	 */
323 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
324 		/* not yet implemented */
325 		pem = pem_find(td->td_proc);
326 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
327 			linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
328 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
329 		}
330 		return (ENOSYS);
331 
332 	case LINUX_FUTEX_CMP_REQUEUE_PI:
333 		/* not yet implemented */
334 		pem = pem_find(td->td_proc);
335 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
336 			linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
337 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
338 		}
339 		return (ENOSYS);
340 
341 	default:
342 		linux_msg(td, "unsupported futex op %d", args->op);
343 		return (ENOSYS);
344 	}
345 }
346 
347 /*
348  * pi protocol:
349  * - 0 futex word value means unlocked.
350  * - TID futex word value means locked.
351  * Userspace uses atomic ops to lock/unlock these futexes without entering the
352  * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
353  * then FUTEX_LOCK_PI is called.
354  * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
355  * other waiters exists looks up the thread that owns the futex (it has put its
356  * own TID into the futex value) and made this thread the owner of the internal
357  * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
358  * object, on which it blocks. Once it returns, it has the mutex acquired, and it
359  * sets the futex value to its own TID and returns (futex value contains
360  * FUTEX_WAITERS|TID).
361  * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
362  * FUTEX_UNLOCK_PI will be called.
363  * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
364  * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
365  * bit is preserved (if any).
366  * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
367  * the internal futex state and if correct, acquire futex.
368  */
369 static int
370 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
371 {
372 	struct umtx_abs_timeout timo;
373 	struct linux_emuldata *em;
374 	struct umtx_pi *pi, *new_pi;
375 	struct thread *td1;
376 	struct umtx_q *uq;
377 	int error, rv;
378 	uint32_t owner, old_owner;
379 
380 	em = em_find(td);
381 	uq = td->td_umtxq;
382 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
383 	    &uq->uq_key);
384 	if (error != 0)
385 		return (error);
386 	if (args->ts != NULL)
387 		linux_umtx_abs_timeout_init(&timo, args);
388 
389 	umtxq_lock(&uq->uq_key);
390 	pi = umtx_pi_lookup(&uq->uq_key);
391 	if (pi == NULL) {
392 		new_pi = umtx_pi_alloc(M_NOWAIT);
393 		if (new_pi == NULL) {
394 			umtxq_unlock(&uq->uq_key);
395 			new_pi = umtx_pi_alloc(M_WAITOK);
396 			umtxq_lock(&uq->uq_key);
397 			pi = umtx_pi_lookup(&uq->uq_key);
398 			if (pi != NULL) {
399 				umtx_pi_free(new_pi);
400 				new_pi = NULL;
401 			}
402 		}
403 		if (new_pi != NULL) {
404 			new_pi->pi_key = uq->uq_key;
405 			umtx_pi_insert(new_pi);
406 			pi = new_pi;
407 		}
408 	}
409 	umtx_pi_ref(pi);
410 	umtxq_unlock(&uq->uq_key);
411 	for (;;) {
412 		/* Try uncontested case first. */
413 		rv = casueword32(args->uaddr, 0, &owner, em->em_tid);
414 		/* The acquire succeeded. */
415 		if (rv == 0) {
416 			error = 0;
417 			break;
418 		}
419 		if (rv == -1) {
420 			error = EFAULT;
421 			break;
422 		}
423 
424 		/*
425 		 * Avoid overwriting a possible error from sleep due
426 		 * to the pending signal with suspension check result.
427 		 */
428 		if (error == 0) {
429 			error = thread_check_susp(td, true);
430 			if (error != 0)
431 				break;
432 		}
433 
434 		/* The futex word at *uaddr is already locked by the caller. */
435 		if ((owner & FUTEX_TID_MASK) == em->em_tid) {
436 			error = EDEADLK;
437 			break;
438 		}
439 
440 		/*
441 		 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
442 		 * and clear tid. Try to acquire it.
443 		 */
444 		if ((owner & FUTEX_TID_MASK) == 0) {
445 			old_owner = owner;
446 			owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
447 			owner |= em->em_tid;
448 			rv = casueword32(args->uaddr, old_owner, &owner, owner);
449 			if (rv == -1) {
450 				error = EFAULT;
451 				break;
452 			}
453 			if (rv == 1) {
454 				if (error == 0) {
455 					error = thread_check_susp(td, true);
456 					if (error != 0)
457 						break;
458 				}
459 
460 				/*
461 				 * If this failed the lock could
462 				 * changed, restart.
463 				 */
464 				continue;
465 			}
466 
467 			umtxq_lock(&uq->uq_key);
468 			umtxq_busy(&uq->uq_key);
469 			error = umtx_pi_claim(pi, td);
470 			umtxq_unbusy(&uq->uq_key);
471 			umtxq_unlock(&uq->uq_key);
472 			if (error != 0) {
473 				/*
474 				 * Since we're going to return an
475 				 * error, restore the futex to its
476 				 * previous, unowned state to avoid
477 				 * compounding the problem.
478 				 */
479 				(void)casuword32(args->uaddr, owner, old_owner);
480 			}
481 			break;
482 		}
483 
484 		/*
485 		 * Inconsistent state: OWNER_DIED is set and tid is not 0.
486 		 * Linux does some checks of futex state, we return EINVAL,
487 		 * as the user space can take care of this.
488 		 */
489 		if ((owner & FUTEX_OWNER_DIED) != 0) {
490 			error = EINVAL;
491 			break;
492 		}
493 
494 		if (try != 0) {
495 			error = EBUSY;
496 			break;
497 		}
498 
499 		/*
500 		 * If we caught a signal, we have retried and now
501 		 * exit immediately.
502 		 */
503 		if (error != 0)
504 			break;
505 
506 		umtxq_lock(&uq->uq_key);
507 		umtxq_busy(&uq->uq_key);
508 		umtxq_unlock(&uq->uq_key);
509 
510 		/*
511 		 * Set the contested bit so that a release in user space knows
512 		 * to use the system call for unlock. If this fails either some
513 		 * one else has acquired the lock or it has been released.
514 		 */
515 		rv = casueword32(args->uaddr, owner, &owner,
516 		    owner | FUTEX_WAITERS);
517 		if (rv == -1) {
518 			umtxq_unbusy_unlocked(&uq->uq_key);
519 			error = EFAULT;
520 			break;
521 		}
522 		if (rv == 1) {
523 			umtxq_unbusy_unlocked(&uq->uq_key);
524 			error = thread_check_susp(td, true);
525 			if (error != 0)
526 				break;
527 
528 			/*
529 			 * The lock changed and we need to retry or we
530 			 * lost a race to the thread unlocking the umtx.
531 			 */
532 			continue;
533 		}
534 
535 		/*
536 		 * Substitute Linux thread id by native thread id to
537 		 * avoid refactoring code of umtxq_sleep_pi().
538 		 */
539 		td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
540 		if (td1 != NULL) {
541 			owner = td1->td_tid;
542 			PROC_UNLOCK(td1->td_proc);
543 		} else {
544 			umtxq_unbusy_unlocked(&uq->uq_key);
545 			error = EINVAL;
546 			break;
547 		}
548 
549 		umtxq_lock(&uq->uq_key);
550 
551 		/* We set the contested bit, sleep. */
552 		error = umtxq_sleep_pi(uq, pi, owner, "futexp",
553 		    args->ts == NULL ? NULL : &timo,
554 		    (args->flags & FUTEX_SHARED) != 0);
555 		if (error != 0)
556 			continue;
557 
558 		error = thread_check_susp(td, false);
559 		if (error != 0)
560 			break;
561 	}
562 
563 	umtxq_lock(&uq->uq_key);
564 	umtx_pi_unref(pi);
565 	umtxq_unlock(&uq->uq_key);
566 	umtx_key_release(&uq->uq_key);
567 	return (error);
568 }
569 
570 static int
571 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
572 {
573 	struct linux_emuldata *em;
574 	struct umtx_key key;
575 	uint32_t old, owner, new_owner;
576 	int count, error;
577 
578 	em = em_find(td);
579 
580 	/*
581 	 * Make sure we own this mtx.
582 	 */
583 	error = fueword32(args->uaddr, &owner);
584 	if (error == -1)
585 		return (EFAULT);
586 	if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
587 		return (EPERM);
588 
589 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
590 	if (error != 0)
591 		return (error);
592 	umtxq_lock(&key);
593 	umtxq_busy(&key);
594 	error = umtx_pi_drop(td, &key, rb, &count);
595 	if (error != 0 || rb) {
596 		umtxq_unbusy(&key);
597 		umtxq_unlock(&key);
598 		umtx_key_release(&key);
599 		return (error);
600 	}
601 	umtxq_unlock(&key);
602 
603 	/*
604 	 * When unlocking the futex, it must be marked as unowned if
605 	 * there is zero or one thread only waiting for it.
606 	 * Otherwise, it must be marked as contested.
607 	 */
608 	if (count > 1)
609 		new_owner = FUTEX_WAITERS;
610 	else
611 		new_owner = 0;
612 
613 again:
614 	error = casueword32(args->uaddr, owner, &old, new_owner);
615 	if (error == 1) {
616 		error = thread_check_susp(td, false);
617 		if (error == 0)
618 			goto again;
619 	}
620 	umtxq_unbusy_unlocked(&key);
621 	umtx_key_release(&key);
622 	if (error == -1)
623 		return (EFAULT);
624 	if (error == 0 && old != owner)
625 		return (EINVAL);
626 	return (error);
627 }
628 
629 static int
630 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
631 {
632 	struct umtx_key key, key2;
633 	int nrwake, op_ret, ret;
634 	int error, count;
635 
636 	if (args->uaddr == args->uaddr2)
637 		return (EINVAL);
638 
639 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
640 	if (error != 0)
641 		return (error);
642 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
643 	if (error != 0) {
644 		umtx_key_release(&key);
645 		return (error);
646 	}
647 	umtxq_lock(&key);
648 	umtxq_busy(&key);
649 	umtxq_unlock(&key);
650 	op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
651 	if (op_ret < 0) {
652 		if (op_ret == -ENOSYS)
653 			error = ENOSYS;
654 		else
655 			error = EFAULT;
656 	}
657 	umtxq_lock(&key);
658 	umtxq_unbusy(&key);
659 	if (error != 0)
660 		goto out;
661 	ret = umtxq_signal_mask(&key, args->val, args->val3);
662 	if (op_ret > 0) {
663 		nrwake = (int)(unsigned long)args->ts;
664 		umtxq_lock(&key2);
665 		count = umtxq_count(&key2);
666 		if (count > 0)
667 			ret += umtxq_signal_mask(&key2, nrwake, args->val3);
668 		else
669 			ret += umtxq_signal_mask(&key, nrwake, args->val3);
670 		umtxq_unlock(&key2);
671 	}
672 	td->td_retval[0] = ret;
673 out:
674 	umtxq_unlock(&key);
675 	umtx_key_release(&key2);
676 	umtx_key_release(&key);
677 	return (error);
678 }
679 
680 static int
681 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
682 {
683 	int nrwake, nrrequeue;
684 	struct umtx_key key, key2;
685 	int error;
686 	uint32_t uval;
687 
688 	/*
689 	 * Linux allows this, we would not, it is an incorrect
690 	 * usage of declared ABI, so return EINVAL.
691 	 */
692 	if (args->uaddr == args->uaddr2)
693 		return (EINVAL);
694 
695 	nrrequeue = (int)(unsigned long)args->ts;
696 	nrwake = args->val;
697 	/*
698 	 * Sanity check to prevent signed integer overflow,
699 	 * see Linux CVE-2018-6927
700 	 */
701 	if (nrwake < 0 || nrrequeue < 0)
702 		return (EINVAL);
703 
704 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
705 	if (error != 0)
706 		return (error);
707 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
708 	if (error != 0) {
709 		umtx_key_release(&key);
710 		return (error);
711 	}
712 	umtxq_lock(&key);
713 	umtxq_busy(&key);
714 	umtxq_unlock(&key);
715 	error = fueword32(args->uaddr, &uval);
716 	if (error != 0)
717 		error = EFAULT;
718 	else if (args->val3_compare == true && uval != args->val3)
719 		error = EWOULDBLOCK;
720 	umtxq_lock(&key);
721 	umtxq_unbusy(&key);
722 	if (error == 0) {
723 		umtxq_lock(&key2);
724 		td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
725 		umtxq_unlock(&key2);
726 	}
727 	umtxq_unlock(&key);
728 	umtx_key_release(&key2);
729 	umtx_key_release(&key);
730 	return (error);
731 }
732 
733 static int
734 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
735 {
736 	struct umtx_key key;
737 	int error;
738 
739 	if (args->val3 == 0)
740 		return (EINVAL);
741 
742 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
743 	if (error != 0)
744 		return (error);
745 	umtxq_lock(&key);
746 	td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
747 	umtxq_unlock(&key);
748 	umtx_key_release(&key);
749 	return (0);
750 }
751 
752 static int
753 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
754 {
755 	struct umtx_abs_timeout timo;
756 	struct umtx_q *uq;
757 	uint32_t uval;
758 	int error;
759 
760 	if (args->val3 == 0)
761 		error = EINVAL;
762 
763 	uq = td->td_umtxq;
764 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
765 	    &uq->uq_key);
766 	if (error != 0)
767 		return (error);
768 	if (args->ts != NULL)
769 		linux_umtx_abs_timeout_init(&timo, args);
770 	umtxq_lock(&uq->uq_key);
771 	umtxq_busy(&uq->uq_key);
772 	uq->uq_bitset = args->val3;
773 	umtxq_insert(uq);
774 	umtxq_unlock(&uq->uq_key);
775 	error = fueword32(args->uaddr, &uval);
776 	if (error != 0)
777 		error = EFAULT;
778 	else if (uval != args->val)
779 		error = EWOULDBLOCK;
780 	umtxq_lock(&uq->uq_key);
781 	umtxq_unbusy(&uq->uq_key);
782 	if (error == 0) {
783 		error = umtxq_sleep(uq, "futex",
784 		    args->ts == NULL ? NULL : &timo);
785 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
786 			error = 0;
787 		else
788 			umtxq_remove(uq);
789 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
790 		umtxq_remove(uq);
791 	}
792 	umtxq_unlock(&uq->uq_key);
793 	umtx_key_release(&uq->uq_key);
794 	if (error == ERESTART)
795 		error = EINTR;
796 	return (error);
797 }
798 
799 static void
800 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
801     struct linux_futex_args *args)
802 {
803 	int clockid, absolute;
804 
805 	/*
806 	 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
807 	 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
808 	 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
809 	 * futex operations timeout is interpreted as an absolute value.
810 	 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
811 	 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
812 	 * measures the timeout against the CLOCK_MONOTONIC clock.
813 	 */
814 	clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
815 	absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
816 	umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
817 }
818 
819 int
820 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
821 {
822 	struct linux_futex_args fargs = {
823 		.uaddr = args->uaddr,
824 		.op = args->op,
825 		.val = args->val,
826 		.ts = NULL,
827 		.uaddr2 = args->uaddr2,
828 		.val3 = args->val3,
829 		.val3_compare = true,
830 	};
831 	struct l_timespec lts;
832 	int error;
833 
834 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
835 	case LINUX_FUTEX_WAIT:
836 	case LINUX_FUTEX_WAIT_BITSET:
837 	case LINUX_FUTEX_LOCK_PI:
838 	case LINUX_FUTEX_LOCK_PI2:
839 		if (args->timeout != NULL) {
840 			error = copyin(args->timeout, &lts, sizeof(lts));
841 			if (error != 0)
842 				return (error);
843 			error = linux_to_native_timespec(&fargs.kts, &lts);
844 			if (error != 0)
845 				return (error);
846 			fargs.ts = &fargs.kts;
847 		}
848 		break;
849 	default:
850 		fargs.ts = PTRIN(args->timeout);
851 	}
852 	return (linux_futex(td, &fargs));
853 }
854 
855 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
856 int
857 linux_sys_futex_time64(struct thread *td,
858     struct linux_sys_futex_time64_args *args)
859 {
860 	struct linux_futex_args fargs = {
861 		.uaddr = args->uaddr,
862 		.op = args->op,
863 		.val = args->val,
864 		.ts = NULL,
865 		.uaddr2 = args->uaddr2,
866 		.val3 = args->val3,
867 		.val3_compare = true,
868 	};
869 	struct l_timespec64 lts;
870 	int error;
871 
872 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
873 	case LINUX_FUTEX_WAIT:
874 	case LINUX_FUTEX_WAIT_BITSET:
875 	case LINUX_FUTEX_LOCK_PI:
876 	case LINUX_FUTEX_LOCK_PI2:
877 		if (args->timeout != NULL) {
878 			error = copyin(args->timeout, &lts, sizeof(lts));
879 			if (error != 0)
880 				return (error);
881 			error = linux_to_native_timespec64(&fargs.kts, &lts);
882 			if (error != 0)
883 				return (error);
884 			fargs.ts = &fargs.kts;
885 		}
886 		break;
887 	default:
888 		fargs.ts = PTRIN(args->timeout);
889 	}
890 	return (linux_futex(td, &fargs));
891 }
892 #endif
893 
894 int
895 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
896 {
897 	struct linux_emuldata *em;
898 
899 	if (args->len != sizeof(struct linux_robust_list_head))
900 		return (EINVAL);
901 
902 	em = em_find(td);
903 	em->robust_futexes = args->head;
904 
905 	return (0);
906 }
907 
908 int
909 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
910 {
911 	struct linux_emuldata *em;
912 	struct linux_robust_list_head *head;
913 	l_size_t len;
914 	struct thread *td2;
915 	int error;
916 
917 	if (!args->pid) {
918 		em = em_find(td);
919 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
920 		head = em->robust_futexes;
921 	} else {
922 		td2 = linux_tdfind(td, args->pid, -1);
923 		if (td2 == NULL)
924 			return (ESRCH);
925 		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
926 			PROC_UNLOCK(td2->td_proc);
927 			return (EPERM);
928 		}
929 
930 		em = em_find(td2);
931 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
932 		/* XXX: ptrace? */
933 		if (priv_check(td, PRIV_CRED_SETUID) ||
934 		    priv_check(td, PRIV_CRED_SETEUID) ||
935 		    p_candebug(td, td2->td_proc)) {
936 			PROC_UNLOCK(td2->td_proc);
937 			return (EPERM);
938 		}
939 		head = em->robust_futexes;
940 
941 		PROC_UNLOCK(td2->td_proc);
942 	}
943 
944 	len = sizeof(struct linux_robust_list_head);
945 	error = copyout(&len, args->len, sizeof(l_size_t));
946 	if (error != 0)
947 		return (EFAULT);
948 
949 	return (copyout(&head, args->head, sizeof(head)));
950 }
951 
952 static int
953 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
954     unsigned int pi, bool pending_op)
955 {
956 	uint32_t uval, nval, mval;
957 	int error;
958 
959 retry:
960 	error = fueword32(uaddr, &uval);
961 	if (error != 0)
962 		return (EFAULT);
963 
964 	/*
965 	 * Special case for regular (non PI) futexes. The unlock path in
966 	 * user space has two race scenarios:
967 	 *
968 	 * 1. The unlock path releases the user space futex value and
969 	 *    before it can execute the futex() syscall to wake up
970 	 *    waiters it is killed.
971 	 *
972 	 * 2. A woken up waiter is killed before it can acquire the
973 	 *    futex in user space.
974 	 *
975 	 * In both cases the TID validation below prevents a wakeup of
976 	 * potential waiters which can cause these waiters to block
977 	 * forever.
978 	 *
979 	 * In both cases it is safe to attempt waking up a potential
980 	 * waiter without touching the user space futex value and trying
981 	 * to set the OWNER_DIED bit.
982 	 */
983 	if (pending_op && !pi && !uval) {
984 		(void)futex_wake(td, uaddr, 1, true);
985 		return (0);
986 	}
987 
988 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
989 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
990 		error = casueword32(uaddr, uval, &nval, mval);
991 		if (error == -1)
992 			return (EFAULT);
993 		if (error == 1) {
994 			error = thread_check_susp(td, false);
995 			if (error != 0)
996 				return (error);
997 			goto retry;
998 		}
999 
1000 		if (!pi && (uval & FUTEX_WAITERS)) {
1001 			error = futex_wake(td, uaddr, 1, true);
1002 			if (error != 0)
1003 				return (error);
1004 		} else if (pi && (uval & FUTEX_WAITERS)) {
1005 			error = futex_wake_pi(td, uaddr, true);
1006 			if (error != 0)
1007 				return (error);
1008 		}
1009 	}
1010 
1011 	return (0);
1012 }
1013 
1014 static int
1015 fetch_robust_entry(struct linux_robust_list **entry,
1016     struct linux_robust_list **head, unsigned int *pi)
1017 {
1018 	l_ulong uentry;
1019 	int error;
1020 
1021 	error = copyin((const void *)head, &uentry, sizeof(uentry));
1022 	if (error != 0)
1023 		return (EFAULT);
1024 
1025 	*entry = (void *)(uentry & ~1UL);
1026 	*pi = uentry & 1;
1027 
1028 	return (0);
1029 }
1030 
1031 #define	LINUX_HANDLE_DEATH_PENDING	true
1032 #define	LINUX_HANDLE_DEATH_LIST		false
1033 
1034 /* This walks the list of robust futexes releasing them. */
1035 void
1036 release_futexes(struct thread *td, struct linux_emuldata *em)
1037 {
1038 	struct linux_robust_list_head *head;
1039 	struct linux_robust_list *entry, *next_entry, *pending;
1040 	unsigned int limit = 2048, pi, next_pi, pip;
1041 	uint32_t *uaddr;
1042 	l_long futex_offset;
1043 	int error;
1044 
1045 	head = em->robust_futexes;
1046 	if (head == NULL)
1047 		return;
1048 
1049 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1050 		return;
1051 
1052 	error = copyin(&head->futex_offset, &futex_offset,
1053 	    sizeof(futex_offset));
1054 	if (error != 0)
1055 		return;
1056 
1057 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1058 		return;
1059 
1060 	while (entry != &head->list) {
1061 		error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1062 		    &next_pi);
1063 
1064 		/*
1065 		 * A pending lock might already be on the list, so
1066 		 * don't process it twice.
1067 		 */
1068 		if (entry != pending) {
1069 			uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1070 			if (handle_futex_death(td, em, uaddr, pi,
1071 			    LINUX_HANDLE_DEATH_LIST))
1072 				return;
1073 		}
1074 		if (error != 0)
1075 			return;
1076 
1077 		entry = next_entry;
1078 		pi = next_pi;
1079 
1080 		if (!--limit)
1081 			break;
1082 
1083 		sched_relinquish(curthread);
1084 	}
1085 
1086 	if (pending) {
1087 		uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1088 		(void)handle_futex_death(td, em, uaddr, pip,
1089 		    LINUX_HANDLE_DEATH_PENDING);
1090 	}
1091 }
1092