xref: /freebsd/sys/compat/linux/linux_futex.c (revision 2cd662064a0cd8d179cbb5a06378fce0ec458747)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org>
5  * Copyright (c) 2008 Roman Divacky
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_compat.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/imgact.h>
37 #include <sys/imgact_elf.h>
38 #include <sys/ktr.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sched.h>
43 #include <sys/umtxvar.h>
44 
45 #ifdef COMPAT_LINUX32
46 #include <machine/../linux32/linux.h>
47 #include <machine/../linux32/linux32_proto.h>
48 #else
49 #include <machine/../linux/linux.h>
50 #include <machine/../linux/linux_proto.h>
51 #endif
52 #include <compat/linux/linux_emul.h>
53 #include <compat/linux/linux_futex.h>
54 #include <compat/linux/linux_misc.h>
55 #include <compat/linux/linux_timer.h>
56 #include <compat/linux/linux_util.h>
57 
58 #define	FUTEX_SHARED	0x8     /* shared futex */
59 
60 #define	GET_SHARED(a)	(a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
61 
62 static int futex_atomic_op(struct thread *, int, uint32_t *, int *);
63 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
64     uint32_t *, unsigned int, bool);
65 static int fetch_robust_entry(struct linux_robust_list **,
66     struct linux_robust_list **, unsigned int *);
67 
68 struct linux_futex_args {
69 	uint32_t	*uaddr;
70 	int32_t		op;
71 	uint32_t	flags;
72 	bool		clockrt;
73 	uint32_t	val;
74 	struct timespec	*ts;
75 	uint32_t	*uaddr2;
76 	uint32_t	val3;
77 	bool		val3_compare;
78 	struct timespec	kts;
79 };
80 
81 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
82 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
83 	    struct linux_futex_args *);
84 static int linux_futex(struct thread *, struct linux_futex_args *);
85 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
86 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
87 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
88 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
89 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
90 static int linux_futex_unlock_pi(struct thread *, bool,
91 	    struct linux_futex_args *);
92 static int futex_wake_pi(struct thread *, uint32_t *, bool);
93 
94 static int
95 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
96 {
97 
98 	/* Check that futex address is a 32bit aligned. */
99 	if (!__is_aligned(uaddr, sizeof(uint32_t)))
100 		return (EINVAL);
101 	return (umtx_key_get(uaddr, type, share, key));
102 }
103 
104 int
105 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
106 {
107 	struct linux_futex_args args;
108 
109 	bzero(&args, sizeof(args));
110 	args.op = LINUX_FUTEX_WAKE;
111 	args.uaddr = uaddr;
112 	args.flags = shared == true ? FUTEX_SHARED : 0;
113 	args.val = val;
114 	args.val3 = FUTEX_BITSET_MATCH_ANY;
115 
116 	return (linux_futex_wake(td, &args));
117 }
118 
119 static int
120 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
121 {
122 	struct linux_futex_args args;
123 
124 	bzero(&args, sizeof(args));
125 	args.op = LINUX_FUTEX_UNLOCK_PI;
126 	args.uaddr = uaddr;
127 	args.flags = shared == true ? FUTEX_SHARED : 0;
128 
129 	return (linux_futex_unlock_pi(td, true, &args));
130 }
131 
132 static int
133 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr,
134     int *res)
135 {
136 	int op = (encoded_op >> 28) & 7;
137 	int cmp = (encoded_op >> 24) & 15;
138 	int oparg = (encoded_op << 8) >> 20;
139 	int cmparg = (encoded_op << 20) >> 20;
140 	int oldval = 0, ret;
141 
142 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
143 		oparg = 1 << oparg;
144 
145 	switch (op) {
146 	case FUTEX_OP_SET:
147 		ret = futex_xchgl(oparg, uaddr, &oldval);
148 		break;
149 	case FUTEX_OP_ADD:
150 		ret = futex_addl(oparg, uaddr, &oldval);
151 		break;
152 	case FUTEX_OP_OR:
153 		ret = futex_orl(oparg, uaddr, &oldval);
154 		break;
155 	case FUTEX_OP_ANDN:
156 		ret = futex_andl(~oparg, uaddr, &oldval);
157 		break;
158 	case FUTEX_OP_XOR:
159 		ret = futex_xorl(oparg, uaddr, &oldval);
160 		break;
161 	default:
162 		ret = ENOSYS;
163 		break;
164 	}
165 
166 	if (ret != 0)
167 		return (ret);
168 
169 	switch (cmp) {
170 	case FUTEX_OP_CMP_EQ:
171 		*res = (oldval == cmparg);
172 		break;
173 	case FUTEX_OP_CMP_NE:
174 		*res = (oldval != cmparg);
175 		break;
176 	case FUTEX_OP_CMP_LT:
177 		*res = (oldval < cmparg);
178 		break;
179 	case FUTEX_OP_CMP_GE:
180 		*res = (oldval >= cmparg);
181 		break;
182 	case FUTEX_OP_CMP_LE:
183 		*res = (oldval <= cmparg);
184 		break;
185 	case FUTEX_OP_CMP_GT:
186 		*res = (oldval > cmparg);
187 		break;
188 	default:
189 		ret = ENOSYS;
190 	}
191 
192 	return (ret);
193 }
194 
195 static int
196 linux_futex(struct thread *td, struct linux_futex_args *args)
197 {
198 	struct linux_pemuldata *pem;
199 	struct proc *p;
200 
201 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
202 		args->flags = 0;
203 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
204 	} else
205 		args->flags = FUTEX_SHARED;
206 
207 	args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
208 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
209 
210 	if (args->clockrt &&
211 	    args->op != LINUX_FUTEX_WAIT_BITSET &&
212 	    args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
213 	    args->op != LINUX_FUTEX_LOCK_PI2)
214 		return (ENOSYS);
215 
216 	switch (args->op) {
217 	case LINUX_FUTEX_WAIT:
218 		args->val3 = FUTEX_BITSET_MATCH_ANY;
219 		/* FALLTHROUGH */
220 
221 	case LINUX_FUTEX_WAIT_BITSET:
222 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
223 		    args->uaddr, args->val, args->val3);
224 
225 		return (linux_futex_wait(td, args));
226 
227 	case LINUX_FUTEX_WAKE:
228 		args->val3 = FUTEX_BITSET_MATCH_ANY;
229 		/* FALLTHROUGH */
230 
231 	case LINUX_FUTEX_WAKE_BITSET:
232 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
233 		    args->uaddr, args->val, args->val3);
234 
235 		return (linux_futex_wake(td, args));
236 
237 	case LINUX_FUTEX_REQUEUE:
238 		/*
239 		 * Glibc does not use this operation since version 2.3.3,
240 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
241 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
242 		 * FUTEX_REQUEUE returned EINVAL.
243 		 */
244 		pem = pem_find(td->td_proc);
245 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
246 			linux_msg(td, "unsupported FUTEX_REQUEUE");
247 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
248 		}
249 
250 		/*
251 		 * The above is true, however musl libc does make use of the
252 		 * futex requeue operation, allow operation for brands which
253 		 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
254 		 */
255 		p = td->td_proc;
256 		Elf_Brandinfo *bi = p->p_elf_brandinfo;
257 		if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
258 			return (EINVAL);
259 		args->val3_compare = false;
260 		/* FALLTHROUGH */
261 
262 	case LINUX_FUTEX_CMP_REQUEUE:
263 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
264 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
265 		    args->uaddr, args->val, args->val3, args->uaddr2,
266 		    args->ts);
267 
268 		return (linux_futex_requeue(td, args));
269 
270 	case LINUX_FUTEX_WAKE_OP:
271 		LINUX_CTR5(sys_futex, "WAKE_OP "
272 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
273 		    args->uaddr, args->val, args->uaddr2, args->val3,
274 		    args->ts);
275 
276 		return (linux_futex_wakeop(td, args));
277 
278 	case LINUX_FUTEX_LOCK_PI:
279 		args->clockrt = true;
280 		/* FALLTHROUGH */
281 
282 	case LINUX_FUTEX_LOCK_PI2:
283 		LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
284 		    args->uaddr, args->val);
285 
286 		return (linux_futex_lock_pi(td, false, args));
287 
288 	case LINUX_FUTEX_UNLOCK_PI:
289 		LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
290 		    args->uaddr);
291 
292 		return (linux_futex_unlock_pi(td, false, args));
293 
294 	case LINUX_FUTEX_TRYLOCK_PI:
295 		LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
296 		    args->uaddr);
297 
298 		return (linux_futex_lock_pi(td, true, args));
299 
300 	/*
301 	 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
302 	 * can't be used anymore to implement conditional variables.
303 	 * A detailed explanation can be found here:
304 	 *
305 	 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
306 	 * and here http://austingroupbugs.net/view.php?id=609
307 	 *
308 	 * And since commit
309 	 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
310 	 * glibc does not use them.
311 	 */
312 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
313 		/* not yet implemented */
314 		pem = pem_find(td->td_proc);
315 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
316 			linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
317 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
318 		}
319 		return (ENOSYS);
320 
321 	case LINUX_FUTEX_CMP_REQUEUE_PI:
322 		/* not yet implemented */
323 		pem = pem_find(td->td_proc);
324 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
325 			linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
326 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
327 		}
328 		return (ENOSYS);
329 
330 	default:
331 		linux_msg(td, "unsupported futex op %d", args->op);
332 		return (ENOSYS);
333 	}
334 }
335 
336 /*
337  * pi protocol:
338  * - 0 futex word value means unlocked.
339  * - TID futex word value means locked.
340  * Userspace uses atomic ops to lock/unlock these futexes without entering the
341  * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
342  * then FUTEX_LOCK_PI is called.
343  * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
344  * other waiters exists looks up the thread that owns the futex (it has put its
345  * own TID into the futex value) and made this thread the owner of the internal
346  * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
347  * object, on which it blocks. Once it returns, it has the mutex acquired, and it
348  * sets the futex value to its own TID and returns (futex value contains
349  * FUTEX_WAITERS|TID).
350  * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
351  * FUTEX_UNLOCK_PI will be called.
352  * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
353  * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
354  * bit is preserved (if any).
355  * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
356  * the internal futex state and if correct, acquire futex.
357  */
358 static int
359 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
360 {
361 	struct umtx_abs_timeout timo;
362 	struct linux_emuldata *em;
363 	struct umtx_pi *pi, *new_pi;
364 	struct thread *td1;
365 	struct umtx_q *uq;
366 	int error, rv;
367 	uint32_t owner, old_owner;
368 
369 	em = em_find(td);
370 	uq = td->td_umtxq;
371 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
372 	    &uq->uq_key);
373 	if (error != 0)
374 		return (error);
375 	if (args->ts != NULL)
376 		linux_umtx_abs_timeout_init(&timo, args);
377 
378 	umtxq_lock(&uq->uq_key);
379 	pi = umtx_pi_lookup(&uq->uq_key);
380 	if (pi == NULL) {
381 		new_pi = umtx_pi_alloc(M_NOWAIT);
382 		if (new_pi == NULL) {
383 			umtxq_unlock(&uq->uq_key);
384 			new_pi = umtx_pi_alloc(M_WAITOK);
385 			umtxq_lock(&uq->uq_key);
386 			pi = umtx_pi_lookup(&uq->uq_key);
387 			if (pi != NULL) {
388 				umtx_pi_free(new_pi);
389 				new_pi = NULL;
390 			}
391 		}
392 		if (new_pi != NULL) {
393 			new_pi->pi_key = uq->uq_key;
394 			umtx_pi_insert(new_pi);
395 			pi = new_pi;
396 		}
397 	}
398 	umtx_pi_ref(pi);
399 	umtxq_unlock(&uq->uq_key);
400 	for (;;) {
401 		/* Try uncontested case first. */
402 		rv = casueword32(args->uaddr, 0, &owner, em->em_tid);
403 		/* The acquire succeeded. */
404 		if (rv == 0) {
405 			error = 0;
406 			break;
407 		}
408 		if (rv == -1) {
409 			error = EFAULT;
410 			break;
411 		}
412 
413 		/*
414 		 * Nobody owns it, but the acquire failed. This can happen
415 		 * with ll/sc atomic.
416 		 */
417 		if (owner == 0) {
418 			error = thread_check_susp(td, true);
419 			if (error != 0)
420 				break;
421 			continue;
422 		}
423 
424 		/*
425 		 * Avoid overwriting a possible error from sleep due
426 		 * to the pending signal with suspension check result.
427 		 */
428 		if (error == 0) {
429 			error = thread_check_susp(td, true);
430 			if (error != 0)
431 				break;
432 		}
433 
434 		/* The futex word at *uaddr is already locked by the caller. */
435 		if ((owner & FUTEX_TID_MASK) == em->em_tid) {
436 			error = EDEADLK;
437 			break;
438 		}
439 
440 		/*
441 		 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
442 		 * and clear tid. Try to acquire it.
443 		 */
444 		if ((owner & FUTEX_TID_MASK) == 0) {
445 			old_owner = owner;
446 			owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
447 			owner |= em->em_tid;
448 			rv = casueword32(args->uaddr, old_owner, &owner, owner);
449 			if (rv == -1) {
450 				error = EFAULT;
451 				break;
452 			}
453 			if (rv == 1) {
454 				if (error == 0) {
455 					error = thread_check_susp(td, true);
456 					if (error != 0)
457 						break;
458 				}
459 
460 				/*
461 				 * If this failed the lock could
462 				 * changed, restart.
463 				 */
464 				continue;
465 			}
466 
467 			umtxq_lock(&uq->uq_key);
468 			umtxq_busy(&uq->uq_key);
469 			error = umtx_pi_claim(pi, td);
470 			umtxq_unbusy(&uq->uq_key);
471 			umtxq_unlock(&uq->uq_key);
472 			if (error != 0) {
473 				/*
474 				 * Since we're going to return an
475 				 * error, restore the futex to its
476 				 * previous, unowned state to avoid
477 				 * compounding the problem.
478 				 */
479 				(void)casuword32(args->uaddr, owner, old_owner);
480 			}
481 			break;
482 		}
483 
484 		/*
485 		 * Inconsistent state: OWNER_DIED is set and tid is not 0.
486 		 * Linux does some checks of futex state, we return EINVAL,
487 		 * as the user space can take care of this.
488 		 */
489 		if ((owner & FUTEX_OWNER_DIED) != 0) {
490 			error = EINVAL;
491 			break;
492 		}
493 
494 		if (try != 0) {
495 			error = EBUSY;
496 			break;
497 		}
498 
499 		/*
500 		 * If we caught a signal, we have retried and now
501 		 * exit immediately.
502 		 */
503 		if (error != 0)
504 			break;
505 
506 		umtxq_lock(&uq->uq_key);
507 		umtxq_busy(&uq->uq_key);
508 		umtxq_unlock(&uq->uq_key);
509 
510 		/*
511 		 * Set the contested bit so that a release in user space knows
512 		 * to use the system call for unlock. If this fails either some
513 		 * one else has acquired the lock or it has been released.
514 		 */
515 		rv = casueword32(args->uaddr, owner, &owner,
516 		    owner | FUTEX_WAITERS);
517 		if (rv == -1) {
518 			umtxq_unbusy_unlocked(&uq->uq_key);
519 			error = EFAULT;
520 			break;
521 		}
522 		if (rv == 1) {
523 			umtxq_unbusy_unlocked(&uq->uq_key);
524 			error = thread_check_susp(td, true);
525 			if (error != 0)
526 				break;
527 
528 			/*
529 			 * The lock changed and we need to retry or we
530 			 * lost a race to the thread unlocking the umtx.
531 			 */
532 			continue;
533 		}
534 
535 		/*
536 		 * Substitute Linux thread id by native thread id to
537 		 * avoid refactoring code of umtxq_sleep_pi().
538 		 */
539 		td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
540 		if (td1 != NULL) {
541 			owner = td1->td_tid;
542 			PROC_UNLOCK(td1->td_proc);
543 		} else {
544 			umtxq_unbusy_unlocked(&uq->uq_key);
545 			error = EINVAL;
546 			break;
547 		}
548 
549 		umtxq_lock(&uq->uq_key);
550 
551 		/* We set the contested bit, sleep. */
552 		error = umtxq_sleep_pi(uq, pi, owner, "futexp",
553 		    args->ts == NULL ? NULL : &timo,
554 		    (args->flags & FUTEX_SHARED) != 0);
555 		if (error != 0)
556 			continue;
557 
558 		error = thread_check_susp(td, false);
559 		if (error != 0)
560 			break;
561 	}
562 
563 	umtxq_lock(&uq->uq_key);
564 	umtx_pi_unref(pi);
565 	umtxq_unlock(&uq->uq_key);
566 	umtx_key_release(&uq->uq_key);
567 	return (error);
568 }
569 
570 static int
571 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
572 {
573 	struct linux_emuldata *em;
574 	struct umtx_key key;
575 	uint32_t old, owner, new_owner;
576 	int count, error;
577 
578 	em = em_find(td);
579 
580 	/*
581 	 * Make sure we own this mtx.
582 	 */
583 	error = fueword32(args->uaddr, &owner);
584 	if (error == -1)
585 		return (EFAULT);
586 	if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
587 		return (EPERM);
588 
589 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
590 	if (error != 0)
591 		return (error);
592 	umtxq_lock(&key);
593 	umtxq_busy(&key);
594 	error = umtx_pi_drop(td, &key, rb, &count);
595 	if (error != 0 || rb) {
596 		umtxq_unbusy(&key);
597 		umtxq_unlock(&key);
598 		umtx_key_release(&key);
599 		return (error);
600 	}
601 	umtxq_unlock(&key);
602 
603 	/*
604 	 * When unlocking the futex, it must be marked as unowned if
605 	 * there is zero or one thread only waiting for it.
606 	 * Otherwise, it must be marked as contested.
607 	 */
608 	if (count > 1)
609 		new_owner = FUTEX_WAITERS;
610 	else
611 		new_owner = 0;
612 
613 again:
614 	error = casueword32(args->uaddr, owner, &old, new_owner);
615 	if (error == 1) {
616 		error = thread_check_susp(td, false);
617 		if (error == 0)
618 			goto again;
619 	}
620 	umtxq_unbusy_unlocked(&key);
621 	umtx_key_release(&key);
622 	if (error == -1)
623 		return (EFAULT);
624 	if (error == 0 && old != owner)
625 		return (EINVAL);
626 	return (error);
627 }
628 
629 static int
630 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
631 {
632 	struct umtx_key key, key2;
633 	int nrwake, op_ret, ret;
634 	int error, count;
635 
636 	if (args->uaddr == args->uaddr2)
637 		return (EINVAL);
638 
639 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
640 	if (error != 0)
641 		return (error);
642 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
643 	if (error != 0) {
644 		umtx_key_release(&key);
645 		return (error);
646 	}
647 	umtxq_lock(&key);
648 	umtxq_busy(&key);
649 	umtxq_unlock(&key);
650 	error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret);
651 	umtxq_lock(&key);
652 	umtxq_unbusy(&key);
653 	if (error != 0)
654 		goto out;
655 	ret = umtxq_signal_mask(&key, args->val, args->val3);
656 	if (op_ret > 0) {
657 		nrwake = (int)(unsigned long)args->ts;
658 		umtxq_lock(&key2);
659 		count = umtxq_count(&key2);
660 		if (count > 0)
661 			ret += umtxq_signal_mask(&key2, nrwake, args->val3);
662 		else
663 			ret += umtxq_signal_mask(&key, nrwake, args->val3);
664 		umtxq_unlock(&key2);
665 	}
666 	td->td_retval[0] = ret;
667 out:
668 	umtxq_unlock(&key);
669 	umtx_key_release(&key2);
670 	umtx_key_release(&key);
671 	return (error);
672 }
673 
674 static int
675 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
676 {
677 	int nrwake, nrrequeue;
678 	struct umtx_key key, key2;
679 	int error;
680 	uint32_t uval;
681 
682 	/*
683 	 * Linux allows this, we would not, it is an incorrect
684 	 * usage of declared ABI, so return EINVAL.
685 	 */
686 	if (args->uaddr == args->uaddr2)
687 		return (EINVAL);
688 
689 	nrrequeue = (int)(unsigned long)args->ts;
690 	nrwake = args->val;
691 	/*
692 	 * Sanity check to prevent signed integer overflow,
693 	 * see Linux CVE-2018-6927
694 	 */
695 	if (nrwake < 0 || nrrequeue < 0)
696 		return (EINVAL);
697 
698 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
699 	if (error != 0)
700 		return (error);
701 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
702 	if (error != 0) {
703 		umtx_key_release(&key);
704 		return (error);
705 	}
706 	umtxq_lock(&key);
707 	umtxq_busy(&key);
708 	umtxq_unlock(&key);
709 	error = fueword32(args->uaddr, &uval);
710 	if (error != 0)
711 		error = EFAULT;
712 	else if (args->val3_compare == true && uval != args->val3)
713 		error = EWOULDBLOCK;
714 	umtxq_lock(&key);
715 	umtxq_unbusy(&key);
716 	if (error == 0) {
717 		umtxq_lock(&key2);
718 		td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
719 		umtxq_unlock(&key2);
720 	}
721 	umtxq_unlock(&key);
722 	umtx_key_release(&key2);
723 	umtx_key_release(&key);
724 	return (error);
725 }
726 
727 static int
728 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
729 {
730 	struct umtx_key key;
731 	int error;
732 
733 	if (args->val3 == 0)
734 		return (EINVAL);
735 
736 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
737 	if (error != 0)
738 		return (error);
739 	umtxq_lock(&key);
740 	td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
741 	umtxq_unlock(&key);
742 	umtx_key_release(&key);
743 	return (0);
744 }
745 
746 static int
747 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
748 {
749 	struct umtx_abs_timeout timo;
750 	struct umtx_q *uq;
751 	uint32_t uval;
752 	int error;
753 
754 	if (args->val3 == 0)
755 		error = EINVAL;
756 
757 	uq = td->td_umtxq;
758 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
759 	    &uq->uq_key);
760 	if (error != 0)
761 		return (error);
762 	if (args->ts != NULL)
763 		linux_umtx_abs_timeout_init(&timo, args);
764 	umtxq_lock(&uq->uq_key);
765 	umtxq_busy(&uq->uq_key);
766 	uq->uq_bitset = args->val3;
767 	umtxq_insert(uq);
768 	umtxq_unlock(&uq->uq_key);
769 	error = fueword32(args->uaddr, &uval);
770 	if (error != 0)
771 		error = EFAULT;
772 	else if (uval != args->val)
773 		error = EWOULDBLOCK;
774 	umtxq_lock(&uq->uq_key);
775 	umtxq_unbusy(&uq->uq_key);
776 	if (error == 0) {
777 		error = umtxq_sleep(uq, "futex",
778 		    args->ts == NULL ? NULL : &timo);
779 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
780 			error = 0;
781 		else
782 			umtxq_remove(uq);
783 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
784 		umtxq_remove(uq);
785 	}
786 	umtxq_unlock(&uq->uq_key);
787 	umtx_key_release(&uq->uq_key);
788 	return (error);
789 }
790 
791 static void
792 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
793     struct linux_futex_args *args)
794 {
795 	int clockid, absolute;
796 
797 	/*
798 	 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
799 	 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
800 	 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
801 	 * futex operations timeout is interpreted as an absolute value.
802 	 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
803 	 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
804 	 * measures the timeout against the CLOCK_MONOTONIC clock.
805 	 */
806 	clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
807 	absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
808 	umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
809 }
810 
811 int
812 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
813 {
814 	struct linux_futex_args fargs = {
815 		.uaddr = args->uaddr,
816 		.op = args->op,
817 		.val = args->val,
818 		.ts = NULL,
819 		.uaddr2 = args->uaddr2,
820 		.val3 = args->val3,
821 		.val3_compare = true,
822 	};
823 	int error;
824 
825 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
826 	case LINUX_FUTEX_WAIT:
827 	case LINUX_FUTEX_WAIT_BITSET:
828 	case LINUX_FUTEX_LOCK_PI:
829 	case LINUX_FUTEX_LOCK_PI2:
830 		if (args->timeout != NULL) {
831 			error = linux_get_timespec(&fargs.kts, args->timeout);
832 			if (error != 0)
833 				return (error);
834 			fargs.ts = &fargs.kts;
835 		}
836 		break;
837 	default:
838 		fargs.ts = PTRIN(args->timeout);
839 	}
840 	return (linux_futex(td, &fargs));
841 }
842 
843 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
844 int
845 linux_sys_futex_time64(struct thread *td,
846     struct linux_sys_futex_time64_args *args)
847 {
848 	struct linux_futex_args fargs = {
849 		.uaddr = args->uaddr,
850 		.op = args->op,
851 		.val = args->val,
852 		.ts = NULL,
853 		.uaddr2 = args->uaddr2,
854 		.val3 = args->val3,
855 		.val3_compare = true,
856 	};
857 	int error;
858 
859 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
860 	case LINUX_FUTEX_WAIT:
861 	case LINUX_FUTEX_WAIT_BITSET:
862 	case LINUX_FUTEX_LOCK_PI:
863 	case LINUX_FUTEX_LOCK_PI2:
864 		if (args->timeout != NULL) {
865 			error = linux_get_timespec64(&fargs.kts, args->timeout);
866 			if (error != 0)
867 				return (error);
868 			fargs.ts = &fargs.kts;
869 		}
870 		break;
871 	default:
872 		fargs.ts = PTRIN(args->timeout);
873 	}
874 	return (linux_futex(td, &fargs));
875 }
876 #endif
877 
878 int
879 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
880 {
881 	struct linux_emuldata *em;
882 
883 	if (args->len != sizeof(struct linux_robust_list_head))
884 		return (EINVAL);
885 
886 	em = em_find(td);
887 	em->robust_futexes = args->head;
888 
889 	return (0);
890 }
891 
892 int
893 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
894 {
895 	struct linux_emuldata *em;
896 	struct linux_robust_list_head *head;
897 	l_size_t len;
898 	struct thread *td2;
899 	int error;
900 
901 	if (!args->pid) {
902 		em = em_find(td);
903 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
904 		head = em->robust_futexes;
905 	} else {
906 		td2 = linux_tdfind(td, args->pid, -1);
907 		if (td2 == NULL)
908 			return (ESRCH);
909 		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
910 			PROC_UNLOCK(td2->td_proc);
911 			return (EPERM);
912 		}
913 
914 		em = em_find(td2);
915 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
916 		/* XXX: ptrace? */
917 		if (priv_check(td, PRIV_CRED_SETUID) ||
918 		    priv_check(td, PRIV_CRED_SETEUID) ||
919 		    p_candebug(td, td2->td_proc)) {
920 			PROC_UNLOCK(td2->td_proc);
921 			return (EPERM);
922 		}
923 		head = em->robust_futexes;
924 
925 		PROC_UNLOCK(td2->td_proc);
926 	}
927 
928 	len = sizeof(struct linux_robust_list_head);
929 	error = copyout(&len, args->len, sizeof(l_size_t));
930 	if (error != 0)
931 		return (EFAULT);
932 
933 	return (copyout(&head, args->head, sizeof(head)));
934 }
935 
936 static int
937 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
938     unsigned int pi, bool pending_op)
939 {
940 	uint32_t uval, nval, mval;
941 	int error;
942 
943 retry:
944 	error = fueword32(uaddr, &uval);
945 	if (error != 0)
946 		return (EFAULT);
947 
948 	/*
949 	 * Special case for regular (non PI) futexes. The unlock path in
950 	 * user space has two race scenarios:
951 	 *
952 	 * 1. The unlock path releases the user space futex value and
953 	 *    before it can execute the futex() syscall to wake up
954 	 *    waiters it is killed.
955 	 *
956 	 * 2. A woken up waiter is killed before it can acquire the
957 	 *    futex in user space.
958 	 *
959 	 * In both cases the TID validation below prevents a wakeup of
960 	 * potential waiters which can cause these waiters to block
961 	 * forever.
962 	 *
963 	 * In both cases it is safe to attempt waking up a potential
964 	 * waiter without touching the user space futex value and trying
965 	 * to set the OWNER_DIED bit.
966 	 */
967 	if (pending_op && !pi && !uval) {
968 		(void)futex_wake(td, uaddr, 1, true);
969 		return (0);
970 	}
971 
972 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
973 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
974 		error = casueword32(uaddr, uval, &nval, mval);
975 		if (error == -1)
976 			return (EFAULT);
977 		if (error == 1) {
978 			error = thread_check_susp(td, false);
979 			if (error != 0)
980 				return (error);
981 			goto retry;
982 		}
983 
984 		if (!pi && (uval & FUTEX_WAITERS)) {
985 			error = futex_wake(td, uaddr, 1, true);
986 			if (error != 0)
987 				return (error);
988 		} else if (pi && (uval & FUTEX_WAITERS)) {
989 			error = futex_wake_pi(td, uaddr, true);
990 			if (error != 0)
991 				return (error);
992 		}
993 	}
994 
995 	return (0);
996 }
997 
998 static int
999 fetch_robust_entry(struct linux_robust_list **entry,
1000     struct linux_robust_list **head, unsigned int *pi)
1001 {
1002 	l_ulong uentry;
1003 	int error;
1004 
1005 	error = copyin((const void *)head, &uentry, sizeof(uentry));
1006 	if (error != 0)
1007 		return (EFAULT);
1008 
1009 	*entry = (void *)(uentry & ~1UL);
1010 	*pi = uentry & 1;
1011 
1012 	return (0);
1013 }
1014 
1015 #define	LINUX_HANDLE_DEATH_PENDING	true
1016 #define	LINUX_HANDLE_DEATH_LIST		false
1017 
1018 /* This walks the list of robust futexes releasing them. */
1019 void
1020 release_futexes(struct thread *td, struct linux_emuldata *em)
1021 {
1022 	struct linux_robust_list_head *head;
1023 	struct linux_robust_list *entry, *next_entry, *pending;
1024 	unsigned int limit = 2048, pi, next_pi, pip;
1025 	uint32_t *uaddr;
1026 	l_long futex_offset;
1027 	int error;
1028 
1029 	head = em->robust_futexes;
1030 	if (head == NULL)
1031 		return;
1032 
1033 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1034 		return;
1035 
1036 	error = copyin(&head->futex_offset, &futex_offset,
1037 	    sizeof(futex_offset));
1038 	if (error != 0)
1039 		return;
1040 
1041 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1042 		return;
1043 
1044 	while (entry != &head->list) {
1045 		error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1046 		    &next_pi);
1047 
1048 		/*
1049 		 * A pending lock might already be on the list, so
1050 		 * don't process it twice.
1051 		 */
1052 		if (entry != pending) {
1053 			uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1054 			if (handle_futex_death(td, em, uaddr, pi,
1055 			    LINUX_HANDLE_DEATH_LIST))
1056 				return;
1057 		}
1058 		if (error != 0)
1059 			return;
1060 
1061 		entry = next_entry;
1062 		pi = next_pi;
1063 
1064 		if (!--limit)
1065 			break;
1066 
1067 		sched_relinquish(curthread);
1068 	}
1069 
1070 	if (pending) {
1071 		uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1072 		(void)handle_futex_death(td, em, uaddr, pip,
1073 		    LINUX_HANDLE_DEATH_PENDING);
1074 	}
1075 }
1076