xref: /freebsd/sys/compat/linux/linux_futex.c (revision 397e83df75e0fcd0d3fcb95ae4d794cb7600fc89)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org>
5  * Copyright (c) 2008 Roman Divacky
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/imgact.h>
31 #include <sys/imgact_elf.h>
32 #include <sys/ktr.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/priv.h>
36 #include <sys/proc.h>
37 #include <sys/sched.h>
38 #include <sys/sysent.h>
39 #include <sys/vnode.h>
40 #include <sys/umtxvar.h>
41 
42 #ifdef COMPAT_LINUX32
43 #include <machine/../linux32/linux.h>
44 #include <machine/../linux32/linux32_proto.h>
45 #else
46 #include <machine/../linux/linux.h>
47 #include <machine/../linux/linux_proto.h>
48 #endif
49 #include <compat/linux/linux_emul.h>
50 #include <compat/linux/linux_futex.h>
51 #include <compat/linux/linux_misc.h>
52 #include <compat/linux/linux_time.h>
53 #include <compat/linux/linux_util.h>
54 
55 #define	FUTEX_SHARED	0x8     /* shared futex */
56 #define	FUTEX_UNOWNED	0
57 
58 #define	GET_SHARED(a)	(a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
59 
60 static int futex_atomic_op(struct thread *, int, uint32_t *, int *);
61 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
62     uint32_t *, unsigned int, bool);
63 static int fetch_robust_entry(struct linux_robust_list **,
64     struct linux_robust_list **, unsigned int *);
65 
66 struct linux_futex_args {
67 	uint32_t	*uaddr;
68 	int32_t		op;
69 	uint32_t	flags;
70 	bool		clockrt;
71 	uint32_t	val;
72 	struct timespec	*ts;
73 	uint32_t	*uaddr2;
74 	uint32_t	val3;
75 	bool		val3_compare;
76 	struct timespec	kts;
77 };
78 
79 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
80 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
81 	    struct linux_futex_args *);
82 static int linux_futex(struct thread *, struct linux_futex_args *);
83 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
84 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
85 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
86 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
87 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
88 static int linux_futex_unlock_pi(struct thread *, bool,
89 	    struct linux_futex_args *);
90 static int futex_wake_pi(struct thread *, uint32_t *, bool);
91 
92 static int
93 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
94 {
95 
96 	/* Check that futex address is a 32bit aligned. */
97 	if (!__is_aligned(uaddr, sizeof(uint32_t)))
98 		return (EINVAL);
99 	return (umtx_key_get(uaddr, type, share, key));
100 }
101 
102 int
103 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
104 {
105 	struct linux_futex_args args;
106 
107 	bzero(&args, sizeof(args));
108 	args.op = LINUX_FUTEX_WAKE;
109 	args.uaddr = uaddr;
110 	args.flags = shared == true ? FUTEX_SHARED : 0;
111 	args.val = val;
112 	args.val3 = FUTEX_BITSET_MATCH_ANY;
113 
114 	return (linux_futex_wake(td, &args));
115 }
116 
117 static int
118 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
119 {
120 	struct linux_futex_args args;
121 
122 	bzero(&args, sizeof(args));
123 	args.op = LINUX_FUTEX_UNLOCK_PI;
124 	args.uaddr = uaddr;
125 	args.flags = shared == true ? FUTEX_SHARED : 0;
126 
127 	return (linux_futex_unlock_pi(td, true, &args));
128 }
129 
130 static int
131 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr,
132     int *res)
133 {
134 	int op = (encoded_op >> 28) & 7;
135 	int cmp = (encoded_op >> 24) & 15;
136 	int oparg = (encoded_op << 8) >> 20;
137 	int cmparg = (encoded_op << 20) >> 20;
138 	int oldval = 0, ret;
139 
140 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
141 		oparg = 1 << oparg;
142 
143 	switch (op) {
144 	case FUTEX_OP_SET:
145 		ret = futex_xchgl(oparg, uaddr, &oldval);
146 		break;
147 	case FUTEX_OP_ADD:
148 		ret = futex_addl(oparg, uaddr, &oldval);
149 		break;
150 	case FUTEX_OP_OR:
151 		ret = futex_orl(oparg, uaddr, &oldval);
152 		break;
153 	case FUTEX_OP_ANDN:
154 		ret = futex_andl(~oparg, uaddr, &oldval);
155 		break;
156 	case FUTEX_OP_XOR:
157 		ret = futex_xorl(oparg, uaddr, &oldval);
158 		break;
159 	default:
160 		ret = ENOSYS;
161 		break;
162 	}
163 
164 	if (ret != 0)
165 		return (ret);
166 
167 	switch (cmp) {
168 	case FUTEX_OP_CMP_EQ:
169 		*res = (oldval == cmparg);
170 		break;
171 	case FUTEX_OP_CMP_NE:
172 		*res = (oldval != cmparg);
173 		break;
174 	case FUTEX_OP_CMP_LT:
175 		*res = (oldval < cmparg);
176 		break;
177 	case FUTEX_OP_CMP_GE:
178 		*res = (oldval >= cmparg);
179 		break;
180 	case FUTEX_OP_CMP_LE:
181 		*res = (oldval <= cmparg);
182 		break;
183 	case FUTEX_OP_CMP_GT:
184 		*res = (oldval > cmparg);
185 		break;
186 	default:
187 		ret = ENOSYS;
188 	}
189 
190 	return (ret);
191 }
192 
193 static int
194 linux_futex(struct thread *td, struct linux_futex_args *args)
195 {
196 	struct linux_pemuldata *pem;
197 	struct proc *p;
198 
199 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
200 		args->flags = 0;
201 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
202 	} else
203 		args->flags = FUTEX_SHARED;
204 
205 	args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
206 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
207 
208 	if (args->clockrt &&
209 	    args->op != LINUX_FUTEX_WAIT_BITSET &&
210 	    args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
211 	    args->op != LINUX_FUTEX_LOCK_PI2)
212 		return (ENOSYS);
213 
214 	switch (args->op) {
215 	case LINUX_FUTEX_WAIT:
216 		args->val3 = FUTEX_BITSET_MATCH_ANY;
217 		/* FALLTHROUGH */
218 
219 	case LINUX_FUTEX_WAIT_BITSET:
220 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
221 		    args->uaddr, args->val, args->val3);
222 
223 		return (linux_futex_wait(td, args));
224 
225 	case LINUX_FUTEX_WAKE:
226 		args->val3 = FUTEX_BITSET_MATCH_ANY;
227 		/* FALLTHROUGH */
228 
229 	case LINUX_FUTEX_WAKE_BITSET:
230 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
231 		    args->uaddr, args->val, args->val3);
232 
233 		return (linux_futex_wake(td, args));
234 
235 	case LINUX_FUTEX_REQUEUE:
236 		/*
237 		 * Glibc does not use this operation since version 2.3.3,
238 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
239 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
240 		 * FUTEX_REQUEUE returned EINVAL.
241 		 */
242 		pem = pem_find(td->td_proc);
243 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
244 			linux_msg(td, "unsupported FUTEX_REQUEUE");
245 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
246 		}
247 
248 		/*
249 		 * The above is true, however musl libc does make use of the
250 		 * futex requeue operation, allow operation for brands which
251 		 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
252 		 */
253 		p = td->td_proc;
254 		Elf_Brandinfo *bi = p->p_elf_brandinfo;
255 		if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
256 			return (EINVAL);
257 		args->val3_compare = false;
258 		/* FALLTHROUGH */
259 
260 	case LINUX_FUTEX_CMP_REQUEUE:
261 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
262 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
263 		    args->uaddr, args->val, args->val3, args->uaddr2,
264 		    args->ts);
265 
266 		return (linux_futex_requeue(td, args));
267 
268 	case LINUX_FUTEX_WAKE_OP:
269 		LINUX_CTR5(sys_futex, "WAKE_OP "
270 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
271 		    args->uaddr, args->val, args->uaddr2, args->val3,
272 		    args->ts);
273 
274 		return (linux_futex_wakeop(td, args));
275 
276 	case LINUX_FUTEX_LOCK_PI:
277 		args->clockrt = true;
278 		/* FALLTHROUGH */
279 
280 	case LINUX_FUTEX_LOCK_PI2:
281 		LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
282 		    args->uaddr, args->val);
283 
284 		return (linux_futex_lock_pi(td, false, args));
285 
286 	case LINUX_FUTEX_UNLOCK_PI:
287 		LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
288 		    args->uaddr);
289 
290 		return (linux_futex_unlock_pi(td, false, args));
291 
292 	case LINUX_FUTEX_TRYLOCK_PI:
293 		LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
294 		    args->uaddr);
295 
296 		return (linux_futex_lock_pi(td, true, args));
297 
298 	/*
299 	 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
300 	 * can't be used anymore to implement conditional variables.
301 	 * A detailed explanation can be found here:
302 	 *
303 	 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
304 	 * and here http://austingroupbugs.net/view.php?id=609
305 	 *
306 	 * And since commit
307 	 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
308 	 * glibc does not use them.
309 	 */
310 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
311 		/* not yet implemented */
312 		pem = pem_find(td->td_proc);
313 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
314 			linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
315 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
316 		}
317 		return (ENOSYS);
318 
319 	case LINUX_FUTEX_CMP_REQUEUE_PI:
320 		/* not yet implemented */
321 		pem = pem_find(td->td_proc);
322 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
323 			linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
324 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
325 		}
326 		return (ENOSYS);
327 
328 	default:
329 		linux_msg(td, "unsupported futex op %d", args->op);
330 		return (ENOSYS);
331 	}
332 }
333 
334 /*
335  * pi protocol:
336  * - 0 futex word value means unlocked.
337  * - TID futex word value means locked.
338  * Userspace uses atomic ops to lock/unlock these futexes without entering the
339  * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
340  * then FUTEX_LOCK_PI is called.
341  * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
342  * other waiters exists looks up the thread that owns the futex (it has put its
343  * own TID into the futex value) and made this thread the owner of the internal
344  * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
345  * object, on which it blocks. Once it returns, it has the mutex acquired, and it
346  * sets the futex value to its own TID and returns (futex value contains
347  * FUTEX_WAITERS|TID).
348  * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
349  * FUTEX_UNLOCK_PI will be called.
350  * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
351  * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
352  * bit is preserved (if any).
353  * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
354  * the internal futex state and if correct, acquire futex.
355  */
356 static int
357 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
358 {
359 	struct umtx_abs_timeout timo;
360 	struct linux_emuldata *em;
361 	struct umtx_pi *pi, *new_pi;
362 	struct thread *td1;
363 	struct umtx_q *uq;
364 	int error, rv;
365 	uint32_t owner, old_owner;
366 
367 	em = em_find(td);
368 	uq = td->td_umtxq;
369 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
370 	    &uq->uq_key);
371 	if (error != 0)
372 		return (error);
373 	if (args->ts != NULL)
374 		linux_umtx_abs_timeout_init(&timo, args);
375 
376 	umtxq_lock(&uq->uq_key);
377 	pi = umtx_pi_lookup(&uq->uq_key);
378 	if (pi == NULL) {
379 		new_pi = umtx_pi_alloc(M_NOWAIT);
380 		if (new_pi == NULL) {
381 			umtxq_unlock(&uq->uq_key);
382 			new_pi = umtx_pi_alloc(M_WAITOK);
383 			umtxq_lock(&uq->uq_key);
384 			pi = umtx_pi_lookup(&uq->uq_key);
385 			if (pi != NULL) {
386 				umtx_pi_free(new_pi);
387 				new_pi = NULL;
388 			}
389 		}
390 		if (new_pi != NULL) {
391 			new_pi->pi_key = uq->uq_key;
392 			umtx_pi_insert(new_pi);
393 			pi = new_pi;
394 		}
395 	}
396 	umtx_pi_ref(pi);
397 	umtxq_unlock(&uq->uq_key);
398 	for (;;) {
399 		/* Try uncontested case first. */
400 		rv = casueword32(args->uaddr, FUTEX_UNOWNED, &owner, em->em_tid);
401 		/* The acquire succeeded. */
402 		if (rv == 0) {
403 			error = 0;
404 			break;
405 		}
406 		if (rv == -1) {
407 			error = EFAULT;
408 			break;
409 		}
410 
411 		/*
412 		 * Nobody owns it, but the acquire failed. This can happen
413 		 * with ll/sc atomic.
414 		 */
415 		if (owner == FUTEX_UNOWNED) {
416 			error = thread_check_susp(td, true);
417 			if (error != 0)
418 				break;
419 			continue;
420 		}
421 
422 		/*
423 		 * Avoid overwriting a possible error from sleep due
424 		 * to the pending signal with suspension check result.
425 		 */
426 		if (error == 0) {
427 			error = thread_check_susp(td, true);
428 			if (error != 0)
429 				break;
430 		}
431 
432 		/* The futex word at *uaddr is already locked by the caller. */
433 		if ((owner & FUTEX_TID_MASK) == em->em_tid) {
434 			error = EDEADLK;
435 			break;
436 		}
437 
438 		/*
439 		 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
440 		 * and clear tid. Try to acquire it.
441 		 */
442 		if ((owner & FUTEX_TID_MASK) == FUTEX_UNOWNED) {
443 			old_owner = owner;
444 			owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
445 			owner |= em->em_tid;
446 			rv = casueword32(args->uaddr, old_owner, &owner, owner);
447 			if (rv == -1) {
448 				error = EFAULT;
449 				break;
450 			}
451 			if (rv == 1) {
452 				if (error == 0) {
453 					error = thread_check_susp(td, true);
454 					if (error != 0)
455 						break;
456 				}
457 
458 				/*
459 				 * If this failed the lock could
460 				 * changed, restart.
461 				 */
462 				continue;
463 			}
464 
465 			umtxq_lock(&uq->uq_key);
466 			umtxq_busy(&uq->uq_key);
467 			error = umtx_pi_claim(pi, td);
468 			umtxq_unbusy(&uq->uq_key);
469 			umtxq_unlock(&uq->uq_key);
470 			if (error != 0) {
471 				/*
472 				 * Since we're going to return an
473 				 * error, restore the futex to its
474 				 * previous, unowned state to avoid
475 				 * compounding the problem.
476 				 */
477 				(void)casuword32(args->uaddr, owner, old_owner);
478 			}
479 			break;
480 		}
481 
482 		/*
483 		 * Inconsistent state: OWNER_DIED is set and tid is not 0.
484 		 * Linux does some checks of futex state, we return EINVAL,
485 		 * as the user space can take care of this.
486 		 */
487 		if ((owner & FUTEX_OWNER_DIED) != FUTEX_UNOWNED) {
488 			error = EINVAL;
489 			break;
490 		}
491 
492 		if (try != 0) {
493 			error = EBUSY;
494 			break;
495 		}
496 
497 		/*
498 		 * If we caught a signal, we have retried and now
499 		 * exit immediately.
500 		 */
501 		if (error != 0)
502 			break;
503 
504 		umtxq_lock(&uq->uq_key);
505 		umtxq_busy(&uq->uq_key);
506 		umtxq_unlock(&uq->uq_key);
507 
508 		/*
509 		 * Set the contested bit so that a release in user space knows
510 		 * to use the system call for unlock. If this fails either some
511 		 * one else has acquired the lock or it has been released.
512 		 */
513 		rv = casueword32(args->uaddr, owner, &owner,
514 		    owner | FUTEX_WAITERS);
515 		if (rv == -1) {
516 			umtxq_unbusy_unlocked(&uq->uq_key);
517 			error = EFAULT;
518 			break;
519 		}
520 		if (rv == 1) {
521 			umtxq_unbusy_unlocked(&uq->uq_key);
522 			error = thread_check_susp(td, true);
523 			if (error != 0)
524 				break;
525 
526 			/*
527 			 * The lock changed and we need to retry or we
528 			 * lost a race to the thread unlocking the umtx.
529 			 */
530 			continue;
531 		}
532 
533 		/*
534 		 * Substitute Linux thread id by native thread id to
535 		 * avoid refactoring code of umtxq_sleep_pi().
536 		 */
537 		td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
538 		if (td1 != NULL) {
539 			owner = td1->td_tid;
540 			PROC_UNLOCK(td1->td_proc);
541 		} else {
542 			umtxq_unbusy_unlocked(&uq->uq_key);
543 			error = EINVAL;
544 			break;
545 		}
546 
547 		umtxq_lock(&uq->uq_key);
548 
549 		/* We set the contested bit, sleep. */
550 		error = umtxq_sleep_pi(uq, pi, owner, "futexp",
551 		    args->ts == NULL ? NULL : &timo,
552 		    (args->flags & FUTEX_SHARED) != 0);
553 		if (error != 0)
554 			continue;
555 
556 		error = thread_check_susp(td, false);
557 		if (error != 0)
558 			break;
559 	}
560 
561 	umtxq_lock(&uq->uq_key);
562 	umtx_pi_unref(pi);
563 	umtxq_unlock(&uq->uq_key);
564 	umtx_key_release(&uq->uq_key);
565 	return (error);
566 }
567 
568 static int
569 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
570 {
571 	struct linux_emuldata *em;
572 	struct umtx_key key;
573 	uint32_t old, owner, new_owner;
574 	int count, error;
575 
576 	em = em_find(td);
577 
578 	/*
579 	 * Make sure we own this mtx.
580 	 */
581 	error = fueword32(args->uaddr, &owner);
582 	if (error == -1)
583 		return (EFAULT);
584 	if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
585 		return (EPERM);
586 
587 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
588 	if (error != 0)
589 		return (error);
590 	umtxq_lock(&key);
591 	umtxq_busy(&key);
592 	error = umtx_pi_drop(td, &key, rb, &count);
593 	if (error != 0 || rb) {
594 		umtxq_unbusy(&key);
595 		umtxq_unlock(&key);
596 		umtx_key_release(&key);
597 		return (error);
598 	}
599 	umtxq_unlock(&key);
600 
601 	/*
602 	 * When unlocking the futex, it must be marked as unowned if
603 	 * there is zero or one thread only waiting for it.
604 	 * Otherwise, it must be marked as contested.
605 	 */
606 	if (count > 1)
607 		new_owner = FUTEX_WAITERS;
608 	else
609 		new_owner = FUTEX_UNOWNED;
610 
611 again:
612 	error = casueword32(args->uaddr, owner, &old, new_owner);
613 	if (error == 1) {
614 		error = thread_check_susp(td, false);
615 		if (error == 0)
616 			goto again;
617 	}
618 	umtxq_unbusy_unlocked(&key);
619 	umtx_key_release(&key);
620 	if (error == -1)
621 		return (EFAULT);
622 	if (error == 0 && old != owner)
623 		return (EINVAL);
624 	return (error);
625 }
626 
627 static int
628 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
629 {
630 	struct umtx_key key, key2;
631 	int nrwake, op_ret, ret;
632 	int error, count;
633 
634 	if (args->uaddr == args->uaddr2)
635 		return (EINVAL);
636 
637 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
638 	if (error != 0)
639 		return (error);
640 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
641 	if (error != 0) {
642 		umtx_key_release(&key);
643 		return (error);
644 	}
645 	umtxq_lock(&key);
646 	umtxq_busy(&key);
647 	umtxq_unlock(&key);
648 	error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret);
649 	umtxq_lock(&key);
650 	umtxq_unbusy(&key);
651 	if (error != 0)
652 		goto out;
653 	ret = umtxq_signal_mask(&key, args->val, args->val3);
654 	if (op_ret > 0) {
655 		nrwake = (int)(unsigned long)args->ts;
656 		umtxq_lock(&key2);
657 		count = umtxq_count(&key2);
658 		if (count > 0)
659 			ret += umtxq_signal_mask(&key2, nrwake, args->val3);
660 		else
661 			ret += umtxq_signal_mask(&key, nrwake, args->val3);
662 		umtxq_unlock(&key2);
663 	}
664 	td->td_retval[0] = ret;
665 out:
666 	umtxq_unlock(&key);
667 	umtx_key_release(&key2);
668 	umtx_key_release(&key);
669 	return (error);
670 }
671 
672 static int
673 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
674 {
675 	int nrwake, nrrequeue;
676 	struct umtx_key key, key2;
677 	int error;
678 	uint32_t uval;
679 
680 	/*
681 	 * Linux allows this, we would not, it is an incorrect
682 	 * usage of declared ABI, so return EINVAL.
683 	 */
684 	if (args->uaddr == args->uaddr2)
685 		return (EINVAL);
686 
687 	nrrequeue = (int)(unsigned long)args->ts;
688 	nrwake = args->val;
689 	/*
690 	 * Sanity check to prevent signed integer overflow,
691 	 * see Linux CVE-2018-6927
692 	 */
693 	if (nrwake < 0 || nrrequeue < 0)
694 		return (EINVAL);
695 
696 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
697 	if (error != 0)
698 		return (error);
699 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
700 	if (error != 0) {
701 		umtx_key_release(&key);
702 		return (error);
703 	}
704 	umtxq_lock(&key);
705 	umtxq_busy(&key);
706 	umtxq_unlock(&key);
707 	error = fueword32(args->uaddr, &uval);
708 	if (error != 0)
709 		error = EFAULT;
710 	else if (args->val3_compare == true && uval != args->val3)
711 		error = EWOULDBLOCK;
712 	umtxq_lock(&key);
713 	umtxq_unbusy(&key);
714 	if (error == 0) {
715 		umtxq_lock(&key2);
716 		td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
717 		umtxq_unlock(&key2);
718 	}
719 	umtxq_unlock(&key);
720 	umtx_key_release(&key2);
721 	umtx_key_release(&key);
722 	return (error);
723 }
724 
725 static int
726 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
727 {
728 	struct umtx_key key;
729 	int error;
730 
731 	if (args->val3 == 0)
732 		return (EINVAL);
733 
734 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
735 	if (error != 0)
736 		return (error);
737 	umtxq_lock(&key);
738 	td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
739 	umtxq_unlock(&key);
740 	umtx_key_release(&key);
741 	return (0);
742 }
743 
744 static int
745 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
746 {
747 	struct umtx_abs_timeout timo;
748 	struct umtx_q *uq;
749 	uint32_t uval;
750 	int error;
751 
752 	if (args->val3 == 0)
753 		error = EINVAL;
754 
755 	uq = td->td_umtxq;
756 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
757 	    &uq->uq_key);
758 	if (error != 0)
759 		return (error);
760 	if (args->ts != NULL)
761 		linux_umtx_abs_timeout_init(&timo, args);
762 	umtxq_lock(&uq->uq_key);
763 	umtxq_busy(&uq->uq_key);
764 	uq->uq_bitset = args->val3;
765 	umtxq_insert(uq);
766 	umtxq_unlock(&uq->uq_key);
767 	error = fueword32(args->uaddr, &uval);
768 	if (error != 0)
769 		error = EFAULT;
770 	else if (uval != args->val)
771 		error = EWOULDBLOCK;
772 	umtxq_lock(&uq->uq_key);
773 	umtxq_unbusy(&uq->uq_key);
774 	if (error == 0) {
775 		error = umtxq_sleep(uq, "futex",
776 		    args->ts == NULL ? NULL : &timo);
777 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
778 			error = 0;
779 		else
780 			umtxq_remove(uq);
781 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
782 		umtxq_remove(uq);
783 	}
784 	umtxq_unlock(&uq->uq_key);
785 	umtx_key_release(&uq->uq_key);
786 	if (error == ERESTART)
787 		error = EINTR;
788 	return (error);
789 }
790 
791 static void
792 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
793     struct linux_futex_args *args)
794 {
795 	int clockid, absolute;
796 
797 	/*
798 	 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
799 	 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
800 	 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
801 	 * futex operations timeout is interpreted as an absolute value.
802 	 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
803 	 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
804 	 * measures the timeout against the CLOCK_MONOTONIC clock.
805 	 */
806 	clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
807 	absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
808 	umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
809 }
810 
811 int
812 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
813 {
814 	struct linux_futex_args fargs = {
815 		.uaddr = args->uaddr,
816 		.op = args->op,
817 		.val = args->val,
818 		.ts = NULL,
819 		.uaddr2 = args->uaddr2,
820 		.val3 = args->val3,
821 		.val3_compare = true,
822 	};
823 	int error;
824 
825 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
826 	case LINUX_FUTEX_WAIT:
827 	case LINUX_FUTEX_WAIT_BITSET:
828 	case LINUX_FUTEX_LOCK_PI:
829 	case LINUX_FUTEX_LOCK_PI2:
830 		if (args->timeout != NULL) {
831 			error = linux_get_timespec(&fargs.kts, args->timeout);
832 			if (error != 0)
833 				return (error);
834 			fargs.ts = &fargs.kts;
835 		}
836 		break;
837 	default:
838 		fargs.ts = PTRIN(args->timeout);
839 	}
840 	return (linux_futex(td, &fargs));
841 }
842 
843 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
844 int
845 linux_sys_futex_time64(struct thread *td,
846     struct linux_sys_futex_time64_args *args)
847 {
848 	struct linux_futex_args fargs = {
849 		.uaddr = args->uaddr,
850 		.op = args->op,
851 		.val = args->val,
852 		.ts = NULL,
853 		.uaddr2 = args->uaddr2,
854 		.val3 = args->val3,
855 		.val3_compare = true,
856 	};
857 	int error;
858 
859 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
860 	case LINUX_FUTEX_WAIT:
861 	case LINUX_FUTEX_WAIT_BITSET:
862 	case LINUX_FUTEX_LOCK_PI:
863 	case LINUX_FUTEX_LOCK_PI2:
864 		if (args->timeout != NULL) {
865 			error = linux_get_timespec64(&fargs.kts, args->timeout);
866 			if (error != 0)
867 				return (error);
868 			fargs.ts = &fargs.kts;
869 		}
870 		break;
871 	default:
872 		fargs.ts = PTRIN(args->timeout);
873 	}
874 	return (linux_futex(td, &fargs));
875 }
876 #endif
877 
878 int
879 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
880 {
881 	struct linux_emuldata *em;
882 
883 	if (args->len != sizeof(struct linux_robust_list_head))
884 		return (EINVAL);
885 
886 	em = em_find(td);
887 	em->robust_futexes = args->head;
888 
889 	return (0);
890 }
891 
892 int
893 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
894 {
895 	struct linux_emuldata *em;
896 	struct linux_robust_list_head *head;
897 	l_size_t len;
898 	struct thread *td2;
899 	int error;
900 
901 	if (!args->pid) {
902 		em = em_find(td);
903 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
904 		head = em->robust_futexes;
905 	} else {
906 		td2 = linux_tdfind(td, args->pid, -1);
907 		if (td2 == NULL)
908 			return (ESRCH);
909 		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
910 			PROC_UNLOCK(td2->td_proc);
911 			return (EPERM);
912 		}
913 
914 		em = em_find(td2);
915 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
916 		/* XXX: ptrace? */
917 		if (priv_check(td, PRIV_CRED_SETUID) ||
918 		    priv_check(td, PRIV_CRED_SETEUID) ||
919 		    p_candebug(td, td2->td_proc)) {
920 			PROC_UNLOCK(td2->td_proc);
921 			return (EPERM);
922 		}
923 		head = em->robust_futexes;
924 
925 		PROC_UNLOCK(td2->td_proc);
926 	}
927 
928 	len = sizeof(struct linux_robust_list_head);
929 	error = copyout(&len, args->len, sizeof(l_size_t));
930 	if (error != 0)
931 		return (EFAULT);
932 
933 	return (copyout(&head, args->head, sizeof(l_uintptr_t)));
934 }
935 
936 static int
937 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
938     unsigned int pi, bool pending_op)
939 {
940 	uint32_t uval, nval, mval;
941 	int error;
942 
943 retry:
944 	error = fueword32(uaddr, &uval);
945 	if (error != 0)
946 		return (EFAULT);
947 
948 	/*
949 	 * Special case for regular (non PI) futexes. The unlock path in
950 	 * user space has two race scenarios:
951 	 *
952 	 * 1. The unlock path releases the user space futex value and
953 	 *    before it can execute the futex() syscall to wake up
954 	 *    waiters it is killed.
955 	 *
956 	 * 2. A woken up waiter is killed before it can acquire the
957 	 *    futex in user space.
958 	 *
959 	 * In both cases the TID validation below prevents a wakeup of
960 	 * potential waiters which can cause these waiters to block
961 	 * forever.
962 	 *
963 	 * In both cases it is safe to attempt waking up a potential
964 	 * waiter without touching the user space futex value and trying
965 	 * to set the OWNER_DIED bit.
966 	 */
967 	if (pending_op && !pi && !uval) {
968 		(void)futex_wake(td, uaddr, 1, true);
969 		return (0);
970 	}
971 
972 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
973 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
974 		error = casueword32(uaddr, uval, &nval, mval);
975 		if (error == -1)
976 			return (EFAULT);
977 		if (error == 1) {
978 			error = thread_check_susp(td, false);
979 			if (error != 0)
980 				return (error);
981 			goto retry;
982 		}
983 
984 		if (!pi && (uval & FUTEX_WAITERS)) {
985 			error = futex_wake(td, uaddr, 1, true);
986 			if (error != 0)
987 				return (error);
988 		} else if (pi && (uval & FUTEX_WAITERS)) {
989 			error = futex_wake_pi(td, uaddr, true);
990 			if (error != 0)
991 				return (error);
992 		}
993 	}
994 
995 	return (0);
996 }
997 
998 static int
999 fetch_robust_entry(struct linux_robust_list **entry,
1000     struct linux_robust_list **head, unsigned int *pi)
1001 {
1002 	l_ulong uentry;
1003 	int error;
1004 
1005 	error = copyin((const void *)head, &uentry, sizeof(uentry));
1006 	if (error != 0)
1007 		return (EFAULT);
1008 
1009 	*entry = (void *)(uentry & ~1UL);
1010 	*pi = uentry & 1;
1011 
1012 	return (0);
1013 }
1014 
1015 #define	LINUX_HANDLE_DEATH_PENDING	true
1016 #define	LINUX_HANDLE_DEATH_LIST		false
1017 
1018 /* This walks the list of robust futexes releasing them. */
1019 void
1020 release_futexes(struct thread *td, struct linux_emuldata *em)
1021 {
1022 	struct linux_robust_list_head *head;
1023 	struct linux_robust_list *entry, *next_entry, *pending;
1024 	unsigned int limit = 2048, pi, next_pi, pip;
1025 	uint32_t *uaddr;
1026 	l_long futex_offset;
1027 	int error;
1028 
1029 	head = em->robust_futexes;
1030 	if (head == NULL)
1031 		return;
1032 
1033 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1034 		return;
1035 
1036 	error = copyin(&head->futex_offset, &futex_offset,
1037 	    sizeof(futex_offset));
1038 	if (error != 0)
1039 		return;
1040 
1041 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1042 		return;
1043 
1044 	while (entry != &head->list) {
1045 		error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1046 		    &next_pi);
1047 
1048 		/*
1049 		 * A pending lock might already be on the list, so
1050 		 * don't process it twice.
1051 		 */
1052 		if (entry != pending) {
1053 			uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1054 			if (handle_futex_death(td, em, uaddr, pi,
1055 			    LINUX_HANDLE_DEATH_LIST))
1056 				return;
1057 		}
1058 		if (error != 0)
1059 			return;
1060 
1061 		entry = next_entry;
1062 		pi = next_pi;
1063 
1064 		if (!--limit)
1065 			break;
1066 
1067 		sched_relinquish(curthread);
1068 	}
1069 
1070 	if (pending) {
1071 		uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1072 		(void)handle_futex_death(td, em, uaddr, pip,
1073 		    LINUX_HANDLE_DEATH_PENDING);
1074 	}
1075 }
1076