xref: /freebsd/sys/compat/linux/linux_futex.c (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org>
5  * Copyright (c) 2008 Roman Divacky
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/imgact.h>
32 #include <sys/imgact_elf.h>
33 #include <sys/ktr.h>
34 #include <sys/lock.h>
35 #include <sys/mutex.h>
36 #include <sys/priv.h>
37 #include <sys/proc.h>
38 #include <sys/sched.h>
39 #include <sys/sysent.h>
40 #include <sys/vnode.h>
41 #include <sys/umtxvar.h>
42 
43 #ifdef COMPAT_LINUX32
44 #include <machine/../linux32/linux.h>
45 #include <machine/../linux32/linux32_proto.h>
46 #else
47 #include <machine/../linux/linux.h>
48 #include <machine/../linux/linux_proto.h>
49 #endif
50 #include <compat/linux/linux_emul.h>
51 #include <compat/linux/linux_futex.h>
52 #include <compat/linux/linux_misc.h>
53 #include <compat/linux/linux_time.h>
54 #include <compat/linux/linux_util.h>
55 
56 #define	FUTEX_SHARED	0x8     /* shared futex */
57 #define	FUTEX_UNOWNED	0
58 
59 #define	GET_SHARED(a)	(a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
60 
61 static int futex_atomic_op(struct thread *, int, uint32_t *, int *);
62 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
63     uint32_t *, unsigned int, bool);
64 static int fetch_robust_entry(struct linux_robust_list **,
65     struct linux_robust_list **, unsigned int *);
66 
67 struct linux_futex_args {
68 	uint32_t	*uaddr;
69 	int32_t		op;
70 	uint32_t	flags;
71 	bool		clockrt;
72 	uint32_t	val;
73 	struct timespec	*ts;
74 	uint32_t	*uaddr2;
75 	uint32_t	val3;
76 	bool		val3_compare;
77 	struct timespec	kts;
78 };
79 
80 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
81 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
82 	    struct linux_futex_args *);
83 static int linux_futex(struct thread *, struct linux_futex_args *);
84 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
85 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
86 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
87 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
88 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
89 static int linux_futex_unlock_pi(struct thread *, bool,
90 	    struct linux_futex_args *);
91 static int futex_wake_pi(struct thread *, uint32_t *, bool);
92 
93 static int
94 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
95 {
96 
97 	/* Check that futex address is a 32bit aligned. */
98 	if (!__is_aligned(uaddr, sizeof(uint32_t)))
99 		return (EINVAL);
100 	return (umtx_key_get(uaddr, type, share, key));
101 }
102 
103 int
104 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
105 {
106 	struct linux_futex_args args;
107 
108 	bzero(&args, sizeof(args));
109 	args.op = LINUX_FUTEX_WAKE;
110 	args.uaddr = uaddr;
111 	args.flags = shared == true ? FUTEX_SHARED : 0;
112 	args.val = val;
113 	args.val3 = FUTEX_BITSET_MATCH_ANY;
114 
115 	return (linux_futex_wake(td, &args));
116 }
117 
118 static int
119 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
120 {
121 	struct linux_futex_args args;
122 
123 	bzero(&args, sizeof(args));
124 	args.op = LINUX_FUTEX_UNLOCK_PI;
125 	args.uaddr = uaddr;
126 	args.flags = shared == true ? FUTEX_SHARED : 0;
127 
128 	return (linux_futex_unlock_pi(td, true, &args));
129 }
130 
131 static int
132 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr,
133     int *res)
134 {
135 	int op = (encoded_op >> 28) & 7;
136 	int cmp = (encoded_op >> 24) & 15;
137 	int oparg = (encoded_op << 8) >> 20;
138 	int cmparg = (encoded_op << 20) >> 20;
139 	int oldval = 0, ret;
140 
141 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
142 		oparg = 1 << oparg;
143 
144 	switch (op) {
145 	case FUTEX_OP_SET:
146 		ret = futex_xchgl(oparg, uaddr, &oldval);
147 		break;
148 	case FUTEX_OP_ADD:
149 		ret = futex_addl(oparg, uaddr, &oldval);
150 		break;
151 	case FUTEX_OP_OR:
152 		ret = futex_orl(oparg, uaddr, &oldval);
153 		break;
154 	case FUTEX_OP_ANDN:
155 		ret = futex_andl(~oparg, uaddr, &oldval);
156 		break;
157 	case FUTEX_OP_XOR:
158 		ret = futex_xorl(oparg, uaddr, &oldval);
159 		break;
160 	default:
161 		ret = ENOSYS;
162 		break;
163 	}
164 
165 	if (ret != 0)
166 		return (ret);
167 
168 	switch (cmp) {
169 	case FUTEX_OP_CMP_EQ:
170 		*res = (oldval == cmparg);
171 		break;
172 	case FUTEX_OP_CMP_NE:
173 		*res = (oldval != cmparg);
174 		break;
175 	case FUTEX_OP_CMP_LT:
176 		*res = (oldval < cmparg);
177 		break;
178 	case FUTEX_OP_CMP_GE:
179 		*res = (oldval >= cmparg);
180 		break;
181 	case FUTEX_OP_CMP_LE:
182 		*res = (oldval <= cmparg);
183 		break;
184 	case FUTEX_OP_CMP_GT:
185 		*res = (oldval > cmparg);
186 		break;
187 	default:
188 		ret = ENOSYS;
189 	}
190 
191 	return (ret);
192 }
193 
194 static int
195 linux_futex(struct thread *td, struct linux_futex_args *args)
196 {
197 	struct linux_pemuldata *pem;
198 	struct proc *p;
199 
200 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
201 		args->flags = 0;
202 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
203 	} else
204 		args->flags = FUTEX_SHARED;
205 
206 	args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
207 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
208 
209 	if (args->clockrt &&
210 	    args->op != LINUX_FUTEX_WAIT_BITSET &&
211 	    args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
212 	    args->op != LINUX_FUTEX_LOCK_PI2)
213 		return (ENOSYS);
214 
215 	switch (args->op) {
216 	case LINUX_FUTEX_WAIT:
217 		args->val3 = FUTEX_BITSET_MATCH_ANY;
218 		/* FALLTHROUGH */
219 
220 	case LINUX_FUTEX_WAIT_BITSET:
221 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
222 		    args->uaddr, args->val, args->val3);
223 
224 		return (linux_futex_wait(td, args));
225 
226 	case LINUX_FUTEX_WAKE:
227 		args->val3 = FUTEX_BITSET_MATCH_ANY;
228 		/* FALLTHROUGH */
229 
230 	case LINUX_FUTEX_WAKE_BITSET:
231 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
232 		    args->uaddr, args->val, args->val3);
233 
234 		return (linux_futex_wake(td, args));
235 
236 	case LINUX_FUTEX_REQUEUE:
237 		/*
238 		 * Glibc does not use this operation since version 2.3.3,
239 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
240 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
241 		 * FUTEX_REQUEUE returned EINVAL.
242 		 */
243 		pem = pem_find(td->td_proc);
244 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
245 			linux_msg(td, "unsupported FUTEX_REQUEUE");
246 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
247 		}
248 
249 		/*
250 		 * The above is true, however musl libc does make use of the
251 		 * futex requeue operation, allow operation for brands which
252 		 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
253 		 */
254 		p = td->td_proc;
255 		Elf_Brandinfo *bi = p->p_elf_brandinfo;
256 		if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
257 			return (EINVAL);
258 		args->val3_compare = false;
259 		/* FALLTHROUGH */
260 
261 	case LINUX_FUTEX_CMP_REQUEUE:
262 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
263 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
264 		    args->uaddr, args->val, args->val3, args->uaddr2,
265 		    args->ts);
266 
267 		return (linux_futex_requeue(td, args));
268 
269 	case LINUX_FUTEX_WAKE_OP:
270 		LINUX_CTR5(sys_futex, "WAKE_OP "
271 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
272 		    args->uaddr, args->val, args->uaddr2, args->val3,
273 		    args->ts);
274 
275 		return (linux_futex_wakeop(td, args));
276 
277 	case LINUX_FUTEX_LOCK_PI:
278 		args->clockrt = true;
279 		/* FALLTHROUGH */
280 
281 	case LINUX_FUTEX_LOCK_PI2:
282 		LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
283 		    args->uaddr, args->val);
284 
285 		return (linux_futex_lock_pi(td, false, args));
286 
287 	case LINUX_FUTEX_UNLOCK_PI:
288 		LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
289 		    args->uaddr);
290 
291 		return (linux_futex_unlock_pi(td, false, args));
292 
293 	case LINUX_FUTEX_TRYLOCK_PI:
294 		LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
295 		    args->uaddr);
296 
297 		return (linux_futex_lock_pi(td, true, args));
298 
299 	/*
300 	 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
301 	 * can't be used anymore to implement conditional variables.
302 	 * A detailed explanation can be found here:
303 	 *
304 	 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
305 	 * and here http://austingroupbugs.net/view.php?id=609
306 	 *
307 	 * And since commit
308 	 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
309 	 * glibc does not use them.
310 	 */
311 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
312 		/* not yet implemented */
313 		pem = pem_find(td->td_proc);
314 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
315 			linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
316 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
317 		}
318 		return (ENOSYS);
319 
320 	case LINUX_FUTEX_CMP_REQUEUE_PI:
321 		/* not yet implemented */
322 		pem = pem_find(td->td_proc);
323 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
324 			linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
325 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
326 		}
327 		return (ENOSYS);
328 
329 	default:
330 		linux_msg(td, "unsupported futex op %d", args->op);
331 		return (ENOSYS);
332 	}
333 }
334 
335 /*
336  * pi protocol:
337  * - 0 futex word value means unlocked.
338  * - TID futex word value means locked.
339  * Userspace uses atomic ops to lock/unlock these futexes without entering the
340  * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
341  * then FUTEX_LOCK_PI is called.
342  * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
343  * other waiters exists looks up the thread that owns the futex (it has put its
344  * own TID into the futex value) and made this thread the owner of the internal
345  * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
346  * object, on which it blocks. Once it returns, it has the mutex acquired, and it
347  * sets the futex value to its own TID and returns (futex value contains
348  * FUTEX_WAITERS|TID).
349  * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
350  * FUTEX_UNLOCK_PI will be called.
351  * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
352  * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
353  * bit is preserved (if any).
354  * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
355  * the internal futex state and if correct, acquire futex.
356  */
357 static int
358 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
359 {
360 	struct umtx_abs_timeout timo;
361 	struct linux_emuldata *em;
362 	struct umtx_pi *pi, *new_pi;
363 	struct thread *td1;
364 	struct umtx_q *uq;
365 	int error, rv;
366 	uint32_t owner, old_owner;
367 
368 	em = em_find(td);
369 	uq = td->td_umtxq;
370 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
371 	    &uq->uq_key);
372 	if (error != 0)
373 		return (error);
374 	if (args->ts != NULL)
375 		linux_umtx_abs_timeout_init(&timo, args);
376 
377 	umtxq_lock(&uq->uq_key);
378 	pi = umtx_pi_lookup(&uq->uq_key);
379 	if (pi == NULL) {
380 		new_pi = umtx_pi_alloc(M_NOWAIT);
381 		if (new_pi == NULL) {
382 			umtxq_unlock(&uq->uq_key);
383 			new_pi = umtx_pi_alloc(M_WAITOK);
384 			umtxq_lock(&uq->uq_key);
385 			pi = umtx_pi_lookup(&uq->uq_key);
386 			if (pi != NULL) {
387 				umtx_pi_free(new_pi);
388 				new_pi = NULL;
389 			}
390 		}
391 		if (new_pi != NULL) {
392 			new_pi->pi_key = uq->uq_key;
393 			umtx_pi_insert(new_pi);
394 			pi = new_pi;
395 		}
396 	}
397 	umtx_pi_ref(pi);
398 	umtxq_unlock(&uq->uq_key);
399 	for (;;) {
400 		/* Try uncontested case first. */
401 		rv = casueword32(args->uaddr, FUTEX_UNOWNED, &owner, em->em_tid);
402 		/* The acquire succeeded. */
403 		if (rv == 0) {
404 			error = 0;
405 			break;
406 		}
407 		if (rv == -1) {
408 			error = EFAULT;
409 			break;
410 		}
411 
412 		/*
413 		 * Nobody owns it, but the acquire failed. This can happen
414 		 * with ll/sc atomic.
415 		 */
416 		if (owner == FUTEX_UNOWNED) {
417 			error = thread_check_susp(td, true);
418 			if (error != 0)
419 				break;
420 			continue;
421 		}
422 
423 		/*
424 		 * Avoid overwriting a possible error from sleep due
425 		 * to the pending signal with suspension check result.
426 		 */
427 		if (error == 0) {
428 			error = thread_check_susp(td, true);
429 			if (error != 0)
430 				break;
431 		}
432 
433 		/* The futex word at *uaddr is already locked by the caller. */
434 		if ((owner & FUTEX_TID_MASK) == em->em_tid) {
435 			error = EDEADLK;
436 			break;
437 		}
438 
439 		/*
440 		 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
441 		 * and clear tid. Try to acquire it.
442 		 */
443 		if ((owner & FUTEX_TID_MASK) == FUTEX_UNOWNED) {
444 			old_owner = owner;
445 			owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
446 			owner |= em->em_tid;
447 			rv = casueword32(args->uaddr, old_owner, &owner, owner);
448 			if (rv == -1) {
449 				error = EFAULT;
450 				break;
451 			}
452 			if (rv == 1) {
453 				if (error == 0) {
454 					error = thread_check_susp(td, true);
455 					if (error != 0)
456 						break;
457 				}
458 
459 				/*
460 				 * If this failed the lock could
461 				 * changed, restart.
462 				 */
463 				continue;
464 			}
465 
466 			umtxq_lock(&uq->uq_key);
467 			umtxq_busy(&uq->uq_key);
468 			error = umtx_pi_claim(pi, td);
469 			umtxq_unbusy(&uq->uq_key);
470 			umtxq_unlock(&uq->uq_key);
471 			if (error != 0) {
472 				/*
473 				 * Since we're going to return an
474 				 * error, restore the futex to its
475 				 * previous, unowned state to avoid
476 				 * compounding the problem.
477 				 */
478 				(void)casuword32(args->uaddr, owner, old_owner);
479 			}
480 			break;
481 		}
482 
483 		/*
484 		 * Inconsistent state: OWNER_DIED is set and tid is not 0.
485 		 * Linux does some checks of futex state, we return EINVAL,
486 		 * as the user space can take care of this.
487 		 */
488 		if ((owner & FUTEX_OWNER_DIED) != FUTEX_UNOWNED) {
489 			error = EINVAL;
490 			break;
491 		}
492 
493 		if (try != 0) {
494 			error = EBUSY;
495 			break;
496 		}
497 
498 		/*
499 		 * If we caught a signal, we have retried and now
500 		 * exit immediately.
501 		 */
502 		if (error != 0)
503 			break;
504 
505 		umtxq_lock(&uq->uq_key);
506 		umtxq_busy(&uq->uq_key);
507 		umtxq_unlock(&uq->uq_key);
508 
509 		/*
510 		 * Set the contested bit so that a release in user space knows
511 		 * to use the system call for unlock. If this fails either some
512 		 * one else has acquired the lock or it has been released.
513 		 */
514 		rv = casueword32(args->uaddr, owner, &owner,
515 		    owner | FUTEX_WAITERS);
516 		if (rv == -1) {
517 			umtxq_unbusy_unlocked(&uq->uq_key);
518 			error = EFAULT;
519 			break;
520 		}
521 		if (rv == 1) {
522 			umtxq_unbusy_unlocked(&uq->uq_key);
523 			error = thread_check_susp(td, true);
524 			if (error != 0)
525 				break;
526 
527 			/*
528 			 * The lock changed and we need to retry or we
529 			 * lost a race to the thread unlocking the umtx.
530 			 */
531 			continue;
532 		}
533 
534 		/*
535 		 * Substitute Linux thread id by native thread id to
536 		 * avoid refactoring code of umtxq_sleep_pi().
537 		 */
538 		td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
539 		if (td1 != NULL) {
540 			owner = td1->td_tid;
541 			PROC_UNLOCK(td1->td_proc);
542 		} else {
543 			umtxq_unbusy_unlocked(&uq->uq_key);
544 			error = EINVAL;
545 			break;
546 		}
547 
548 		umtxq_lock(&uq->uq_key);
549 
550 		/* We set the contested bit, sleep. */
551 		error = umtxq_sleep_pi(uq, pi, owner, "futexp",
552 		    args->ts == NULL ? NULL : &timo,
553 		    (args->flags & FUTEX_SHARED) != 0);
554 		if (error != 0)
555 			continue;
556 
557 		error = thread_check_susp(td, false);
558 		if (error != 0)
559 			break;
560 	}
561 
562 	umtxq_lock(&uq->uq_key);
563 	umtx_pi_unref(pi);
564 	umtxq_unlock(&uq->uq_key);
565 	umtx_key_release(&uq->uq_key);
566 	return (error);
567 }
568 
569 static int
570 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
571 {
572 	struct linux_emuldata *em;
573 	struct umtx_key key;
574 	uint32_t old, owner, new_owner;
575 	int count, error;
576 
577 	em = em_find(td);
578 
579 	/*
580 	 * Make sure we own this mtx.
581 	 */
582 	error = fueword32(args->uaddr, &owner);
583 	if (error == -1)
584 		return (EFAULT);
585 	if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
586 		return (EPERM);
587 
588 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
589 	if (error != 0)
590 		return (error);
591 	umtxq_lock(&key);
592 	umtxq_busy(&key);
593 	error = umtx_pi_drop(td, &key, rb, &count);
594 	if (error != 0 || rb) {
595 		umtxq_unbusy(&key);
596 		umtxq_unlock(&key);
597 		umtx_key_release(&key);
598 		return (error);
599 	}
600 	umtxq_unlock(&key);
601 
602 	/*
603 	 * When unlocking the futex, it must be marked as unowned if
604 	 * there is zero or one thread only waiting for it.
605 	 * Otherwise, it must be marked as contested.
606 	 */
607 	if (count > 1)
608 		new_owner = FUTEX_WAITERS;
609 	else
610 		new_owner = FUTEX_UNOWNED;
611 
612 again:
613 	error = casueword32(args->uaddr, owner, &old, new_owner);
614 	if (error == 1) {
615 		error = thread_check_susp(td, false);
616 		if (error == 0)
617 			goto again;
618 	}
619 	umtxq_unbusy_unlocked(&key);
620 	umtx_key_release(&key);
621 	if (error == -1)
622 		return (EFAULT);
623 	if (error == 0 && old != owner)
624 		return (EINVAL);
625 	return (error);
626 }
627 
628 static int
629 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
630 {
631 	struct umtx_key key, key2;
632 	int nrwake, op_ret, ret;
633 	int error, count;
634 
635 	if (args->uaddr == args->uaddr2)
636 		return (EINVAL);
637 
638 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
639 	if (error != 0)
640 		return (error);
641 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
642 	if (error != 0) {
643 		umtx_key_release(&key);
644 		return (error);
645 	}
646 	umtxq_lock(&key);
647 	umtxq_busy(&key);
648 	umtxq_unlock(&key);
649 	error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret);
650 	umtxq_lock(&key);
651 	umtxq_unbusy(&key);
652 	if (error != 0)
653 		goto out;
654 	ret = umtxq_signal_mask(&key, args->val, args->val3);
655 	if (op_ret > 0) {
656 		nrwake = (int)(unsigned long)args->ts;
657 		umtxq_lock(&key2);
658 		count = umtxq_count(&key2);
659 		if (count > 0)
660 			ret += umtxq_signal_mask(&key2, nrwake, args->val3);
661 		else
662 			ret += umtxq_signal_mask(&key, nrwake, args->val3);
663 		umtxq_unlock(&key2);
664 	}
665 	td->td_retval[0] = ret;
666 out:
667 	umtxq_unlock(&key);
668 	umtx_key_release(&key2);
669 	umtx_key_release(&key);
670 	return (error);
671 }
672 
673 static int
674 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
675 {
676 	int nrwake, nrrequeue;
677 	struct umtx_key key, key2;
678 	int error;
679 	uint32_t uval;
680 
681 	/*
682 	 * Linux allows this, we would not, it is an incorrect
683 	 * usage of declared ABI, so return EINVAL.
684 	 */
685 	if (args->uaddr == args->uaddr2)
686 		return (EINVAL);
687 
688 	nrrequeue = (int)(unsigned long)args->ts;
689 	nrwake = args->val;
690 	/*
691 	 * Sanity check to prevent signed integer overflow,
692 	 * see Linux CVE-2018-6927
693 	 */
694 	if (nrwake < 0 || nrrequeue < 0)
695 		return (EINVAL);
696 
697 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
698 	if (error != 0)
699 		return (error);
700 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
701 	if (error != 0) {
702 		umtx_key_release(&key);
703 		return (error);
704 	}
705 	umtxq_lock(&key);
706 	umtxq_busy(&key);
707 	umtxq_unlock(&key);
708 	error = fueword32(args->uaddr, &uval);
709 	if (error != 0)
710 		error = EFAULT;
711 	else if (args->val3_compare == true && uval != args->val3)
712 		error = EWOULDBLOCK;
713 	umtxq_lock(&key);
714 	umtxq_unbusy(&key);
715 	if (error == 0) {
716 		umtxq_lock(&key2);
717 		td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
718 		umtxq_unlock(&key2);
719 	}
720 	umtxq_unlock(&key);
721 	umtx_key_release(&key2);
722 	umtx_key_release(&key);
723 	return (error);
724 }
725 
726 static int
727 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
728 {
729 	struct umtx_key key;
730 	int error;
731 
732 	if (args->val3 == 0)
733 		return (EINVAL);
734 
735 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
736 	if (error != 0)
737 		return (error);
738 	umtxq_lock(&key);
739 	td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
740 	umtxq_unlock(&key);
741 	umtx_key_release(&key);
742 	return (0);
743 }
744 
745 static int
746 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
747 {
748 	struct umtx_abs_timeout timo;
749 	struct umtx_q *uq;
750 	uint32_t uval;
751 	int error;
752 
753 	if (args->val3 == 0)
754 		error = EINVAL;
755 
756 	uq = td->td_umtxq;
757 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
758 	    &uq->uq_key);
759 	if (error != 0)
760 		return (error);
761 	if (args->ts != NULL)
762 		linux_umtx_abs_timeout_init(&timo, args);
763 	umtxq_lock(&uq->uq_key);
764 	umtxq_busy(&uq->uq_key);
765 	uq->uq_bitset = args->val3;
766 	umtxq_insert(uq);
767 	umtxq_unlock(&uq->uq_key);
768 	error = fueword32(args->uaddr, &uval);
769 	if (error != 0)
770 		error = EFAULT;
771 	else if (uval != args->val)
772 		error = EWOULDBLOCK;
773 	umtxq_lock(&uq->uq_key);
774 	umtxq_unbusy(&uq->uq_key);
775 	if (error == 0) {
776 		error = umtxq_sleep(uq, "futex",
777 		    args->ts == NULL ? NULL : &timo);
778 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
779 			error = 0;
780 		else
781 			umtxq_remove(uq);
782 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
783 		umtxq_remove(uq);
784 	}
785 	umtxq_unlock(&uq->uq_key);
786 	umtx_key_release(&uq->uq_key);
787 	if (error == ERESTART)
788 		error = EINTR;
789 	return (error);
790 }
791 
792 static void
793 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
794     struct linux_futex_args *args)
795 {
796 	int clockid, absolute;
797 
798 	/*
799 	 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
800 	 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
801 	 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
802 	 * futex operations timeout is interpreted as an absolute value.
803 	 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
804 	 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
805 	 * measures the timeout against the CLOCK_MONOTONIC clock.
806 	 */
807 	clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
808 	absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
809 	umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
810 }
811 
812 int
813 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
814 {
815 	struct linux_futex_args fargs = {
816 		.uaddr = args->uaddr,
817 		.op = args->op,
818 		.val = args->val,
819 		.ts = NULL,
820 		.uaddr2 = args->uaddr2,
821 		.val3 = args->val3,
822 		.val3_compare = true,
823 	};
824 	int error;
825 
826 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
827 	case LINUX_FUTEX_WAIT:
828 	case LINUX_FUTEX_WAIT_BITSET:
829 	case LINUX_FUTEX_LOCK_PI:
830 	case LINUX_FUTEX_LOCK_PI2:
831 		if (args->timeout != NULL) {
832 			error = linux_get_timespec(&fargs.kts, args->timeout);
833 			if (error != 0)
834 				return (error);
835 			fargs.ts = &fargs.kts;
836 		}
837 		break;
838 	default:
839 		fargs.ts = PTRIN(args->timeout);
840 	}
841 	return (linux_futex(td, &fargs));
842 }
843 
844 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
845 int
846 linux_sys_futex_time64(struct thread *td,
847     struct linux_sys_futex_time64_args *args)
848 {
849 	struct linux_futex_args fargs = {
850 		.uaddr = args->uaddr,
851 		.op = args->op,
852 		.val = args->val,
853 		.ts = NULL,
854 		.uaddr2 = args->uaddr2,
855 		.val3 = args->val3,
856 		.val3_compare = true,
857 	};
858 	int error;
859 
860 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
861 	case LINUX_FUTEX_WAIT:
862 	case LINUX_FUTEX_WAIT_BITSET:
863 	case LINUX_FUTEX_LOCK_PI:
864 	case LINUX_FUTEX_LOCK_PI2:
865 		if (args->timeout != NULL) {
866 			error = linux_get_timespec64(&fargs.kts, args->timeout);
867 			if (error != 0)
868 				return (error);
869 			fargs.ts = &fargs.kts;
870 		}
871 		break;
872 	default:
873 		fargs.ts = PTRIN(args->timeout);
874 	}
875 	return (linux_futex(td, &fargs));
876 }
877 #endif
878 
879 int
880 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
881 {
882 	struct linux_emuldata *em;
883 
884 	if (args->len != sizeof(struct linux_robust_list_head))
885 		return (EINVAL);
886 
887 	em = em_find(td);
888 	em->robust_futexes = args->head;
889 
890 	return (0);
891 }
892 
893 int
894 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
895 {
896 	struct linux_emuldata *em;
897 	struct linux_robust_list_head *head;
898 	l_size_t len;
899 	struct thread *td2;
900 	int error;
901 
902 	if (!args->pid) {
903 		em = em_find(td);
904 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
905 		head = em->robust_futexes;
906 	} else {
907 		td2 = linux_tdfind(td, args->pid, -1);
908 		if (td2 == NULL)
909 			return (ESRCH);
910 		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
911 			PROC_UNLOCK(td2->td_proc);
912 			return (EPERM);
913 		}
914 
915 		em = em_find(td2);
916 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
917 		/* XXX: ptrace? */
918 		if (priv_check(td, PRIV_CRED_SETUID) ||
919 		    priv_check(td, PRIV_CRED_SETEUID) ||
920 		    p_candebug(td, td2->td_proc)) {
921 			PROC_UNLOCK(td2->td_proc);
922 			return (EPERM);
923 		}
924 		head = em->robust_futexes;
925 
926 		PROC_UNLOCK(td2->td_proc);
927 	}
928 
929 	len = sizeof(struct linux_robust_list_head);
930 	error = copyout(&len, args->len, sizeof(l_size_t));
931 	if (error != 0)
932 		return (EFAULT);
933 
934 	return (copyout(&head, args->head, sizeof(l_uintptr_t)));
935 }
936 
937 static int
938 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
939     unsigned int pi, bool pending_op)
940 {
941 	uint32_t uval, nval, mval;
942 	int error;
943 
944 retry:
945 	error = fueword32(uaddr, &uval);
946 	if (error != 0)
947 		return (EFAULT);
948 
949 	/*
950 	 * Special case for regular (non PI) futexes. The unlock path in
951 	 * user space has two race scenarios:
952 	 *
953 	 * 1. The unlock path releases the user space futex value and
954 	 *    before it can execute the futex() syscall to wake up
955 	 *    waiters it is killed.
956 	 *
957 	 * 2. A woken up waiter is killed before it can acquire the
958 	 *    futex in user space.
959 	 *
960 	 * In both cases the TID validation below prevents a wakeup of
961 	 * potential waiters which can cause these waiters to block
962 	 * forever.
963 	 *
964 	 * In both cases it is safe to attempt waking up a potential
965 	 * waiter without touching the user space futex value and trying
966 	 * to set the OWNER_DIED bit.
967 	 */
968 	if (pending_op && !pi && !uval) {
969 		(void)futex_wake(td, uaddr, 1, true);
970 		return (0);
971 	}
972 
973 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
974 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
975 		error = casueword32(uaddr, uval, &nval, mval);
976 		if (error == -1)
977 			return (EFAULT);
978 		if (error == 1) {
979 			error = thread_check_susp(td, false);
980 			if (error != 0)
981 				return (error);
982 			goto retry;
983 		}
984 
985 		if (!pi && (uval & FUTEX_WAITERS)) {
986 			error = futex_wake(td, uaddr, 1, true);
987 			if (error != 0)
988 				return (error);
989 		} else if (pi && (uval & FUTEX_WAITERS)) {
990 			error = futex_wake_pi(td, uaddr, true);
991 			if (error != 0)
992 				return (error);
993 		}
994 	}
995 
996 	return (0);
997 }
998 
999 static int
1000 fetch_robust_entry(struct linux_robust_list **entry,
1001     struct linux_robust_list **head, unsigned int *pi)
1002 {
1003 	l_ulong uentry;
1004 	int error;
1005 
1006 	error = copyin((const void *)head, &uentry, sizeof(uentry));
1007 	if (error != 0)
1008 		return (EFAULT);
1009 
1010 	*entry = (void *)(uentry & ~1UL);
1011 	*pi = uentry & 1;
1012 
1013 	return (0);
1014 }
1015 
1016 #define	LINUX_HANDLE_DEATH_PENDING	true
1017 #define	LINUX_HANDLE_DEATH_LIST		false
1018 
1019 /* This walks the list of robust futexes releasing them. */
1020 void
1021 release_futexes(struct thread *td, struct linux_emuldata *em)
1022 {
1023 	struct linux_robust_list_head *head;
1024 	struct linux_robust_list *entry, *next_entry, *pending;
1025 	unsigned int limit = 2048, pi, next_pi, pip;
1026 	uint32_t *uaddr;
1027 	l_long futex_offset;
1028 	int error;
1029 
1030 	head = em->robust_futexes;
1031 	if (head == NULL)
1032 		return;
1033 
1034 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1035 		return;
1036 
1037 	error = copyin(&head->futex_offset, &futex_offset,
1038 	    sizeof(futex_offset));
1039 	if (error != 0)
1040 		return;
1041 
1042 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1043 		return;
1044 
1045 	while (entry != &head->list) {
1046 		error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1047 		    &next_pi);
1048 
1049 		/*
1050 		 * A pending lock might already be on the list, so
1051 		 * don't process it twice.
1052 		 */
1053 		if (entry != pending) {
1054 			uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1055 			if (handle_futex_death(td, em, uaddr, pi,
1056 			    LINUX_HANDLE_DEATH_LIST))
1057 				return;
1058 		}
1059 		if (error != 0)
1060 			return;
1061 
1062 		entry = next_entry;
1063 		pi = next_pi;
1064 
1065 		if (!--limit)
1066 			break;
1067 
1068 		sched_relinquish(curthread);
1069 	}
1070 
1071 	if (pending) {
1072 		uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1073 		(void)handle_futex_death(td, em, uaddr, pip,
1074 		    LINUX_HANDLE_DEATH_PENDING);
1075 	}
1076 }
1077