xref: /freebsd/sys/compat/linux/linux_futex.c (revision ae7e8a02e6e93455e026036132c4d053b2c12ad9)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org>
5  * Copyright (c) 2008 Roman Divacky
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_compat.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/imgact.h>
37 #include <sys/imgact_elf.h>
38 #include <sys/ktr.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sched.h>
43 #include <sys/umtxvar.h>
44 
45 #ifdef COMPAT_LINUX32
46 #include <machine/../linux32/linux.h>
47 #include <machine/../linux32/linux32_proto.h>
48 #else
49 #include <machine/../linux/linux.h>
50 #include <machine/../linux/linux_proto.h>
51 #endif
52 #include <compat/linux/linux_emul.h>
53 #include <compat/linux/linux_futex.h>
54 #include <compat/linux/linux_misc.h>
55 #include <compat/linux/linux_timer.h>
56 #include <compat/linux/linux_util.h>
57 
58 #define	FUTEX_SHARED	0x8     /* shared futex */
59 
60 #define	GET_SHARED(a)	(a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
61 
62 static int futex_atomic_op(struct thread *, int, uint32_t *);
63 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
64     uint32_t *, unsigned int, bool);
65 static int fetch_robust_entry(struct linux_robust_list **,
66     struct linux_robust_list **, unsigned int *);
67 
68 struct linux_futex_args {
69 	uint32_t	*uaddr;
70 	int32_t		op;
71 	uint32_t	flags;
72 	bool		clockrt;
73 	uint32_t	val;
74 	struct timespec	*ts;
75 	uint32_t	*uaddr2;
76 	uint32_t	val3;
77 	bool		val3_compare;
78 	struct timespec	kts;
79 };
80 
81 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
82 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
83 	    struct linux_futex_args *);
84 static int	linux_futex(struct thread *, struct linux_futex_args *);
85 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
86 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
87 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
88 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
89 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
90 static int linux_futex_unlock_pi(struct thread *, bool,
91 	    struct linux_futex_args *);
92 static int futex_wake_pi(struct thread *, uint32_t *, bool);
93 
94 static int
95 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
96 {
97 
98 	/* Check that futex address is a 32bit aligned. */
99 	if (!__is_aligned(uaddr, sizeof(uint32_t)))
100 		return (EINVAL);
101 	return (umtx_key_get(uaddr, type, share, key));
102 }
103 
104 int
105 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
106 {
107 	struct linux_futex_args args;
108 
109 	bzero(&args, sizeof(args));
110 	args.op = LINUX_FUTEX_WAKE;
111 	args.uaddr = uaddr;
112 	args.flags = shared == true ? FUTEX_SHARED : 0;
113 	args.val = val;
114 	args.val3 = FUTEX_BITSET_MATCH_ANY;
115 
116 	return (linux_futex_wake(td, &args));
117 }
118 
119 static int
120 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
121 {
122 	struct linux_futex_args args;
123 
124 	bzero(&args, sizeof(args));
125 	args.op = LINUX_FUTEX_UNLOCK_PI;
126 	args.uaddr = uaddr;
127 	args.flags = shared == true ? FUTEX_SHARED : 0;
128 
129 	return (linux_futex_unlock_pi(td, true, &args));
130 }
131 
132 static int
133 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr)
134 {
135 	int op = (encoded_op >> 28) & 7;
136 	int cmp = (encoded_op >> 24) & 15;
137 	int oparg = (encoded_op << 8) >> 20;
138 	int cmparg = (encoded_op << 20) >> 20;
139 	int oldval = 0, ret;
140 
141 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
142 		oparg = 1 << oparg;
143 
144 	switch (op) {
145 	case FUTEX_OP_SET:
146 		ret = futex_xchgl(oparg, uaddr, &oldval);
147 		break;
148 	case FUTEX_OP_ADD:
149 		ret = futex_addl(oparg, uaddr, &oldval);
150 		break;
151 	case FUTEX_OP_OR:
152 		ret = futex_orl(oparg, uaddr, &oldval);
153 		break;
154 	case FUTEX_OP_ANDN:
155 		ret = futex_andl(~oparg, uaddr, &oldval);
156 		break;
157 	case FUTEX_OP_XOR:
158 		ret = futex_xorl(oparg, uaddr, &oldval);
159 		break;
160 	default:
161 		ret = -ENOSYS;
162 		break;
163 	}
164 
165 	if (ret)
166 		return (ret);
167 
168 	switch (cmp) {
169 	case FUTEX_OP_CMP_EQ:
170 		ret = (oldval == cmparg);
171 		break;
172 	case FUTEX_OP_CMP_NE:
173 		ret = (oldval != cmparg);
174 		break;
175 	case FUTEX_OP_CMP_LT:
176 		ret = (oldval < cmparg);
177 		break;
178 	case FUTEX_OP_CMP_GE:
179 		ret = (oldval >= cmparg);
180 		break;
181 	case FUTEX_OP_CMP_LE:
182 		ret = (oldval <= cmparg);
183 		break;
184 	case FUTEX_OP_CMP_GT:
185 		ret = (oldval > cmparg);
186 		break;
187 	default:
188 		ret = -ENOSYS;
189 	}
190 
191 	return (ret);
192 }
193 
194 static int
195 linux_futex(struct thread *td, struct linux_futex_args *args)
196 {
197 	struct linux_pemuldata *pem;
198 	struct proc *p;
199 
200 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
201 		args->flags = 0;
202 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
203 	} else
204 		args->flags = FUTEX_SHARED;
205 
206 	args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
207 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
208 
209 	if (args->clockrt &&
210 	    args->op != LINUX_FUTEX_WAIT_BITSET &&
211 	    args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
212 	    args->op != LINUX_FUTEX_LOCK_PI2)
213 		return (ENOSYS);
214 
215 	switch (args->op) {
216 	case LINUX_FUTEX_WAIT:
217 		args->val3 = FUTEX_BITSET_MATCH_ANY;
218 		/* FALLTHROUGH */
219 
220 	case LINUX_FUTEX_WAIT_BITSET:
221 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
222 		    args->uaddr, args->val, args->val3);
223 
224 		return (linux_futex_wait(td, args));
225 
226 	case LINUX_FUTEX_WAKE:
227 		args->val3 = FUTEX_BITSET_MATCH_ANY;
228 		/* FALLTHROUGH */
229 
230 	case LINUX_FUTEX_WAKE_BITSET:
231 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
232 		    args->uaddr, args->val, args->val3);
233 
234 		return (linux_futex_wake(td, args));
235 
236 	case LINUX_FUTEX_REQUEUE:
237 		/*
238 		 * Glibc does not use this operation since version 2.3.3,
239 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
240 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
241 		 * FUTEX_REQUEUE returned EINVAL.
242 		 */
243 		pem = pem_find(td->td_proc);
244 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
245 			linux_msg(td, "unsupported FUTEX_REQUEUE");
246 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
247 		}
248 
249 		/*
250 		 * The above is true, however musl libc does make use of the
251 		 * futex requeue operation, allow operation for brands which
252 		 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
253 		 */
254 		p = td->td_proc;
255 		Elf_Brandinfo *bi = p->p_elf_brandinfo;
256 		if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
257 			return (EINVAL);
258 		args->val3_compare = false;
259 		/* FALLTHROUGH */
260 
261 	case LINUX_FUTEX_CMP_REQUEUE:
262 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
263 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
264 		    args->uaddr, args->val, args->val3, args->uaddr2,
265 		    args->ts);
266 
267 		return (linux_futex_requeue(td, args));
268 
269 	case LINUX_FUTEX_WAKE_OP:
270 		LINUX_CTR5(sys_futex, "WAKE_OP "
271 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
272 		    args->uaddr, args->val, args->uaddr2, args->val3,
273 		    args->ts);
274 
275 		return (linux_futex_wakeop(td, args));
276 
277 	case LINUX_FUTEX_LOCK_PI:
278 		args->clockrt = true;
279 		/* FALLTHROUGH */
280 
281 	case LINUX_FUTEX_LOCK_PI2:
282 		LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
283 		    args->uaddr, args->val);
284 
285 		return (linux_futex_lock_pi(td, false, args));
286 
287 	case LINUX_FUTEX_UNLOCK_PI:
288 		LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
289 		    args->uaddr);
290 
291 		return (linux_futex_unlock_pi(td, false, args));
292 
293 	case LINUX_FUTEX_TRYLOCK_PI:
294 		LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
295 		    args->uaddr);
296 
297 		return (linux_futex_lock_pi(td, true, args));
298 
299 	/*
300 	 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
301 	 * can't be used anymore to implement conditional variables.
302 	 * A detailed explanation can be found here:
303 	 *
304 	 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
305 	 * and here http://austingroupbugs.net/view.php?id=609
306 	 *
307 	 * And since commit
308 	 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
309 	 * glibc does not use them.
310 	 */
311 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
312 		/* not yet implemented */
313 		pem = pem_find(td->td_proc);
314 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
315 			linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
316 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
317 		}
318 		return (ENOSYS);
319 
320 	case LINUX_FUTEX_CMP_REQUEUE_PI:
321 		/* not yet implemented */
322 		pem = pem_find(td->td_proc);
323 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
324 			linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
325 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
326 		}
327 		return (ENOSYS);
328 
329 	default:
330 		linux_msg(td, "unsupported futex op %d", args->op);
331 		return (ENOSYS);
332 	}
333 }
334 
335 /*
336  * pi protocol:
337  * - 0 futex word value means unlocked.
338  * - TID futex word value means locked.
339  * Userspace uses atomic ops to lock/unlock these futexes without entering the
340  * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
341  * then FUTEX_LOCK_PI is called.
342  * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
343  * other waiters exists looks up the thread that owns the futex (it has put its
344  * own TID into the futex value) and made this thread the owner of the internal
345  * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
346  * object, on which it blocks. Once it returns, it has the mutex acquired, and it
347  * sets the futex value to its own TID and returns (futex value contains
348  * FUTEX_WAITERS|TID).
349  * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
350  * FUTEX_UNLOCK_PI will be called.
351  * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
352  * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
353  * bit is preserved (if any).
354  * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
355  * the internal futex state and if correct, acquire futex.
356  */
357 static int
358 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
359 {
360 	struct umtx_abs_timeout timo;
361 	struct linux_emuldata *em;
362 	struct umtx_pi *pi, *new_pi;
363 	struct thread *td1;
364 	struct umtx_q *uq;
365 	int error, rv;
366 	uint32_t owner, old_owner;
367 
368 	em = em_find(td);
369 	uq = td->td_umtxq;
370 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
371 	    &uq->uq_key);
372 	if (error != 0)
373 		return (error);
374 	if (args->ts != NULL)
375 		linux_umtx_abs_timeout_init(&timo, args);
376 
377 	umtxq_lock(&uq->uq_key);
378 	pi = umtx_pi_lookup(&uq->uq_key);
379 	if (pi == NULL) {
380 		new_pi = umtx_pi_alloc(M_NOWAIT);
381 		if (new_pi == NULL) {
382 			umtxq_unlock(&uq->uq_key);
383 			new_pi = umtx_pi_alloc(M_WAITOK);
384 			umtxq_lock(&uq->uq_key);
385 			pi = umtx_pi_lookup(&uq->uq_key);
386 			if (pi != NULL) {
387 				umtx_pi_free(new_pi);
388 				new_pi = NULL;
389 			}
390 		}
391 		if (new_pi != NULL) {
392 			new_pi->pi_key = uq->uq_key;
393 			umtx_pi_insert(new_pi);
394 			pi = new_pi;
395 		}
396 	}
397 	umtx_pi_ref(pi);
398 	umtxq_unlock(&uq->uq_key);
399 	for (;;) {
400 		/* Try uncontested case first. */
401 		rv = casueword32(args->uaddr, 0, &owner, em->em_tid);
402 		/* The acquire succeeded. */
403 		if (rv == 0) {
404 			error = 0;
405 			break;
406 		}
407 		if (rv == -1) {
408 			error = EFAULT;
409 			break;
410 		}
411 
412 		/*
413 		 * Avoid overwriting a possible error from sleep due
414 		 * to the pending signal with suspension check result.
415 		 */
416 		if (error == 0) {
417 			error = thread_check_susp(td, true);
418 			if (error != 0)
419 				break;
420 		}
421 
422 		/* The futex word at *uaddr is already locked by the caller. */
423 		if ((owner & FUTEX_TID_MASK) == em->em_tid) {
424 			error = EDEADLK;
425 			break;
426 		}
427 
428 		/*
429 		 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
430 		 * and clear tid. Try to acquire it.
431 		 */
432 		if ((owner & FUTEX_TID_MASK) == 0) {
433 			old_owner = owner;
434 			owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
435 			owner |= em->em_tid;
436 			rv = casueword32(args->uaddr, old_owner, &owner, owner);
437 			if (rv == -1) {
438 				error = EFAULT;
439 				break;
440 			}
441 			if (rv == 1) {
442 				if (error == 0) {
443 					error = thread_check_susp(td, true);
444 					if (error != 0)
445 						break;
446 				}
447 
448 				/*
449 				 * If this failed the lock could
450 				 * changed, restart.
451 				 */
452 				continue;
453 			}
454 
455 			umtxq_lock(&uq->uq_key);
456 			umtxq_busy(&uq->uq_key);
457 			error = umtx_pi_claim(pi, td);
458 			umtxq_unbusy(&uq->uq_key);
459 			umtxq_unlock(&uq->uq_key);
460 			if (error != 0) {
461 				/*
462 				 * Since we're going to return an
463 				 * error, restore the futex to its
464 				 * previous, unowned state to avoid
465 				 * compounding the problem.
466 				 */
467 				(void)casuword32(args->uaddr, owner, old_owner);
468 			}
469 			break;
470 		}
471 
472 		/*
473 		 * Inconsistent state: OWNER_DIED is set and tid is not 0.
474 		 * Linux does some checks of futex state, we return EINVAL,
475 		 * as the user space can take care of this.
476 		 */
477 		if ((owner & FUTEX_OWNER_DIED) != 0) {
478 			error = EINVAL;
479 			break;
480 		}
481 
482 		if (try != 0) {
483 			error = EBUSY;
484 			break;
485 		}
486 
487 		/*
488 		 * If we caught a signal, we have retried and now
489 		 * exit immediately.
490 		 */
491 		if (error != 0)
492 			break;
493 
494 		umtxq_lock(&uq->uq_key);
495 		umtxq_busy(&uq->uq_key);
496 		umtxq_unlock(&uq->uq_key);
497 
498 		/*
499 		 * Set the contested bit so that a release in user space knows
500 		 * to use the system call for unlock. If this fails either some
501 		 * one else has acquired the lock or it has been released.
502 		 */
503 		rv = casueword32(args->uaddr, owner, &owner,
504 		    owner | FUTEX_WAITERS);
505 		if (rv == -1) {
506 			umtxq_unbusy_unlocked(&uq->uq_key);
507 			error = EFAULT;
508 			break;
509 		}
510 		if (rv == 1) {
511 			umtxq_unbusy_unlocked(&uq->uq_key);
512 			error = thread_check_susp(td, true);
513 			if (error != 0)
514 				break;
515 
516 			/*
517 			 * The lock changed and we need to retry or we
518 			 * lost a race to the thread unlocking the umtx.
519 			 */
520 			continue;
521 		}
522 
523 		/*
524 		 * Substitute Linux thread id by native thread id to
525 		 * avoid refactoring code of umtxq_sleep_pi().
526 		 */
527 		td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
528 		if (td1 != NULL) {
529 			owner = td1->td_tid;
530 			PROC_UNLOCK(td1->td_proc);
531 		} else {
532 			umtxq_unbusy_unlocked(&uq->uq_key);
533 			error = EINVAL;
534 			break;
535 		}
536 
537 		umtxq_lock(&uq->uq_key);
538 
539 		/* We set the contested bit, sleep. */
540 		error = umtxq_sleep_pi(uq, pi, owner, "futexp",
541 		    args->ts == NULL ? NULL : &timo,
542 		    (args->flags & FUTEX_SHARED) != 0);
543 		if (error != 0)
544 			continue;
545 
546 		error = thread_check_susp(td, false);
547 		if (error != 0)
548 			break;
549 	}
550 
551 	umtxq_lock(&uq->uq_key);
552 	umtx_pi_unref(pi);
553 	umtxq_unlock(&uq->uq_key);
554 	umtx_key_release(&uq->uq_key);
555 	return (error);
556 }
557 
558 static int
559 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
560 {
561 	struct linux_emuldata *em;
562 	struct umtx_key key;
563 	uint32_t old, owner, new_owner;
564 	int count, error;
565 
566 	em = em_find(td);
567 
568 	/*
569 	 * Make sure we own this mtx.
570 	 */
571 	error = fueword32(args->uaddr, &owner);
572 	if (error == -1)
573 		return (EFAULT);
574 	if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
575 		return (EPERM);
576 
577 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
578 	if (error != 0)
579 		return (error);
580 	umtxq_lock(&key);
581 	umtxq_busy(&key);
582 	error = umtx_pi_drop(td, &key, rb, &count);
583 	if (error != 0 || rb) {
584 		umtxq_unbusy(&key);
585 		umtxq_unlock(&key);
586 		umtx_key_release(&key);
587 		return (error);
588 	}
589 	umtxq_unlock(&key);
590 
591 	/*
592 	 * When unlocking the futex, it must be marked as unowned if
593 	 * there is zero or one thread only waiting for it.
594 	 * Otherwise, it must be marked as contested.
595 	 */
596 	if (count > 1)
597 		new_owner = FUTEX_WAITERS;
598 	else
599 		new_owner = 0;
600 
601 again:
602 	error = casueword32(args->uaddr, owner, &old, new_owner);
603 	if (error == 1) {
604 		error = thread_check_susp(td, false);
605 		if (error == 0)
606 			goto again;
607 	}
608 	umtxq_unbusy_unlocked(&key);
609 	umtx_key_release(&key);
610 	if (error == -1)
611 		return (EFAULT);
612 	if (error == 0 && old != owner)
613 		return (EINVAL);
614 	return (error);
615 }
616 
617 static int
618 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
619 {
620 	struct umtx_key key, key2;
621 	int nrwake, op_ret, ret;
622 	int error, count;
623 
624 	if (args->uaddr == args->uaddr2)
625 		return (EINVAL);
626 
627 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
628 	if (error != 0)
629 		return (error);
630 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
631 	if (error != 0) {
632 		umtx_key_release(&key);
633 		return (error);
634 	}
635 	umtxq_lock(&key);
636 	umtxq_busy(&key);
637 	umtxq_unlock(&key);
638 	op_ret = futex_atomic_op(td, args->val3, args->uaddr2);
639 	if (op_ret < 0) {
640 		if (op_ret == -ENOSYS)
641 			error = ENOSYS;
642 		else
643 			error = EFAULT;
644 	}
645 	umtxq_lock(&key);
646 	umtxq_unbusy(&key);
647 	if (error != 0)
648 		goto out;
649 	ret = umtxq_signal_mask(&key, args->val, args->val3);
650 	if (op_ret > 0) {
651 		nrwake = (int)(unsigned long)args->ts;
652 		umtxq_lock(&key2);
653 		count = umtxq_count(&key2);
654 		if (count > 0)
655 			ret += umtxq_signal_mask(&key2, nrwake, args->val3);
656 		else
657 			ret += umtxq_signal_mask(&key, nrwake, args->val3);
658 		umtxq_unlock(&key2);
659 	}
660 	td->td_retval[0] = ret;
661 out:
662 	umtxq_unlock(&key);
663 	umtx_key_release(&key2);
664 	umtx_key_release(&key);
665 	return (error);
666 }
667 
668 static int
669 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
670 {
671 	int nrwake, nrrequeue;
672 	struct umtx_key key, key2;
673 	int error;
674 	uint32_t uval;
675 
676 	/*
677 	 * Linux allows this, we would not, it is an incorrect
678 	 * usage of declared ABI, so return EINVAL.
679 	 */
680 	if (args->uaddr == args->uaddr2)
681 		return (EINVAL);
682 
683 	nrrequeue = (int)(unsigned long)args->ts;
684 	nrwake = args->val;
685 	/*
686 	 * Sanity check to prevent signed integer overflow,
687 	 * see Linux CVE-2018-6927
688 	 */
689 	if (nrwake < 0 || nrrequeue < 0)
690 		return (EINVAL);
691 
692 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
693 	if (error != 0)
694 		return (error);
695 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
696 	if (error != 0) {
697 		umtx_key_release(&key);
698 		return (error);
699 	}
700 	umtxq_lock(&key);
701 	umtxq_busy(&key);
702 	umtxq_unlock(&key);
703 	error = fueword32(args->uaddr, &uval);
704 	if (error != 0)
705 		error = EFAULT;
706 	else if (args->val3_compare == true && uval != args->val3)
707 		error = EWOULDBLOCK;
708 	umtxq_lock(&key);
709 	umtxq_unbusy(&key);
710 	if (error == 0) {
711 		umtxq_lock(&key2);
712 		td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
713 		umtxq_unlock(&key2);
714 	}
715 	umtxq_unlock(&key);
716 	umtx_key_release(&key2);
717 	umtx_key_release(&key);
718 	return (error);
719 }
720 
721 static int
722 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
723 {
724 	struct umtx_key key;
725 	int error;
726 
727 	if (args->val3 == 0)
728 		return (EINVAL);
729 
730 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
731 	if (error != 0)
732 		return (error);
733 	umtxq_lock(&key);
734 	td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
735 	umtxq_unlock(&key);
736 	umtx_key_release(&key);
737 	return (0);
738 }
739 
740 static int
741 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
742 {
743 	struct umtx_abs_timeout timo;
744 	struct umtx_q *uq;
745 	uint32_t uval;
746 	int error;
747 
748 	if (args->val3 == 0)
749 		error = EINVAL;
750 
751 	uq = td->td_umtxq;
752 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
753 	    &uq->uq_key);
754 	if (error != 0)
755 		return (error);
756 	if (args->ts != NULL)
757 		linux_umtx_abs_timeout_init(&timo, args);
758 	umtxq_lock(&uq->uq_key);
759 	umtxq_busy(&uq->uq_key);
760 	uq->uq_bitset = args->val3;
761 	umtxq_insert(uq);
762 	umtxq_unlock(&uq->uq_key);
763 	error = fueword32(args->uaddr, &uval);
764 	if (error != 0)
765 		error = EFAULT;
766 	else if (uval != args->val)
767 		error = EWOULDBLOCK;
768 	umtxq_lock(&uq->uq_key);
769 	umtxq_unbusy(&uq->uq_key);
770 	if (error == 0) {
771 		error = umtxq_sleep(uq, "futex",
772 		    args->ts == NULL ? NULL : &timo);
773 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
774 			error = 0;
775 		else
776 			umtxq_remove(uq);
777 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
778 		umtxq_remove(uq);
779 	}
780 	umtxq_unlock(&uq->uq_key);
781 	umtx_key_release(&uq->uq_key);
782 	if (error == ERESTART)
783 		error = EINTR;
784 	return (error);
785 }
786 
787 static void
788 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
789     struct linux_futex_args *args)
790 {
791 	int clockid, absolute;
792 
793 	/*
794 	 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
795 	 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
796 	 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
797 	 * futex operations timeout is interpreted as an absolute value.
798 	 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
799 	 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
800 	 * measures the timeout against the CLOCK_MONOTONIC clock.
801 	 */
802 	clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
803 	absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
804 	umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
805 }
806 
807 int
808 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
809 {
810 	struct linux_futex_args fargs = {
811 		.uaddr = args->uaddr,
812 		.op = args->op,
813 		.val = args->val,
814 		.ts = NULL,
815 		.uaddr2 = args->uaddr2,
816 		.val3 = args->val3,
817 		.val3_compare = true,
818 	};
819 	struct l_timespec lts;
820 	int error;
821 
822 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
823 	case LINUX_FUTEX_WAIT:
824 	case LINUX_FUTEX_WAIT_BITSET:
825 	case LINUX_FUTEX_LOCK_PI:
826 	case LINUX_FUTEX_LOCK_PI2:
827 		if (args->timeout != NULL) {
828 			error = copyin(args->timeout, &lts, sizeof(lts));
829 			if (error != 0)
830 				return (error);
831 			error = linux_to_native_timespec(&fargs.kts, &lts);
832 			if (error != 0)
833 				return (error);
834 			fargs.ts = &fargs.kts;
835 		}
836 		break;
837 	default:
838 		fargs.ts = PTRIN(args->timeout);
839 	}
840 	return (linux_futex(td, &fargs));
841 }
842 
843 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
844 int
845 linux_sys_futex_time64(struct thread *td,
846     struct linux_sys_futex_time64_args *args)
847 {
848 	struct linux_futex_args fargs = {
849 		.uaddr = args->uaddr,
850 		.op = args->op,
851 		.val = args->val,
852 		.ts = NULL,
853 		.uaddr2 = args->uaddr2,
854 		.val3 = args->val3,
855 		.val3_compare = true,
856 	};
857 	struct l_timespec64 lts;
858 	int error;
859 
860 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
861 	case LINUX_FUTEX_WAIT:
862 	case LINUX_FUTEX_WAIT_BITSET:
863 	case LINUX_FUTEX_LOCK_PI:
864 	case LINUX_FUTEX_LOCK_PI2:
865 		if (args->timeout != NULL) {
866 			error = copyin(args->timeout, &lts, sizeof(lts));
867 			if (error != 0)
868 				return (error);
869 			error = linux_to_native_timespec64(&fargs.kts, &lts);
870 			if (error != 0)
871 				return (error);
872 			fargs.ts = &fargs.kts;
873 		}
874 		break;
875 	default:
876 		fargs.ts = PTRIN(args->timeout);
877 	}
878 	return (linux_futex(td, &fargs));
879 }
880 #endif
881 
882 int
883 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
884 {
885 	struct linux_emuldata *em;
886 
887 	if (args->len != sizeof(struct linux_robust_list_head))
888 		return (EINVAL);
889 
890 	em = em_find(td);
891 	em->robust_futexes = args->head;
892 
893 	return (0);
894 }
895 
896 int
897 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
898 {
899 	struct linux_emuldata *em;
900 	struct linux_robust_list_head *head;
901 	l_size_t len;
902 	struct thread *td2;
903 	int error;
904 
905 	if (!args->pid) {
906 		em = em_find(td);
907 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
908 		head = em->robust_futexes;
909 	} else {
910 		td2 = linux_tdfind(td, args->pid, -1);
911 		if (td2 == NULL)
912 			return (ESRCH);
913 		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
914 			PROC_UNLOCK(td2->td_proc);
915 			return (EPERM);
916 		}
917 
918 		em = em_find(td2);
919 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
920 		/* XXX: ptrace? */
921 		if (priv_check(td, PRIV_CRED_SETUID) ||
922 		    priv_check(td, PRIV_CRED_SETEUID) ||
923 		    p_candebug(td, td2->td_proc)) {
924 			PROC_UNLOCK(td2->td_proc);
925 			return (EPERM);
926 		}
927 		head = em->robust_futexes;
928 
929 		PROC_UNLOCK(td2->td_proc);
930 	}
931 
932 	len = sizeof(struct linux_robust_list_head);
933 	error = copyout(&len, args->len, sizeof(l_size_t));
934 	if (error != 0)
935 		return (EFAULT);
936 
937 	return (copyout(&head, args->head, sizeof(head)));
938 }
939 
940 static int
941 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
942     unsigned int pi, bool pending_op)
943 {
944 	uint32_t uval, nval, mval;
945 	int error;
946 
947 retry:
948 	error = fueword32(uaddr, &uval);
949 	if (error != 0)
950 		return (EFAULT);
951 
952 	/*
953 	 * Special case for regular (non PI) futexes. The unlock path in
954 	 * user space has two race scenarios:
955 	 *
956 	 * 1. The unlock path releases the user space futex value and
957 	 *    before it can execute the futex() syscall to wake up
958 	 *    waiters it is killed.
959 	 *
960 	 * 2. A woken up waiter is killed before it can acquire the
961 	 *    futex in user space.
962 	 *
963 	 * In both cases the TID validation below prevents a wakeup of
964 	 * potential waiters which can cause these waiters to block
965 	 * forever.
966 	 *
967 	 * In both cases it is safe to attempt waking up a potential
968 	 * waiter without touching the user space futex value and trying
969 	 * to set the OWNER_DIED bit.
970 	 */
971 	if (pending_op && !pi && !uval) {
972 		(void)futex_wake(td, uaddr, 1, true);
973 		return (0);
974 	}
975 
976 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
977 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
978 		error = casueword32(uaddr, uval, &nval, mval);
979 		if (error == -1)
980 			return (EFAULT);
981 		if (error == 1) {
982 			error = thread_check_susp(td, false);
983 			if (error != 0)
984 				return (error);
985 			goto retry;
986 		}
987 
988 		if (!pi && (uval & FUTEX_WAITERS)) {
989 			error = futex_wake(td, uaddr, 1, true);
990 			if (error != 0)
991 				return (error);
992 		} else if (pi && (uval & FUTEX_WAITERS)) {
993 			error = futex_wake_pi(td, uaddr, true);
994 			if (error != 0)
995 				return (error);
996 		}
997 	}
998 
999 	return (0);
1000 }
1001 
1002 static int
1003 fetch_robust_entry(struct linux_robust_list **entry,
1004     struct linux_robust_list **head, unsigned int *pi)
1005 {
1006 	l_ulong uentry;
1007 	int error;
1008 
1009 	error = copyin((const void *)head, &uentry, sizeof(uentry));
1010 	if (error != 0)
1011 		return (EFAULT);
1012 
1013 	*entry = (void *)(uentry & ~1UL);
1014 	*pi = uentry & 1;
1015 
1016 	return (0);
1017 }
1018 
1019 #define	LINUX_HANDLE_DEATH_PENDING	true
1020 #define	LINUX_HANDLE_DEATH_LIST		false
1021 
1022 /* This walks the list of robust futexes releasing them. */
1023 void
1024 release_futexes(struct thread *td, struct linux_emuldata *em)
1025 {
1026 	struct linux_robust_list_head *head;
1027 	struct linux_robust_list *entry, *next_entry, *pending;
1028 	unsigned int limit = 2048, pi, next_pi, pip;
1029 	uint32_t *uaddr;
1030 	l_long futex_offset;
1031 	int error;
1032 
1033 	head = em->robust_futexes;
1034 	if (head == NULL)
1035 		return;
1036 
1037 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1038 		return;
1039 
1040 	error = copyin(&head->futex_offset, &futex_offset,
1041 	    sizeof(futex_offset));
1042 	if (error != 0)
1043 		return;
1044 
1045 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1046 		return;
1047 
1048 	while (entry != &head->list) {
1049 		error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1050 		    &next_pi);
1051 
1052 		/*
1053 		 * A pending lock might already be on the list, so
1054 		 * don't process it twice.
1055 		 */
1056 		if (entry != pending) {
1057 			uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1058 			if (handle_futex_death(td, em, uaddr, pi,
1059 			    LINUX_HANDLE_DEATH_LIST))
1060 				return;
1061 		}
1062 		if (error != 0)
1063 			return;
1064 
1065 		entry = next_entry;
1066 		pi = next_pi;
1067 
1068 		if (!--limit)
1069 			break;
1070 
1071 		sched_relinquish(curthread);
1072 	}
1073 
1074 	if (pending) {
1075 		uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1076 		(void)handle_futex_death(td, em, uaddr, pip,
1077 		    LINUX_HANDLE_DEATH_PENDING);
1078 	}
1079 }
1080