xref: /freebsd/sys/compat/linux/linux_futex.c (revision 80336636b6b9f7a3bdad007c400e85eae017d2a2)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org>
5  * Copyright (c) 2008 Roman Divacky
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/imgact.h>
31 #include <sys/imgact_elf.h>
32 #include <sys/ktr.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/priv.h>
36 #include <sys/proc.h>
37 #include <sys/sched.h>
38 #include <sys/sysent.h>
39 #include <sys/vnode.h>
40 #include <sys/umtxvar.h>
41 
42 #ifdef COMPAT_LINUX32
43 #include <machine/../linux32/linux.h>
44 #include <machine/../linux32/linux32_proto.h>
45 #else
46 #include <machine/../linux/linux.h>
47 #include <machine/../linux/linux_proto.h>
48 #endif
49 #include <compat/linux/linux_emul.h>
50 #include <compat/linux/linux_futex.h>
51 #include <compat/linux/linux_misc.h>
52 #include <compat/linux/linux_time.h>
53 #include <compat/linux/linux_util.h>
54 
55 #define	FUTEX_SHARED	0x8     /* shared futex */
56 #define	FUTEX_UNOWNED	0
57 
58 #define	GET_SHARED(a)	(a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
59 
60 static int futex_atomic_op(struct thread *, int, uint32_t *, int *);
61 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
62     uint32_t *, unsigned int, bool);
63 static int fetch_robust_entry(struct linux_robust_list **,
64     struct linux_robust_list **, unsigned int *);
65 
66 struct linux_futex_args {
67 	uint32_t	*uaddr;
68 	int32_t		op;
69 	uint32_t	flags;
70 	bool		clockrt;
71 	uint32_t	val;
72 	struct timespec	*ts;
73 	uint32_t	*uaddr2;
74 	uint32_t	val3;
75 	bool		val3_compare;
76 	struct timespec	kts;
77 };
78 
79 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
80 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
81 	    struct linux_futex_args *);
82 static int linux_futex(struct thread *, struct linux_futex_args *);
83 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
84 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
85 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
86 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
87 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
88 static int linux_futex_unlock_pi(struct thread *, bool,
89 	    struct linux_futex_args *);
90 static int futex_wake_pi(struct thread *, uint32_t *, bool);
91 
92 static int
futex_key_get(const void * uaddr,int type,int share,struct umtx_key * key)93 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
94 {
95 
96 	/* Check that futex address is a 32bit aligned. */
97 	if (!__is_aligned(uaddr, sizeof(uint32_t)))
98 		return (EINVAL);
99 	return (umtx_key_get(uaddr, type, share, key));
100 }
101 
102 int
futex_wake(struct thread * td,uint32_t * uaddr,int val,bool shared)103 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
104 {
105 	struct linux_futex_args args;
106 
107 	bzero(&args, sizeof(args));
108 	args.op = LINUX_FUTEX_WAKE;
109 	args.uaddr = uaddr;
110 	args.flags = shared == true ? FUTEX_SHARED : 0;
111 	args.val = val;
112 	args.val3 = FUTEX_BITSET_MATCH_ANY;
113 
114 	return (linux_futex_wake(td, &args));
115 }
116 
117 static int
futex_wake_pi(struct thread * td,uint32_t * uaddr,bool shared)118 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
119 {
120 	struct linux_futex_args args;
121 
122 	bzero(&args, sizeof(args));
123 	args.op = LINUX_FUTEX_UNLOCK_PI;
124 	args.uaddr = uaddr;
125 	args.flags = shared == true ? FUTEX_SHARED : 0;
126 
127 	return (linux_futex_unlock_pi(td, true, &args));
128 }
129 
130 static int
futex_atomic_op(struct thread * td,int encoded_op,uint32_t * uaddr,int * res)131 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr,
132     int *res)
133 {
134 	int op = (encoded_op >> 28) & 7;
135 	int cmp = (encoded_op >> 24) & 15;
136 	int oparg = (encoded_op << 8) >> 20;
137 	int cmparg = (encoded_op << 20) >> 20;
138 	int oldval = 0, ret;
139 
140 	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
141 		oparg = 1 << oparg;
142 
143 	switch (op) {
144 	case FUTEX_OP_SET:
145 		ret = futex_xchgl(oparg, uaddr, &oldval);
146 		break;
147 	case FUTEX_OP_ADD:
148 		ret = futex_addl(oparg, uaddr, &oldval);
149 		break;
150 	case FUTEX_OP_OR:
151 		ret = futex_orl(oparg, uaddr, &oldval);
152 		break;
153 	case FUTEX_OP_ANDN:
154 		ret = futex_andl(~oparg, uaddr, &oldval);
155 		break;
156 	case FUTEX_OP_XOR:
157 		ret = futex_xorl(oparg, uaddr, &oldval);
158 		break;
159 	default:
160 		ret = ENOSYS;
161 		break;
162 	}
163 
164 	if (ret != 0)
165 		return (ret);
166 
167 	switch (cmp) {
168 	case FUTEX_OP_CMP_EQ:
169 		*res = (oldval == cmparg);
170 		break;
171 	case FUTEX_OP_CMP_NE:
172 		*res = (oldval != cmparg);
173 		break;
174 	case FUTEX_OP_CMP_LT:
175 		*res = (oldval < cmparg);
176 		break;
177 	case FUTEX_OP_CMP_GE:
178 		*res = (oldval >= cmparg);
179 		break;
180 	case FUTEX_OP_CMP_LE:
181 		*res = (oldval <= cmparg);
182 		break;
183 	case FUTEX_OP_CMP_GT:
184 		*res = (oldval > cmparg);
185 		break;
186 	default:
187 		ret = ENOSYS;
188 	}
189 
190 	return (ret);
191 }
192 
193 static int
linux_futex(struct thread * td,struct linux_futex_args * args)194 linux_futex(struct thread *td, struct linux_futex_args *args)
195 {
196 	struct linux_pemuldata *pem;
197 	struct proc *p;
198 
199 	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
200 		args->flags = 0;
201 		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
202 	} else
203 		args->flags = FUTEX_SHARED;
204 
205 	args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
206 	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
207 
208 	if (args->clockrt &&
209 	    args->op != LINUX_FUTEX_WAIT_BITSET &&
210 	    args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
211 	    args->op != LINUX_FUTEX_LOCK_PI2)
212 		return (ENOSYS);
213 
214 	switch (args->op) {
215 	case LINUX_FUTEX_WAIT:
216 		args->val3 = FUTEX_BITSET_MATCH_ANY;
217 		/* FALLTHROUGH */
218 
219 	case LINUX_FUTEX_WAIT_BITSET:
220 		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
221 		    args->uaddr, args->val, args->val3);
222 
223 		return (linux_futex_wait(td, args));
224 
225 	case LINUX_FUTEX_WAKE:
226 		args->val3 = FUTEX_BITSET_MATCH_ANY;
227 		/* FALLTHROUGH */
228 
229 	case LINUX_FUTEX_WAKE_BITSET:
230 		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
231 		    args->uaddr, args->val, args->val3);
232 
233 		return (linux_futex_wake(td, args));
234 
235 	case LINUX_FUTEX_REQUEUE:
236 		/*
237 		 * Glibc does not use this operation since version 2.3.3,
238 		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
239 		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
240 		 * FUTEX_REQUEUE returned EINVAL.
241 		 */
242 		pem = pem_find(td->td_proc);
243 		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
244 			linux_msg(td, "unsupported FUTEX_REQUEUE");
245 			pem->flags |= LINUX_XDEPR_REQUEUEOP;
246 		}
247 
248 		/*
249 		 * The above is true, however musl libc does make use of the
250 		 * futex requeue operation, allow operation for brands which
251 		 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
252 		 */
253 		p = td->td_proc;
254 		const Elf_Brandinfo *bi = p->p_elf_brandinfo;
255 		if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
256 			return (EINVAL);
257 		args->val3_compare = false;
258 		/* FALLTHROUGH */
259 
260 	case LINUX_FUTEX_CMP_REQUEUE:
261 		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
262 		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
263 		    args->uaddr, args->val, args->val3, args->uaddr2,
264 		    args->ts);
265 
266 		return (linux_futex_requeue(td, args));
267 
268 	case LINUX_FUTEX_WAKE_OP:
269 		LINUX_CTR5(sys_futex, "WAKE_OP "
270 		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
271 		    args->uaddr, args->val, args->uaddr2, args->val3,
272 		    args->ts);
273 
274 		return (linux_futex_wakeop(td, args));
275 
276 	case LINUX_FUTEX_LOCK_PI:
277 		args->clockrt = true;
278 		/* FALLTHROUGH */
279 
280 	case LINUX_FUTEX_LOCK_PI2:
281 		LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
282 		    args->uaddr, args->val);
283 
284 		return (linux_futex_lock_pi(td, false, args));
285 
286 	case LINUX_FUTEX_UNLOCK_PI:
287 		LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
288 		    args->uaddr);
289 
290 		return (linux_futex_unlock_pi(td, false, args));
291 
292 	case LINUX_FUTEX_TRYLOCK_PI:
293 		LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
294 		    args->uaddr);
295 
296 		return (linux_futex_lock_pi(td, true, args));
297 
298 	/*
299 	 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
300 	 * can't be used anymore to implement conditional variables.
301 	 * A detailed explanation can be found here:
302 	 *
303 	 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
304 	 * and here http://austingroupbugs.net/view.php?id=609
305 	 *
306 	 * And since commit
307 	 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
308 	 * glibc does not use them.
309 	 */
310 	case LINUX_FUTEX_WAIT_REQUEUE_PI:
311 		/* not yet implemented */
312 		pem = pem_find(td->td_proc);
313 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
314 			linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
315 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
316 		}
317 		return (ENOSYS);
318 
319 	case LINUX_FUTEX_CMP_REQUEUE_PI:
320 		/* not yet implemented */
321 		pem = pem_find(td->td_proc);
322 		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
323 			linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
324 			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
325 		}
326 		return (ENOSYS);
327 
328 	default:
329 		linux_msg(td, "unsupported futex op %d", args->op);
330 		return (ENOSYS);
331 	}
332 }
333 
334 /*
335  * pi protocol:
336  * - 0 futex word value means unlocked.
337  * - TID futex word value means locked.
338  * Userspace uses atomic ops to lock/unlock these futexes without entering the
339  * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
340  * then FUTEX_LOCK_PI is called.
341  * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
342  * other waiters exists looks up the thread that owns the futex (it has put its
343  * own TID into the futex value) and made this thread the owner of the internal
344  * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
345  * object, on which it blocks. Once it returns, it has the mutex acquired, and it
346  * sets the futex value to its own TID and returns (futex value contains
347  * FUTEX_WAITERS|TID).
348  * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
349  * FUTEX_UNLOCK_PI will be called.
350  * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
351  * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
352  * bit is preserved (if any).
353  * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
354  * the internal futex state and if correct, acquire futex.
355  */
356 static int
linux_futex_lock_pi(struct thread * td,bool try,struct linux_futex_args * args)357 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
358 {
359 	struct umtx_abs_timeout timo;
360 	struct linux_emuldata *em;
361 	struct umtx_pi *pi, *new_pi;
362 	struct thread *td1;
363 	struct umtx_q *uq;
364 	int error, rv;
365 	uint32_t owner, old_owner;
366 
367 	em = em_find(td);
368 	uq = td->td_umtxq;
369 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
370 	    &uq->uq_key);
371 	if (error != 0)
372 		return (error);
373 	if (args->ts != NULL)
374 		linux_umtx_abs_timeout_init(&timo, args);
375 
376 	umtxq_lock(&uq->uq_key);
377 	pi = umtx_pi_lookup(&uq->uq_key);
378 	if (pi == NULL) {
379 		new_pi = umtx_pi_alloc(M_NOWAIT);
380 		if (new_pi == NULL) {
381 			umtxq_unlock(&uq->uq_key);
382 			new_pi = umtx_pi_alloc(M_WAITOK);
383 			umtxq_lock(&uq->uq_key);
384 			pi = umtx_pi_lookup(&uq->uq_key);
385 			if (pi != NULL) {
386 				umtx_pi_free(new_pi);
387 				new_pi = NULL;
388 			}
389 		}
390 		if (new_pi != NULL) {
391 			new_pi->pi_key = uq->uq_key;
392 			umtx_pi_insert(new_pi);
393 			pi = new_pi;
394 		}
395 	}
396 	umtx_pi_ref(pi);
397 	umtxq_unlock(&uq->uq_key);
398 	for (;;) {
399 		/* Try uncontested case first. */
400 		rv = casueword32(args->uaddr, FUTEX_UNOWNED, &owner, em->em_tid);
401 		/* The acquire succeeded. */
402 		if (rv == 0) {
403 			error = 0;
404 			break;
405 		}
406 		if (rv == -1) {
407 			error = EFAULT;
408 			break;
409 		}
410 
411 		/*
412 		 * Nobody owns it, but the acquire failed. This can happen
413 		 * with ll/sc atomic.
414 		 */
415 		if (owner == FUTEX_UNOWNED) {
416 			error = thread_check_susp(td, true);
417 			if (error != 0)
418 				break;
419 			continue;
420 		}
421 
422 		/*
423 		 * Avoid overwriting a possible error from sleep due
424 		 * to the pending signal with suspension check result.
425 		 */
426 		if (error == 0) {
427 			error = thread_check_susp(td, true);
428 			if (error != 0)
429 				break;
430 		}
431 
432 		/* The futex word at *uaddr is already locked by the caller. */
433 		if ((owner & FUTEX_TID_MASK) == em->em_tid) {
434 			error = EDEADLK;
435 			break;
436 		}
437 
438 		/*
439 		 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
440 		 * and clear tid. Try to acquire it.
441 		 */
442 		if ((owner & FUTEX_TID_MASK) == FUTEX_UNOWNED) {
443 			old_owner = owner;
444 			owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
445 			owner |= em->em_tid;
446 			rv = casueword32(args->uaddr, old_owner, &owner, owner);
447 			if (rv == -1) {
448 				error = EFAULT;
449 				break;
450 			}
451 			if (rv == 1) {
452 				if (error == 0) {
453 					error = thread_check_susp(td, true);
454 					if (error != 0)
455 						break;
456 				}
457 
458 				/*
459 				 * If this failed the lock could
460 				 * changed, restart.
461 				 */
462 				continue;
463 			}
464 
465 			umtxq_lock(&uq->uq_key);
466 			umtxq_busy(&uq->uq_key);
467 			error = umtx_pi_claim(pi, td);
468 			umtxq_unbusy(&uq->uq_key);
469 			umtxq_unlock(&uq->uq_key);
470 			if (error != 0) {
471 				/*
472 				 * Since we're going to return an
473 				 * error, restore the futex to its
474 				 * previous, unowned state to avoid
475 				 * compounding the problem.
476 				 */
477 				(void)casuword32(args->uaddr, owner, old_owner);
478 			}
479 			break;
480 		}
481 
482 		/*
483 		 * Inconsistent state: OWNER_DIED is set and tid is not 0.
484 		 * Linux does some checks of futex state, we return EINVAL,
485 		 * as the user space can take care of this.
486 		 */
487 		if ((owner & FUTEX_OWNER_DIED) != FUTEX_UNOWNED) {
488 			error = EINVAL;
489 			break;
490 		}
491 
492 		if (try != 0) {
493 			error = EBUSY;
494 			break;
495 		}
496 
497 		/*
498 		 * If we caught a signal, we have retried and now
499 		 * exit immediately.
500 		 */
501 		if (error != 0)
502 			break;
503 
504 		umtxq_busy_unlocked(&uq->uq_key);
505 
506 		/*
507 		 * Set the contested bit so that a release in user space knows
508 		 * to use the system call for unlock. If this fails either some
509 		 * one else has acquired the lock or it has been released.
510 		 */
511 		rv = casueword32(args->uaddr, owner, &owner,
512 		    owner | FUTEX_WAITERS);
513 		if (rv == -1) {
514 			umtxq_unbusy_unlocked(&uq->uq_key);
515 			error = EFAULT;
516 			break;
517 		}
518 		if (rv == 1) {
519 			umtxq_unbusy_unlocked(&uq->uq_key);
520 			error = thread_check_susp(td, true);
521 			if (error != 0)
522 				break;
523 
524 			/*
525 			 * The lock changed and we need to retry or we
526 			 * lost a race to the thread unlocking the umtx.
527 			 */
528 			continue;
529 		}
530 
531 		/*
532 		 * Substitute Linux thread id by native thread id to
533 		 * avoid refactoring code of umtxq_sleep_pi().
534 		 */
535 		td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
536 		if (td1 != NULL) {
537 			owner = td1->td_tid;
538 			PROC_UNLOCK(td1->td_proc);
539 		} else {
540 			umtxq_unbusy_unlocked(&uq->uq_key);
541 			error = EINVAL;
542 			break;
543 		}
544 
545 		umtxq_lock(&uq->uq_key);
546 
547 		/* We set the contested bit, sleep. */
548 		error = umtxq_sleep_pi(uq, pi, owner, "futexp",
549 		    args->ts == NULL ? NULL : &timo,
550 		    (args->flags & FUTEX_SHARED) != 0);
551 		if (error != 0)
552 			continue;
553 
554 		error = thread_check_susp(td, false);
555 		if (error != 0)
556 			break;
557 	}
558 
559 	umtxq_lock(&uq->uq_key);
560 	umtx_pi_unref(pi);
561 	umtxq_unlock(&uq->uq_key);
562 	umtx_key_release(&uq->uq_key);
563 	return (error);
564 }
565 
566 static int
linux_futex_unlock_pi(struct thread * td,bool rb,struct linux_futex_args * args)567 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
568 {
569 	struct linux_emuldata *em;
570 	struct umtx_key key;
571 	uint32_t old, owner, new_owner;
572 	int count, error;
573 
574 	em = em_find(td);
575 
576 	/*
577 	 * Make sure we own this mtx.
578 	 */
579 	error = fueword32(args->uaddr, &owner);
580 	if (error == -1)
581 		return (EFAULT);
582 	if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
583 		return (EPERM);
584 
585 	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
586 	if (error != 0)
587 		return (error);
588 	umtxq_lock(&key);
589 	umtxq_busy(&key);
590 	error = umtx_pi_drop(td, &key, rb, &count);
591 	if (error != 0 || rb) {
592 		umtxq_unbusy(&key);
593 		umtxq_unlock(&key);
594 		umtx_key_release(&key);
595 		return (error);
596 	}
597 	umtxq_unlock(&key);
598 
599 	/*
600 	 * When unlocking the futex, it must be marked as unowned if
601 	 * there is zero or one thread only waiting for it.
602 	 * Otherwise, it must be marked as contested.
603 	 */
604 	if (count > 1)
605 		new_owner = FUTEX_WAITERS;
606 	else
607 		new_owner = FUTEX_UNOWNED;
608 
609 again:
610 	error = casueword32(args->uaddr, owner, &old, new_owner);
611 	if (error == 1) {
612 		error = thread_check_susp(td, false);
613 		if (error == 0)
614 			goto again;
615 	}
616 	umtxq_unbusy_unlocked(&key);
617 	umtx_key_release(&key);
618 	if (error == -1)
619 		return (EFAULT);
620 	if (error == 0 && old != owner)
621 		return (EINVAL);
622 	return (error);
623 }
624 
625 static int
linux_futex_wakeop(struct thread * td,struct linux_futex_args * args)626 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
627 {
628 	struct umtx_key key, key2;
629 	int nrwake, op_ret, ret;
630 	int error, count;
631 
632 	if (args->uaddr == args->uaddr2)
633 		return (EINVAL);
634 
635 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
636 	if (error != 0)
637 		return (error);
638 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
639 	if (error != 0) {
640 		umtx_key_release(&key);
641 		return (error);
642 	}
643 	umtxq_busy_unlocked(&key);
644 	error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret);
645 	umtxq_lock(&key);
646 	umtxq_unbusy(&key);
647 	if (error != 0)
648 		goto out;
649 	ret = umtxq_signal_mask(&key, args->val, args->val3);
650 	if (op_ret > 0) {
651 		nrwake = (int)(unsigned long)args->ts;
652 		umtxq_lock(&key2);
653 		count = umtxq_count(&key2);
654 		if (count > 0)
655 			ret += umtxq_signal_mask(&key2, nrwake, args->val3);
656 		else
657 			ret += umtxq_signal_mask(&key, nrwake, args->val3);
658 		umtxq_unlock(&key2);
659 	}
660 	td->td_retval[0] = ret;
661 out:
662 	umtxq_unlock(&key);
663 	umtx_key_release(&key2);
664 	umtx_key_release(&key);
665 	return (error);
666 }
667 
668 static int
linux_futex_requeue(struct thread * td,struct linux_futex_args * args)669 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
670 {
671 	int nrwake, nrrequeue;
672 	struct umtx_key key, key2;
673 	int error;
674 	uint32_t uval;
675 
676 	/*
677 	 * Linux allows this, we would not, it is an incorrect
678 	 * usage of declared ABI, so return EINVAL.
679 	 */
680 	if (args->uaddr == args->uaddr2)
681 		return (EINVAL);
682 
683 	nrrequeue = (int)(unsigned long)args->ts;
684 	nrwake = args->val;
685 	/*
686 	 * Sanity check to prevent signed integer overflow,
687 	 * see Linux CVE-2018-6927
688 	 */
689 	if (nrwake < 0 || nrrequeue < 0)
690 		return (EINVAL);
691 
692 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
693 	if (error != 0)
694 		return (error);
695 	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
696 	if (error != 0) {
697 		umtx_key_release(&key);
698 		return (error);
699 	}
700 	umtxq_busy_unlocked(&key);
701 	error = fueword32(args->uaddr, &uval);
702 	if (error != 0)
703 		error = EFAULT;
704 	else if (args->val3_compare == true && uval != args->val3)
705 		error = EWOULDBLOCK;
706 	umtxq_lock(&key);
707 	umtxq_unbusy(&key);
708 	if (error == 0) {
709 		umtxq_lock(&key2);
710 		td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
711 		umtxq_unlock(&key2);
712 	}
713 	umtxq_unlock(&key);
714 	umtx_key_release(&key2);
715 	umtx_key_release(&key);
716 	return (error);
717 }
718 
719 static int
linux_futex_wake(struct thread * td,struct linux_futex_args * args)720 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
721 {
722 	struct umtx_key key;
723 	int error;
724 
725 	if (args->val3 == 0)
726 		return (EINVAL);
727 
728 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
729 	if (error != 0)
730 		return (error);
731 	umtxq_lock(&key);
732 	td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
733 	umtxq_unlock(&key);
734 	umtx_key_release(&key);
735 	return (0);
736 }
737 
738 static int
linux_futex_wait(struct thread * td,struct linux_futex_args * args)739 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
740 {
741 	struct umtx_abs_timeout timo;
742 	struct umtx_q *uq;
743 	uint32_t uval;
744 	int error;
745 
746 	if (args->val3 == 0)
747 		error = EINVAL;
748 
749 	uq = td->td_umtxq;
750 	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
751 	    &uq->uq_key);
752 	if (error != 0)
753 		return (error);
754 	if (args->ts != NULL)
755 		linux_umtx_abs_timeout_init(&timo, args);
756 	umtxq_lock(&uq->uq_key);
757 	umtxq_busy(&uq->uq_key);
758 	uq->uq_bitset = args->val3;
759 	umtxq_insert(uq);
760 	umtxq_unlock(&uq->uq_key);
761 	error = fueword32(args->uaddr, &uval);
762 	if (error != 0)
763 		error = EFAULT;
764 	else if (uval != args->val)
765 		error = EWOULDBLOCK;
766 	umtxq_lock(&uq->uq_key);
767 	umtxq_unbusy(&uq->uq_key);
768 	if (error == 0) {
769 		error = umtxq_sleep(uq, "futex",
770 		    args->ts == NULL ? NULL : &timo);
771 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
772 			error = 0;
773 		else
774 			umtxq_remove(uq);
775 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
776 		umtxq_remove(uq);
777 	}
778 	umtxq_unlock(&uq->uq_key);
779 	umtx_key_release(&uq->uq_key);
780 	if (error == ERESTART)
781 		error = EINTR;
782 	return (error);
783 }
784 
785 static void
linux_umtx_abs_timeout_init(struct umtx_abs_timeout * timo,struct linux_futex_args * args)786 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
787     struct linux_futex_args *args)
788 {
789 	int clockid, absolute;
790 
791 	/*
792 	 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
793 	 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
794 	 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
795 	 * futex operations timeout is interpreted as an absolute value.
796 	 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
797 	 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
798 	 * measures the timeout against the CLOCK_MONOTONIC clock.
799 	 */
800 	clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
801 	absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
802 	umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
803 }
804 
805 int
linux_sys_futex(struct thread * td,struct linux_sys_futex_args * args)806 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
807 {
808 	struct linux_futex_args fargs = {
809 		.uaddr = args->uaddr,
810 		.op = args->op,
811 		.val = args->val,
812 		.ts = NULL,
813 		.uaddr2 = args->uaddr2,
814 		.val3 = args->val3,
815 		.val3_compare = true,
816 	};
817 	int error;
818 
819 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
820 	case LINUX_FUTEX_WAIT:
821 	case LINUX_FUTEX_WAIT_BITSET:
822 	case LINUX_FUTEX_LOCK_PI:
823 	case LINUX_FUTEX_LOCK_PI2:
824 		if (args->timeout != NULL) {
825 			error = linux_get_timespec(&fargs.kts, args->timeout);
826 			if (error != 0)
827 				return (error);
828 			fargs.ts = &fargs.kts;
829 		}
830 		break;
831 	default:
832 		fargs.ts = PTRIN(args->timeout);
833 	}
834 	return (linux_futex(td, &fargs));
835 }
836 
837 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
838 int
linux_sys_futex_time64(struct thread * td,struct linux_sys_futex_time64_args * args)839 linux_sys_futex_time64(struct thread *td,
840     struct linux_sys_futex_time64_args *args)
841 {
842 	struct linux_futex_args fargs = {
843 		.uaddr = args->uaddr,
844 		.op = args->op,
845 		.val = args->val,
846 		.ts = NULL,
847 		.uaddr2 = args->uaddr2,
848 		.val3 = args->val3,
849 		.val3_compare = true,
850 	};
851 	int error;
852 
853 	switch (args->op & LINUX_FUTEX_CMD_MASK) {
854 	case LINUX_FUTEX_WAIT:
855 	case LINUX_FUTEX_WAIT_BITSET:
856 	case LINUX_FUTEX_LOCK_PI:
857 	case LINUX_FUTEX_LOCK_PI2:
858 		if (args->timeout != NULL) {
859 			error = linux_get_timespec64(&fargs.kts, args->timeout);
860 			if (error != 0)
861 				return (error);
862 			fargs.ts = &fargs.kts;
863 		}
864 		break;
865 	default:
866 		fargs.ts = PTRIN(args->timeout);
867 	}
868 	return (linux_futex(td, &fargs));
869 }
870 #endif
871 
872 int
linux_set_robust_list(struct thread * td,struct linux_set_robust_list_args * args)873 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
874 {
875 	struct linux_emuldata *em;
876 
877 	if (args->len != sizeof(struct linux_robust_list_head))
878 		return (EINVAL);
879 
880 	em = em_find(td);
881 	em->robust_futexes = args->head;
882 
883 	return (0);
884 }
885 
886 int
linux_get_robust_list(struct thread * td,struct linux_get_robust_list_args * args)887 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
888 {
889 	struct linux_emuldata *em;
890 	struct linux_robust_list_head *head;
891 	l_size_t len;
892 	struct thread *td2;
893 	int error;
894 
895 	if (!args->pid) {
896 		em = em_find(td);
897 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
898 		head = em->robust_futexes;
899 	} else {
900 		td2 = linux_tdfind(td, args->pid, -1);
901 		if (td2 == NULL)
902 			return (ESRCH);
903 		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
904 			PROC_UNLOCK(td2->td_proc);
905 			return (EPERM);
906 		}
907 
908 		em = em_find(td2);
909 		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
910 		/* XXX: ptrace? */
911 		if (priv_check(td, PRIV_CRED_SETUID) ||
912 		    priv_check(td, PRIV_CRED_SETEUID) ||
913 		    p_candebug(td, td2->td_proc)) {
914 			PROC_UNLOCK(td2->td_proc);
915 			return (EPERM);
916 		}
917 		head = em->robust_futexes;
918 
919 		PROC_UNLOCK(td2->td_proc);
920 	}
921 
922 	len = sizeof(struct linux_robust_list_head);
923 	error = copyout(&len, args->len, sizeof(l_size_t));
924 	if (error != 0)
925 		return (EFAULT);
926 
927 	return (copyout(&head, args->head, sizeof(l_uintptr_t)));
928 }
929 
930 static int
handle_futex_death(struct thread * td,struct linux_emuldata * em,uint32_t * uaddr,unsigned int pi,bool pending_op)931 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
932     unsigned int pi, bool pending_op)
933 {
934 	uint32_t uval, nval, mval;
935 	int error;
936 
937 retry:
938 	error = fueword32(uaddr, &uval);
939 	if (error != 0)
940 		return (EFAULT);
941 
942 	/*
943 	 * Special case for regular (non PI) futexes. The unlock path in
944 	 * user space has two race scenarios:
945 	 *
946 	 * 1. The unlock path releases the user space futex value and
947 	 *    before it can execute the futex() syscall to wake up
948 	 *    waiters it is killed.
949 	 *
950 	 * 2. A woken up waiter is killed before it can acquire the
951 	 *    futex in user space.
952 	 *
953 	 * In both cases the TID validation below prevents a wakeup of
954 	 * potential waiters which can cause these waiters to block
955 	 * forever.
956 	 *
957 	 * In both cases it is safe to attempt waking up a potential
958 	 * waiter without touching the user space futex value and trying
959 	 * to set the OWNER_DIED bit.
960 	 */
961 	if (pending_op && !pi && !uval) {
962 		(void)futex_wake(td, uaddr, 1, true);
963 		return (0);
964 	}
965 
966 	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
967 		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
968 		error = casueword32(uaddr, uval, &nval, mval);
969 		if (error == -1)
970 			return (EFAULT);
971 		if (error == 1) {
972 			error = thread_check_susp(td, false);
973 			if (error != 0)
974 				return (error);
975 			goto retry;
976 		}
977 
978 		if (!pi && (uval & FUTEX_WAITERS)) {
979 			error = futex_wake(td, uaddr, 1, true);
980 			if (error != 0)
981 				return (error);
982 		} else if (pi && (uval & FUTEX_WAITERS)) {
983 			error = futex_wake_pi(td, uaddr, true);
984 			if (error != 0)
985 				return (error);
986 		}
987 	}
988 
989 	return (0);
990 }
991 
992 static int
fetch_robust_entry(struct linux_robust_list ** entry,struct linux_robust_list ** head,unsigned int * pi)993 fetch_robust_entry(struct linux_robust_list **entry,
994     struct linux_robust_list **head, unsigned int *pi)
995 {
996 	l_ulong uentry;
997 	int error;
998 
999 	error = copyin((const void *)head, &uentry, sizeof(uentry));
1000 	if (error != 0)
1001 		return (EFAULT);
1002 
1003 	*entry = (void *)(uentry & ~1UL);
1004 	*pi = uentry & 1;
1005 
1006 	return (0);
1007 }
1008 
1009 #define	LINUX_HANDLE_DEATH_PENDING	true
1010 #define	LINUX_HANDLE_DEATH_LIST		false
1011 
1012 /* This walks the list of robust futexes releasing them. */
1013 void
release_futexes(struct thread * td,struct linux_emuldata * em)1014 release_futexes(struct thread *td, struct linux_emuldata *em)
1015 {
1016 	struct linux_robust_list_head *head;
1017 	struct linux_robust_list *entry, *next_entry, *pending;
1018 	unsigned int limit = 2048, pi, next_pi, pip;
1019 	uint32_t *uaddr;
1020 	l_long futex_offset;
1021 	int error;
1022 
1023 	head = em->robust_futexes;
1024 	if (head == NULL)
1025 		return;
1026 
1027 	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1028 		return;
1029 
1030 	error = copyin(&head->futex_offset, &futex_offset,
1031 	    sizeof(futex_offset));
1032 	if (error != 0)
1033 		return;
1034 
1035 	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1036 		return;
1037 
1038 	while (entry != &head->list) {
1039 		error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1040 		    &next_pi);
1041 
1042 		/*
1043 		 * A pending lock might already be on the list, so
1044 		 * don't process it twice.
1045 		 */
1046 		if (entry != pending) {
1047 			uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1048 			if (handle_futex_death(td, em, uaddr, pi,
1049 			    LINUX_HANDLE_DEATH_LIST))
1050 				return;
1051 		}
1052 		if (error != 0)
1053 			return;
1054 
1055 		entry = next_entry;
1056 		pi = next_pi;
1057 
1058 		if (!--limit)
1059 			break;
1060 
1061 		sched_relinquish(curthread);
1062 	}
1063 
1064 	if (pending) {
1065 		uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1066 		(void)handle_futex_death(td, em, uaddr, pip,
1067 		    LINUX_HANDLE_DEATH_PENDING);
1068 	}
1069 }
1070