1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org>
5 * Copyright (c) 2008 Roman Divacky
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/imgact.h>
31 #include <sys/imgact_elf.h>
32 #include <sys/ktr.h>
33 #include <sys/lock.h>
34 #include <sys/mutex.h>
35 #include <sys/priv.h>
36 #include <sys/proc.h>
37 #include <sys/sched.h>
38 #include <sys/sysent.h>
39 #include <sys/vnode.h>
40 #include <sys/umtxvar.h>
41
42 #ifdef COMPAT_LINUX32
43 #include <machine/../linux32/linux.h>
44 #include <machine/../linux32/linux32_proto.h>
45 #else
46 #include <machine/../linux/linux.h>
47 #include <machine/../linux/linux_proto.h>
48 #endif
49 #include <compat/linux/linux_emul.h>
50 #include <compat/linux/linux_futex.h>
51 #include <compat/linux/linux_misc.h>
52 #include <compat/linux/linux_time.h>
53 #include <compat/linux/linux_util.h>
54
55 #define FUTEX_SHARED 0x8 /* shared futex */
56 #define FUTEX_UNOWNED 0
57
58 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
59
60 static int futex_atomic_op(struct thread *, int, uint32_t *, int *);
61 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
62 uint32_t *, unsigned int, bool);
63 static int fetch_robust_entry(struct linux_robust_list **,
64 struct linux_robust_list **, unsigned int *);
65
66 struct linux_futex_args {
67 uint32_t *uaddr;
68 int32_t op;
69 uint32_t flags;
70 bool clockrt;
71 uint32_t val;
72 struct timespec *ts;
73 uint32_t *uaddr2;
74 uint32_t val3;
75 bool val3_compare;
76 struct timespec kts;
77 };
78
79 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
80 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
81 struct linux_futex_args *);
82 static int linux_futex(struct thread *, struct linux_futex_args *);
83 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
84 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
85 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
86 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
87 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
88 static int linux_futex_unlock_pi(struct thread *, bool,
89 struct linux_futex_args *);
90 static int futex_wake_pi(struct thread *, uint32_t *, bool);
91
92 static int
futex_key_get(const void * uaddr,int type,int share,struct umtx_key * key)93 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
94 {
95
96 /* Check that futex address is a 32bit aligned. */
97 if (!__is_aligned(uaddr, sizeof(uint32_t)))
98 return (EINVAL);
99 return (umtx_key_get(uaddr, type, share, key));
100 }
101
102 int
futex_wake(struct thread * td,uint32_t * uaddr,int val,bool shared)103 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
104 {
105 struct linux_futex_args args;
106
107 bzero(&args, sizeof(args));
108 args.op = LINUX_FUTEX_WAKE;
109 args.uaddr = uaddr;
110 args.flags = shared == true ? FUTEX_SHARED : 0;
111 args.val = val;
112 args.val3 = FUTEX_BITSET_MATCH_ANY;
113
114 return (linux_futex_wake(td, &args));
115 }
116
117 static int
futex_wake_pi(struct thread * td,uint32_t * uaddr,bool shared)118 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
119 {
120 struct linux_futex_args args;
121
122 bzero(&args, sizeof(args));
123 args.op = LINUX_FUTEX_UNLOCK_PI;
124 args.uaddr = uaddr;
125 args.flags = shared == true ? FUTEX_SHARED : 0;
126
127 return (linux_futex_unlock_pi(td, true, &args));
128 }
129
130 static int
futex_atomic_op(struct thread * td,int encoded_op,uint32_t * uaddr,int * res)131 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr,
132 int *res)
133 {
134 int op = (encoded_op >> 28) & 7;
135 int cmp = (encoded_op >> 24) & 15;
136 int oparg = (encoded_op << 8) >> 20;
137 int cmparg = (encoded_op << 20) >> 20;
138 int oldval = 0, ret;
139
140 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
141 oparg = 1 << oparg;
142
143 switch (op) {
144 case FUTEX_OP_SET:
145 ret = futex_xchgl(oparg, uaddr, &oldval);
146 break;
147 case FUTEX_OP_ADD:
148 ret = futex_addl(oparg, uaddr, &oldval);
149 break;
150 case FUTEX_OP_OR:
151 ret = futex_orl(oparg, uaddr, &oldval);
152 break;
153 case FUTEX_OP_ANDN:
154 ret = futex_andl(~oparg, uaddr, &oldval);
155 break;
156 case FUTEX_OP_XOR:
157 ret = futex_xorl(oparg, uaddr, &oldval);
158 break;
159 default:
160 ret = ENOSYS;
161 break;
162 }
163
164 if (ret != 0)
165 return (ret);
166
167 switch (cmp) {
168 case FUTEX_OP_CMP_EQ:
169 *res = (oldval == cmparg);
170 break;
171 case FUTEX_OP_CMP_NE:
172 *res = (oldval != cmparg);
173 break;
174 case FUTEX_OP_CMP_LT:
175 *res = (oldval < cmparg);
176 break;
177 case FUTEX_OP_CMP_GE:
178 *res = (oldval >= cmparg);
179 break;
180 case FUTEX_OP_CMP_LE:
181 *res = (oldval <= cmparg);
182 break;
183 case FUTEX_OP_CMP_GT:
184 *res = (oldval > cmparg);
185 break;
186 default:
187 ret = ENOSYS;
188 }
189
190 return (ret);
191 }
192
193 static int
linux_futex(struct thread * td,struct linux_futex_args * args)194 linux_futex(struct thread *td, struct linux_futex_args *args)
195 {
196 struct linux_pemuldata *pem;
197 struct proc *p;
198
199 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
200 args->flags = 0;
201 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
202 } else
203 args->flags = FUTEX_SHARED;
204
205 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
206 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
207
208 if (args->clockrt &&
209 args->op != LINUX_FUTEX_WAIT_BITSET &&
210 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
211 args->op != LINUX_FUTEX_LOCK_PI2)
212 return (ENOSYS);
213
214 switch (args->op) {
215 case LINUX_FUTEX_WAIT:
216 args->val3 = FUTEX_BITSET_MATCH_ANY;
217 /* FALLTHROUGH */
218
219 case LINUX_FUTEX_WAIT_BITSET:
220 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
221 args->uaddr, args->val, args->val3);
222
223 return (linux_futex_wait(td, args));
224
225 case LINUX_FUTEX_WAKE:
226 args->val3 = FUTEX_BITSET_MATCH_ANY;
227 /* FALLTHROUGH */
228
229 case LINUX_FUTEX_WAKE_BITSET:
230 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
231 args->uaddr, args->val, args->val3);
232
233 return (linux_futex_wake(td, args));
234
235 case LINUX_FUTEX_REQUEUE:
236 /*
237 * Glibc does not use this operation since version 2.3.3,
238 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
239 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
240 * FUTEX_REQUEUE returned EINVAL.
241 */
242 pem = pem_find(td->td_proc);
243 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
244 linux_msg(td, "unsupported FUTEX_REQUEUE");
245 pem->flags |= LINUX_XDEPR_REQUEUEOP;
246 }
247
248 /*
249 * The above is true, however musl libc does make use of the
250 * futex requeue operation, allow operation for brands which
251 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
252 */
253 p = td->td_proc;
254 Elf_Brandinfo *bi = p->p_elf_brandinfo;
255 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
256 return (EINVAL);
257 args->val3_compare = false;
258 /* FALLTHROUGH */
259
260 case LINUX_FUTEX_CMP_REQUEUE:
261 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
262 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
263 args->uaddr, args->val, args->val3, args->uaddr2,
264 args->ts);
265
266 return (linux_futex_requeue(td, args));
267
268 case LINUX_FUTEX_WAKE_OP:
269 LINUX_CTR5(sys_futex, "WAKE_OP "
270 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
271 args->uaddr, args->val, args->uaddr2, args->val3,
272 args->ts);
273
274 return (linux_futex_wakeop(td, args));
275
276 case LINUX_FUTEX_LOCK_PI:
277 args->clockrt = true;
278 /* FALLTHROUGH */
279
280 case LINUX_FUTEX_LOCK_PI2:
281 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
282 args->uaddr, args->val);
283
284 return (linux_futex_lock_pi(td, false, args));
285
286 case LINUX_FUTEX_UNLOCK_PI:
287 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
288 args->uaddr);
289
290 return (linux_futex_unlock_pi(td, false, args));
291
292 case LINUX_FUTEX_TRYLOCK_PI:
293 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
294 args->uaddr);
295
296 return (linux_futex_lock_pi(td, true, args));
297
298 /*
299 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
300 * can't be used anymore to implement conditional variables.
301 * A detailed explanation can be found here:
302 *
303 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
304 * and here http://austingroupbugs.net/view.php?id=609
305 *
306 * And since commit
307 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
308 * glibc does not use them.
309 */
310 case LINUX_FUTEX_WAIT_REQUEUE_PI:
311 /* not yet implemented */
312 pem = pem_find(td->td_proc);
313 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
314 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
315 pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
316 }
317 return (ENOSYS);
318
319 case LINUX_FUTEX_CMP_REQUEUE_PI:
320 /* not yet implemented */
321 pem = pem_find(td->td_proc);
322 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
323 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
324 pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
325 }
326 return (ENOSYS);
327
328 default:
329 linux_msg(td, "unsupported futex op %d", args->op);
330 return (ENOSYS);
331 }
332 }
333
334 /*
335 * pi protocol:
336 * - 0 futex word value means unlocked.
337 * - TID futex word value means locked.
338 * Userspace uses atomic ops to lock/unlock these futexes without entering the
339 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
340 * then FUTEX_LOCK_PI is called.
341 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
342 * other waiters exists looks up the thread that owns the futex (it has put its
343 * own TID into the futex value) and made this thread the owner of the internal
344 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
345 * object, on which it blocks. Once it returns, it has the mutex acquired, and it
346 * sets the futex value to its own TID and returns (futex value contains
347 * FUTEX_WAITERS|TID).
348 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
349 * FUTEX_UNLOCK_PI will be called.
350 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
351 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
352 * bit is preserved (if any).
353 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
354 * the internal futex state and if correct, acquire futex.
355 */
356 static int
linux_futex_lock_pi(struct thread * td,bool try,struct linux_futex_args * args)357 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
358 {
359 struct umtx_abs_timeout timo;
360 struct linux_emuldata *em;
361 struct umtx_pi *pi, *new_pi;
362 struct thread *td1;
363 struct umtx_q *uq;
364 int error, rv;
365 uint32_t owner, old_owner;
366
367 em = em_find(td);
368 uq = td->td_umtxq;
369 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
370 &uq->uq_key);
371 if (error != 0)
372 return (error);
373 if (args->ts != NULL)
374 linux_umtx_abs_timeout_init(&timo, args);
375
376 umtxq_lock(&uq->uq_key);
377 pi = umtx_pi_lookup(&uq->uq_key);
378 if (pi == NULL) {
379 new_pi = umtx_pi_alloc(M_NOWAIT);
380 if (new_pi == NULL) {
381 umtxq_unlock(&uq->uq_key);
382 new_pi = umtx_pi_alloc(M_WAITOK);
383 umtxq_lock(&uq->uq_key);
384 pi = umtx_pi_lookup(&uq->uq_key);
385 if (pi != NULL) {
386 umtx_pi_free(new_pi);
387 new_pi = NULL;
388 }
389 }
390 if (new_pi != NULL) {
391 new_pi->pi_key = uq->uq_key;
392 umtx_pi_insert(new_pi);
393 pi = new_pi;
394 }
395 }
396 umtx_pi_ref(pi);
397 umtxq_unlock(&uq->uq_key);
398 for (;;) {
399 /* Try uncontested case first. */
400 rv = casueword32(args->uaddr, FUTEX_UNOWNED, &owner, em->em_tid);
401 /* The acquire succeeded. */
402 if (rv == 0) {
403 error = 0;
404 break;
405 }
406 if (rv == -1) {
407 error = EFAULT;
408 break;
409 }
410
411 /*
412 * Nobody owns it, but the acquire failed. This can happen
413 * with ll/sc atomic.
414 */
415 if (owner == FUTEX_UNOWNED) {
416 error = thread_check_susp(td, true);
417 if (error != 0)
418 break;
419 continue;
420 }
421
422 /*
423 * Avoid overwriting a possible error from sleep due
424 * to the pending signal with suspension check result.
425 */
426 if (error == 0) {
427 error = thread_check_susp(td, true);
428 if (error != 0)
429 break;
430 }
431
432 /* The futex word at *uaddr is already locked by the caller. */
433 if ((owner & FUTEX_TID_MASK) == em->em_tid) {
434 error = EDEADLK;
435 break;
436 }
437
438 /*
439 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
440 * and clear tid. Try to acquire it.
441 */
442 if ((owner & FUTEX_TID_MASK) == FUTEX_UNOWNED) {
443 old_owner = owner;
444 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
445 owner |= em->em_tid;
446 rv = casueword32(args->uaddr, old_owner, &owner, owner);
447 if (rv == -1) {
448 error = EFAULT;
449 break;
450 }
451 if (rv == 1) {
452 if (error == 0) {
453 error = thread_check_susp(td, true);
454 if (error != 0)
455 break;
456 }
457
458 /*
459 * If this failed the lock could
460 * changed, restart.
461 */
462 continue;
463 }
464
465 umtxq_lock(&uq->uq_key);
466 umtxq_busy(&uq->uq_key);
467 error = umtx_pi_claim(pi, td);
468 umtxq_unbusy(&uq->uq_key);
469 umtxq_unlock(&uq->uq_key);
470 if (error != 0) {
471 /*
472 * Since we're going to return an
473 * error, restore the futex to its
474 * previous, unowned state to avoid
475 * compounding the problem.
476 */
477 (void)casuword32(args->uaddr, owner, old_owner);
478 }
479 break;
480 }
481
482 /*
483 * Inconsistent state: OWNER_DIED is set and tid is not 0.
484 * Linux does some checks of futex state, we return EINVAL,
485 * as the user space can take care of this.
486 */
487 if ((owner & FUTEX_OWNER_DIED) != FUTEX_UNOWNED) {
488 error = EINVAL;
489 break;
490 }
491
492 if (try != 0) {
493 error = EBUSY;
494 break;
495 }
496
497 /*
498 * If we caught a signal, we have retried and now
499 * exit immediately.
500 */
501 if (error != 0)
502 break;
503
504 umtxq_lock(&uq->uq_key);
505 umtxq_busy(&uq->uq_key);
506 umtxq_unlock(&uq->uq_key);
507
508 /*
509 * Set the contested bit so that a release in user space knows
510 * to use the system call for unlock. If this fails either some
511 * one else has acquired the lock or it has been released.
512 */
513 rv = casueword32(args->uaddr, owner, &owner,
514 owner | FUTEX_WAITERS);
515 if (rv == -1) {
516 umtxq_unbusy_unlocked(&uq->uq_key);
517 error = EFAULT;
518 break;
519 }
520 if (rv == 1) {
521 umtxq_unbusy_unlocked(&uq->uq_key);
522 error = thread_check_susp(td, true);
523 if (error != 0)
524 break;
525
526 /*
527 * The lock changed and we need to retry or we
528 * lost a race to the thread unlocking the umtx.
529 */
530 continue;
531 }
532
533 /*
534 * Substitute Linux thread id by native thread id to
535 * avoid refactoring code of umtxq_sleep_pi().
536 */
537 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
538 if (td1 != NULL) {
539 owner = td1->td_tid;
540 PROC_UNLOCK(td1->td_proc);
541 } else {
542 umtxq_unbusy_unlocked(&uq->uq_key);
543 error = EINVAL;
544 break;
545 }
546
547 umtxq_lock(&uq->uq_key);
548
549 /* We set the contested bit, sleep. */
550 error = umtxq_sleep_pi(uq, pi, owner, "futexp",
551 args->ts == NULL ? NULL : &timo,
552 (args->flags & FUTEX_SHARED) != 0);
553 if (error != 0)
554 continue;
555
556 error = thread_check_susp(td, false);
557 if (error != 0)
558 break;
559 }
560
561 umtxq_lock(&uq->uq_key);
562 umtx_pi_unref(pi);
563 umtxq_unlock(&uq->uq_key);
564 umtx_key_release(&uq->uq_key);
565 return (error);
566 }
567
568 static int
linux_futex_unlock_pi(struct thread * td,bool rb,struct linux_futex_args * args)569 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
570 {
571 struct linux_emuldata *em;
572 struct umtx_key key;
573 uint32_t old, owner, new_owner;
574 int count, error;
575
576 em = em_find(td);
577
578 /*
579 * Make sure we own this mtx.
580 */
581 error = fueword32(args->uaddr, &owner);
582 if (error == -1)
583 return (EFAULT);
584 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
585 return (EPERM);
586
587 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
588 if (error != 0)
589 return (error);
590 umtxq_lock(&key);
591 umtxq_busy(&key);
592 error = umtx_pi_drop(td, &key, rb, &count);
593 if (error != 0 || rb) {
594 umtxq_unbusy(&key);
595 umtxq_unlock(&key);
596 umtx_key_release(&key);
597 return (error);
598 }
599 umtxq_unlock(&key);
600
601 /*
602 * When unlocking the futex, it must be marked as unowned if
603 * there is zero or one thread only waiting for it.
604 * Otherwise, it must be marked as contested.
605 */
606 if (count > 1)
607 new_owner = FUTEX_WAITERS;
608 else
609 new_owner = FUTEX_UNOWNED;
610
611 again:
612 error = casueword32(args->uaddr, owner, &old, new_owner);
613 if (error == 1) {
614 error = thread_check_susp(td, false);
615 if (error == 0)
616 goto again;
617 }
618 umtxq_unbusy_unlocked(&key);
619 umtx_key_release(&key);
620 if (error == -1)
621 return (EFAULT);
622 if (error == 0 && old != owner)
623 return (EINVAL);
624 return (error);
625 }
626
627 static int
linux_futex_wakeop(struct thread * td,struct linux_futex_args * args)628 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
629 {
630 struct umtx_key key, key2;
631 int nrwake, op_ret, ret;
632 int error, count;
633
634 if (args->uaddr == args->uaddr2)
635 return (EINVAL);
636
637 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
638 if (error != 0)
639 return (error);
640 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
641 if (error != 0) {
642 umtx_key_release(&key);
643 return (error);
644 }
645 umtxq_lock(&key);
646 umtxq_busy(&key);
647 umtxq_unlock(&key);
648 error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret);
649 umtxq_lock(&key);
650 umtxq_unbusy(&key);
651 if (error != 0)
652 goto out;
653 ret = umtxq_signal_mask(&key, args->val, args->val3);
654 if (op_ret > 0) {
655 nrwake = (int)(unsigned long)args->ts;
656 umtxq_lock(&key2);
657 count = umtxq_count(&key2);
658 if (count > 0)
659 ret += umtxq_signal_mask(&key2, nrwake, args->val3);
660 else
661 ret += umtxq_signal_mask(&key, nrwake, args->val3);
662 umtxq_unlock(&key2);
663 }
664 td->td_retval[0] = ret;
665 out:
666 umtxq_unlock(&key);
667 umtx_key_release(&key2);
668 umtx_key_release(&key);
669 return (error);
670 }
671
672 static int
linux_futex_requeue(struct thread * td,struct linux_futex_args * args)673 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
674 {
675 int nrwake, nrrequeue;
676 struct umtx_key key, key2;
677 int error;
678 uint32_t uval;
679
680 /*
681 * Linux allows this, we would not, it is an incorrect
682 * usage of declared ABI, so return EINVAL.
683 */
684 if (args->uaddr == args->uaddr2)
685 return (EINVAL);
686
687 nrrequeue = (int)(unsigned long)args->ts;
688 nrwake = args->val;
689 /*
690 * Sanity check to prevent signed integer overflow,
691 * see Linux CVE-2018-6927
692 */
693 if (nrwake < 0 || nrrequeue < 0)
694 return (EINVAL);
695
696 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
697 if (error != 0)
698 return (error);
699 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
700 if (error != 0) {
701 umtx_key_release(&key);
702 return (error);
703 }
704 umtxq_lock(&key);
705 umtxq_busy(&key);
706 umtxq_unlock(&key);
707 error = fueword32(args->uaddr, &uval);
708 if (error != 0)
709 error = EFAULT;
710 else if (args->val3_compare == true && uval != args->val3)
711 error = EWOULDBLOCK;
712 umtxq_lock(&key);
713 umtxq_unbusy(&key);
714 if (error == 0) {
715 umtxq_lock(&key2);
716 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
717 umtxq_unlock(&key2);
718 }
719 umtxq_unlock(&key);
720 umtx_key_release(&key2);
721 umtx_key_release(&key);
722 return (error);
723 }
724
725 static int
linux_futex_wake(struct thread * td,struct linux_futex_args * args)726 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
727 {
728 struct umtx_key key;
729 int error;
730
731 if (args->val3 == 0)
732 return (EINVAL);
733
734 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
735 if (error != 0)
736 return (error);
737 umtxq_lock(&key);
738 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
739 umtxq_unlock(&key);
740 umtx_key_release(&key);
741 return (0);
742 }
743
744 static int
linux_futex_wait(struct thread * td,struct linux_futex_args * args)745 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
746 {
747 struct umtx_abs_timeout timo;
748 struct umtx_q *uq;
749 uint32_t uval;
750 int error;
751
752 if (args->val3 == 0)
753 error = EINVAL;
754
755 uq = td->td_umtxq;
756 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
757 &uq->uq_key);
758 if (error != 0)
759 return (error);
760 if (args->ts != NULL)
761 linux_umtx_abs_timeout_init(&timo, args);
762 umtxq_lock(&uq->uq_key);
763 umtxq_busy(&uq->uq_key);
764 uq->uq_bitset = args->val3;
765 umtxq_insert(uq);
766 umtxq_unlock(&uq->uq_key);
767 error = fueword32(args->uaddr, &uval);
768 if (error != 0)
769 error = EFAULT;
770 else if (uval != args->val)
771 error = EWOULDBLOCK;
772 umtxq_lock(&uq->uq_key);
773 umtxq_unbusy(&uq->uq_key);
774 if (error == 0) {
775 error = umtxq_sleep(uq, "futex",
776 args->ts == NULL ? NULL : &timo);
777 if ((uq->uq_flags & UQF_UMTXQ) == 0)
778 error = 0;
779 else
780 umtxq_remove(uq);
781 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
782 umtxq_remove(uq);
783 }
784 umtxq_unlock(&uq->uq_key);
785 umtx_key_release(&uq->uq_key);
786 if (error == ERESTART)
787 error = EINTR;
788 return (error);
789 }
790
791 static void
linux_umtx_abs_timeout_init(struct umtx_abs_timeout * timo,struct linux_futex_args * args)792 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
793 struct linux_futex_args *args)
794 {
795 int clockid, absolute;
796
797 /*
798 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
799 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
800 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
801 * futex operations timeout is interpreted as an absolute value.
802 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
803 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
804 * measures the timeout against the CLOCK_MONOTONIC clock.
805 */
806 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
807 absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
808 umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
809 }
810
811 int
linux_sys_futex(struct thread * td,struct linux_sys_futex_args * args)812 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
813 {
814 struct linux_futex_args fargs = {
815 .uaddr = args->uaddr,
816 .op = args->op,
817 .val = args->val,
818 .ts = NULL,
819 .uaddr2 = args->uaddr2,
820 .val3 = args->val3,
821 .val3_compare = true,
822 };
823 int error;
824
825 switch (args->op & LINUX_FUTEX_CMD_MASK) {
826 case LINUX_FUTEX_WAIT:
827 case LINUX_FUTEX_WAIT_BITSET:
828 case LINUX_FUTEX_LOCK_PI:
829 case LINUX_FUTEX_LOCK_PI2:
830 if (args->timeout != NULL) {
831 error = linux_get_timespec(&fargs.kts, args->timeout);
832 if (error != 0)
833 return (error);
834 fargs.ts = &fargs.kts;
835 }
836 break;
837 default:
838 fargs.ts = PTRIN(args->timeout);
839 }
840 return (linux_futex(td, &fargs));
841 }
842
843 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
844 int
linux_sys_futex_time64(struct thread * td,struct linux_sys_futex_time64_args * args)845 linux_sys_futex_time64(struct thread *td,
846 struct linux_sys_futex_time64_args *args)
847 {
848 struct linux_futex_args fargs = {
849 .uaddr = args->uaddr,
850 .op = args->op,
851 .val = args->val,
852 .ts = NULL,
853 .uaddr2 = args->uaddr2,
854 .val3 = args->val3,
855 .val3_compare = true,
856 };
857 int error;
858
859 switch (args->op & LINUX_FUTEX_CMD_MASK) {
860 case LINUX_FUTEX_WAIT:
861 case LINUX_FUTEX_WAIT_BITSET:
862 case LINUX_FUTEX_LOCK_PI:
863 case LINUX_FUTEX_LOCK_PI2:
864 if (args->timeout != NULL) {
865 error = linux_get_timespec64(&fargs.kts, args->timeout);
866 if (error != 0)
867 return (error);
868 fargs.ts = &fargs.kts;
869 }
870 break;
871 default:
872 fargs.ts = PTRIN(args->timeout);
873 }
874 return (linux_futex(td, &fargs));
875 }
876 #endif
877
878 int
linux_set_robust_list(struct thread * td,struct linux_set_robust_list_args * args)879 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
880 {
881 struct linux_emuldata *em;
882
883 if (args->len != sizeof(struct linux_robust_list_head))
884 return (EINVAL);
885
886 em = em_find(td);
887 em->robust_futexes = args->head;
888
889 return (0);
890 }
891
892 int
linux_get_robust_list(struct thread * td,struct linux_get_robust_list_args * args)893 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
894 {
895 struct linux_emuldata *em;
896 struct linux_robust_list_head *head;
897 l_size_t len;
898 struct thread *td2;
899 int error;
900
901 if (!args->pid) {
902 em = em_find(td);
903 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
904 head = em->robust_futexes;
905 } else {
906 td2 = linux_tdfind(td, args->pid, -1);
907 if (td2 == NULL)
908 return (ESRCH);
909 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
910 PROC_UNLOCK(td2->td_proc);
911 return (EPERM);
912 }
913
914 em = em_find(td2);
915 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
916 /* XXX: ptrace? */
917 if (priv_check(td, PRIV_CRED_SETUID) ||
918 priv_check(td, PRIV_CRED_SETEUID) ||
919 p_candebug(td, td2->td_proc)) {
920 PROC_UNLOCK(td2->td_proc);
921 return (EPERM);
922 }
923 head = em->robust_futexes;
924
925 PROC_UNLOCK(td2->td_proc);
926 }
927
928 len = sizeof(struct linux_robust_list_head);
929 error = copyout(&len, args->len, sizeof(l_size_t));
930 if (error != 0)
931 return (EFAULT);
932
933 return (copyout(&head, args->head, sizeof(l_uintptr_t)));
934 }
935
936 static int
handle_futex_death(struct thread * td,struct linux_emuldata * em,uint32_t * uaddr,unsigned int pi,bool pending_op)937 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
938 unsigned int pi, bool pending_op)
939 {
940 uint32_t uval, nval, mval;
941 int error;
942
943 retry:
944 error = fueword32(uaddr, &uval);
945 if (error != 0)
946 return (EFAULT);
947
948 /*
949 * Special case for regular (non PI) futexes. The unlock path in
950 * user space has two race scenarios:
951 *
952 * 1. The unlock path releases the user space futex value and
953 * before it can execute the futex() syscall to wake up
954 * waiters it is killed.
955 *
956 * 2. A woken up waiter is killed before it can acquire the
957 * futex in user space.
958 *
959 * In both cases the TID validation below prevents a wakeup of
960 * potential waiters which can cause these waiters to block
961 * forever.
962 *
963 * In both cases it is safe to attempt waking up a potential
964 * waiter without touching the user space futex value and trying
965 * to set the OWNER_DIED bit.
966 */
967 if (pending_op && !pi && !uval) {
968 (void)futex_wake(td, uaddr, 1, true);
969 return (0);
970 }
971
972 if ((uval & FUTEX_TID_MASK) == em->em_tid) {
973 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
974 error = casueword32(uaddr, uval, &nval, mval);
975 if (error == -1)
976 return (EFAULT);
977 if (error == 1) {
978 error = thread_check_susp(td, false);
979 if (error != 0)
980 return (error);
981 goto retry;
982 }
983
984 if (!pi && (uval & FUTEX_WAITERS)) {
985 error = futex_wake(td, uaddr, 1, true);
986 if (error != 0)
987 return (error);
988 } else if (pi && (uval & FUTEX_WAITERS)) {
989 error = futex_wake_pi(td, uaddr, true);
990 if (error != 0)
991 return (error);
992 }
993 }
994
995 return (0);
996 }
997
998 static int
fetch_robust_entry(struct linux_robust_list ** entry,struct linux_robust_list ** head,unsigned int * pi)999 fetch_robust_entry(struct linux_robust_list **entry,
1000 struct linux_robust_list **head, unsigned int *pi)
1001 {
1002 l_ulong uentry;
1003 int error;
1004
1005 error = copyin((const void *)head, &uentry, sizeof(uentry));
1006 if (error != 0)
1007 return (EFAULT);
1008
1009 *entry = (void *)(uentry & ~1UL);
1010 *pi = uentry & 1;
1011
1012 return (0);
1013 }
1014
1015 #define LINUX_HANDLE_DEATH_PENDING true
1016 #define LINUX_HANDLE_DEATH_LIST false
1017
1018 /* This walks the list of robust futexes releasing them. */
1019 void
release_futexes(struct thread * td,struct linux_emuldata * em)1020 release_futexes(struct thread *td, struct linux_emuldata *em)
1021 {
1022 struct linux_robust_list_head *head;
1023 struct linux_robust_list *entry, *next_entry, *pending;
1024 unsigned int limit = 2048, pi, next_pi, pip;
1025 uint32_t *uaddr;
1026 l_long futex_offset;
1027 int error;
1028
1029 head = em->robust_futexes;
1030 if (head == NULL)
1031 return;
1032
1033 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1034 return;
1035
1036 error = copyin(&head->futex_offset, &futex_offset,
1037 sizeof(futex_offset));
1038 if (error != 0)
1039 return;
1040
1041 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1042 return;
1043
1044 while (entry != &head->list) {
1045 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1046 &next_pi);
1047
1048 /*
1049 * A pending lock might already be on the list, so
1050 * don't process it twice.
1051 */
1052 if (entry != pending) {
1053 uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1054 if (handle_futex_death(td, em, uaddr, pi,
1055 LINUX_HANDLE_DEATH_LIST))
1056 return;
1057 }
1058 if (error != 0)
1059 return;
1060
1061 entry = next_entry;
1062 pi = next_pi;
1063
1064 if (!--limit)
1065 break;
1066
1067 sched_relinquish(curthread);
1068 }
1069
1070 if (pending) {
1071 uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1072 (void)handle_futex_death(td, em, uaddr, pip,
1073 LINUX_HANDLE_DEATH_PENDING);
1074 }
1075 }
1076