1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org> 5 * Copyright (c) 2008 Roman Divacky 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include "opt_compat.h" 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/imgact.h> 37 #include <sys/imgact_elf.h> 38 #include <sys/ktr.h> 39 #include <sys/mutex.h> 40 #include <sys/priv.h> 41 #include <sys/proc.h> 42 #include <sys/sched.h> 43 #include <sys/umtxvar.h> 44 45 #ifdef COMPAT_LINUX32 46 #include <machine/../linux32/linux.h> 47 #include <machine/../linux32/linux32_proto.h> 48 #else 49 #include <machine/../linux/linux.h> 50 #include <machine/../linux/linux_proto.h> 51 #endif 52 #include <compat/linux/linux_emul.h> 53 #include <compat/linux/linux_futex.h> 54 #include <compat/linux/linux_misc.h> 55 #include <compat/linux/linux_timer.h> 56 #include <compat/linux/linux_util.h> 57 58 #define FUTEX_SHARED 0x8 /* shared futex */ 59 60 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE 61 62 static int futex_atomic_op(struct thread *, int, uint32_t *, int *); 63 static int handle_futex_death(struct thread *td, struct linux_emuldata *, 64 uint32_t *, unsigned int, bool); 65 static int fetch_robust_entry(struct linux_robust_list **, 66 struct linux_robust_list **, unsigned int *); 67 68 struct linux_futex_args { 69 uint32_t *uaddr; 70 int32_t op; 71 uint32_t flags; 72 bool clockrt; 73 uint32_t val; 74 struct timespec *ts; 75 uint32_t *uaddr2; 76 uint32_t val3; 77 bool val3_compare; 78 struct timespec kts; 79 }; 80 81 static inline int futex_key_get(const void *, int, int, struct umtx_key *); 82 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, 83 struct linux_futex_args *); 84 static int linux_futex(struct thread *, struct linux_futex_args *); 85 static int linux_futex_wait(struct thread *, struct linux_futex_args *); 86 static int linux_futex_wake(struct thread *, struct linux_futex_args *); 87 static int linux_futex_requeue(struct thread *, struct linux_futex_args *); 88 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); 89 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); 90 static int linux_futex_unlock_pi(struct thread *, bool, 91 struct linux_futex_args *); 92 static int futex_wake_pi(struct thread *, uint32_t *, bool); 93 94 static int 95 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key) 96 { 97 98 /* Check that futex address is a 32bit aligned. */ 99 if (!__is_aligned(uaddr, sizeof(uint32_t))) 100 return (EINVAL); 101 return (umtx_key_get(uaddr, type, share, key)); 102 } 103 104 int 105 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) 106 { 107 struct linux_futex_args args; 108 109 bzero(&args, sizeof(args)); 110 args.op = LINUX_FUTEX_WAKE; 111 args.uaddr = uaddr; 112 args.flags = shared == true ? FUTEX_SHARED : 0; 113 args.val = val; 114 args.val3 = FUTEX_BITSET_MATCH_ANY; 115 116 return (linux_futex_wake(td, &args)); 117 } 118 119 static int 120 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) 121 { 122 struct linux_futex_args args; 123 124 bzero(&args, sizeof(args)); 125 args.op = LINUX_FUTEX_UNLOCK_PI; 126 args.uaddr = uaddr; 127 args.flags = shared == true ? FUTEX_SHARED : 0; 128 129 return (linux_futex_unlock_pi(td, true, &args)); 130 } 131 132 static int 133 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr, 134 int *res) 135 { 136 int op = (encoded_op >> 28) & 7; 137 int cmp = (encoded_op >> 24) & 15; 138 int oparg = (encoded_op << 8) >> 20; 139 int cmparg = (encoded_op << 20) >> 20; 140 int oldval = 0, ret; 141 142 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 143 oparg = 1 << oparg; 144 145 switch (op) { 146 case FUTEX_OP_SET: 147 ret = futex_xchgl(oparg, uaddr, &oldval); 148 break; 149 case FUTEX_OP_ADD: 150 ret = futex_addl(oparg, uaddr, &oldval); 151 break; 152 case FUTEX_OP_OR: 153 ret = futex_orl(oparg, uaddr, &oldval); 154 break; 155 case FUTEX_OP_ANDN: 156 ret = futex_andl(~oparg, uaddr, &oldval); 157 break; 158 case FUTEX_OP_XOR: 159 ret = futex_xorl(oparg, uaddr, &oldval); 160 break; 161 default: 162 ret = ENOSYS; 163 break; 164 } 165 166 if (ret != 0) 167 return (ret); 168 169 switch (cmp) { 170 case FUTEX_OP_CMP_EQ: 171 *res = (oldval == cmparg); 172 break; 173 case FUTEX_OP_CMP_NE: 174 *res = (oldval != cmparg); 175 break; 176 case FUTEX_OP_CMP_LT: 177 *res = (oldval < cmparg); 178 break; 179 case FUTEX_OP_CMP_GE: 180 *res = (oldval >= cmparg); 181 break; 182 case FUTEX_OP_CMP_LE: 183 *res = (oldval <= cmparg); 184 break; 185 case FUTEX_OP_CMP_GT: 186 *res = (oldval > cmparg); 187 break; 188 default: 189 ret = ENOSYS; 190 } 191 192 return (ret); 193 } 194 195 static int 196 linux_futex(struct thread *td, struct linux_futex_args *args) 197 { 198 struct linux_pemuldata *pem; 199 struct proc *p; 200 201 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { 202 args->flags = 0; 203 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; 204 } else 205 args->flags = FUTEX_SHARED; 206 207 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; 208 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; 209 210 if (args->clockrt && 211 args->op != LINUX_FUTEX_WAIT_BITSET && 212 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI && 213 args->op != LINUX_FUTEX_LOCK_PI2) 214 return (ENOSYS); 215 216 switch (args->op) { 217 case LINUX_FUTEX_WAIT: 218 args->val3 = FUTEX_BITSET_MATCH_ANY; 219 /* FALLTHROUGH */ 220 221 case LINUX_FUTEX_WAIT_BITSET: 222 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", 223 args->uaddr, args->val, args->val3); 224 225 return (linux_futex_wait(td, args)); 226 227 case LINUX_FUTEX_WAKE: 228 args->val3 = FUTEX_BITSET_MATCH_ANY; 229 /* FALLTHROUGH */ 230 231 case LINUX_FUTEX_WAKE_BITSET: 232 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", 233 args->uaddr, args->val, args->val3); 234 235 return (linux_futex_wake(td, args)); 236 237 case LINUX_FUTEX_REQUEUE: 238 /* 239 * Glibc does not use this operation since version 2.3.3, 240 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. 241 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when 242 * FUTEX_REQUEUE returned EINVAL. 243 */ 244 pem = pem_find(td->td_proc); 245 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { 246 linux_msg(td, "unsupported FUTEX_REQUEUE"); 247 pem->flags |= LINUX_XDEPR_REQUEUEOP; 248 } 249 250 /* 251 * The above is true, however musl libc does make use of the 252 * futex requeue operation, allow operation for brands which 253 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. 254 */ 255 p = td->td_proc; 256 Elf_Brandinfo *bi = p->p_elf_brandinfo; 257 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) 258 return (EINVAL); 259 args->val3_compare = false; 260 /* FALLTHROUGH */ 261 262 case LINUX_FUTEX_CMP_REQUEUE: 263 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " 264 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", 265 args->uaddr, args->val, args->val3, args->uaddr2, 266 args->ts); 267 268 return (linux_futex_requeue(td, args)); 269 270 case LINUX_FUTEX_WAKE_OP: 271 LINUX_CTR5(sys_futex, "WAKE_OP " 272 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", 273 args->uaddr, args->val, args->uaddr2, args->val3, 274 args->ts); 275 276 return (linux_futex_wakeop(td, args)); 277 278 case LINUX_FUTEX_LOCK_PI: 279 args->clockrt = true; 280 /* FALLTHROUGH */ 281 282 case LINUX_FUTEX_LOCK_PI2: 283 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", 284 args->uaddr, args->val); 285 286 return (linux_futex_lock_pi(td, false, args)); 287 288 case LINUX_FUTEX_UNLOCK_PI: 289 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", 290 args->uaddr); 291 292 return (linux_futex_unlock_pi(td, false, args)); 293 294 case LINUX_FUTEX_TRYLOCK_PI: 295 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", 296 args->uaddr); 297 298 return (linux_futex_lock_pi(td, true, args)); 299 300 /* 301 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI 302 * can't be used anymore to implement conditional variables. 303 * A detailed explanation can be found here: 304 * 305 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165 306 * and here http://austingroupbugs.net/view.php?id=609 307 * 308 * And since commit 309 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14 310 * glibc does not use them. 311 */ 312 case LINUX_FUTEX_WAIT_REQUEUE_PI: 313 /* not yet implemented */ 314 pem = pem_find(td->td_proc); 315 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 316 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); 317 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 318 } 319 return (ENOSYS); 320 321 case LINUX_FUTEX_CMP_REQUEUE_PI: 322 /* not yet implemented */ 323 pem = pem_find(td->td_proc); 324 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 325 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); 326 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 327 } 328 return (ENOSYS); 329 330 default: 331 linux_msg(td, "unsupported futex op %d", args->op); 332 return (ENOSYS); 333 } 334 } 335 336 /* 337 * pi protocol: 338 * - 0 futex word value means unlocked. 339 * - TID futex word value means locked. 340 * Userspace uses atomic ops to lock/unlock these futexes without entering the 341 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), 342 * then FUTEX_LOCK_PI is called. 343 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no 344 * other waiters exists looks up the thread that owns the futex (it has put its 345 * own TID into the futex value) and made this thread the owner of the internal 346 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock 347 * object, on which it blocks. Once it returns, it has the mutex acquired, and it 348 * sets the futex value to its own TID and returns (futex value contains 349 * FUTEX_WAITERS|TID). 350 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and 351 * FUTEX_UNLOCK_PI will be called. 352 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED 353 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS 354 * bit is preserved (if any). 355 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against 356 * the internal futex state and if correct, acquire futex. 357 */ 358 static int 359 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) 360 { 361 struct umtx_abs_timeout timo; 362 struct linux_emuldata *em; 363 struct umtx_pi *pi, *new_pi; 364 struct thread *td1; 365 struct umtx_q *uq; 366 int error, rv; 367 uint32_t owner, old_owner; 368 369 em = em_find(td); 370 uq = td->td_umtxq; 371 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), 372 &uq->uq_key); 373 if (error != 0) 374 return (error); 375 if (args->ts != NULL) 376 linux_umtx_abs_timeout_init(&timo, args); 377 378 umtxq_lock(&uq->uq_key); 379 pi = umtx_pi_lookup(&uq->uq_key); 380 if (pi == NULL) { 381 new_pi = umtx_pi_alloc(M_NOWAIT); 382 if (new_pi == NULL) { 383 umtxq_unlock(&uq->uq_key); 384 new_pi = umtx_pi_alloc(M_WAITOK); 385 umtxq_lock(&uq->uq_key); 386 pi = umtx_pi_lookup(&uq->uq_key); 387 if (pi != NULL) { 388 umtx_pi_free(new_pi); 389 new_pi = NULL; 390 } 391 } 392 if (new_pi != NULL) { 393 new_pi->pi_key = uq->uq_key; 394 umtx_pi_insert(new_pi); 395 pi = new_pi; 396 } 397 } 398 umtx_pi_ref(pi); 399 umtxq_unlock(&uq->uq_key); 400 for (;;) { 401 /* Try uncontested case first. */ 402 rv = casueword32(args->uaddr, 0, &owner, em->em_tid); 403 /* The acquire succeeded. */ 404 if (rv == 0) { 405 error = 0; 406 break; 407 } 408 if (rv == -1) { 409 error = EFAULT; 410 break; 411 } 412 413 /* 414 * Nobody owns it, but the acquire failed. This can happen 415 * with ll/sc atomic. 416 */ 417 if (owner == 0) { 418 error = thread_check_susp(td, true); 419 if (error != 0) 420 break; 421 continue; 422 } 423 424 /* 425 * Avoid overwriting a possible error from sleep due 426 * to the pending signal with suspension check result. 427 */ 428 if (error == 0) { 429 error = thread_check_susp(td, true); 430 if (error != 0) 431 break; 432 } 433 434 /* The futex word at *uaddr is already locked by the caller. */ 435 if ((owner & FUTEX_TID_MASK) == em->em_tid) { 436 error = EDEADLK; 437 break; 438 } 439 440 /* 441 * Futex owner died, handle_futex_death() set the OWNER_DIED bit 442 * and clear tid. Try to acquire it. 443 */ 444 if ((owner & FUTEX_TID_MASK) == 0) { 445 old_owner = owner; 446 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); 447 owner |= em->em_tid; 448 rv = casueword32(args->uaddr, old_owner, &owner, owner); 449 if (rv == -1) { 450 error = EFAULT; 451 break; 452 } 453 if (rv == 1) { 454 if (error == 0) { 455 error = thread_check_susp(td, true); 456 if (error != 0) 457 break; 458 } 459 460 /* 461 * If this failed the lock could 462 * changed, restart. 463 */ 464 continue; 465 } 466 467 umtxq_lock(&uq->uq_key); 468 umtxq_busy(&uq->uq_key); 469 error = umtx_pi_claim(pi, td); 470 umtxq_unbusy(&uq->uq_key); 471 umtxq_unlock(&uq->uq_key); 472 if (error != 0) { 473 /* 474 * Since we're going to return an 475 * error, restore the futex to its 476 * previous, unowned state to avoid 477 * compounding the problem. 478 */ 479 (void)casuword32(args->uaddr, owner, old_owner); 480 } 481 break; 482 } 483 484 /* 485 * Inconsistent state: OWNER_DIED is set and tid is not 0. 486 * Linux does some checks of futex state, we return EINVAL, 487 * as the user space can take care of this. 488 */ 489 if ((owner & FUTEX_OWNER_DIED) != 0) { 490 error = EINVAL; 491 break; 492 } 493 494 if (try != 0) { 495 error = EBUSY; 496 break; 497 } 498 499 /* 500 * If we caught a signal, we have retried and now 501 * exit immediately. 502 */ 503 if (error != 0) 504 break; 505 506 umtxq_lock(&uq->uq_key); 507 umtxq_busy(&uq->uq_key); 508 umtxq_unlock(&uq->uq_key); 509 510 /* 511 * Set the contested bit so that a release in user space knows 512 * to use the system call for unlock. If this fails either some 513 * one else has acquired the lock or it has been released. 514 */ 515 rv = casueword32(args->uaddr, owner, &owner, 516 owner | FUTEX_WAITERS); 517 if (rv == -1) { 518 umtxq_unbusy_unlocked(&uq->uq_key); 519 error = EFAULT; 520 break; 521 } 522 if (rv == 1) { 523 umtxq_unbusy_unlocked(&uq->uq_key); 524 error = thread_check_susp(td, true); 525 if (error != 0) 526 break; 527 528 /* 529 * The lock changed and we need to retry or we 530 * lost a race to the thread unlocking the umtx. 531 */ 532 continue; 533 } 534 535 /* 536 * Substitute Linux thread id by native thread id to 537 * avoid refactoring code of umtxq_sleep_pi(). 538 */ 539 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); 540 if (td1 != NULL) { 541 owner = td1->td_tid; 542 PROC_UNLOCK(td1->td_proc); 543 } else { 544 umtxq_unbusy_unlocked(&uq->uq_key); 545 error = EINVAL; 546 break; 547 } 548 549 umtxq_lock(&uq->uq_key); 550 551 /* We set the contested bit, sleep. */ 552 error = umtxq_sleep_pi(uq, pi, owner, "futexp", 553 args->ts == NULL ? NULL : &timo, 554 (args->flags & FUTEX_SHARED) != 0); 555 if (error != 0) 556 continue; 557 558 error = thread_check_susp(td, false); 559 if (error != 0) 560 break; 561 } 562 563 umtxq_lock(&uq->uq_key); 564 umtx_pi_unref(pi); 565 umtxq_unlock(&uq->uq_key); 566 umtx_key_release(&uq->uq_key); 567 return (error); 568 } 569 570 static int 571 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) 572 { 573 struct linux_emuldata *em; 574 struct umtx_key key; 575 uint32_t old, owner, new_owner; 576 int count, error; 577 578 em = em_find(td); 579 580 /* 581 * Make sure we own this mtx. 582 */ 583 error = fueword32(args->uaddr, &owner); 584 if (error == -1) 585 return (EFAULT); 586 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) 587 return (EPERM); 588 589 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); 590 if (error != 0) 591 return (error); 592 umtxq_lock(&key); 593 umtxq_busy(&key); 594 error = umtx_pi_drop(td, &key, rb, &count); 595 if (error != 0 || rb) { 596 umtxq_unbusy(&key); 597 umtxq_unlock(&key); 598 umtx_key_release(&key); 599 return (error); 600 } 601 umtxq_unlock(&key); 602 603 /* 604 * When unlocking the futex, it must be marked as unowned if 605 * there is zero or one thread only waiting for it. 606 * Otherwise, it must be marked as contested. 607 */ 608 if (count > 1) 609 new_owner = FUTEX_WAITERS; 610 else 611 new_owner = 0; 612 613 again: 614 error = casueword32(args->uaddr, owner, &old, new_owner); 615 if (error == 1) { 616 error = thread_check_susp(td, false); 617 if (error == 0) 618 goto again; 619 } 620 umtxq_unbusy_unlocked(&key); 621 umtx_key_release(&key); 622 if (error == -1) 623 return (EFAULT); 624 if (error == 0 && old != owner) 625 return (EINVAL); 626 return (error); 627 } 628 629 static int 630 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) 631 { 632 struct umtx_key key, key2; 633 int nrwake, op_ret, ret; 634 int error, count; 635 636 if (args->uaddr == args->uaddr2) 637 return (EINVAL); 638 639 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 640 if (error != 0) 641 return (error); 642 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 643 if (error != 0) { 644 umtx_key_release(&key); 645 return (error); 646 } 647 umtxq_lock(&key); 648 umtxq_busy(&key); 649 umtxq_unlock(&key); 650 error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret); 651 umtxq_lock(&key); 652 umtxq_unbusy(&key); 653 if (error != 0) 654 goto out; 655 ret = umtxq_signal_mask(&key, args->val, args->val3); 656 if (op_ret > 0) { 657 nrwake = (int)(unsigned long)args->ts; 658 umtxq_lock(&key2); 659 count = umtxq_count(&key2); 660 if (count > 0) 661 ret += umtxq_signal_mask(&key2, nrwake, args->val3); 662 else 663 ret += umtxq_signal_mask(&key, nrwake, args->val3); 664 umtxq_unlock(&key2); 665 } 666 td->td_retval[0] = ret; 667 out: 668 umtxq_unlock(&key); 669 umtx_key_release(&key2); 670 umtx_key_release(&key); 671 return (error); 672 } 673 674 static int 675 linux_futex_requeue(struct thread *td, struct linux_futex_args *args) 676 { 677 int nrwake, nrrequeue; 678 struct umtx_key key, key2; 679 int error; 680 uint32_t uval; 681 682 /* 683 * Linux allows this, we would not, it is an incorrect 684 * usage of declared ABI, so return EINVAL. 685 */ 686 if (args->uaddr == args->uaddr2) 687 return (EINVAL); 688 689 nrrequeue = (int)(unsigned long)args->ts; 690 nrwake = args->val; 691 /* 692 * Sanity check to prevent signed integer overflow, 693 * see Linux CVE-2018-6927 694 */ 695 if (nrwake < 0 || nrrequeue < 0) 696 return (EINVAL); 697 698 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 699 if (error != 0) 700 return (error); 701 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 702 if (error != 0) { 703 umtx_key_release(&key); 704 return (error); 705 } 706 umtxq_lock(&key); 707 umtxq_busy(&key); 708 umtxq_unlock(&key); 709 error = fueword32(args->uaddr, &uval); 710 if (error != 0) 711 error = EFAULT; 712 else if (args->val3_compare == true && uval != args->val3) 713 error = EWOULDBLOCK; 714 umtxq_lock(&key); 715 umtxq_unbusy(&key); 716 if (error == 0) { 717 umtxq_lock(&key2); 718 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); 719 umtxq_unlock(&key2); 720 } 721 umtxq_unlock(&key); 722 umtx_key_release(&key2); 723 umtx_key_release(&key); 724 return (error); 725 } 726 727 static int 728 linux_futex_wake(struct thread *td, struct linux_futex_args *args) 729 { 730 struct umtx_key key; 731 int error; 732 733 if (args->val3 == 0) 734 return (EINVAL); 735 736 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 737 if (error != 0) 738 return (error); 739 umtxq_lock(&key); 740 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); 741 umtxq_unlock(&key); 742 umtx_key_release(&key); 743 return (0); 744 } 745 746 static int 747 linux_futex_wait(struct thread *td, struct linux_futex_args *args) 748 { 749 struct umtx_abs_timeout timo; 750 struct umtx_q *uq; 751 uint32_t uval; 752 int error; 753 754 if (args->val3 == 0) 755 error = EINVAL; 756 757 uq = td->td_umtxq; 758 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), 759 &uq->uq_key); 760 if (error != 0) 761 return (error); 762 if (args->ts != NULL) 763 linux_umtx_abs_timeout_init(&timo, args); 764 umtxq_lock(&uq->uq_key); 765 umtxq_busy(&uq->uq_key); 766 uq->uq_bitset = args->val3; 767 umtxq_insert(uq); 768 umtxq_unlock(&uq->uq_key); 769 error = fueword32(args->uaddr, &uval); 770 if (error != 0) 771 error = EFAULT; 772 else if (uval != args->val) 773 error = EWOULDBLOCK; 774 umtxq_lock(&uq->uq_key); 775 umtxq_unbusy(&uq->uq_key); 776 if (error == 0) { 777 error = umtxq_sleep(uq, "futex", 778 args->ts == NULL ? NULL : &timo); 779 if ((uq->uq_flags & UQF_UMTXQ) == 0) 780 error = 0; 781 else 782 umtxq_remove(uq); 783 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { 784 umtxq_remove(uq); 785 } 786 umtxq_unlock(&uq->uq_key); 787 umtx_key_release(&uq->uq_key); 788 return (error); 789 } 790 791 static void 792 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, 793 struct linux_futex_args *args) 794 { 795 int clockid, absolute; 796 797 /* 798 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the 799 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2. 800 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other 801 * futex operations timeout is interpreted as an absolute value. 802 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures 803 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel 804 * measures the timeout against the CLOCK_MONOTONIC clock. 805 */ 806 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; 807 absolute = args->op == LINUX_FUTEX_WAIT ? false : true; 808 umtx_abs_timeout_init(timo, clockid, absolute, args->ts); 809 } 810 811 int 812 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 813 { 814 struct linux_futex_args fargs = { 815 .uaddr = args->uaddr, 816 .op = args->op, 817 .val = args->val, 818 .ts = NULL, 819 .uaddr2 = args->uaddr2, 820 .val3 = args->val3, 821 .val3_compare = true, 822 }; 823 int error; 824 825 switch (args->op & LINUX_FUTEX_CMD_MASK) { 826 case LINUX_FUTEX_WAIT: 827 case LINUX_FUTEX_WAIT_BITSET: 828 case LINUX_FUTEX_LOCK_PI: 829 case LINUX_FUTEX_LOCK_PI2: 830 if (args->timeout != NULL) { 831 error = linux_get_timespec(&fargs.kts, args->timeout); 832 if (error != 0) 833 return (error); 834 fargs.ts = &fargs.kts; 835 } 836 break; 837 default: 838 fargs.ts = PTRIN(args->timeout); 839 } 840 return (linux_futex(td, &fargs)); 841 } 842 843 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) 844 int 845 linux_sys_futex_time64(struct thread *td, 846 struct linux_sys_futex_time64_args *args) 847 { 848 struct linux_futex_args fargs = { 849 .uaddr = args->uaddr, 850 .op = args->op, 851 .val = args->val, 852 .ts = NULL, 853 .uaddr2 = args->uaddr2, 854 .val3 = args->val3, 855 .val3_compare = true, 856 }; 857 int error; 858 859 switch (args->op & LINUX_FUTEX_CMD_MASK) { 860 case LINUX_FUTEX_WAIT: 861 case LINUX_FUTEX_WAIT_BITSET: 862 case LINUX_FUTEX_LOCK_PI: 863 case LINUX_FUTEX_LOCK_PI2: 864 if (args->timeout != NULL) { 865 error = linux_get_timespec64(&fargs.kts, args->timeout); 866 if (error != 0) 867 return (error); 868 fargs.ts = &fargs.kts; 869 } 870 break; 871 default: 872 fargs.ts = PTRIN(args->timeout); 873 } 874 return (linux_futex(td, &fargs)); 875 } 876 #endif 877 878 int 879 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 880 { 881 struct linux_emuldata *em; 882 883 if (args->len != sizeof(struct linux_robust_list_head)) 884 return (EINVAL); 885 886 em = em_find(td); 887 em->robust_futexes = args->head; 888 889 return (0); 890 } 891 892 int 893 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 894 { 895 struct linux_emuldata *em; 896 struct linux_robust_list_head *head; 897 l_size_t len; 898 struct thread *td2; 899 int error; 900 901 if (!args->pid) { 902 em = em_find(td); 903 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 904 head = em->robust_futexes; 905 } else { 906 td2 = linux_tdfind(td, args->pid, -1); 907 if (td2 == NULL) 908 return (ESRCH); 909 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { 910 PROC_UNLOCK(td2->td_proc); 911 return (EPERM); 912 } 913 914 em = em_find(td2); 915 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 916 /* XXX: ptrace? */ 917 if (priv_check(td, PRIV_CRED_SETUID) || 918 priv_check(td, PRIV_CRED_SETEUID) || 919 p_candebug(td, td2->td_proc)) { 920 PROC_UNLOCK(td2->td_proc); 921 return (EPERM); 922 } 923 head = em->robust_futexes; 924 925 PROC_UNLOCK(td2->td_proc); 926 } 927 928 len = sizeof(struct linux_robust_list_head); 929 error = copyout(&len, args->len, sizeof(l_size_t)); 930 if (error != 0) 931 return (EFAULT); 932 933 return (copyout(&head, args->head, sizeof(head))); 934 } 935 936 static int 937 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr, 938 unsigned int pi, bool pending_op) 939 { 940 uint32_t uval, nval, mval; 941 int error; 942 943 retry: 944 error = fueword32(uaddr, &uval); 945 if (error != 0) 946 return (EFAULT); 947 948 /* 949 * Special case for regular (non PI) futexes. The unlock path in 950 * user space has two race scenarios: 951 * 952 * 1. The unlock path releases the user space futex value and 953 * before it can execute the futex() syscall to wake up 954 * waiters it is killed. 955 * 956 * 2. A woken up waiter is killed before it can acquire the 957 * futex in user space. 958 * 959 * In both cases the TID validation below prevents a wakeup of 960 * potential waiters which can cause these waiters to block 961 * forever. 962 * 963 * In both cases it is safe to attempt waking up a potential 964 * waiter without touching the user space futex value and trying 965 * to set the OWNER_DIED bit. 966 */ 967 if (pending_op && !pi && !uval) { 968 (void)futex_wake(td, uaddr, 1, true); 969 return (0); 970 } 971 972 if ((uval & FUTEX_TID_MASK) == em->em_tid) { 973 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 974 error = casueword32(uaddr, uval, &nval, mval); 975 if (error == -1) 976 return (EFAULT); 977 if (error == 1) { 978 error = thread_check_susp(td, false); 979 if (error != 0) 980 return (error); 981 goto retry; 982 } 983 984 if (!pi && (uval & FUTEX_WAITERS)) { 985 error = futex_wake(td, uaddr, 1, true); 986 if (error != 0) 987 return (error); 988 } else if (pi && (uval & FUTEX_WAITERS)) { 989 error = futex_wake_pi(td, uaddr, true); 990 if (error != 0) 991 return (error); 992 } 993 } 994 995 return (0); 996 } 997 998 static int 999 fetch_robust_entry(struct linux_robust_list **entry, 1000 struct linux_robust_list **head, unsigned int *pi) 1001 { 1002 l_ulong uentry; 1003 int error; 1004 1005 error = copyin((const void *)head, &uentry, sizeof(uentry)); 1006 if (error != 0) 1007 return (EFAULT); 1008 1009 *entry = (void *)(uentry & ~1UL); 1010 *pi = uentry & 1; 1011 1012 return (0); 1013 } 1014 1015 #define LINUX_HANDLE_DEATH_PENDING true 1016 #define LINUX_HANDLE_DEATH_LIST false 1017 1018 /* This walks the list of robust futexes releasing them. */ 1019 void 1020 release_futexes(struct thread *td, struct linux_emuldata *em) 1021 { 1022 struct linux_robust_list_head *head; 1023 struct linux_robust_list *entry, *next_entry, *pending; 1024 unsigned int limit = 2048, pi, next_pi, pip; 1025 uint32_t *uaddr; 1026 l_long futex_offset; 1027 int error; 1028 1029 head = em->robust_futexes; 1030 if (head == NULL) 1031 return; 1032 1033 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 1034 return; 1035 1036 error = copyin(&head->futex_offset, &futex_offset, 1037 sizeof(futex_offset)); 1038 if (error != 0) 1039 return; 1040 1041 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 1042 return; 1043 1044 while (entry != &head->list) { 1045 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next), 1046 &next_pi); 1047 1048 /* 1049 * A pending lock might already be on the list, so 1050 * don't process it twice. 1051 */ 1052 if (entry != pending) { 1053 uaddr = (uint32_t *)((caddr_t)entry + futex_offset); 1054 if (handle_futex_death(td, em, uaddr, pi, 1055 LINUX_HANDLE_DEATH_LIST)) 1056 return; 1057 } 1058 if (error != 0) 1059 return; 1060 1061 entry = next_entry; 1062 pi = next_pi; 1063 1064 if (!--limit) 1065 break; 1066 1067 sched_relinquish(curthread); 1068 } 1069 1070 if (pending) { 1071 uaddr = (uint32_t *)((caddr_t)pending + futex_offset); 1072 (void)handle_futex_death(td, em, uaddr, pip, 1073 LINUX_HANDLE_DEATH_PENDING); 1074 } 1075 } 1076