1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org> 5 * Copyright (c) 2008 Roman Divacky 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/imgact.h> 34 #include <sys/imgact_elf.h> 35 #include <sys/ktr.h> 36 #include <sys/lock.h> 37 #include <sys/mutex.h> 38 #include <sys/priv.h> 39 #include <sys/proc.h> 40 #include <sys/sched.h> 41 #include <sys/sysent.h> 42 #include <sys/vnode.h> 43 #include <sys/umtxvar.h> 44 45 #ifdef COMPAT_LINUX32 46 #include <machine/../linux32/linux.h> 47 #include <machine/../linux32/linux32_proto.h> 48 #else 49 #include <machine/../linux/linux.h> 50 #include <machine/../linux/linux_proto.h> 51 #endif 52 #include <compat/linux/linux_emul.h> 53 #include <compat/linux/linux_futex.h> 54 #include <compat/linux/linux_misc.h> 55 #include <compat/linux/linux_time.h> 56 #include <compat/linux/linux_util.h> 57 58 #define FUTEX_SHARED 0x8 /* shared futex */ 59 #define FUTEX_UNOWNED 0 60 61 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE 62 63 static int futex_atomic_op(struct thread *, int, uint32_t *, int *); 64 static int handle_futex_death(struct thread *td, struct linux_emuldata *, 65 uint32_t *, unsigned int, bool); 66 static int fetch_robust_entry(struct linux_robust_list **, 67 struct linux_robust_list **, unsigned int *); 68 69 struct linux_futex_args { 70 uint32_t *uaddr; 71 int32_t op; 72 uint32_t flags; 73 bool clockrt; 74 uint32_t val; 75 struct timespec *ts; 76 uint32_t *uaddr2; 77 uint32_t val3; 78 bool val3_compare; 79 struct timespec kts; 80 }; 81 82 static inline int futex_key_get(const void *, int, int, struct umtx_key *); 83 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, 84 struct linux_futex_args *); 85 static int linux_futex(struct thread *, struct linux_futex_args *); 86 static int linux_futex_wait(struct thread *, struct linux_futex_args *); 87 static int linux_futex_wake(struct thread *, struct linux_futex_args *); 88 static int linux_futex_requeue(struct thread *, struct linux_futex_args *); 89 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); 90 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); 91 static int linux_futex_unlock_pi(struct thread *, bool, 92 struct linux_futex_args *); 93 static int futex_wake_pi(struct thread *, uint32_t *, bool); 94 95 static int 96 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key) 97 { 98 99 /* Check that futex address is a 32bit aligned. */ 100 if (!__is_aligned(uaddr, sizeof(uint32_t))) 101 return (EINVAL); 102 return (umtx_key_get(uaddr, type, share, key)); 103 } 104 105 int 106 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) 107 { 108 struct linux_futex_args args; 109 110 bzero(&args, sizeof(args)); 111 args.op = LINUX_FUTEX_WAKE; 112 args.uaddr = uaddr; 113 args.flags = shared == true ? FUTEX_SHARED : 0; 114 args.val = val; 115 args.val3 = FUTEX_BITSET_MATCH_ANY; 116 117 return (linux_futex_wake(td, &args)); 118 } 119 120 static int 121 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) 122 { 123 struct linux_futex_args args; 124 125 bzero(&args, sizeof(args)); 126 args.op = LINUX_FUTEX_UNLOCK_PI; 127 args.uaddr = uaddr; 128 args.flags = shared == true ? FUTEX_SHARED : 0; 129 130 return (linux_futex_unlock_pi(td, true, &args)); 131 } 132 133 static int 134 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr, 135 int *res) 136 { 137 int op = (encoded_op >> 28) & 7; 138 int cmp = (encoded_op >> 24) & 15; 139 int oparg = (encoded_op << 8) >> 20; 140 int cmparg = (encoded_op << 20) >> 20; 141 int oldval = 0, ret; 142 143 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 144 oparg = 1 << oparg; 145 146 switch (op) { 147 case FUTEX_OP_SET: 148 ret = futex_xchgl(oparg, uaddr, &oldval); 149 break; 150 case FUTEX_OP_ADD: 151 ret = futex_addl(oparg, uaddr, &oldval); 152 break; 153 case FUTEX_OP_OR: 154 ret = futex_orl(oparg, uaddr, &oldval); 155 break; 156 case FUTEX_OP_ANDN: 157 ret = futex_andl(~oparg, uaddr, &oldval); 158 break; 159 case FUTEX_OP_XOR: 160 ret = futex_xorl(oparg, uaddr, &oldval); 161 break; 162 default: 163 ret = ENOSYS; 164 break; 165 } 166 167 if (ret != 0) 168 return (ret); 169 170 switch (cmp) { 171 case FUTEX_OP_CMP_EQ: 172 *res = (oldval == cmparg); 173 break; 174 case FUTEX_OP_CMP_NE: 175 *res = (oldval != cmparg); 176 break; 177 case FUTEX_OP_CMP_LT: 178 *res = (oldval < cmparg); 179 break; 180 case FUTEX_OP_CMP_GE: 181 *res = (oldval >= cmparg); 182 break; 183 case FUTEX_OP_CMP_LE: 184 *res = (oldval <= cmparg); 185 break; 186 case FUTEX_OP_CMP_GT: 187 *res = (oldval > cmparg); 188 break; 189 default: 190 ret = ENOSYS; 191 } 192 193 return (ret); 194 } 195 196 static int 197 linux_futex(struct thread *td, struct linux_futex_args *args) 198 { 199 struct linux_pemuldata *pem; 200 struct proc *p; 201 202 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { 203 args->flags = 0; 204 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; 205 } else 206 args->flags = FUTEX_SHARED; 207 208 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; 209 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; 210 211 if (args->clockrt && 212 args->op != LINUX_FUTEX_WAIT_BITSET && 213 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI && 214 args->op != LINUX_FUTEX_LOCK_PI2) 215 return (ENOSYS); 216 217 switch (args->op) { 218 case LINUX_FUTEX_WAIT: 219 args->val3 = FUTEX_BITSET_MATCH_ANY; 220 /* FALLTHROUGH */ 221 222 case LINUX_FUTEX_WAIT_BITSET: 223 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", 224 args->uaddr, args->val, args->val3); 225 226 return (linux_futex_wait(td, args)); 227 228 case LINUX_FUTEX_WAKE: 229 args->val3 = FUTEX_BITSET_MATCH_ANY; 230 /* FALLTHROUGH */ 231 232 case LINUX_FUTEX_WAKE_BITSET: 233 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", 234 args->uaddr, args->val, args->val3); 235 236 return (linux_futex_wake(td, args)); 237 238 case LINUX_FUTEX_REQUEUE: 239 /* 240 * Glibc does not use this operation since version 2.3.3, 241 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. 242 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when 243 * FUTEX_REQUEUE returned EINVAL. 244 */ 245 pem = pem_find(td->td_proc); 246 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { 247 linux_msg(td, "unsupported FUTEX_REQUEUE"); 248 pem->flags |= LINUX_XDEPR_REQUEUEOP; 249 } 250 251 /* 252 * The above is true, however musl libc does make use of the 253 * futex requeue operation, allow operation for brands which 254 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. 255 */ 256 p = td->td_proc; 257 Elf_Brandinfo *bi = p->p_elf_brandinfo; 258 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) 259 return (EINVAL); 260 args->val3_compare = false; 261 /* FALLTHROUGH */ 262 263 case LINUX_FUTEX_CMP_REQUEUE: 264 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " 265 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", 266 args->uaddr, args->val, args->val3, args->uaddr2, 267 args->ts); 268 269 return (linux_futex_requeue(td, args)); 270 271 case LINUX_FUTEX_WAKE_OP: 272 LINUX_CTR5(sys_futex, "WAKE_OP " 273 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", 274 args->uaddr, args->val, args->uaddr2, args->val3, 275 args->ts); 276 277 return (linux_futex_wakeop(td, args)); 278 279 case LINUX_FUTEX_LOCK_PI: 280 args->clockrt = true; 281 /* FALLTHROUGH */ 282 283 case LINUX_FUTEX_LOCK_PI2: 284 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", 285 args->uaddr, args->val); 286 287 return (linux_futex_lock_pi(td, false, args)); 288 289 case LINUX_FUTEX_UNLOCK_PI: 290 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", 291 args->uaddr); 292 293 return (linux_futex_unlock_pi(td, false, args)); 294 295 case LINUX_FUTEX_TRYLOCK_PI: 296 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", 297 args->uaddr); 298 299 return (linux_futex_lock_pi(td, true, args)); 300 301 /* 302 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI 303 * can't be used anymore to implement conditional variables. 304 * A detailed explanation can be found here: 305 * 306 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165 307 * and here http://austingroupbugs.net/view.php?id=609 308 * 309 * And since commit 310 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14 311 * glibc does not use them. 312 */ 313 case LINUX_FUTEX_WAIT_REQUEUE_PI: 314 /* not yet implemented */ 315 pem = pem_find(td->td_proc); 316 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 317 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); 318 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 319 } 320 return (ENOSYS); 321 322 case LINUX_FUTEX_CMP_REQUEUE_PI: 323 /* not yet implemented */ 324 pem = pem_find(td->td_proc); 325 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 326 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); 327 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 328 } 329 return (ENOSYS); 330 331 default: 332 linux_msg(td, "unsupported futex op %d", args->op); 333 return (ENOSYS); 334 } 335 } 336 337 /* 338 * pi protocol: 339 * - 0 futex word value means unlocked. 340 * - TID futex word value means locked. 341 * Userspace uses atomic ops to lock/unlock these futexes without entering the 342 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), 343 * then FUTEX_LOCK_PI is called. 344 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no 345 * other waiters exists looks up the thread that owns the futex (it has put its 346 * own TID into the futex value) and made this thread the owner of the internal 347 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock 348 * object, on which it blocks. Once it returns, it has the mutex acquired, and it 349 * sets the futex value to its own TID and returns (futex value contains 350 * FUTEX_WAITERS|TID). 351 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and 352 * FUTEX_UNLOCK_PI will be called. 353 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED 354 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS 355 * bit is preserved (if any). 356 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against 357 * the internal futex state and if correct, acquire futex. 358 */ 359 static int 360 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) 361 { 362 struct umtx_abs_timeout timo; 363 struct linux_emuldata *em; 364 struct umtx_pi *pi, *new_pi; 365 struct thread *td1; 366 struct umtx_q *uq; 367 int error, rv; 368 uint32_t owner, old_owner; 369 370 em = em_find(td); 371 uq = td->td_umtxq; 372 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), 373 &uq->uq_key); 374 if (error != 0) 375 return (error); 376 if (args->ts != NULL) 377 linux_umtx_abs_timeout_init(&timo, args); 378 379 umtxq_lock(&uq->uq_key); 380 pi = umtx_pi_lookup(&uq->uq_key); 381 if (pi == NULL) { 382 new_pi = umtx_pi_alloc(M_NOWAIT); 383 if (new_pi == NULL) { 384 umtxq_unlock(&uq->uq_key); 385 new_pi = umtx_pi_alloc(M_WAITOK); 386 umtxq_lock(&uq->uq_key); 387 pi = umtx_pi_lookup(&uq->uq_key); 388 if (pi != NULL) { 389 umtx_pi_free(new_pi); 390 new_pi = NULL; 391 } 392 } 393 if (new_pi != NULL) { 394 new_pi->pi_key = uq->uq_key; 395 umtx_pi_insert(new_pi); 396 pi = new_pi; 397 } 398 } 399 umtx_pi_ref(pi); 400 umtxq_unlock(&uq->uq_key); 401 for (;;) { 402 /* Try uncontested case first. */ 403 rv = casueword32(args->uaddr, FUTEX_UNOWNED, &owner, em->em_tid); 404 /* The acquire succeeded. */ 405 if (rv == 0) { 406 error = 0; 407 break; 408 } 409 if (rv == -1) { 410 error = EFAULT; 411 break; 412 } 413 414 /* 415 * Nobody owns it, but the acquire failed. This can happen 416 * with ll/sc atomic. 417 */ 418 if (owner == FUTEX_UNOWNED) { 419 error = thread_check_susp(td, true); 420 if (error != 0) 421 break; 422 continue; 423 } 424 425 /* 426 * Avoid overwriting a possible error from sleep due 427 * to the pending signal with suspension check result. 428 */ 429 if (error == 0) { 430 error = thread_check_susp(td, true); 431 if (error != 0) 432 break; 433 } 434 435 /* The futex word at *uaddr is already locked by the caller. */ 436 if ((owner & FUTEX_TID_MASK) == em->em_tid) { 437 error = EDEADLK; 438 break; 439 } 440 441 /* 442 * Futex owner died, handle_futex_death() set the OWNER_DIED bit 443 * and clear tid. Try to acquire it. 444 */ 445 if ((owner & FUTEX_TID_MASK) == FUTEX_UNOWNED) { 446 old_owner = owner; 447 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); 448 owner |= em->em_tid; 449 rv = casueword32(args->uaddr, old_owner, &owner, owner); 450 if (rv == -1) { 451 error = EFAULT; 452 break; 453 } 454 if (rv == 1) { 455 if (error == 0) { 456 error = thread_check_susp(td, true); 457 if (error != 0) 458 break; 459 } 460 461 /* 462 * If this failed the lock could 463 * changed, restart. 464 */ 465 continue; 466 } 467 468 umtxq_lock(&uq->uq_key); 469 umtxq_busy(&uq->uq_key); 470 error = umtx_pi_claim(pi, td); 471 umtxq_unbusy(&uq->uq_key); 472 umtxq_unlock(&uq->uq_key); 473 if (error != 0) { 474 /* 475 * Since we're going to return an 476 * error, restore the futex to its 477 * previous, unowned state to avoid 478 * compounding the problem. 479 */ 480 (void)casuword32(args->uaddr, owner, old_owner); 481 } 482 break; 483 } 484 485 /* 486 * Inconsistent state: OWNER_DIED is set and tid is not 0. 487 * Linux does some checks of futex state, we return EINVAL, 488 * as the user space can take care of this. 489 */ 490 if ((owner & FUTEX_OWNER_DIED) != FUTEX_UNOWNED) { 491 error = EINVAL; 492 break; 493 } 494 495 if (try != 0) { 496 error = EBUSY; 497 break; 498 } 499 500 /* 501 * If we caught a signal, we have retried and now 502 * exit immediately. 503 */ 504 if (error != 0) 505 break; 506 507 umtxq_lock(&uq->uq_key); 508 umtxq_busy(&uq->uq_key); 509 umtxq_unlock(&uq->uq_key); 510 511 /* 512 * Set the contested bit so that a release in user space knows 513 * to use the system call for unlock. If this fails either some 514 * one else has acquired the lock or it has been released. 515 */ 516 rv = casueword32(args->uaddr, owner, &owner, 517 owner | FUTEX_WAITERS); 518 if (rv == -1) { 519 umtxq_unbusy_unlocked(&uq->uq_key); 520 error = EFAULT; 521 break; 522 } 523 if (rv == 1) { 524 umtxq_unbusy_unlocked(&uq->uq_key); 525 error = thread_check_susp(td, true); 526 if (error != 0) 527 break; 528 529 /* 530 * The lock changed and we need to retry or we 531 * lost a race to the thread unlocking the umtx. 532 */ 533 continue; 534 } 535 536 /* 537 * Substitute Linux thread id by native thread id to 538 * avoid refactoring code of umtxq_sleep_pi(). 539 */ 540 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); 541 if (td1 != NULL) { 542 owner = td1->td_tid; 543 PROC_UNLOCK(td1->td_proc); 544 } else { 545 umtxq_unbusy_unlocked(&uq->uq_key); 546 error = EINVAL; 547 break; 548 } 549 550 umtxq_lock(&uq->uq_key); 551 552 /* We set the contested bit, sleep. */ 553 error = umtxq_sleep_pi(uq, pi, owner, "futexp", 554 args->ts == NULL ? NULL : &timo, 555 (args->flags & FUTEX_SHARED) != 0); 556 if (error != 0) 557 continue; 558 559 error = thread_check_susp(td, false); 560 if (error != 0) 561 break; 562 } 563 564 umtxq_lock(&uq->uq_key); 565 umtx_pi_unref(pi); 566 umtxq_unlock(&uq->uq_key); 567 umtx_key_release(&uq->uq_key); 568 return (error); 569 } 570 571 static int 572 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) 573 { 574 struct linux_emuldata *em; 575 struct umtx_key key; 576 uint32_t old, owner, new_owner; 577 int count, error; 578 579 em = em_find(td); 580 581 /* 582 * Make sure we own this mtx. 583 */ 584 error = fueword32(args->uaddr, &owner); 585 if (error == -1) 586 return (EFAULT); 587 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) 588 return (EPERM); 589 590 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); 591 if (error != 0) 592 return (error); 593 umtxq_lock(&key); 594 umtxq_busy(&key); 595 error = umtx_pi_drop(td, &key, rb, &count); 596 if (error != 0 || rb) { 597 umtxq_unbusy(&key); 598 umtxq_unlock(&key); 599 umtx_key_release(&key); 600 return (error); 601 } 602 umtxq_unlock(&key); 603 604 /* 605 * When unlocking the futex, it must be marked as unowned if 606 * there is zero or one thread only waiting for it. 607 * Otherwise, it must be marked as contested. 608 */ 609 if (count > 1) 610 new_owner = FUTEX_WAITERS; 611 else 612 new_owner = FUTEX_UNOWNED; 613 614 again: 615 error = casueword32(args->uaddr, owner, &old, new_owner); 616 if (error == 1) { 617 error = thread_check_susp(td, false); 618 if (error == 0) 619 goto again; 620 } 621 umtxq_unbusy_unlocked(&key); 622 umtx_key_release(&key); 623 if (error == -1) 624 return (EFAULT); 625 if (error == 0 && old != owner) 626 return (EINVAL); 627 return (error); 628 } 629 630 static int 631 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) 632 { 633 struct umtx_key key, key2; 634 int nrwake, op_ret, ret; 635 int error, count; 636 637 if (args->uaddr == args->uaddr2) 638 return (EINVAL); 639 640 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 641 if (error != 0) 642 return (error); 643 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 644 if (error != 0) { 645 umtx_key_release(&key); 646 return (error); 647 } 648 umtxq_lock(&key); 649 umtxq_busy(&key); 650 umtxq_unlock(&key); 651 error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret); 652 umtxq_lock(&key); 653 umtxq_unbusy(&key); 654 if (error != 0) 655 goto out; 656 ret = umtxq_signal_mask(&key, args->val, args->val3); 657 if (op_ret > 0) { 658 nrwake = (int)(unsigned long)args->ts; 659 umtxq_lock(&key2); 660 count = umtxq_count(&key2); 661 if (count > 0) 662 ret += umtxq_signal_mask(&key2, nrwake, args->val3); 663 else 664 ret += umtxq_signal_mask(&key, nrwake, args->val3); 665 umtxq_unlock(&key2); 666 } 667 td->td_retval[0] = ret; 668 out: 669 umtxq_unlock(&key); 670 umtx_key_release(&key2); 671 umtx_key_release(&key); 672 return (error); 673 } 674 675 static int 676 linux_futex_requeue(struct thread *td, struct linux_futex_args *args) 677 { 678 int nrwake, nrrequeue; 679 struct umtx_key key, key2; 680 int error; 681 uint32_t uval; 682 683 /* 684 * Linux allows this, we would not, it is an incorrect 685 * usage of declared ABI, so return EINVAL. 686 */ 687 if (args->uaddr == args->uaddr2) 688 return (EINVAL); 689 690 nrrequeue = (int)(unsigned long)args->ts; 691 nrwake = args->val; 692 /* 693 * Sanity check to prevent signed integer overflow, 694 * see Linux CVE-2018-6927 695 */ 696 if (nrwake < 0 || nrrequeue < 0) 697 return (EINVAL); 698 699 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 700 if (error != 0) 701 return (error); 702 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 703 if (error != 0) { 704 umtx_key_release(&key); 705 return (error); 706 } 707 umtxq_lock(&key); 708 umtxq_busy(&key); 709 umtxq_unlock(&key); 710 error = fueword32(args->uaddr, &uval); 711 if (error != 0) 712 error = EFAULT; 713 else if (args->val3_compare == true && uval != args->val3) 714 error = EWOULDBLOCK; 715 umtxq_lock(&key); 716 umtxq_unbusy(&key); 717 if (error == 0) { 718 umtxq_lock(&key2); 719 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); 720 umtxq_unlock(&key2); 721 } 722 umtxq_unlock(&key); 723 umtx_key_release(&key2); 724 umtx_key_release(&key); 725 return (error); 726 } 727 728 static int 729 linux_futex_wake(struct thread *td, struct linux_futex_args *args) 730 { 731 struct umtx_key key; 732 int error; 733 734 if (args->val3 == 0) 735 return (EINVAL); 736 737 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 738 if (error != 0) 739 return (error); 740 umtxq_lock(&key); 741 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); 742 umtxq_unlock(&key); 743 umtx_key_release(&key); 744 return (0); 745 } 746 747 static int 748 linux_futex_wait(struct thread *td, struct linux_futex_args *args) 749 { 750 struct umtx_abs_timeout timo; 751 struct umtx_q *uq; 752 uint32_t uval; 753 int error; 754 755 if (args->val3 == 0) 756 error = EINVAL; 757 758 uq = td->td_umtxq; 759 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), 760 &uq->uq_key); 761 if (error != 0) 762 return (error); 763 if (args->ts != NULL) 764 linux_umtx_abs_timeout_init(&timo, args); 765 umtxq_lock(&uq->uq_key); 766 umtxq_busy(&uq->uq_key); 767 uq->uq_bitset = args->val3; 768 umtxq_insert(uq); 769 umtxq_unlock(&uq->uq_key); 770 error = fueword32(args->uaddr, &uval); 771 if (error != 0) 772 error = EFAULT; 773 else if (uval != args->val) 774 error = EWOULDBLOCK; 775 umtxq_lock(&uq->uq_key); 776 umtxq_unbusy(&uq->uq_key); 777 if (error == 0) { 778 error = umtxq_sleep(uq, "futex", 779 args->ts == NULL ? NULL : &timo); 780 if ((uq->uq_flags & UQF_UMTXQ) == 0) 781 error = 0; 782 else 783 umtxq_remove(uq); 784 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { 785 umtxq_remove(uq); 786 } 787 umtxq_unlock(&uq->uq_key); 788 umtx_key_release(&uq->uq_key); 789 return (error); 790 } 791 792 static void 793 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, 794 struct linux_futex_args *args) 795 { 796 int clockid, absolute; 797 798 /* 799 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the 800 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2. 801 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other 802 * futex operations timeout is interpreted as an absolute value. 803 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures 804 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel 805 * measures the timeout against the CLOCK_MONOTONIC clock. 806 */ 807 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; 808 absolute = args->op == LINUX_FUTEX_WAIT ? false : true; 809 umtx_abs_timeout_init(timo, clockid, absolute, args->ts); 810 } 811 812 int 813 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 814 { 815 struct linux_futex_args fargs = { 816 .uaddr = args->uaddr, 817 .op = args->op, 818 .val = args->val, 819 .ts = NULL, 820 .uaddr2 = args->uaddr2, 821 .val3 = args->val3, 822 .val3_compare = true, 823 }; 824 int error; 825 826 switch (args->op & LINUX_FUTEX_CMD_MASK) { 827 case LINUX_FUTEX_WAIT: 828 case LINUX_FUTEX_WAIT_BITSET: 829 case LINUX_FUTEX_LOCK_PI: 830 case LINUX_FUTEX_LOCK_PI2: 831 if (args->timeout != NULL) { 832 error = linux_get_timespec(&fargs.kts, args->timeout); 833 if (error != 0) 834 return (error); 835 fargs.ts = &fargs.kts; 836 } 837 break; 838 default: 839 fargs.ts = PTRIN(args->timeout); 840 } 841 return (linux_futex(td, &fargs)); 842 } 843 844 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) 845 int 846 linux_sys_futex_time64(struct thread *td, 847 struct linux_sys_futex_time64_args *args) 848 { 849 struct linux_futex_args fargs = { 850 .uaddr = args->uaddr, 851 .op = args->op, 852 .val = args->val, 853 .ts = NULL, 854 .uaddr2 = args->uaddr2, 855 .val3 = args->val3, 856 .val3_compare = true, 857 }; 858 int error; 859 860 switch (args->op & LINUX_FUTEX_CMD_MASK) { 861 case LINUX_FUTEX_WAIT: 862 case LINUX_FUTEX_WAIT_BITSET: 863 case LINUX_FUTEX_LOCK_PI: 864 case LINUX_FUTEX_LOCK_PI2: 865 if (args->timeout != NULL) { 866 error = linux_get_timespec64(&fargs.kts, args->timeout); 867 if (error != 0) 868 return (error); 869 fargs.ts = &fargs.kts; 870 } 871 break; 872 default: 873 fargs.ts = PTRIN(args->timeout); 874 } 875 return (linux_futex(td, &fargs)); 876 } 877 #endif 878 879 int 880 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 881 { 882 struct linux_emuldata *em; 883 884 if (args->len != sizeof(struct linux_robust_list_head)) 885 return (EINVAL); 886 887 em = em_find(td); 888 em->robust_futexes = args->head; 889 890 return (0); 891 } 892 893 int 894 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 895 { 896 struct linux_emuldata *em; 897 struct linux_robust_list_head *head; 898 l_size_t len; 899 struct thread *td2; 900 int error; 901 902 if (!args->pid) { 903 em = em_find(td); 904 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 905 head = em->robust_futexes; 906 } else { 907 td2 = linux_tdfind(td, args->pid, -1); 908 if (td2 == NULL) 909 return (ESRCH); 910 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { 911 PROC_UNLOCK(td2->td_proc); 912 return (EPERM); 913 } 914 915 em = em_find(td2); 916 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 917 /* XXX: ptrace? */ 918 if (priv_check(td, PRIV_CRED_SETUID) || 919 priv_check(td, PRIV_CRED_SETEUID) || 920 p_candebug(td, td2->td_proc)) { 921 PROC_UNLOCK(td2->td_proc); 922 return (EPERM); 923 } 924 head = em->robust_futexes; 925 926 PROC_UNLOCK(td2->td_proc); 927 } 928 929 len = sizeof(struct linux_robust_list_head); 930 error = copyout(&len, args->len, sizeof(l_size_t)); 931 if (error != 0) 932 return (EFAULT); 933 934 return (copyout(&head, args->head, sizeof(l_uintptr_t))); 935 } 936 937 static int 938 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr, 939 unsigned int pi, bool pending_op) 940 { 941 uint32_t uval, nval, mval; 942 int error; 943 944 retry: 945 error = fueword32(uaddr, &uval); 946 if (error != 0) 947 return (EFAULT); 948 949 /* 950 * Special case for regular (non PI) futexes. The unlock path in 951 * user space has two race scenarios: 952 * 953 * 1. The unlock path releases the user space futex value and 954 * before it can execute the futex() syscall to wake up 955 * waiters it is killed. 956 * 957 * 2. A woken up waiter is killed before it can acquire the 958 * futex in user space. 959 * 960 * In both cases the TID validation below prevents a wakeup of 961 * potential waiters which can cause these waiters to block 962 * forever. 963 * 964 * In both cases it is safe to attempt waking up a potential 965 * waiter without touching the user space futex value and trying 966 * to set the OWNER_DIED bit. 967 */ 968 if (pending_op && !pi && !uval) { 969 (void)futex_wake(td, uaddr, 1, true); 970 return (0); 971 } 972 973 if ((uval & FUTEX_TID_MASK) == em->em_tid) { 974 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 975 error = casueword32(uaddr, uval, &nval, mval); 976 if (error == -1) 977 return (EFAULT); 978 if (error == 1) { 979 error = thread_check_susp(td, false); 980 if (error != 0) 981 return (error); 982 goto retry; 983 } 984 985 if (!pi && (uval & FUTEX_WAITERS)) { 986 error = futex_wake(td, uaddr, 1, true); 987 if (error != 0) 988 return (error); 989 } else if (pi && (uval & FUTEX_WAITERS)) { 990 error = futex_wake_pi(td, uaddr, true); 991 if (error != 0) 992 return (error); 993 } 994 } 995 996 return (0); 997 } 998 999 static int 1000 fetch_robust_entry(struct linux_robust_list **entry, 1001 struct linux_robust_list **head, unsigned int *pi) 1002 { 1003 l_ulong uentry; 1004 int error; 1005 1006 error = copyin((const void *)head, &uentry, sizeof(uentry)); 1007 if (error != 0) 1008 return (EFAULT); 1009 1010 *entry = (void *)(uentry & ~1UL); 1011 *pi = uentry & 1; 1012 1013 return (0); 1014 } 1015 1016 #define LINUX_HANDLE_DEATH_PENDING true 1017 #define LINUX_HANDLE_DEATH_LIST false 1018 1019 /* This walks the list of robust futexes releasing them. */ 1020 void 1021 release_futexes(struct thread *td, struct linux_emuldata *em) 1022 { 1023 struct linux_robust_list_head *head; 1024 struct linux_robust_list *entry, *next_entry, *pending; 1025 unsigned int limit = 2048, pi, next_pi, pip; 1026 uint32_t *uaddr; 1027 l_long futex_offset; 1028 int error; 1029 1030 head = em->robust_futexes; 1031 if (head == NULL) 1032 return; 1033 1034 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 1035 return; 1036 1037 error = copyin(&head->futex_offset, &futex_offset, 1038 sizeof(futex_offset)); 1039 if (error != 0) 1040 return; 1041 1042 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 1043 return; 1044 1045 while (entry != &head->list) { 1046 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next), 1047 &next_pi); 1048 1049 /* 1050 * A pending lock might already be on the list, so 1051 * don't process it twice. 1052 */ 1053 if (entry != pending) { 1054 uaddr = (uint32_t *)((caddr_t)entry + futex_offset); 1055 if (handle_futex_death(td, em, uaddr, pi, 1056 LINUX_HANDLE_DEATH_LIST)) 1057 return; 1058 } 1059 if (error != 0) 1060 return; 1061 1062 entry = next_entry; 1063 pi = next_pi; 1064 1065 if (!--limit) 1066 break; 1067 1068 sched_relinquish(curthread); 1069 } 1070 1071 if (pending) { 1072 uaddr = (uint32_t *)((caddr_t)pending + futex_offset); 1073 (void)handle_futex_death(td, em, uaddr, pip, 1074 LINUX_HANDLE_DEATH_PENDING); 1075 } 1076 } 1077