1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org> 5 * Copyright (c) 2008 Roman Divacky 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/imgact.h> 31 #include <sys/imgact_elf.h> 32 #include <sys/ktr.h> 33 #include <sys/lock.h> 34 #include <sys/mutex.h> 35 #include <sys/priv.h> 36 #include <sys/proc.h> 37 #include <sys/sched.h> 38 #include <sys/sysent.h> 39 #include <sys/vnode.h> 40 #include <sys/umtxvar.h> 41 42 #ifdef COMPAT_LINUX32 43 #include <machine/../linux32/linux.h> 44 #include <machine/../linux32/linux32_proto.h> 45 #else 46 #include <machine/../linux/linux.h> 47 #include <machine/../linux/linux_proto.h> 48 #endif 49 #include <compat/linux/linux_emul.h> 50 #include <compat/linux/linux_futex.h> 51 #include <compat/linux/linux_misc.h> 52 #include <compat/linux/linux_time.h> 53 #include <compat/linux/linux_util.h> 54 55 #define FUTEX_SHARED 0x8 /* shared futex */ 56 #define FUTEX_UNOWNED 0 57 58 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE 59 60 static int futex_atomic_op(struct thread *, int, uint32_t *, int *); 61 static int handle_futex_death(struct thread *td, struct linux_emuldata *, 62 uint32_t *, unsigned int, bool); 63 static int fetch_robust_entry(struct linux_robust_list **, 64 struct linux_robust_list **, unsigned int *); 65 66 struct linux_futex_args { 67 uint32_t *uaddr; 68 int32_t op; 69 uint32_t flags; 70 bool clockrt; 71 uint32_t val; 72 struct timespec *ts; 73 uint32_t *uaddr2; 74 uint32_t val3; 75 bool val3_compare; 76 struct timespec kts; 77 }; 78 79 static inline int futex_key_get(const void *, int, int, struct umtx_key *); 80 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, 81 struct linux_futex_args *); 82 static int linux_futex(struct thread *, struct linux_futex_args *); 83 static int linux_futex_wait(struct thread *, struct linux_futex_args *); 84 static int linux_futex_wake(struct thread *, struct linux_futex_args *); 85 static int linux_futex_requeue(struct thread *, struct linux_futex_args *); 86 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); 87 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); 88 static int linux_futex_unlock_pi(struct thread *, bool, 89 struct linux_futex_args *); 90 static int futex_wake_pi(struct thread *, uint32_t *, bool); 91 92 static int 93 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key) 94 { 95 96 /* Check that futex address is a 32bit aligned. */ 97 if (!__is_aligned(uaddr, sizeof(uint32_t))) 98 return (EINVAL); 99 return (umtx_key_get(uaddr, type, share, key)); 100 } 101 102 int 103 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) 104 { 105 struct linux_futex_args args; 106 107 bzero(&args, sizeof(args)); 108 args.op = LINUX_FUTEX_WAKE; 109 args.uaddr = uaddr; 110 args.flags = shared == true ? FUTEX_SHARED : 0; 111 args.val = val; 112 args.val3 = FUTEX_BITSET_MATCH_ANY; 113 114 return (linux_futex_wake(td, &args)); 115 } 116 117 static int 118 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) 119 { 120 struct linux_futex_args args; 121 122 bzero(&args, sizeof(args)); 123 args.op = LINUX_FUTEX_UNLOCK_PI; 124 args.uaddr = uaddr; 125 args.flags = shared == true ? FUTEX_SHARED : 0; 126 127 return (linux_futex_unlock_pi(td, true, &args)); 128 } 129 130 static int 131 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr, 132 int *res) 133 { 134 int op = (encoded_op >> 28) & 7; 135 int cmp = (encoded_op >> 24) & 15; 136 int oparg = (encoded_op << 8) >> 20; 137 int cmparg = (encoded_op << 20) >> 20; 138 int oldval = 0, ret; 139 140 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 141 oparg = 1 << oparg; 142 143 switch (op) { 144 case FUTEX_OP_SET: 145 ret = futex_xchgl(oparg, uaddr, &oldval); 146 break; 147 case FUTEX_OP_ADD: 148 ret = futex_addl(oparg, uaddr, &oldval); 149 break; 150 case FUTEX_OP_OR: 151 ret = futex_orl(oparg, uaddr, &oldval); 152 break; 153 case FUTEX_OP_ANDN: 154 ret = futex_andl(~oparg, uaddr, &oldval); 155 break; 156 case FUTEX_OP_XOR: 157 ret = futex_xorl(oparg, uaddr, &oldval); 158 break; 159 default: 160 ret = ENOSYS; 161 break; 162 } 163 164 if (ret != 0) 165 return (ret); 166 167 switch (cmp) { 168 case FUTEX_OP_CMP_EQ: 169 *res = (oldval == cmparg); 170 break; 171 case FUTEX_OP_CMP_NE: 172 *res = (oldval != cmparg); 173 break; 174 case FUTEX_OP_CMP_LT: 175 *res = (oldval < cmparg); 176 break; 177 case FUTEX_OP_CMP_GE: 178 *res = (oldval >= cmparg); 179 break; 180 case FUTEX_OP_CMP_LE: 181 *res = (oldval <= cmparg); 182 break; 183 case FUTEX_OP_CMP_GT: 184 *res = (oldval > cmparg); 185 break; 186 default: 187 ret = ENOSYS; 188 } 189 190 return (ret); 191 } 192 193 static int 194 linux_futex(struct thread *td, struct linux_futex_args *args) 195 { 196 struct linux_pemuldata *pem; 197 struct proc *p; 198 199 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { 200 args->flags = 0; 201 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; 202 } else 203 args->flags = FUTEX_SHARED; 204 205 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; 206 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; 207 208 if (args->clockrt && 209 args->op != LINUX_FUTEX_WAIT_BITSET && 210 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI && 211 args->op != LINUX_FUTEX_LOCK_PI2) 212 return (ENOSYS); 213 214 switch (args->op) { 215 case LINUX_FUTEX_WAIT: 216 args->val3 = FUTEX_BITSET_MATCH_ANY; 217 /* FALLTHROUGH */ 218 219 case LINUX_FUTEX_WAIT_BITSET: 220 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", 221 args->uaddr, args->val, args->val3); 222 223 return (linux_futex_wait(td, args)); 224 225 case LINUX_FUTEX_WAKE: 226 args->val3 = FUTEX_BITSET_MATCH_ANY; 227 /* FALLTHROUGH */ 228 229 case LINUX_FUTEX_WAKE_BITSET: 230 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", 231 args->uaddr, args->val, args->val3); 232 233 return (linux_futex_wake(td, args)); 234 235 case LINUX_FUTEX_REQUEUE: 236 /* 237 * Glibc does not use this operation since version 2.3.3, 238 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. 239 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when 240 * FUTEX_REQUEUE returned EINVAL. 241 */ 242 pem = pem_find(td->td_proc); 243 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { 244 linux_msg(td, "unsupported FUTEX_REQUEUE"); 245 pem->flags |= LINUX_XDEPR_REQUEUEOP; 246 } 247 248 /* 249 * The above is true, however musl libc does make use of the 250 * futex requeue operation, allow operation for brands which 251 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. 252 */ 253 p = td->td_proc; 254 Elf_Brandinfo *bi = p->p_elf_brandinfo; 255 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) 256 return (EINVAL); 257 args->val3_compare = false; 258 /* FALLTHROUGH */ 259 260 case LINUX_FUTEX_CMP_REQUEUE: 261 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " 262 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", 263 args->uaddr, args->val, args->val3, args->uaddr2, 264 args->ts); 265 266 return (linux_futex_requeue(td, args)); 267 268 case LINUX_FUTEX_WAKE_OP: 269 LINUX_CTR5(sys_futex, "WAKE_OP " 270 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", 271 args->uaddr, args->val, args->uaddr2, args->val3, 272 args->ts); 273 274 return (linux_futex_wakeop(td, args)); 275 276 case LINUX_FUTEX_LOCK_PI: 277 args->clockrt = true; 278 /* FALLTHROUGH */ 279 280 case LINUX_FUTEX_LOCK_PI2: 281 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", 282 args->uaddr, args->val); 283 284 return (linux_futex_lock_pi(td, false, args)); 285 286 case LINUX_FUTEX_UNLOCK_PI: 287 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", 288 args->uaddr); 289 290 return (linux_futex_unlock_pi(td, false, args)); 291 292 case LINUX_FUTEX_TRYLOCK_PI: 293 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", 294 args->uaddr); 295 296 return (linux_futex_lock_pi(td, true, args)); 297 298 /* 299 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI 300 * can't be used anymore to implement conditional variables. 301 * A detailed explanation can be found here: 302 * 303 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165 304 * and here http://austingroupbugs.net/view.php?id=609 305 * 306 * And since commit 307 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14 308 * glibc does not use them. 309 */ 310 case LINUX_FUTEX_WAIT_REQUEUE_PI: 311 /* not yet implemented */ 312 pem = pem_find(td->td_proc); 313 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 314 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); 315 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 316 } 317 return (ENOSYS); 318 319 case LINUX_FUTEX_CMP_REQUEUE_PI: 320 /* not yet implemented */ 321 pem = pem_find(td->td_proc); 322 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 323 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); 324 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 325 } 326 return (ENOSYS); 327 328 default: 329 linux_msg(td, "unsupported futex op %d", args->op); 330 return (ENOSYS); 331 } 332 } 333 334 /* 335 * pi protocol: 336 * - 0 futex word value means unlocked. 337 * - TID futex word value means locked. 338 * Userspace uses atomic ops to lock/unlock these futexes without entering the 339 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), 340 * then FUTEX_LOCK_PI is called. 341 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no 342 * other waiters exists looks up the thread that owns the futex (it has put its 343 * own TID into the futex value) and made this thread the owner of the internal 344 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock 345 * object, on which it blocks. Once it returns, it has the mutex acquired, and it 346 * sets the futex value to its own TID and returns (futex value contains 347 * FUTEX_WAITERS|TID). 348 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and 349 * FUTEX_UNLOCK_PI will be called. 350 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED 351 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS 352 * bit is preserved (if any). 353 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against 354 * the internal futex state and if correct, acquire futex. 355 */ 356 static int 357 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) 358 { 359 struct umtx_abs_timeout timo; 360 struct linux_emuldata *em; 361 struct umtx_pi *pi, *new_pi; 362 struct thread *td1; 363 struct umtx_q *uq; 364 int error, rv; 365 uint32_t owner, old_owner; 366 367 em = em_find(td); 368 uq = td->td_umtxq; 369 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), 370 &uq->uq_key); 371 if (error != 0) 372 return (error); 373 if (args->ts != NULL) 374 linux_umtx_abs_timeout_init(&timo, args); 375 376 umtxq_lock(&uq->uq_key); 377 pi = umtx_pi_lookup(&uq->uq_key); 378 if (pi == NULL) { 379 new_pi = umtx_pi_alloc(M_NOWAIT); 380 if (new_pi == NULL) { 381 umtxq_unlock(&uq->uq_key); 382 new_pi = umtx_pi_alloc(M_WAITOK); 383 umtxq_lock(&uq->uq_key); 384 pi = umtx_pi_lookup(&uq->uq_key); 385 if (pi != NULL) { 386 umtx_pi_free(new_pi); 387 new_pi = NULL; 388 } 389 } 390 if (new_pi != NULL) { 391 new_pi->pi_key = uq->uq_key; 392 umtx_pi_insert(new_pi); 393 pi = new_pi; 394 } 395 } 396 umtx_pi_ref(pi); 397 umtxq_unlock(&uq->uq_key); 398 for (;;) { 399 /* Try uncontested case first. */ 400 rv = casueword32(args->uaddr, FUTEX_UNOWNED, &owner, em->em_tid); 401 /* The acquire succeeded. */ 402 if (rv == 0) { 403 error = 0; 404 break; 405 } 406 if (rv == -1) { 407 error = EFAULT; 408 break; 409 } 410 411 /* 412 * Nobody owns it, but the acquire failed. This can happen 413 * with ll/sc atomic. 414 */ 415 if (owner == FUTEX_UNOWNED) { 416 error = thread_check_susp(td, true); 417 if (error != 0) 418 break; 419 continue; 420 } 421 422 /* 423 * Avoid overwriting a possible error from sleep due 424 * to the pending signal with suspension check result. 425 */ 426 if (error == 0) { 427 error = thread_check_susp(td, true); 428 if (error != 0) 429 break; 430 } 431 432 /* The futex word at *uaddr is already locked by the caller. */ 433 if ((owner & FUTEX_TID_MASK) == em->em_tid) { 434 error = EDEADLK; 435 break; 436 } 437 438 /* 439 * Futex owner died, handle_futex_death() set the OWNER_DIED bit 440 * and clear tid. Try to acquire it. 441 */ 442 if ((owner & FUTEX_TID_MASK) == FUTEX_UNOWNED) { 443 old_owner = owner; 444 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); 445 owner |= em->em_tid; 446 rv = casueword32(args->uaddr, old_owner, &owner, owner); 447 if (rv == -1) { 448 error = EFAULT; 449 break; 450 } 451 if (rv == 1) { 452 if (error == 0) { 453 error = thread_check_susp(td, true); 454 if (error != 0) 455 break; 456 } 457 458 /* 459 * If this failed the lock could 460 * changed, restart. 461 */ 462 continue; 463 } 464 465 umtxq_lock(&uq->uq_key); 466 umtxq_busy(&uq->uq_key); 467 error = umtx_pi_claim(pi, td); 468 umtxq_unbusy(&uq->uq_key); 469 umtxq_unlock(&uq->uq_key); 470 if (error != 0) { 471 /* 472 * Since we're going to return an 473 * error, restore the futex to its 474 * previous, unowned state to avoid 475 * compounding the problem. 476 */ 477 (void)casuword32(args->uaddr, owner, old_owner); 478 } 479 break; 480 } 481 482 /* 483 * Inconsistent state: OWNER_DIED is set and tid is not 0. 484 * Linux does some checks of futex state, we return EINVAL, 485 * as the user space can take care of this. 486 */ 487 if ((owner & FUTEX_OWNER_DIED) != FUTEX_UNOWNED) { 488 error = EINVAL; 489 break; 490 } 491 492 if (try != 0) { 493 error = EBUSY; 494 break; 495 } 496 497 /* 498 * If we caught a signal, we have retried and now 499 * exit immediately. 500 */ 501 if (error != 0) 502 break; 503 504 umtxq_busy_unlocked(&uq->uq_key); 505 506 /* 507 * Set the contested bit so that a release in user space knows 508 * to use the system call for unlock. If this fails either some 509 * one else has acquired the lock or it has been released. 510 */ 511 rv = casueword32(args->uaddr, owner, &owner, 512 owner | FUTEX_WAITERS); 513 if (rv == -1) { 514 umtxq_unbusy_unlocked(&uq->uq_key); 515 error = EFAULT; 516 break; 517 } 518 if (rv == 1) { 519 umtxq_unbusy_unlocked(&uq->uq_key); 520 error = thread_check_susp(td, true); 521 if (error != 0) 522 break; 523 524 /* 525 * The lock changed and we need to retry or we 526 * lost a race to the thread unlocking the umtx. 527 */ 528 continue; 529 } 530 531 /* 532 * Substitute Linux thread id by native thread id to 533 * avoid refactoring code of umtxq_sleep_pi(). 534 */ 535 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); 536 if (td1 != NULL) { 537 owner = td1->td_tid; 538 PROC_UNLOCK(td1->td_proc); 539 } else { 540 umtxq_unbusy_unlocked(&uq->uq_key); 541 error = EINVAL; 542 break; 543 } 544 545 umtxq_lock(&uq->uq_key); 546 547 /* We set the contested bit, sleep. */ 548 error = umtxq_sleep_pi(uq, pi, owner, "futexp", 549 args->ts == NULL ? NULL : &timo, 550 (args->flags & FUTEX_SHARED) != 0); 551 if (error != 0) 552 continue; 553 554 error = thread_check_susp(td, false); 555 if (error != 0) 556 break; 557 } 558 559 umtxq_lock(&uq->uq_key); 560 umtx_pi_unref(pi); 561 umtxq_unlock(&uq->uq_key); 562 umtx_key_release(&uq->uq_key); 563 return (error); 564 } 565 566 static int 567 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) 568 { 569 struct linux_emuldata *em; 570 struct umtx_key key; 571 uint32_t old, owner, new_owner; 572 int count, error; 573 574 em = em_find(td); 575 576 /* 577 * Make sure we own this mtx. 578 */ 579 error = fueword32(args->uaddr, &owner); 580 if (error == -1) 581 return (EFAULT); 582 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) 583 return (EPERM); 584 585 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); 586 if (error != 0) 587 return (error); 588 umtxq_lock(&key); 589 umtxq_busy(&key); 590 error = umtx_pi_drop(td, &key, rb, &count); 591 if (error != 0 || rb) { 592 umtxq_unbusy(&key); 593 umtxq_unlock(&key); 594 umtx_key_release(&key); 595 return (error); 596 } 597 umtxq_unlock(&key); 598 599 /* 600 * When unlocking the futex, it must be marked as unowned if 601 * there is zero or one thread only waiting for it. 602 * Otherwise, it must be marked as contested. 603 */ 604 if (count > 1) 605 new_owner = FUTEX_WAITERS; 606 else 607 new_owner = FUTEX_UNOWNED; 608 609 again: 610 error = casueword32(args->uaddr, owner, &old, new_owner); 611 if (error == 1) { 612 error = thread_check_susp(td, false); 613 if (error == 0) 614 goto again; 615 } 616 umtxq_unbusy_unlocked(&key); 617 umtx_key_release(&key); 618 if (error == -1) 619 return (EFAULT); 620 if (error == 0 && old != owner) 621 return (EINVAL); 622 return (error); 623 } 624 625 static int 626 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) 627 { 628 struct umtx_key key, key2; 629 int nrwake, op_ret, ret; 630 int error, count; 631 632 if (args->uaddr == args->uaddr2) 633 return (EINVAL); 634 635 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 636 if (error != 0) 637 return (error); 638 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 639 if (error != 0) { 640 umtx_key_release(&key); 641 return (error); 642 } 643 umtxq_busy_unlocked(&key); 644 error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret); 645 umtxq_lock(&key); 646 umtxq_unbusy(&key); 647 if (error != 0) 648 goto out; 649 ret = umtxq_signal_mask(&key, args->val, args->val3); 650 if (op_ret > 0) { 651 nrwake = (int)(unsigned long)args->ts; 652 umtxq_lock(&key2); 653 count = umtxq_count(&key2); 654 if (count > 0) 655 ret += umtxq_signal_mask(&key2, nrwake, args->val3); 656 else 657 ret += umtxq_signal_mask(&key, nrwake, args->val3); 658 umtxq_unlock(&key2); 659 } 660 td->td_retval[0] = ret; 661 out: 662 umtxq_unlock(&key); 663 umtx_key_release(&key2); 664 umtx_key_release(&key); 665 return (error); 666 } 667 668 static int 669 linux_futex_requeue(struct thread *td, struct linux_futex_args *args) 670 { 671 int nrwake, nrrequeue; 672 struct umtx_key key, key2; 673 int error; 674 uint32_t uval; 675 676 /* 677 * Linux allows this, we would not, it is an incorrect 678 * usage of declared ABI, so return EINVAL. 679 */ 680 if (args->uaddr == args->uaddr2) 681 return (EINVAL); 682 683 nrrequeue = (int)(unsigned long)args->ts; 684 nrwake = args->val; 685 /* 686 * Sanity check to prevent signed integer overflow, 687 * see Linux CVE-2018-6927 688 */ 689 if (nrwake < 0 || nrrequeue < 0) 690 return (EINVAL); 691 692 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 693 if (error != 0) 694 return (error); 695 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 696 if (error != 0) { 697 umtx_key_release(&key); 698 return (error); 699 } 700 umtxq_busy_unlocked(&key); 701 error = fueword32(args->uaddr, &uval); 702 if (error != 0) 703 error = EFAULT; 704 else if (args->val3_compare == true && uval != args->val3) 705 error = EWOULDBLOCK; 706 umtxq_lock(&key); 707 umtxq_unbusy(&key); 708 if (error == 0) { 709 umtxq_lock(&key2); 710 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); 711 umtxq_unlock(&key2); 712 } 713 umtxq_unlock(&key); 714 umtx_key_release(&key2); 715 umtx_key_release(&key); 716 return (error); 717 } 718 719 static int 720 linux_futex_wake(struct thread *td, struct linux_futex_args *args) 721 { 722 struct umtx_key key; 723 int error; 724 725 if (args->val3 == 0) 726 return (EINVAL); 727 728 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 729 if (error != 0) 730 return (error); 731 umtxq_lock(&key); 732 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); 733 umtxq_unlock(&key); 734 umtx_key_release(&key); 735 return (0); 736 } 737 738 static int 739 linux_futex_wait(struct thread *td, struct linux_futex_args *args) 740 { 741 struct umtx_abs_timeout timo; 742 struct umtx_q *uq; 743 uint32_t uval; 744 int error; 745 746 if (args->val3 == 0) 747 error = EINVAL; 748 749 uq = td->td_umtxq; 750 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), 751 &uq->uq_key); 752 if (error != 0) 753 return (error); 754 if (args->ts != NULL) 755 linux_umtx_abs_timeout_init(&timo, args); 756 umtxq_lock(&uq->uq_key); 757 umtxq_busy(&uq->uq_key); 758 uq->uq_bitset = args->val3; 759 umtxq_insert(uq); 760 umtxq_unlock(&uq->uq_key); 761 error = fueword32(args->uaddr, &uval); 762 if (error != 0) 763 error = EFAULT; 764 else if (uval != args->val) 765 error = EWOULDBLOCK; 766 umtxq_lock(&uq->uq_key); 767 umtxq_unbusy(&uq->uq_key); 768 if (error == 0) { 769 error = umtxq_sleep(uq, "futex", 770 args->ts == NULL ? NULL : &timo); 771 if ((uq->uq_flags & UQF_UMTXQ) == 0) 772 error = 0; 773 else 774 umtxq_remove(uq); 775 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { 776 umtxq_remove(uq); 777 } 778 umtxq_unlock(&uq->uq_key); 779 umtx_key_release(&uq->uq_key); 780 if (error == ERESTART) 781 error = EINTR; 782 return (error); 783 } 784 785 static void 786 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, 787 struct linux_futex_args *args) 788 { 789 int clockid, absolute; 790 791 /* 792 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the 793 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2. 794 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other 795 * futex operations timeout is interpreted as an absolute value. 796 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures 797 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel 798 * measures the timeout against the CLOCK_MONOTONIC clock. 799 */ 800 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; 801 absolute = args->op == LINUX_FUTEX_WAIT ? false : true; 802 umtx_abs_timeout_init(timo, clockid, absolute, args->ts); 803 } 804 805 int 806 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 807 { 808 struct linux_futex_args fargs = { 809 .uaddr = args->uaddr, 810 .op = args->op, 811 .val = args->val, 812 .ts = NULL, 813 .uaddr2 = args->uaddr2, 814 .val3 = args->val3, 815 .val3_compare = true, 816 }; 817 int error; 818 819 switch (args->op & LINUX_FUTEX_CMD_MASK) { 820 case LINUX_FUTEX_WAIT: 821 case LINUX_FUTEX_WAIT_BITSET: 822 case LINUX_FUTEX_LOCK_PI: 823 case LINUX_FUTEX_LOCK_PI2: 824 if (args->timeout != NULL) { 825 error = linux_get_timespec(&fargs.kts, args->timeout); 826 if (error != 0) 827 return (error); 828 fargs.ts = &fargs.kts; 829 } 830 break; 831 default: 832 fargs.ts = PTRIN(args->timeout); 833 } 834 return (linux_futex(td, &fargs)); 835 } 836 837 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) 838 int 839 linux_sys_futex_time64(struct thread *td, 840 struct linux_sys_futex_time64_args *args) 841 { 842 struct linux_futex_args fargs = { 843 .uaddr = args->uaddr, 844 .op = args->op, 845 .val = args->val, 846 .ts = NULL, 847 .uaddr2 = args->uaddr2, 848 .val3 = args->val3, 849 .val3_compare = true, 850 }; 851 int error; 852 853 switch (args->op & LINUX_FUTEX_CMD_MASK) { 854 case LINUX_FUTEX_WAIT: 855 case LINUX_FUTEX_WAIT_BITSET: 856 case LINUX_FUTEX_LOCK_PI: 857 case LINUX_FUTEX_LOCK_PI2: 858 if (args->timeout != NULL) { 859 error = linux_get_timespec64(&fargs.kts, args->timeout); 860 if (error != 0) 861 return (error); 862 fargs.ts = &fargs.kts; 863 } 864 break; 865 default: 866 fargs.ts = PTRIN(args->timeout); 867 } 868 return (linux_futex(td, &fargs)); 869 } 870 #endif 871 872 int 873 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 874 { 875 struct linux_emuldata *em; 876 877 if (args->len != sizeof(struct linux_robust_list_head)) 878 return (EINVAL); 879 880 em = em_find(td); 881 em->robust_futexes = args->head; 882 883 return (0); 884 } 885 886 int 887 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 888 { 889 struct linux_emuldata *em; 890 struct linux_robust_list_head *head; 891 l_size_t len; 892 struct thread *td2; 893 int error; 894 895 if (!args->pid) { 896 em = em_find(td); 897 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 898 head = em->robust_futexes; 899 } else { 900 td2 = linux_tdfind(td, args->pid, -1); 901 if (td2 == NULL) 902 return (ESRCH); 903 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { 904 PROC_UNLOCK(td2->td_proc); 905 return (EPERM); 906 } 907 908 em = em_find(td2); 909 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 910 /* XXX: ptrace? */ 911 if (priv_check(td, PRIV_CRED_SETUID) || 912 priv_check(td, PRIV_CRED_SETEUID) || 913 p_candebug(td, td2->td_proc)) { 914 PROC_UNLOCK(td2->td_proc); 915 return (EPERM); 916 } 917 head = em->robust_futexes; 918 919 PROC_UNLOCK(td2->td_proc); 920 } 921 922 len = sizeof(struct linux_robust_list_head); 923 error = copyout(&len, args->len, sizeof(l_size_t)); 924 if (error != 0) 925 return (EFAULT); 926 927 return (copyout(&head, args->head, sizeof(l_uintptr_t))); 928 } 929 930 static int 931 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr, 932 unsigned int pi, bool pending_op) 933 { 934 uint32_t uval, nval, mval; 935 int error; 936 937 retry: 938 error = fueword32(uaddr, &uval); 939 if (error != 0) 940 return (EFAULT); 941 942 /* 943 * Special case for regular (non PI) futexes. The unlock path in 944 * user space has two race scenarios: 945 * 946 * 1. The unlock path releases the user space futex value and 947 * before it can execute the futex() syscall to wake up 948 * waiters it is killed. 949 * 950 * 2. A woken up waiter is killed before it can acquire the 951 * futex in user space. 952 * 953 * In both cases the TID validation below prevents a wakeup of 954 * potential waiters which can cause these waiters to block 955 * forever. 956 * 957 * In both cases it is safe to attempt waking up a potential 958 * waiter without touching the user space futex value and trying 959 * to set the OWNER_DIED bit. 960 */ 961 if (pending_op && !pi && !uval) { 962 (void)futex_wake(td, uaddr, 1, true); 963 return (0); 964 } 965 966 if ((uval & FUTEX_TID_MASK) == em->em_tid) { 967 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 968 error = casueword32(uaddr, uval, &nval, mval); 969 if (error == -1) 970 return (EFAULT); 971 if (error == 1) { 972 error = thread_check_susp(td, false); 973 if (error != 0) 974 return (error); 975 goto retry; 976 } 977 978 if (!pi && (uval & FUTEX_WAITERS)) { 979 error = futex_wake(td, uaddr, 1, true); 980 if (error != 0) 981 return (error); 982 } else if (pi && (uval & FUTEX_WAITERS)) { 983 error = futex_wake_pi(td, uaddr, true); 984 if (error != 0) 985 return (error); 986 } 987 } 988 989 return (0); 990 } 991 992 static int 993 fetch_robust_entry(struct linux_robust_list **entry, 994 struct linux_robust_list **head, unsigned int *pi) 995 { 996 l_ulong uentry; 997 int error; 998 999 error = copyin((const void *)head, &uentry, sizeof(uentry)); 1000 if (error != 0) 1001 return (EFAULT); 1002 1003 *entry = (void *)(uentry & ~1UL); 1004 *pi = uentry & 1; 1005 1006 return (0); 1007 } 1008 1009 #define LINUX_HANDLE_DEATH_PENDING true 1010 #define LINUX_HANDLE_DEATH_LIST false 1011 1012 /* This walks the list of robust futexes releasing them. */ 1013 void 1014 release_futexes(struct thread *td, struct linux_emuldata *em) 1015 { 1016 struct linux_robust_list_head *head; 1017 struct linux_robust_list *entry, *next_entry, *pending; 1018 unsigned int limit = 2048, pi, next_pi, pip; 1019 uint32_t *uaddr; 1020 l_long futex_offset; 1021 int error; 1022 1023 head = em->robust_futexes; 1024 if (head == NULL) 1025 return; 1026 1027 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 1028 return; 1029 1030 error = copyin(&head->futex_offset, &futex_offset, 1031 sizeof(futex_offset)); 1032 if (error != 0) 1033 return; 1034 1035 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 1036 return; 1037 1038 while (entry != &head->list) { 1039 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next), 1040 &next_pi); 1041 1042 /* 1043 * A pending lock might already be on the list, so 1044 * don't process it twice. 1045 */ 1046 if (entry != pending) { 1047 uaddr = (uint32_t *)((caddr_t)entry + futex_offset); 1048 if (handle_futex_death(td, em, uaddr, pi, 1049 LINUX_HANDLE_DEATH_LIST)) 1050 return; 1051 } 1052 if (error != 0) 1053 return; 1054 1055 entry = next_entry; 1056 pi = next_pi; 1057 1058 if (!--limit) 1059 break; 1060 1061 sched_relinquish(curthread); 1062 } 1063 1064 if (pending) { 1065 uaddr = (uint32_t *)((caddr_t)pending + futex_offset); 1066 (void)handle_futex_death(td, em, uaddr, pip, 1067 LINUX_HANDLE_DEATH_PENDING); 1068 } 1069 } 1070