1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org> 5 * Copyright (c) 2008 Roman Divacky 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include "opt_compat.h" 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/imgact.h> 37 #include <sys/imgact_elf.h> 38 #include <sys/ktr.h> 39 #include <sys/mutex.h> 40 #include <sys/priv.h> 41 #include <sys/proc.h> 42 #include <sys/sched.h> 43 #include <sys/umtxvar.h> 44 45 #ifdef COMPAT_LINUX32 46 #include <machine/../linux32/linux.h> 47 #include <machine/../linux32/linux32_proto.h> 48 #else 49 #include <machine/../linux/linux.h> 50 #include <machine/../linux/linux_proto.h> 51 #endif 52 #include <compat/linux/linux_emul.h> 53 #include <compat/linux/linux_futex.h> 54 #include <compat/linux/linux_misc.h> 55 #include <compat/linux/linux_timer.h> 56 #include <compat/linux/linux_util.h> 57 58 #define FUTEX_SHARED 0x8 /* shared futex */ 59 60 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE 61 62 static int futex_atomic_op(struct thread *, int, uint32_t *); 63 static int handle_futex_death(struct thread *td, struct linux_emuldata *, 64 uint32_t *, unsigned int, bool); 65 static int fetch_robust_entry(struct linux_robust_list **, 66 struct linux_robust_list **, unsigned int *); 67 68 struct linux_futex_args { 69 uint32_t *uaddr; 70 int32_t op; 71 uint32_t flags; 72 bool clockrt; 73 uint32_t val; 74 struct timespec *ts; 75 uint32_t *uaddr2; 76 uint32_t val3; 77 bool val3_compare; 78 struct timespec kts; 79 }; 80 81 static inline int futex_key_get(const void *, int, int, struct umtx_key *); 82 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, 83 struct linux_futex_args *); 84 static int linux_futex(struct thread *, struct linux_futex_args *); 85 static int linux_futex_wait(struct thread *, struct linux_futex_args *); 86 static int linux_futex_wake(struct thread *, struct linux_futex_args *); 87 static int linux_futex_requeue(struct thread *, struct linux_futex_args *); 88 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); 89 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); 90 static int linux_futex_unlock_pi(struct thread *, bool, 91 struct linux_futex_args *); 92 static int futex_wake_pi(struct thread *, uint32_t *, bool); 93 94 static int 95 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key) 96 { 97 98 /* Check that futex address is a 32bit aligned. */ 99 if (!__is_aligned(uaddr, sizeof(uint32_t))) 100 return (EINVAL); 101 return (umtx_key_get(uaddr, type, share, key)); 102 } 103 104 int 105 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) 106 { 107 struct linux_futex_args args; 108 109 bzero(&args, sizeof(args)); 110 args.op = LINUX_FUTEX_WAKE; 111 args.uaddr = uaddr; 112 args.flags = shared == true ? FUTEX_SHARED : 0; 113 args.val = val; 114 args.val3 = FUTEX_BITSET_MATCH_ANY; 115 116 return (linux_futex_wake(td, &args)); 117 } 118 119 static int 120 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) 121 { 122 struct linux_futex_args args; 123 124 bzero(&args, sizeof(args)); 125 args.op = LINUX_FUTEX_UNLOCK_PI; 126 args.uaddr = uaddr; 127 args.flags = shared == true ? FUTEX_SHARED : 0; 128 129 return (linux_futex_unlock_pi(td, true, &args)); 130 } 131 132 static int 133 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) 134 { 135 int op = (encoded_op >> 28) & 7; 136 int cmp = (encoded_op >> 24) & 15; 137 int oparg = (encoded_op << 8) >> 20; 138 int cmparg = (encoded_op << 20) >> 20; 139 int oldval = 0, ret; 140 141 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 142 oparg = 1 << oparg; 143 144 switch (op) { 145 case FUTEX_OP_SET: 146 ret = futex_xchgl(oparg, uaddr, &oldval); 147 break; 148 case FUTEX_OP_ADD: 149 ret = futex_addl(oparg, uaddr, &oldval); 150 break; 151 case FUTEX_OP_OR: 152 ret = futex_orl(oparg, uaddr, &oldval); 153 break; 154 case FUTEX_OP_ANDN: 155 ret = futex_andl(~oparg, uaddr, &oldval); 156 break; 157 case FUTEX_OP_XOR: 158 ret = futex_xorl(oparg, uaddr, &oldval); 159 break; 160 default: 161 ret = -ENOSYS; 162 break; 163 } 164 165 if (ret) 166 return (ret); 167 168 switch (cmp) { 169 case FUTEX_OP_CMP_EQ: 170 ret = (oldval == cmparg); 171 break; 172 case FUTEX_OP_CMP_NE: 173 ret = (oldval != cmparg); 174 break; 175 case FUTEX_OP_CMP_LT: 176 ret = (oldval < cmparg); 177 break; 178 case FUTEX_OP_CMP_GE: 179 ret = (oldval >= cmparg); 180 break; 181 case FUTEX_OP_CMP_LE: 182 ret = (oldval <= cmparg); 183 break; 184 case FUTEX_OP_CMP_GT: 185 ret = (oldval > cmparg); 186 break; 187 default: 188 ret = -ENOSYS; 189 } 190 191 return (ret); 192 } 193 194 static int 195 linux_futex(struct thread *td, struct linux_futex_args *args) 196 { 197 struct linux_pemuldata *pem; 198 struct proc *p; 199 200 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { 201 args->flags = 0; 202 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; 203 } else 204 args->flags = FUTEX_SHARED; 205 206 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; 207 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; 208 209 if (args->clockrt && 210 args->op != LINUX_FUTEX_WAIT_BITSET && 211 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI && 212 args->op != LINUX_FUTEX_LOCK_PI2) 213 return (ENOSYS); 214 215 switch (args->op) { 216 case LINUX_FUTEX_WAIT: 217 args->val3 = FUTEX_BITSET_MATCH_ANY; 218 /* FALLTHROUGH */ 219 220 case LINUX_FUTEX_WAIT_BITSET: 221 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", 222 args->uaddr, args->val, args->val3); 223 224 return (linux_futex_wait(td, args)); 225 226 case LINUX_FUTEX_WAKE: 227 args->val3 = FUTEX_BITSET_MATCH_ANY; 228 /* FALLTHROUGH */ 229 230 case LINUX_FUTEX_WAKE_BITSET: 231 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", 232 args->uaddr, args->val, args->val3); 233 234 return (linux_futex_wake(td, args)); 235 236 case LINUX_FUTEX_REQUEUE: 237 /* 238 * Glibc does not use this operation since version 2.3.3, 239 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. 240 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when 241 * FUTEX_REQUEUE returned EINVAL. 242 */ 243 pem = pem_find(td->td_proc); 244 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { 245 linux_msg(td, "unsupported FUTEX_REQUEUE"); 246 pem->flags |= LINUX_XDEPR_REQUEUEOP; 247 } 248 249 /* 250 * The above is true, however musl libc does make use of the 251 * futex requeue operation, allow operation for brands which 252 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. 253 */ 254 p = td->td_proc; 255 Elf_Brandinfo *bi = p->p_elf_brandinfo; 256 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) 257 return (EINVAL); 258 args->val3_compare = false; 259 /* FALLTHROUGH */ 260 261 case LINUX_FUTEX_CMP_REQUEUE: 262 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " 263 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", 264 args->uaddr, args->val, args->val3, args->uaddr2, 265 args->ts); 266 267 return (linux_futex_requeue(td, args)); 268 269 case LINUX_FUTEX_WAKE_OP: 270 LINUX_CTR5(sys_futex, "WAKE_OP " 271 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", 272 args->uaddr, args->val, args->uaddr2, args->val3, 273 args->ts); 274 275 return (linux_futex_wakeop(td, args)); 276 277 case LINUX_FUTEX_LOCK_PI: 278 args->clockrt = true; 279 /* FALLTHROUGH */ 280 281 case LINUX_FUTEX_LOCK_PI2: 282 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", 283 args->uaddr, args->val); 284 285 return (linux_futex_lock_pi(td, false, args)); 286 287 case LINUX_FUTEX_UNLOCK_PI: 288 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", 289 args->uaddr); 290 291 return (linux_futex_unlock_pi(td, false, args)); 292 293 case LINUX_FUTEX_TRYLOCK_PI: 294 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", 295 args->uaddr); 296 297 return (linux_futex_lock_pi(td, true, args)); 298 299 /* 300 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI 301 * can't be used anymore to implement conditional variables. 302 * A detailed explanation can be found here: 303 * 304 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165 305 * and here http://austingroupbugs.net/view.php?id=609 306 * 307 * And since commit 308 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14 309 * glibc does not use them. 310 */ 311 case LINUX_FUTEX_WAIT_REQUEUE_PI: 312 /* not yet implemented */ 313 pem = pem_find(td->td_proc); 314 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 315 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); 316 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 317 } 318 return (ENOSYS); 319 320 case LINUX_FUTEX_CMP_REQUEUE_PI: 321 /* not yet implemented */ 322 pem = pem_find(td->td_proc); 323 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 324 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); 325 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 326 } 327 return (ENOSYS); 328 329 default: 330 linux_msg(td, "unsupported futex op %d", args->op); 331 return (ENOSYS); 332 } 333 } 334 335 /* 336 * pi protocol: 337 * - 0 futex word value means unlocked. 338 * - TID futex word value means locked. 339 * Userspace uses atomic ops to lock/unlock these futexes without entering the 340 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), 341 * then FUTEX_LOCK_PI is called. 342 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no 343 * other waiters exists looks up the thread that owns the futex (it has put its 344 * own TID into the futex value) and made this thread the owner of the internal 345 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock 346 * object, on which it blocks. Once it returns, it has the mutex acquired, and it 347 * sets the futex value to its own TID and returns (futex value contains 348 * FUTEX_WAITERS|TID). 349 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and 350 * FUTEX_UNLOCK_PI will be called. 351 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED 352 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS 353 * bit is preserved (if any). 354 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against 355 * the internal futex state and if correct, acquire futex. 356 */ 357 static int 358 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) 359 { 360 struct umtx_abs_timeout timo; 361 struct linux_emuldata *em; 362 struct umtx_pi *pi, *new_pi; 363 struct thread *td1; 364 struct umtx_q *uq; 365 int error, rv; 366 uint32_t owner, old_owner; 367 368 em = em_find(td); 369 uq = td->td_umtxq; 370 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), 371 &uq->uq_key); 372 if (error != 0) 373 return (error); 374 if (args->ts != NULL) 375 linux_umtx_abs_timeout_init(&timo, args); 376 377 umtxq_lock(&uq->uq_key); 378 pi = umtx_pi_lookup(&uq->uq_key); 379 if (pi == NULL) { 380 new_pi = umtx_pi_alloc(M_NOWAIT); 381 if (new_pi == NULL) { 382 umtxq_unlock(&uq->uq_key); 383 new_pi = umtx_pi_alloc(M_WAITOK); 384 umtxq_lock(&uq->uq_key); 385 pi = umtx_pi_lookup(&uq->uq_key); 386 if (pi != NULL) { 387 umtx_pi_free(new_pi); 388 new_pi = NULL; 389 } 390 } 391 if (new_pi != NULL) { 392 new_pi->pi_key = uq->uq_key; 393 umtx_pi_insert(new_pi); 394 pi = new_pi; 395 } 396 } 397 umtx_pi_ref(pi); 398 umtxq_unlock(&uq->uq_key); 399 for (;;) { 400 /* Try uncontested case first. */ 401 rv = casueword32(args->uaddr, 0, &owner, em->em_tid); 402 /* The acquire succeeded. */ 403 if (rv == 0) { 404 error = 0; 405 break; 406 } 407 if (rv == -1) { 408 error = EFAULT; 409 break; 410 } 411 412 /* 413 * Avoid overwriting a possible error from sleep due 414 * to the pending signal with suspension check result. 415 */ 416 if (error == 0) { 417 error = thread_check_susp(td, true); 418 if (error != 0) 419 break; 420 } 421 422 /* The futex word at *uaddr is already locked by the caller. */ 423 if ((owner & FUTEX_TID_MASK) == em->em_tid) { 424 error = EDEADLK; 425 break; 426 } 427 428 /* 429 * Futex owner died, handle_futex_death() set the OWNER_DIED bit 430 * and clear tid. Try to acquire it. 431 */ 432 if ((owner & FUTEX_TID_MASK) == 0) { 433 old_owner = owner; 434 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); 435 owner |= em->em_tid; 436 rv = casueword32(args->uaddr, old_owner, &owner, owner); 437 if (rv == -1) { 438 error = EFAULT; 439 break; 440 } 441 if (rv == 1) { 442 if (error == 0) { 443 error = thread_check_susp(td, true); 444 if (error != 0) 445 break; 446 } 447 448 /* 449 * If this failed the lock could 450 * changed, restart. 451 */ 452 continue; 453 } 454 455 umtxq_lock(&uq->uq_key); 456 umtxq_busy(&uq->uq_key); 457 error = umtx_pi_claim(pi, td); 458 umtxq_unbusy(&uq->uq_key); 459 umtxq_unlock(&uq->uq_key); 460 if (error != 0) { 461 /* 462 * Since we're going to return an 463 * error, restore the futex to its 464 * previous, unowned state to avoid 465 * compounding the problem. 466 */ 467 (void)casuword32(args->uaddr, owner, old_owner); 468 } 469 break; 470 } 471 472 /* 473 * Inconsistent state: OWNER_DIED is set and tid is not 0. 474 * Linux does some checks of futex state, we return EINVAL, 475 * as the user space can take care of this. 476 */ 477 if ((owner & FUTEX_OWNER_DIED) != 0) { 478 error = EINVAL; 479 break; 480 } 481 482 if (try != 0) { 483 error = EBUSY; 484 break; 485 } 486 487 /* 488 * If we caught a signal, we have retried and now 489 * exit immediately. 490 */ 491 if (error != 0) 492 break; 493 494 umtxq_lock(&uq->uq_key); 495 umtxq_busy(&uq->uq_key); 496 umtxq_unlock(&uq->uq_key); 497 498 /* 499 * Set the contested bit so that a release in user space knows 500 * to use the system call for unlock. If this fails either some 501 * one else has acquired the lock or it has been released. 502 */ 503 rv = casueword32(args->uaddr, owner, &owner, 504 owner | FUTEX_WAITERS); 505 if (rv == -1) { 506 umtxq_unbusy_unlocked(&uq->uq_key); 507 error = EFAULT; 508 break; 509 } 510 if (rv == 1) { 511 umtxq_unbusy_unlocked(&uq->uq_key); 512 error = thread_check_susp(td, true); 513 if (error != 0) 514 break; 515 516 /* 517 * The lock changed and we need to retry or we 518 * lost a race to the thread unlocking the umtx. 519 */ 520 continue; 521 } 522 523 /* 524 * Substitute Linux thread id by native thread id to 525 * avoid refactoring code of umtxq_sleep_pi(). 526 */ 527 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); 528 if (td1 != NULL) { 529 owner = td1->td_tid; 530 PROC_UNLOCK(td1->td_proc); 531 } else { 532 umtxq_unbusy_unlocked(&uq->uq_key); 533 error = EINVAL; 534 break; 535 } 536 537 umtxq_lock(&uq->uq_key); 538 539 /* We set the contested bit, sleep. */ 540 error = umtxq_sleep_pi(uq, pi, owner, "futexp", 541 args->ts == NULL ? NULL : &timo, 542 (args->flags & FUTEX_SHARED) != 0); 543 if (error != 0) 544 continue; 545 546 error = thread_check_susp(td, false); 547 if (error != 0) 548 break; 549 } 550 551 umtxq_lock(&uq->uq_key); 552 umtx_pi_unref(pi); 553 umtxq_unlock(&uq->uq_key); 554 umtx_key_release(&uq->uq_key); 555 return (error); 556 } 557 558 static int 559 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) 560 { 561 struct linux_emuldata *em; 562 struct umtx_key key; 563 uint32_t old, owner, new_owner; 564 int count, error; 565 566 em = em_find(td); 567 568 /* 569 * Make sure we own this mtx. 570 */ 571 error = fueword32(args->uaddr, &owner); 572 if (error == -1) 573 return (EFAULT); 574 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) 575 return (EPERM); 576 577 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); 578 if (error != 0) 579 return (error); 580 umtxq_lock(&key); 581 umtxq_busy(&key); 582 error = umtx_pi_drop(td, &key, rb, &count); 583 if (error != 0 || rb) { 584 umtxq_unbusy(&key); 585 umtxq_unlock(&key); 586 umtx_key_release(&key); 587 return (error); 588 } 589 umtxq_unlock(&key); 590 591 /* 592 * When unlocking the futex, it must be marked as unowned if 593 * there is zero or one thread only waiting for it. 594 * Otherwise, it must be marked as contested. 595 */ 596 if (count > 1) 597 new_owner = FUTEX_WAITERS; 598 else 599 new_owner = 0; 600 601 again: 602 error = casueword32(args->uaddr, owner, &old, new_owner); 603 if (error == 1) { 604 error = thread_check_susp(td, false); 605 if (error == 0) 606 goto again; 607 } 608 umtxq_unbusy_unlocked(&key); 609 umtx_key_release(&key); 610 if (error == -1) 611 return (EFAULT); 612 if (error == 0 && old != owner) 613 return (EINVAL); 614 return (error); 615 } 616 617 static int 618 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) 619 { 620 struct umtx_key key, key2; 621 int nrwake, op_ret, ret; 622 int error, count; 623 624 if (args->uaddr == args->uaddr2) 625 return (EINVAL); 626 627 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 628 if (error != 0) 629 return (error); 630 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 631 if (error != 0) { 632 umtx_key_release(&key); 633 return (error); 634 } 635 umtxq_lock(&key); 636 umtxq_busy(&key); 637 umtxq_unlock(&key); 638 op_ret = futex_atomic_op(td, args->val3, args->uaddr2); 639 if (op_ret < 0) { 640 if (op_ret == -ENOSYS) 641 error = ENOSYS; 642 else 643 error = EFAULT; 644 } 645 umtxq_lock(&key); 646 umtxq_unbusy(&key); 647 if (error != 0) 648 goto out; 649 ret = umtxq_signal_mask(&key, args->val, args->val3); 650 if (op_ret > 0) { 651 nrwake = (int)(unsigned long)args->ts; 652 umtxq_lock(&key2); 653 count = umtxq_count(&key2); 654 if (count > 0) 655 ret += umtxq_signal_mask(&key2, nrwake, args->val3); 656 else 657 ret += umtxq_signal_mask(&key, nrwake, args->val3); 658 umtxq_unlock(&key2); 659 } 660 td->td_retval[0] = ret; 661 out: 662 umtxq_unlock(&key); 663 umtx_key_release(&key2); 664 umtx_key_release(&key); 665 return (error); 666 } 667 668 static int 669 linux_futex_requeue(struct thread *td, struct linux_futex_args *args) 670 { 671 int nrwake, nrrequeue; 672 struct umtx_key key, key2; 673 int error; 674 uint32_t uval; 675 676 /* 677 * Linux allows this, we would not, it is an incorrect 678 * usage of declared ABI, so return EINVAL. 679 */ 680 if (args->uaddr == args->uaddr2) 681 return (EINVAL); 682 683 nrrequeue = (int)(unsigned long)args->ts; 684 nrwake = args->val; 685 /* 686 * Sanity check to prevent signed integer overflow, 687 * see Linux CVE-2018-6927 688 */ 689 if (nrwake < 0 || nrrequeue < 0) 690 return (EINVAL); 691 692 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 693 if (error != 0) 694 return (error); 695 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 696 if (error != 0) { 697 umtx_key_release(&key); 698 return (error); 699 } 700 umtxq_lock(&key); 701 umtxq_busy(&key); 702 umtxq_unlock(&key); 703 error = fueword32(args->uaddr, &uval); 704 if (error != 0) 705 error = EFAULT; 706 else if (args->val3_compare == true && uval != args->val3) 707 error = EWOULDBLOCK; 708 umtxq_lock(&key); 709 umtxq_unbusy(&key); 710 if (error == 0) { 711 umtxq_lock(&key2); 712 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); 713 umtxq_unlock(&key2); 714 } 715 umtxq_unlock(&key); 716 umtx_key_release(&key2); 717 umtx_key_release(&key); 718 return (error); 719 } 720 721 static int 722 linux_futex_wake(struct thread *td, struct linux_futex_args *args) 723 { 724 struct umtx_key key; 725 int error; 726 727 if (args->val3 == 0) 728 return (EINVAL); 729 730 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 731 if (error != 0) 732 return (error); 733 umtxq_lock(&key); 734 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); 735 umtxq_unlock(&key); 736 umtx_key_release(&key); 737 return (0); 738 } 739 740 static int 741 linux_futex_wait(struct thread *td, struct linux_futex_args *args) 742 { 743 struct umtx_abs_timeout timo; 744 struct umtx_q *uq; 745 uint32_t uval; 746 int error; 747 748 if (args->val3 == 0) 749 error = EINVAL; 750 751 uq = td->td_umtxq; 752 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), 753 &uq->uq_key); 754 if (error != 0) 755 return (error); 756 if (args->ts != NULL) 757 linux_umtx_abs_timeout_init(&timo, args); 758 umtxq_lock(&uq->uq_key); 759 umtxq_busy(&uq->uq_key); 760 uq->uq_bitset = args->val3; 761 umtxq_insert(uq); 762 umtxq_unlock(&uq->uq_key); 763 error = fueword32(args->uaddr, &uval); 764 if (error != 0) 765 error = EFAULT; 766 else if (uval != args->val) 767 error = EWOULDBLOCK; 768 umtxq_lock(&uq->uq_key); 769 umtxq_unbusy(&uq->uq_key); 770 if (error == 0) { 771 error = umtxq_sleep(uq, "futex", 772 args->ts == NULL ? NULL : &timo); 773 if ((uq->uq_flags & UQF_UMTXQ) == 0) 774 error = 0; 775 else 776 umtxq_remove(uq); 777 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { 778 umtxq_remove(uq); 779 } 780 umtxq_unlock(&uq->uq_key); 781 umtx_key_release(&uq->uq_key); 782 if (error == ERESTART) 783 error = EINTR; 784 return (error); 785 } 786 787 static void 788 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, 789 struct linux_futex_args *args) 790 { 791 int clockid, absolute; 792 793 /* 794 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the 795 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2. 796 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other 797 * futex operations timeout is interpreted as an absolute value. 798 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures 799 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel 800 * measures the timeout against the CLOCK_MONOTONIC clock. 801 */ 802 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; 803 absolute = args->op == LINUX_FUTEX_WAIT ? false : true; 804 umtx_abs_timeout_init(timo, clockid, absolute, args->ts); 805 } 806 807 int 808 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 809 { 810 struct linux_futex_args fargs = { 811 .uaddr = args->uaddr, 812 .op = args->op, 813 .val = args->val, 814 .ts = NULL, 815 .uaddr2 = args->uaddr2, 816 .val3 = args->val3, 817 .val3_compare = true, 818 }; 819 struct l_timespec lts; 820 int error; 821 822 switch (args->op & LINUX_FUTEX_CMD_MASK) { 823 case LINUX_FUTEX_WAIT: 824 case LINUX_FUTEX_WAIT_BITSET: 825 case LINUX_FUTEX_LOCK_PI: 826 case LINUX_FUTEX_LOCK_PI2: 827 if (args->timeout != NULL) { 828 error = copyin(args->timeout, <s, sizeof(lts)); 829 if (error != 0) 830 return (error); 831 error = linux_to_native_timespec(&fargs.kts, <s); 832 if (error != 0) 833 return (error); 834 fargs.ts = &fargs.kts; 835 } 836 break; 837 default: 838 fargs.ts = PTRIN(args->timeout); 839 } 840 return (linux_futex(td, &fargs)); 841 } 842 843 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) 844 int 845 linux_sys_futex_time64(struct thread *td, 846 struct linux_sys_futex_time64_args *args) 847 { 848 struct linux_futex_args fargs = { 849 .uaddr = args->uaddr, 850 .op = args->op, 851 .val = args->val, 852 .ts = NULL, 853 .uaddr2 = args->uaddr2, 854 .val3 = args->val3, 855 .val3_compare = true, 856 }; 857 struct l_timespec64 lts; 858 int error; 859 860 switch (args->op & LINUX_FUTEX_CMD_MASK) { 861 case LINUX_FUTEX_WAIT: 862 case LINUX_FUTEX_WAIT_BITSET: 863 case LINUX_FUTEX_LOCK_PI: 864 case LINUX_FUTEX_LOCK_PI2: 865 if (args->timeout != NULL) { 866 error = copyin(args->timeout, <s, sizeof(lts)); 867 if (error != 0) 868 return (error); 869 error = linux_to_native_timespec64(&fargs.kts, <s); 870 if (error != 0) 871 return (error); 872 fargs.ts = &fargs.kts; 873 } 874 break; 875 default: 876 fargs.ts = PTRIN(args->timeout); 877 } 878 return (linux_futex(td, &fargs)); 879 } 880 #endif 881 882 int 883 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 884 { 885 struct linux_emuldata *em; 886 887 if (args->len != sizeof(struct linux_robust_list_head)) 888 return (EINVAL); 889 890 em = em_find(td); 891 em->robust_futexes = args->head; 892 893 return (0); 894 } 895 896 int 897 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 898 { 899 struct linux_emuldata *em; 900 struct linux_robust_list_head *head; 901 l_size_t len; 902 struct thread *td2; 903 int error; 904 905 if (!args->pid) { 906 em = em_find(td); 907 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 908 head = em->robust_futexes; 909 } else { 910 td2 = linux_tdfind(td, args->pid, -1); 911 if (td2 == NULL) 912 return (ESRCH); 913 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { 914 PROC_UNLOCK(td2->td_proc); 915 return (EPERM); 916 } 917 918 em = em_find(td2); 919 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 920 /* XXX: ptrace? */ 921 if (priv_check(td, PRIV_CRED_SETUID) || 922 priv_check(td, PRIV_CRED_SETEUID) || 923 p_candebug(td, td2->td_proc)) { 924 PROC_UNLOCK(td2->td_proc); 925 return (EPERM); 926 } 927 head = em->robust_futexes; 928 929 PROC_UNLOCK(td2->td_proc); 930 } 931 932 len = sizeof(struct linux_robust_list_head); 933 error = copyout(&len, args->len, sizeof(l_size_t)); 934 if (error != 0) 935 return (EFAULT); 936 937 return (copyout(&head, args->head, sizeof(head))); 938 } 939 940 static int 941 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr, 942 unsigned int pi, bool pending_op) 943 { 944 uint32_t uval, nval, mval; 945 int error; 946 947 retry: 948 error = fueword32(uaddr, &uval); 949 if (error != 0) 950 return (EFAULT); 951 952 /* 953 * Special case for regular (non PI) futexes. The unlock path in 954 * user space has two race scenarios: 955 * 956 * 1. The unlock path releases the user space futex value and 957 * before it can execute the futex() syscall to wake up 958 * waiters it is killed. 959 * 960 * 2. A woken up waiter is killed before it can acquire the 961 * futex in user space. 962 * 963 * In both cases the TID validation below prevents a wakeup of 964 * potential waiters which can cause these waiters to block 965 * forever. 966 * 967 * In both cases it is safe to attempt waking up a potential 968 * waiter without touching the user space futex value and trying 969 * to set the OWNER_DIED bit. 970 */ 971 if (pending_op && !pi && !uval) { 972 (void)futex_wake(td, uaddr, 1, true); 973 return (0); 974 } 975 976 if ((uval & FUTEX_TID_MASK) == em->em_tid) { 977 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 978 error = casueword32(uaddr, uval, &nval, mval); 979 if (error == -1) 980 return (EFAULT); 981 if (error == 1) { 982 error = thread_check_susp(td, false); 983 if (error != 0) 984 return (error); 985 goto retry; 986 } 987 988 if (!pi && (uval & FUTEX_WAITERS)) { 989 error = futex_wake(td, uaddr, 1, true); 990 if (error != 0) 991 return (error); 992 } else if (pi && (uval & FUTEX_WAITERS)) { 993 error = futex_wake_pi(td, uaddr, true); 994 if (error != 0) 995 return (error); 996 } 997 } 998 999 return (0); 1000 } 1001 1002 static int 1003 fetch_robust_entry(struct linux_robust_list **entry, 1004 struct linux_robust_list **head, unsigned int *pi) 1005 { 1006 l_ulong uentry; 1007 int error; 1008 1009 error = copyin((const void *)head, &uentry, sizeof(uentry)); 1010 if (error != 0) 1011 return (EFAULT); 1012 1013 *entry = (void *)(uentry & ~1UL); 1014 *pi = uentry & 1; 1015 1016 return (0); 1017 } 1018 1019 #define LINUX_HANDLE_DEATH_PENDING true 1020 #define LINUX_HANDLE_DEATH_LIST false 1021 1022 /* This walks the list of robust futexes releasing them. */ 1023 void 1024 release_futexes(struct thread *td, struct linux_emuldata *em) 1025 { 1026 struct linux_robust_list_head *head; 1027 struct linux_robust_list *entry, *next_entry, *pending; 1028 unsigned int limit = 2048, pi, next_pi, pip; 1029 uint32_t *uaddr; 1030 l_long futex_offset; 1031 int error; 1032 1033 head = em->robust_futexes; 1034 if (head == NULL) 1035 return; 1036 1037 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 1038 return; 1039 1040 error = copyin(&head->futex_offset, &futex_offset, 1041 sizeof(futex_offset)); 1042 if (error != 0) 1043 return; 1044 1045 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 1046 return; 1047 1048 while (entry != &head->list) { 1049 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next), 1050 &next_pi); 1051 1052 /* 1053 * A pending lock might already be on the list, so 1054 * don't process it twice. 1055 */ 1056 if (entry != pending) { 1057 uaddr = (uint32_t *)((caddr_t)entry + futex_offset); 1058 if (handle_futex_death(td, em, uaddr, pi, 1059 LINUX_HANDLE_DEATH_LIST)) 1060 return; 1061 } 1062 if (error != 0) 1063 return; 1064 1065 entry = next_entry; 1066 pi = next_pi; 1067 1068 if (!--limit) 1069 break; 1070 1071 sched_relinquish(curthread); 1072 } 1073 1074 if (pending) { 1075 uaddr = (uint32_t *)((caddr_t)pending + futex_offset); 1076 (void)handle_futex_death(td, em, uaddr, pip, 1077 LINUX_HANDLE_DEATH_PENDING); 1078 } 1079 } 1080