1 /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-4-Clause 5 * 6 * Copyright (c) 2005 Emmanuel Dreyfus 7 * All rights reserved. 8 * Copyright (c) 2009-2016 Dmitry Chagin <dchagin@FreeBSD.org> 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by Emmanuel Dreyfus 21 * 4. The name of the author may not be used to endorse or promote 22 * products derived from this software without specific prior written 23 * permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' 26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 27 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 #if 0 41 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); 42 #endif 43 44 #include "opt_compat.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/imgact.h> 49 #include <sys/imgact_elf.h> 50 #include <sys/ktr.h> 51 #include <sys/mutex.h> 52 #include <sys/priv.h> 53 #include <sys/proc.h> 54 #include <sys/sched.h> 55 #include <sys/umtxvar.h> 56 57 #ifdef COMPAT_LINUX32 58 #include <machine/../linux32/linux.h> 59 #include <machine/../linux32/linux32_proto.h> 60 #else 61 #include <machine/../linux/linux.h> 62 #include <machine/../linux/linux_proto.h> 63 #endif 64 #include <compat/linux/linux_emul.h> 65 #include <compat/linux/linux_futex.h> 66 #include <compat/linux/linux_misc.h> 67 #include <compat/linux/linux_timer.h> 68 #include <compat/linux/linux_util.h> 69 70 #define FUTEX_SHARED 0x8 /* shared futex */ 71 72 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE 73 74 static int futex_atomic_op(struct thread *, int, uint32_t *); 75 static int handle_futex_death(struct thread *td, struct linux_emuldata *, 76 uint32_t *, unsigned int, bool); 77 static int fetch_robust_entry(struct linux_robust_list **, 78 struct linux_robust_list **, unsigned int *); 79 80 struct linux_futex_args { 81 uint32_t *uaddr; 82 int32_t op; 83 uint32_t flags; 84 bool clockrt; 85 uint32_t val; 86 struct timespec *ts; 87 uint32_t *uaddr2; 88 uint32_t val3; 89 bool val3_compare; 90 struct timespec kts; 91 }; 92 93 static inline int futex_key_get(const void *, int, int, struct umtx_key *); 94 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, 95 struct linux_futex_args *); 96 static int linux_futex(struct thread *, struct linux_futex_args *); 97 static int linux_futex_wait(struct thread *, struct linux_futex_args *); 98 static int linux_futex_wake(struct thread *, struct linux_futex_args *); 99 static int linux_futex_requeue(struct thread *, struct linux_futex_args *); 100 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); 101 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); 102 static int linux_futex_unlock_pi(struct thread *, bool, 103 struct linux_futex_args *); 104 static int futex_wake_pi(struct thread *, uint32_t *, bool); 105 106 static int 107 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key) 108 { 109 110 /* Check that futex address is a 32bit aligned. */ 111 if (!__is_aligned(uaddr, sizeof(uint32_t))) 112 return (EINVAL); 113 return (umtx_key_get(uaddr, type, share, key)); 114 } 115 116 int 117 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) 118 { 119 struct linux_futex_args args; 120 121 bzero(&args, sizeof(args)); 122 args.op = LINUX_FUTEX_WAKE; 123 args.uaddr = uaddr; 124 args.flags = shared == true ? FUTEX_SHARED : 0; 125 args.val = val; 126 args.val3 = FUTEX_BITSET_MATCH_ANY; 127 128 return (linux_futex_wake(td, &args)); 129 } 130 131 static int 132 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) 133 { 134 struct linux_futex_args args; 135 136 bzero(&args, sizeof(args)); 137 args.op = LINUX_FUTEX_UNLOCK_PI; 138 args.uaddr = uaddr; 139 args.flags = shared == true ? FUTEX_SHARED : 0; 140 141 return (linux_futex_unlock_pi(td, true, &args)); 142 } 143 144 static int 145 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) 146 { 147 int op = (encoded_op >> 28) & 7; 148 int cmp = (encoded_op >> 24) & 15; 149 int oparg = (encoded_op << 8) >> 20; 150 int cmparg = (encoded_op << 20) >> 20; 151 int oldval = 0, ret; 152 153 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 154 oparg = 1 << oparg; 155 156 switch (op) { 157 case FUTEX_OP_SET: 158 ret = futex_xchgl(oparg, uaddr, &oldval); 159 break; 160 case FUTEX_OP_ADD: 161 ret = futex_addl(oparg, uaddr, &oldval); 162 break; 163 case FUTEX_OP_OR: 164 ret = futex_orl(oparg, uaddr, &oldval); 165 break; 166 case FUTEX_OP_ANDN: 167 ret = futex_andl(~oparg, uaddr, &oldval); 168 break; 169 case FUTEX_OP_XOR: 170 ret = futex_xorl(oparg, uaddr, &oldval); 171 break; 172 default: 173 ret = -ENOSYS; 174 break; 175 } 176 177 if (ret) 178 return (ret); 179 180 switch (cmp) { 181 case FUTEX_OP_CMP_EQ: 182 ret = (oldval == cmparg); 183 break; 184 case FUTEX_OP_CMP_NE: 185 ret = (oldval != cmparg); 186 break; 187 case FUTEX_OP_CMP_LT: 188 ret = (oldval < cmparg); 189 break; 190 case FUTEX_OP_CMP_GE: 191 ret = (oldval >= cmparg); 192 break; 193 case FUTEX_OP_CMP_LE: 194 ret = (oldval <= cmparg); 195 break; 196 case FUTEX_OP_CMP_GT: 197 ret = (oldval > cmparg); 198 break; 199 default: 200 ret = -ENOSYS; 201 } 202 203 return (ret); 204 } 205 206 static int 207 linux_futex(struct thread *td, struct linux_futex_args *args) 208 { 209 struct linux_pemuldata *pem; 210 struct proc *p; 211 212 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { 213 args->flags = 0; 214 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; 215 } else 216 args->flags = FUTEX_SHARED; 217 218 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; 219 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; 220 221 if (args->clockrt && 222 args->op != LINUX_FUTEX_WAIT_BITSET && 223 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI && 224 args->op != LINUX_FUTEX_LOCK_PI2) 225 return (ENOSYS); 226 227 switch (args->op) { 228 case LINUX_FUTEX_WAIT: 229 args->val3 = FUTEX_BITSET_MATCH_ANY; 230 /* FALLTHROUGH */ 231 232 case LINUX_FUTEX_WAIT_BITSET: 233 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", 234 args->uaddr, args->val, args->val3); 235 236 return (linux_futex_wait(td, args)); 237 238 case LINUX_FUTEX_WAKE: 239 args->val3 = FUTEX_BITSET_MATCH_ANY; 240 /* FALLTHROUGH */ 241 242 case LINUX_FUTEX_WAKE_BITSET: 243 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", 244 args->uaddr, args->val, args->val3); 245 246 return (linux_futex_wake(td, args)); 247 248 case LINUX_FUTEX_REQUEUE: 249 /* 250 * Glibc does not use this operation since version 2.3.3, 251 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. 252 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when 253 * FUTEX_REQUEUE returned EINVAL. 254 */ 255 pem = pem_find(td->td_proc); 256 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { 257 linux_msg(td, "unsupported FUTEX_REQUEUE"); 258 pem->flags |= LINUX_XDEPR_REQUEUEOP; 259 } 260 261 /* 262 * The above is true, however musl libc does make use of the 263 * futex requeue operation, allow operation for brands which 264 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. 265 */ 266 p = td->td_proc; 267 Elf_Brandinfo *bi = p->p_elf_brandinfo; 268 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) 269 return (EINVAL); 270 args->val3_compare = false; 271 /* FALLTHROUGH */ 272 273 case LINUX_FUTEX_CMP_REQUEUE: 274 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " 275 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", 276 args->uaddr, args->val, args->val3, args->uaddr2, 277 args->ts); 278 279 return (linux_futex_requeue(td, args)); 280 281 case LINUX_FUTEX_WAKE_OP: 282 LINUX_CTR5(sys_futex, "WAKE_OP " 283 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", 284 args->uaddr, args->val, args->uaddr2, args->val3, 285 args->ts); 286 287 return (linux_futex_wakeop(td, args)); 288 289 case LINUX_FUTEX_LOCK_PI: 290 args->clockrt = true; 291 /* FALLTHROUGH */ 292 293 case LINUX_FUTEX_LOCK_PI2: 294 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", 295 args->uaddr, args->val); 296 297 return (linux_futex_lock_pi(td, false, args)); 298 299 case LINUX_FUTEX_UNLOCK_PI: 300 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", 301 args->uaddr); 302 303 return (linux_futex_unlock_pi(td, false, args)); 304 305 case LINUX_FUTEX_TRYLOCK_PI: 306 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", 307 args->uaddr); 308 309 return (linux_futex_lock_pi(td, true, args)); 310 311 /* 312 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI 313 * can't be used anymore to implement conditional variables. 314 * A detailed explanation can be found here: 315 * 316 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165 317 * and here http://austingroupbugs.net/view.php?id=609 318 * 319 * And since commit 320 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14 321 * glibc does not uses it. 322 */ 323 case LINUX_FUTEX_WAIT_REQUEUE_PI: 324 /* not yet implemented */ 325 pem = pem_find(td->td_proc); 326 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 327 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); 328 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 329 } 330 return (ENOSYS); 331 332 case LINUX_FUTEX_CMP_REQUEUE_PI: 333 /* not yet implemented */ 334 pem = pem_find(td->td_proc); 335 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 336 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); 337 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 338 } 339 return (ENOSYS); 340 341 default: 342 linux_msg(td, "unsupported futex op %d", args->op); 343 return (ENOSYS); 344 } 345 } 346 347 /* 348 * pi protocol: 349 * - 0 futex word value means unlocked. 350 * - TID futex word value means locked. 351 * Userspace uses atomic ops to lock/unlock these futexes without entering the 352 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), 353 * then FUTEX_LOCK_PI is called. 354 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no 355 * other waiters exists looks up the thread that owns the futex (it has put its 356 * own TID into the futex value) and made this thread the owner of the internal 357 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock 358 * object, on which it blocks. Once it returns, it has the mutex acquired, and it 359 * sets the futex value to its own TID and returns (futex value contains 360 * FUTEX_WAITERS|TID). 361 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and 362 * FUTEX_UNLOCK_PI will be called. 363 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED 364 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS 365 * bit is preserved (if any). 366 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against 367 * the internal futex state and if correct, acquire futex. 368 */ 369 static int 370 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) 371 { 372 struct umtx_abs_timeout timo; 373 struct linux_emuldata *em; 374 struct umtx_pi *pi, *new_pi; 375 struct thread *td1; 376 struct umtx_q *uq; 377 int error, rv; 378 uint32_t owner, old_owner; 379 380 em = em_find(td); 381 uq = td->td_umtxq; 382 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), 383 &uq->uq_key); 384 if (error != 0) 385 return (error); 386 if (args->ts != NULL) 387 linux_umtx_abs_timeout_init(&timo, args); 388 389 umtxq_lock(&uq->uq_key); 390 pi = umtx_pi_lookup(&uq->uq_key); 391 if (pi == NULL) { 392 new_pi = umtx_pi_alloc(M_NOWAIT); 393 if (new_pi == NULL) { 394 umtxq_unlock(&uq->uq_key); 395 new_pi = umtx_pi_alloc(M_WAITOK); 396 umtxq_lock(&uq->uq_key); 397 pi = umtx_pi_lookup(&uq->uq_key); 398 if (pi != NULL) { 399 umtx_pi_free(new_pi); 400 new_pi = NULL; 401 } 402 } 403 if (new_pi != NULL) { 404 new_pi->pi_key = uq->uq_key; 405 umtx_pi_insert(new_pi); 406 pi = new_pi; 407 } 408 } 409 umtx_pi_ref(pi); 410 umtxq_unlock(&uq->uq_key); 411 for (;;) { 412 /* Try uncontested case first. */ 413 rv = casueword32(args->uaddr, 0, &owner, em->em_tid); 414 /* The acquire succeeded. */ 415 if (rv == 0) { 416 error = 0; 417 break; 418 } 419 if (rv == -1) { 420 error = EFAULT; 421 break; 422 } 423 424 /* 425 * Avoid overwriting a possible error from sleep due 426 * to the pending signal with suspension check result. 427 */ 428 if (error == 0) { 429 error = thread_check_susp(td, true); 430 if (error != 0) 431 break; 432 } 433 434 /* The futex word at *uaddr is already locked by the caller. */ 435 if ((owner & FUTEX_TID_MASK) == em->em_tid) { 436 error = EDEADLK; 437 break; 438 } 439 440 /* 441 * Futex owner died, handle_futex_death() set the OWNER_DIED bit 442 * and clear tid. Try to acquire it. 443 */ 444 if ((owner & FUTEX_TID_MASK) == 0) { 445 old_owner = owner; 446 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); 447 owner |= em->em_tid; 448 rv = casueword32(args->uaddr, old_owner, &owner, owner); 449 if (rv == -1) { 450 error = EFAULT; 451 break; 452 } 453 if (rv == 1) { 454 if (error == 0) { 455 error = thread_check_susp(td, true); 456 if (error != 0) 457 break; 458 } 459 460 /* 461 * If this failed the lock could 462 * changed, restart. 463 */ 464 continue; 465 } 466 467 umtxq_lock(&uq->uq_key); 468 umtxq_busy(&uq->uq_key); 469 error = umtx_pi_claim(pi, td); 470 umtxq_unbusy(&uq->uq_key); 471 umtxq_unlock(&uq->uq_key); 472 if (error != 0) { 473 /* 474 * Since we're going to return an 475 * error, restore the futex to its 476 * previous, unowned state to avoid 477 * compounding the problem. 478 */ 479 (void)casuword32(args->uaddr, owner, old_owner); 480 } 481 break; 482 } 483 484 /* 485 * Inconsistent state: OWNER_DIED is set and tid is not 0. 486 * Linux does some checks of futex state, we return EINVAL, 487 * as the user space can take care of this. 488 */ 489 if ((owner & FUTEX_OWNER_DIED) != 0) { 490 error = EINVAL; 491 break; 492 } 493 494 if (try != 0) { 495 error = EBUSY; 496 break; 497 } 498 499 /* 500 * If we caught a signal, we have retried and now 501 * exit immediately. 502 */ 503 if (error != 0) 504 break; 505 506 umtxq_lock(&uq->uq_key); 507 umtxq_busy(&uq->uq_key); 508 umtxq_unlock(&uq->uq_key); 509 510 /* 511 * Set the contested bit so that a release in user space knows 512 * to use the system call for unlock. If this fails either some 513 * one else has acquired the lock or it has been released. 514 */ 515 rv = casueword32(args->uaddr, owner, &owner, 516 owner | FUTEX_WAITERS); 517 if (rv == -1) { 518 umtxq_unbusy_unlocked(&uq->uq_key); 519 error = EFAULT; 520 break; 521 } 522 if (rv == 1) { 523 umtxq_unbusy_unlocked(&uq->uq_key); 524 error = thread_check_susp(td, true); 525 if (error != 0) 526 break; 527 528 /* 529 * The lock changed and we need to retry or we 530 * lost a race to the thread unlocking the umtx. 531 */ 532 continue; 533 } 534 535 /* 536 * Substitute Linux thread id by native thread id to 537 * avoid refactoring code of umtxq_sleep_pi(). 538 */ 539 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); 540 if (td1 != NULL) { 541 owner = td1->td_tid; 542 PROC_UNLOCK(td1->td_proc); 543 } else { 544 umtxq_unbusy_unlocked(&uq->uq_key); 545 error = EINVAL; 546 break; 547 } 548 549 umtxq_lock(&uq->uq_key); 550 551 /* We set the contested bit, sleep. */ 552 error = umtxq_sleep_pi(uq, pi, owner, "futexp", 553 args->ts == NULL ? NULL : &timo, 554 (args->flags & FUTEX_SHARED) != 0); 555 if (error != 0) 556 continue; 557 558 error = thread_check_susp(td, false); 559 if (error != 0) 560 break; 561 } 562 563 umtxq_lock(&uq->uq_key); 564 umtx_pi_unref(pi); 565 umtxq_unlock(&uq->uq_key); 566 umtx_key_release(&uq->uq_key); 567 return (error); 568 } 569 570 static int 571 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) 572 { 573 struct linux_emuldata *em; 574 struct umtx_key key; 575 uint32_t old, owner, new_owner; 576 int count, error; 577 578 em = em_find(td); 579 580 /* 581 * Make sure we own this mtx. 582 */ 583 error = fueword32(args->uaddr, &owner); 584 if (error == -1) 585 return (EFAULT); 586 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) 587 return (EPERM); 588 589 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); 590 if (error != 0) 591 return (error); 592 umtxq_lock(&key); 593 umtxq_busy(&key); 594 error = umtx_pi_drop(td, &key, rb, &count); 595 if (error != 0 || rb) { 596 umtxq_unbusy(&key); 597 umtxq_unlock(&key); 598 umtx_key_release(&key); 599 return (error); 600 } 601 umtxq_unlock(&key); 602 603 /* 604 * When unlocking the futex, it must be marked as unowned if 605 * there is zero or one thread only waiting for it. 606 * Otherwise, it must be marked as contested. 607 */ 608 if (count > 1) 609 new_owner = FUTEX_WAITERS; 610 else 611 new_owner = 0; 612 613 again: 614 error = casueword32(args->uaddr, owner, &old, new_owner); 615 if (error == 1) { 616 error = thread_check_susp(td, false); 617 if (error == 0) 618 goto again; 619 } 620 umtxq_unbusy_unlocked(&key); 621 umtx_key_release(&key); 622 if (error == -1) 623 return (EFAULT); 624 if (error == 0 && old != owner) 625 return (EINVAL); 626 return (error); 627 } 628 629 static int 630 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) 631 { 632 struct umtx_key key, key2; 633 int nrwake, op_ret, ret; 634 int error, count; 635 636 if (args->uaddr == args->uaddr2) 637 return (EINVAL); 638 639 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 640 if (error != 0) 641 return (error); 642 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 643 if (error != 0) { 644 umtx_key_release(&key); 645 return (error); 646 } 647 umtxq_lock(&key); 648 umtxq_busy(&key); 649 umtxq_unlock(&key); 650 op_ret = futex_atomic_op(td, args->val3, args->uaddr2); 651 if (op_ret < 0) { 652 if (op_ret == -ENOSYS) 653 error = ENOSYS; 654 else 655 error = EFAULT; 656 } 657 umtxq_lock(&key); 658 umtxq_unbusy(&key); 659 if (error != 0) 660 goto out; 661 ret = umtxq_signal_mask(&key, args->val, args->val3); 662 if (op_ret > 0) { 663 nrwake = (int)(unsigned long)args->ts; 664 umtxq_lock(&key2); 665 count = umtxq_count(&key2); 666 if (count > 0) 667 ret += umtxq_signal_mask(&key2, nrwake, args->val3); 668 else 669 ret += umtxq_signal_mask(&key, nrwake, args->val3); 670 umtxq_unlock(&key2); 671 } 672 td->td_retval[0] = ret; 673 out: 674 umtxq_unlock(&key); 675 umtx_key_release(&key2); 676 umtx_key_release(&key); 677 return (error); 678 } 679 680 static int 681 linux_futex_requeue(struct thread *td, struct linux_futex_args *args) 682 { 683 int nrwake, nrrequeue; 684 struct umtx_key key, key2; 685 int error; 686 uint32_t uval; 687 688 /* 689 * Linux allows this, we would not, it is an incorrect 690 * usage of declared ABI, so return EINVAL. 691 */ 692 if (args->uaddr == args->uaddr2) 693 return (EINVAL); 694 695 nrrequeue = (int)(unsigned long)args->ts; 696 nrwake = args->val; 697 /* 698 * Sanity check to prevent signed integer overflow, 699 * see Linux CVE-2018-6927 700 */ 701 if (nrwake < 0 || nrrequeue < 0) 702 return (EINVAL); 703 704 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 705 if (error != 0) 706 return (error); 707 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 708 if (error != 0) { 709 umtx_key_release(&key); 710 return (error); 711 } 712 umtxq_lock(&key); 713 umtxq_busy(&key); 714 umtxq_unlock(&key); 715 error = fueword32(args->uaddr, &uval); 716 if (error != 0) 717 error = EFAULT; 718 else if (args->val3_compare == true && uval != args->val3) 719 error = EWOULDBLOCK; 720 umtxq_lock(&key); 721 umtxq_unbusy(&key); 722 if (error == 0) { 723 umtxq_lock(&key2); 724 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); 725 umtxq_unlock(&key2); 726 } 727 umtxq_unlock(&key); 728 umtx_key_release(&key2); 729 umtx_key_release(&key); 730 return (error); 731 } 732 733 static int 734 linux_futex_wake(struct thread *td, struct linux_futex_args *args) 735 { 736 struct umtx_key key; 737 int error; 738 739 if (args->val3 == 0) 740 return (EINVAL); 741 742 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 743 if (error != 0) 744 return (error); 745 umtxq_lock(&key); 746 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); 747 umtxq_unlock(&key); 748 umtx_key_release(&key); 749 return (0); 750 } 751 752 static int 753 linux_futex_wait(struct thread *td, struct linux_futex_args *args) 754 { 755 struct umtx_abs_timeout timo; 756 struct umtx_q *uq; 757 uint32_t uval; 758 int error; 759 760 if (args->val3 == 0) 761 error = EINVAL; 762 763 uq = td->td_umtxq; 764 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), 765 &uq->uq_key); 766 if (error != 0) 767 return (error); 768 if (args->ts != NULL) 769 linux_umtx_abs_timeout_init(&timo, args); 770 umtxq_lock(&uq->uq_key); 771 umtxq_busy(&uq->uq_key); 772 uq->uq_bitset = args->val3; 773 umtxq_insert(uq); 774 umtxq_unlock(&uq->uq_key); 775 error = fueword32(args->uaddr, &uval); 776 if (error != 0) 777 error = EFAULT; 778 else if (uval != args->val) 779 error = EWOULDBLOCK; 780 umtxq_lock(&uq->uq_key); 781 umtxq_unbusy(&uq->uq_key); 782 if (error == 0) { 783 error = umtxq_sleep(uq, "futex", 784 args->ts == NULL ? NULL : &timo); 785 if ((uq->uq_flags & UQF_UMTXQ) == 0) 786 error = 0; 787 else 788 umtxq_remove(uq); 789 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { 790 umtxq_remove(uq); 791 } 792 umtxq_unlock(&uq->uq_key); 793 umtx_key_release(&uq->uq_key); 794 if (error == ERESTART) 795 error = EINTR; 796 return (error); 797 } 798 799 static void 800 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, 801 struct linux_futex_args *args) 802 { 803 int clockid, absolute; 804 805 /* 806 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the 807 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2. 808 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other 809 * futex operations timeout is interpreted as an absolute value. 810 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures 811 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel 812 * measures the timeout against the CLOCK_MONOTONIC clock. 813 */ 814 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; 815 absolute = args->op == LINUX_FUTEX_WAIT ? false : true; 816 umtx_abs_timeout_init(timo, clockid, absolute, args->ts); 817 } 818 819 int 820 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 821 { 822 struct linux_futex_args fargs = { 823 .uaddr = args->uaddr, 824 .op = args->op, 825 .val = args->val, 826 .ts = NULL, 827 .uaddr2 = args->uaddr2, 828 .val3 = args->val3, 829 .val3_compare = true, 830 }; 831 struct l_timespec lts; 832 int error; 833 834 switch (args->op & LINUX_FUTEX_CMD_MASK) { 835 case LINUX_FUTEX_WAIT: 836 case LINUX_FUTEX_WAIT_BITSET: 837 case LINUX_FUTEX_LOCK_PI: 838 case LINUX_FUTEX_LOCK_PI2: 839 if (args->timeout != NULL) { 840 error = copyin(args->timeout, <s, sizeof(lts)); 841 if (error != 0) 842 return (error); 843 error = linux_to_native_timespec(&fargs.kts, <s); 844 if (error != 0) 845 return (error); 846 fargs.ts = &fargs.kts; 847 } 848 break; 849 default: 850 fargs.ts = PTRIN(args->timeout); 851 } 852 return (linux_futex(td, &fargs)); 853 } 854 855 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) 856 int 857 linux_sys_futex_time64(struct thread *td, 858 struct linux_sys_futex_time64_args *args) 859 { 860 struct linux_futex_args fargs = { 861 .uaddr = args->uaddr, 862 .op = args->op, 863 .val = args->val, 864 .ts = NULL, 865 .uaddr2 = args->uaddr2, 866 .val3 = args->val3, 867 .val3_compare = true, 868 }; 869 struct l_timespec64 lts; 870 int error; 871 872 switch (args->op & LINUX_FUTEX_CMD_MASK) { 873 case LINUX_FUTEX_WAIT: 874 case LINUX_FUTEX_WAIT_BITSET: 875 case LINUX_FUTEX_LOCK_PI: 876 case LINUX_FUTEX_LOCK_PI2: 877 if (args->timeout != NULL) { 878 error = copyin(args->timeout, <s, sizeof(lts)); 879 if (error != 0) 880 return (error); 881 error = linux_to_native_timespec64(&fargs.kts, <s); 882 if (error != 0) 883 return (error); 884 fargs.ts = &fargs.kts; 885 } 886 break; 887 default: 888 fargs.ts = PTRIN(args->timeout); 889 } 890 return (linux_futex(td, &fargs)); 891 } 892 #endif 893 894 int 895 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 896 { 897 struct linux_emuldata *em; 898 899 if (args->len != sizeof(struct linux_robust_list_head)) 900 return (EINVAL); 901 902 em = em_find(td); 903 em->robust_futexes = args->head; 904 905 return (0); 906 } 907 908 int 909 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 910 { 911 struct linux_emuldata *em; 912 struct linux_robust_list_head *head; 913 l_size_t len; 914 struct thread *td2; 915 int error; 916 917 if (!args->pid) { 918 em = em_find(td); 919 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 920 head = em->robust_futexes; 921 } else { 922 td2 = linux_tdfind(td, args->pid, -1); 923 if (td2 == NULL) 924 return (ESRCH); 925 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { 926 PROC_UNLOCK(td2->td_proc); 927 return (EPERM); 928 } 929 930 em = em_find(td2); 931 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 932 /* XXX: ptrace? */ 933 if (priv_check(td, PRIV_CRED_SETUID) || 934 priv_check(td, PRIV_CRED_SETEUID) || 935 p_candebug(td, td2->td_proc)) { 936 PROC_UNLOCK(td2->td_proc); 937 return (EPERM); 938 } 939 head = em->robust_futexes; 940 941 PROC_UNLOCK(td2->td_proc); 942 } 943 944 len = sizeof(struct linux_robust_list_head); 945 error = copyout(&len, args->len, sizeof(l_size_t)); 946 if (error != 0) 947 return (EFAULT); 948 949 return (copyout(&head, args->head, sizeof(head))); 950 } 951 952 static int 953 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr, 954 unsigned int pi, bool pending_op) 955 { 956 uint32_t uval, nval, mval; 957 int error; 958 959 retry: 960 error = fueword32(uaddr, &uval); 961 if (error != 0) 962 return (EFAULT); 963 964 /* 965 * Special case for regular (non PI) futexes. The unlock path in 966 * user space has two race scenarios: 967 * 968 * 1. The unlock path releases the user space futex value and 969 * before it can execute the futex() syscall to wake up 970 * waiters it is killed. 971 * 972 * 2. A woken up waiter is killed before it can acquire the 973 * futex in user space. 974 * 975 * In both cases the TID validation below prevents a wakeup of 976 * potential waiters which can cause these waiters to block 977 * forever. 978 * 979 * In both cases it is safe to attempt waking up a potential 980 * waiter without touching the user space futex value and trying 981 * to set the OWNER_DIED bit. 982 */ 983 if (pending_op && !pi && !uval) { 984 (void)futex_wake(td, uaddr, 1, true); 985 return (0); 986 } 987 988 if ((uval & FUTEX_TID_MASK) == em->em_tid) { 989 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 990 error = casueword32(uaddr, uval, &nval, mval); 991 if (error == -1) 992 return (EFAULT); 993 if (error == 1) { 994 error = thread_check_susp(td, false); 995 if (error != 0) 996 return (error); 997 goto retry; 998 } 999 1000 if (!pi && (uval & FUTEX_WAITERS)) { 1001 error = futex_wake(td, uaddr, 1, true); 1002 if (error != 0) 1003 return (error); 1004 } else if (pi && (uval & FUTEX_WAITERS)) { 1005 error = futex_wake_pi(td, uaddr, true); 1006 if (error != 0) 1007 return (error); 1008 } 1009 } 1010 1011 return (0); 1012 } 1013 1014 static int 1015 fetch_robust_entry(struct linux_robust_list **entry, 1016 struct linux_robust_list **head, unsigned int *pi) 1017 { 1018 l_ulong uentry; 1019 int error; 1020 1021 error = copyin((const void *)head, &uentry, sizeof(uentry)); 1022 if (error != 0) 1023 return (EFAULT); 1024 1025 *entry = (void *)(uentry & ~1UL); 1026 *pi = uentry & 1; 1027 1028 return (0); 1029 } 1030 1031 #define LINUX_HANDLE_DEATH_PENDING true 1032 #define LINUX_HANDLE_DEATH_LIST false 1033 1034 /* This walks the list of robust futexes releasing them. */ 1035 void 1036 release_futexes(struct thread *td, struct linux_emuldata *em) 1037 { 1038 struct linux_robust_list_head *head; 1039 struct linux_robust_list *entry, *next_entry, *pending; 1040 unsigned int limit = 2048, pi, next_pi, pip; 1041 uint32_t *uaddr; 1042 l_long futex_offset; 1043 int error; 1044 1045 head = em->robust_futexes; 1046 if (head == NULL) 1047 return; 1048 1049 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 1050 return; 1051 1052 error = copyin(&head->futex_offset, &futex_offset, 1053 sizeof(futex_offset)); 1054 if (error != 0) 1055 return; 1056 1057 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 1058 return; 1059 1060 while (entry != &head->list) { 1061 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next), 1062 &next_pi); 1063 1064 /* 1065 * A pending lock might already be on the list, so 1066 * don't process it twice. 1067 */ 1068 if (entry != pending) { 1069 uaddr = (uint32_t *)((caddr_t)entry + futex_offset); 1070 if (handle_futex_death(td, em, uaddr, pi, 1071 LINUX_HANDLE_DEATH_LIST)) 1072 return; 1073 } 1074 if (error != 0) 1075 return; 1076 1077 entry = next_entry; 1078 pi = next_pi; 1079 1080 if (!--limit) 1081 break; 1082 1083 sched_relinquish(curthread); 1084 } 1085 1086 if (pending) { 1087 uaddr = (uint32_t *)((caddr_t)pending + futex_offset); 1088 (void)handle_futex_death(td, em, uaddr, pip, 1089 LINUX_HANDLE_DEATH_PENDING); 1090 } 1091 } 1092