1 /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-4-Clause 5 * 6 * Copyright (c) 2005 Emmanuel Dreyfus 7 * All rights reserved. 8 * Copyright (c) 2009-2016 Dmitry Chagin <dchagin@FreeBSD.org> 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by Emmanuel Dreyfus 21 * 4. The name of the author may not be used to endorse or promote 22 * products derived from this software without specific prior written 23 * permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' 26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 27 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 #if 0 41 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); 42 #endif 43 44 #include "opt_compat.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/imgact.h> 49 #include <sys/imgact_elf.h> 50 #include <sys/kernel.h> 51 #include <sys/ktr.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mutex.h> 55 #include <sys/priv.h> 56 #include <sys/proc.h> 57 #include <sys/queue.h> 58 #include <sys/sched.h> 59 #include <sys/umtxvar.h> 60 61 #include <vm/vm_extern.h> 62 63 #ifdef COMPAT_LINUX32 64 #include <machine/../linux32/linux.h> 65 #include <machine/../linux32/linux32_proto.h> 66 #else 67 #include <machine/../linux/linux.h> 68 #include <machine/../linux/linux_proto.h> 69 #endif 70 #include <compat/linux/linux_emul.h> 71 #include <compat/linux/linux_futex.h> 72 #include <compat/linux/linux_misc.h> 73 #include <compat/linux/linux_timer.h> 74 #include <compat/linux/linux_util.h> 75 76 #define FUTEX_SHARED 0x8 /* shared futex */ 77 78 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE 79 80 static int futex_atomic_op(struct thread *, int, uint32_t *); 81 static int handle_futex_death(struct thread *td, struct linux_emuldata *, 82 uint32_t *, unsigned int, bool); 83 static int fetch_robust_entry(struct linux_robust_list **, 84 struct linux_robust_list **, unsigned int *); 85 86 struct linux_futex_args { 87 uint32_t *uaddr; 88 int32_t op; 89 uint32_t flags; 90 bool clockrt; 91 uint32_t val; 92 struct timespec *ts; 93 uint32_t *uaddr2; 94 uint32_t val3; 95 bool val3_compare; 96 struct timespec kts; 97 }; 98 99 static inline int futex_key_get(const void *, int, int, struct umtx_key *); 100 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, 101 struct linux_futex_args *); 102 static int linux_futex(struct thread *, struct linux_futex_args *); 103 static int linux_futex_wait(struct thread *, struct linux_futex_args *); 104 static int linux_futex_wake(struct thread *, struct linux_futex_args *); 105 static int linux_futex_requeue(struct thread *, struct linux_futex_args *); 106 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); 107 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); 108 static int linux_futex_unlock_pi(struct thread *, bool, 109 struct linux_futex_args *); 110 static int futex_wake_pi(struct thread *, uint32_t *, bool); 111 112 static int 113 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key) 114 { 115 116 /* Check that futex address is a 32bit aligned. */ 117 if (!__is_aligned(uaddr, sizeof(uint32_t))) 118 return (EINVAL); 119 return (umtx_key_get(uaddr, type, share, key)); 120 } 121 122 int 123 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) 124 { 125 struct linux_futex_args args; 126 127 bzero(&args, sizeof(args)); 128 args.op = LINUX_FUTEX_WAKE; 129 args.uaddr = uaddr; 130 args.flags = shared == true ? FUTEX_SHARED : 0; 131 args.val = val; 132 args.val3 = FUTEX_BITSET_MATCH_ANY; 133 134 return (linux_futex_wake(td, &args)); 135 } 136 137 static int 138 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) 139 { 140 struct linux_futex_args args; 141 142 bzero(&args, sizeof(args)); 143 args.op = LINUX_FUTEX_UNLOCK_PI; 144 args.uaddr = uaddr; 145 args.flags = shared == true ? FUTEX_SHARED : 0; 146 147 return (linux_futex_unlock_pi(td, true, &args)); 148 } 149 150 static int 151 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) 152 { 153 int op = (encoded_op >> 28) & 7; 154 int cmp = (encoded_op >> 24) & 15; 155 int oparg = (encoded_op << 8) >> 20; 156 int cmparg = (encoded_op << 20) >> 20; 157 int oldval = 0, ret; 158 159 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 160 oparg = 1 << oparg; 161 162 switch (op) { 163 case FUTEX_OP_SET: 164 ret = futex_xchgl(oparg, uaddr, &oldval); 165 break; 166 case FUTEX_OP_ADD: 167 ret = futex_addl(oparg, uaddr, &oldval); 168 break; 169 case FUTEX_OP_OR: 170 ret = futex_orl(oparg, uaddr, &oldval); 171 break; 172 case FUTEX_OP_ANDN: 173 ret = futex_andl(~oparg, uaddr, &oldval); 174 break; 175 case FUTEX_OP_XOR: 176 ret = futex_xorl(oparg, uaddr, &oldval); 177 break; 178 default: 179 ret = -ENOSYS; 180 break; 181 } 182 183 if (ret) 184 return (ret); 185 186 switch (cmp) { 187 case FUTEX_OP_CMP_EQ: 188 ret = (oldval == cmparg); 189 break; 190 case FUTEX_OP_CMP_NE: 191 ret = (oldval != cmparg); 192 break; 193 case FUTEX_OP_CMP_LT: 194 ret = (oldval < cmparg); 195 break; 196 case FUTEX_OP_CMP_GE: 197 ret = (oldval >= cmparg); 198 break; 199 case FUTEX_OP_CMP_LE: 200 ret = (oldval <= cmparg); 201 break; 202 case FUTEX_OP_CMP_GT: 203 ret = (oldval > cmparg); 204 break; 205 default: 206 ret = -ENOSYS; 207 } 208 209 return (ret); 210 } 211 212 static int 213 linux_futex(struct thread *td, struct linux_futex_args *args) 214 { 215 struct linux_pemuldata *pem; 216 struct proc *p; 217 218 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { 219 args->flags = 0; 220 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; 221 } else 222 args->flags = FUTEX_SHARED; 223 224 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; 225 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; 226 227 if (args->clockrt && 228 args->op != LINUX_FUTEX_WAIT_BITSET && 229 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI && 230 args->op != LINUX_FUTEX_LOCK_PI2) 231 return (ENOSYS); 232 233 switch (args->op) { 234 case LINUX_FUTEX_WAIT: 235 args->val3 = FUTEX_BITSET_MATCH_ANY; 236 /* FALLTHROUGH */ 237 238 case LINUX_FUTEX_WAIT_BITSET: 239 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", 240 args->uaddr, args->val, args->val3); 241 242 return (linux_futex_wait(td, args)); 243 244 case LINUX_FUTEX_WAKE: 245 args->val3 = FUTEX_BITSET_MATCH_ANY; 246 /* FALLTHROUGH */ 247 248 case LINUX_FUTEX_WAKE_BITSET: 249 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", 250 args->uaddr, args->val, args->val3); 251 252 return (linux_futex_wake(td, args)); 253 254 case LINUX_FUTEX_REQUEUE: 255 /* 256 * Glibc does not use this operation since version 2.3.3, 257 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. 258 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when 259 * FUTEX_REQUEUE returned EINVAL. 260 */ 261 pem = pem_find(td->td_proc); 262 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { 263 linux_msg(td, "unsupported FUTEX_REQUEUE"); 264 pem->flags |= LINUX_XDEPR_REQUEUEOP; 265 } 266 267 /* 268 * The above is true, however musl libc does make use of the 269 * futex requeue operation, allow operation for brands which 270 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. 271 */ 272 p = td->td_proc; 273 Elf_Brandinfo *bi = p->p_elf_brandinfo; 274 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) 275 return (EINVAL); 276 args->val3_compare = false; 277 /* FALLTHROUGH */ 278 279 case LINUX_FUTEX_CMP_REQUEUE: 280 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " 281 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", 282 args->uaddr, args->val, args->val3, args->uaddr2, 283 args->ts); 284 285 return (linux_futex_requeue(td, args)); 286 287 case LINUX_FUTEX_WAKE_OP: 288 LINUX_CTR5(sys_futex, "WAKE_OP " 289 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", 290 args->uaddr, args->val, args->uaddr2, args->val3, 291 args->ts); 292 293 return (linux_futex_wakeop(td, args)); 294 295 case LINUX_FUTEX_LOCK_PI: 296 args->clockrt = true; 297 /* FALLTHROUGH */ 298 299 case LINUX_FUTEX_LOCK_PI2: 300 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", 301 args->uaddr, args->val); 302 303 return (linux_futex_lock_pi(td, false, args)); 304 305 case LINUX_FUTEX_UNLOCK_PI: 306 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", 307 args->uaddr); 308 309 return (linux_futex_unlock_pi(td, false, args)); 310 311 case LINUX_FUTEX_TRYLOCK_PI: 312 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", 313 args->uaddr); 314 315 return (linux_futex_lock_pi(td, true, args)); 316 317 /* 318 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI 319 * can't be used anymore to implement conditional variables. 320 * A detailed explanation can be found here: 321 * 322 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165 323 * and here http://austingroupbugs.net/view.php?id=609 324 * 325 * And since commit 326 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14 327 * glibc does not uses it. 328 */ 329 case LINUX_FUTEX_WAIT_REQUEUE_PI: 330 /* not yet implemented */ 331 pem = pem_find(td->td_proc); 332 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 333 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); 334 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 335 } 336 return (ENOSYS); 337 338 case LINUX_FUTEX_CMP_REQUEUE_PI: 339 /* not yet implemented */ 340 pem = pem_find(td->td_proc); 341 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 342 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); 343 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 344 } 345 return (ENOSYS); 346 347 default: 348 linux_msg(td, "unsupported futex op %d", args->op); 349 return (ENOSYS); 350 } 351 } 352 353 /* 354 * pi protocol: 355 * - 0 futex word value means unlocked. 356 * - TID futex word value means locked. 357 * Userspace uses atomic ops to lock/unlock these futexes without entering the 358 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), 359 * then FUTEX_LOCK_PI is called. 360 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no 361 * other waiters exists looks up the thread that owns the futex (it has put its 362 * own TID into the futex value) and made this thread the owner of the internal 363 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock 364 * object, on which it blocks. Once it returns, it has the mutex acquired, and it 365 * sets the futex value to its own TID and returns (futex value contains 366 * FUTEX_WAITERS|TID). 367 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and 368 * FUTEX_UNLOCK_PI will be called. 369 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED 370 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS 371 * bit is preserved (if any). 372 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against 373 * the internal futex state and if correct, acquire futex. 374 */ 375 static int 376 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) 377 { 378 struct umtx_abs_timeout timo; 379 struct linux_emuldata *em; 380 struct umtx_pi *pi, *new_pi; 381 struct thread *td1; 382 struct umtx_q *uq; 383 int error, rv; 384 uint32_t owner, old_owner; 385 386 em = em_find(td); 387 uq = td->td_umtxq; 388 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), 389 &uq->uq_key); 390 if (error != 0) 391 return (error); 392 if (args->ts != NULL) 393 linux_umtx_abs_timeout_init(&timo, args); 394 395 umtxq_lock(&uq->uq_key); 396 pi = umtx_pi_lookup(&uq->uq_key); 397 if (pi == NULL) { 398 new_pi = umtx_pi_alloc(M_NOWAIT); 399 if (new_pi == NULL) { 400 umtxq_unlock(&uq->uq_key); 401 new_pi = umtx_pi_alloc(M_WAITOK); 402 umtxq_lock(&uq->uq_key); 403 pi = umtx_pi_lookup(&uq->uq_key); 404 if (pi != NULL) { 405 umtx_pi_free(new_pi); 406 new_pi = NULL; 407 } 408 } 409 if (new_pi != NULL) { 410 new_pi->pi_key = uq->uq_key; 411 umtx_pi_insert(new_pi); 412 pi = new_pi; 413 } 414 } 415 umtx_pi_ref(pi); 416 umtxq_unlock(&uq->uq_key); 417 for (;;) { 418 /* Try uncontested case first. */ 419 rv = casueword32(args->uaddr, 0, &owner, em->em_tid); 420 /* The acquire succeeded. */ 421 if (rv == 0) { 422 error = 0; 423 break; 424 } 425 if (rv == -1) { 426 error = EFAULT; 427 break; 428 } 429 430 /* 431 * Avoid overwriting a possible error from sleep due 432 * to the pending signal with suspension check result. 433 */ 434 if (error == 0) { 435 error = thread_check_susp(td, true); 436 if (error != 0) 437 break; 438 } 439 440 /* The futex word at *uaddr is already locked by the caller. */ 441 if ((owner & FUTEX_TID_MASK) == em->em_tid) { 442 error = EDEADLK; 443 break; 444 } 445 446 /* 447 * Futex owner died, handle_futex_death() set the OWNER_DIED bit 448 * and clear tid. Try to acquire it. 449 */ 450 if ((owner & FUTEX_TID_MASK) == 0) { 451 old_owner = owner; 452 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); 453 owner |= em->em_tid; 454 rv = casueword32(args->uaddr, old_owner, &owner, owner); 455 if (rv == -1) { 456 error = EFAULT; 457 break; 458 } 459 if (rv == 1) { 460 if (error == 0) { 461 error = thread_check_susp(td, true); 462 if (error != 0) 463 break; 464 } 465 466 /* 467 * If this failed the lock could 468 * changed, restart. 469 */ 470 continue; 471 } 472 473 umtxq_lock(&uq->uq_key); 474 umtxq_busy(&uq->uq_key); 475 error = umtx_pi_claim(pi, td); 476 umtxq_unbusy(&uq->uq_key); 477 umtxq_unlock(&uq->uq_key); 478 if (error != 0) { 479 /* 480 * Since we're going to return an 481 * error, restore the futex to its 482 * previous, unowned state to avoid 483 * compounding the problem. 484 */ 485 (void)casuword32(args->uaddr, owner, old_owner); 486 } 487 break; 488 } 489 490 /* 491 * Inconsistent state: OWNER_DIED is set and tid is not 0. 492 * Linux does some checks of futex state, we return EINVAL, 493 * as the user space can take care of this. 494 */ 495 if ((owner & FUTEX_OWNER_DIED) != 0) { 496 error = EINVAL; 497 break; 498 } 499 500 if (try != 0) { 501 error = EBUSY; 502 break; 503 } 504 505 /* 506 * If we caught a signal, we have retried and now 507 * exit immediately. 508 */ 509 if (error != 0) 510 break; 511 512 umtxq_lock(&uq->uq_key); 513 umtxq_busy(&uq->uq_key); 514 umtxq_unlock(&uq->uq_key); 515 516 /* 517 * Set the contested bit so that a release in user space knows 518 * to use the system call for unlock. If this fails either some 519 * one else has acquired the lock or it has been released. 520 */ 521 rv = casueword32(args->uaddr, owner, &owner, 522 owner | FUTEX_WAITERS); 523 if (rv == -1) { 524 umtxq_unbusy_unlocked(&uq->uq_key); 525 error = EFAULT; 526 break; 527 } 528 if (rv == 1) { 529 umtxq_unbusy_unlocked(&uq->uq_key); 530 error = thread_check_susp(td, true); 531 if (error != 0) 532 break; 533 534 /* 535 * The lock changed and we need to retry or we 536 * lost a race to the thread unlocking the umtx. 537 */ 538 continue; 539 } 540 541 /* 542 * Substitute Linux thread id by native thread id to 543 * avoid refactoring code of umtxq_sleep_pi(). 544 */ 545 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); 546 if (td1 != NULL) { 547 owner = td1->td_tid; 548 PROC_UNLOCK(td1->td_proc); 549 } else { 550 umtxq_unbusy_unlocked(&uq->uq_key); 551 error = EINVAL; 552 break; 553 } 554 555 umtxq_lock(&uq->uq_key); 556 557 /* We set the contested bit, sleep. */ 558 error = umtxq_sleep_pi(uq, pi, owner, "futexp", 559 args->ts == NULL ? NULL : &timo, 560 (args->flags & FUTEX_SHARED) != 0); 561 if (error != 0) 562 continue; 563 564 error = thread_check_susp(td, false); 565 if (error != 0) 566 break; 567 } 568 569 umtxq_lock(&uq->uq_key); 570 umtx_pi_unref(pi); 571 umtxq_unlock(&uq->uq_key); 572 umtx_key_release(&uq->uq_key); 573 return (error); 574 } 575 576 static int 577 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) 578 { 579 struct linux_emuldata *em; 580 struct umtx_key key; 581 uint32_t old, owner, new_owner; 582 int count, error; 583 584 em = em_find(td); 585 586 /* 587 * Make sure we own this mtx. 588 */ 589 error = fueword32(args->uaddr, &owner); 590 if (error == -1) 591 return (EFAULT); 592 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) 593 return (EPERM); 594 595 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); 596 if (error != 0) 597 return (error); 598 umtxq_lock(&key); 599 umtxq_busy(&key); 600 error = umtx_pi_drop(td, &key, rb, &count); 601 if (error != 0 || rb) { 602 umtxq_unbusy(&key); 603 umtxq_unlock(&key); 604 umtx_key_release(&key); 605 return (error); 606 } 607 umtxq_unlock(&key); 608 609 /* 610 * When unlocking the futex, it must be marked as unowned if 611 * there is zero or one thread only waiting for it. 612 * Otherwise, it must be marked as contested. 613 */ 614 if (count > 1) 615 new_owner = FUTEX_WAITERS; 616 else 617 new_owner = 0; 618 619 again: 620 error = casueword32(args->uaddr, owner, &old, new_owner); 621 if (error == 1) { 622 error = thread_check_susp(td, false); 623 if (error == 0) 624 goto again; 625 } 626 umtxq_unbusy_unlocked(&key); 627 umtx_key_release(&key); 628 if (error == -1) 629 return (EFAULT); 630 if (error == 0 && old != owner) 631 return (EINVAL); 632 return (error); 633 } 634 635 static int 636 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) 637 { 638 struct umtx_key key, key2; 639 int nrwake, op_ret, ret; 640 int error, count; 641 642 if (args->uaddr == args->uaddr2) 643 return (EINVAL); 644 645 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 646 if (error != 0) 647 return (error); 648 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 649 if (error != 0) { 650 umtx_key_release(&key); 651 return (error); 652 } 653 umtxq_lock(&key); 654 umtxq_busy(&key); 655 umtxq_unlock(&key); 656 op_ret = futex_atomic_op(td, args->val3, args->uaddr2); 657 if (op_ret < 0) { 658 if (op_ret == -ENOSYS) 659 error = ENOSYS; 660 else 661 error = EFAULT; 662 } 663 umtxq_lock(&key); 664 umtxq_unbusy(&key); 665 if (error != 0) 666 goto out; 667 ret = umtxq_signal_mask(&key, args->val, args->val3); 668 if (op_ret > 0) { 669 nrwake = (int)(unsigned long)args->ts; 670 umtxq_lock(&key2); 671 count = umtxq_count(&key2); 672 if (count > 0) 673 ret += umtxq_signal_mask(&key2, nrwake, args->val3); 674 else 675 ret += umtxq_signal_mask(&key, nrwake, args->val3); 676 umtxq_unlock(&key2); 677 } 678 td->td_retval[0] = ret; 679 out: 680 umtxq_unlock(&key); 681 umtx_key_release(&key2); 682 umtx_key_release(&key); 683 return (error); 684 } 685 686 static int 687 linux_futex_requeue(struct thread *td, struct linux_futex_args *args) 688 { 689 int nrwake, nrrequeue; 690 struct umtx_key key, key2; 691 int error; 692 uint32_t uval; 693 694 /* 695 * Linux allows this, we would not, it is an incorrect 696 * usage of declared ABI, so return EINVAL. 697 */ 698 if (args->uaddr == args->uaddr2) 699 return (EINVAL); 700 701 nrrequeue = (int)(unsigned long)args->ts; 702 nrwake = args->val; 703 /* 704 * Sanity check to prevent signed integer overflow, 705 * see Linux CVE-2018-6927 706 */ 707 if (nrwake < 0 || nrrequeue < 0) 708 return (EINVAL); 709 710 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 711 if (error != 0) 712 return (error); 713 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 714 if (error != 0) { 715 umtx_key_release(&key); 716 return (error); 717 } 718 umtxq_lock(&key); 719 umtxq_busy(&key); 720 umtxq_unlock(&key); 721 error = fueword32(args->uaddr, &uval); 722 if (error != 0) 723 error = EFAULT; 724 else if (args->val3_compare == true && uval != args->val3) 725 error = EWOULDBLOCK; 726 umtxq_lock(&key); 727 umtxq_unbusy(&key); 728 if (error == 0) { 729 umtxq_lock(&key2); 730 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); 731 umtxq_unlock(&key2); 732 } 733 umtxq_unlock(&key); 734 umtx_key_release(&key2); 735 umtx_key_release(&key); 736 return (error); 737 } 738 739 static int 740 linux_futex_wake(struct thread *td, struct linux_futex_args *args) 741 { 742 struct umtx_key key; 743 int error; 744 745 if (args->val3 == 0) 746 return (EINVAL); 747 748 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 749 if (error != 0) 750 return (error); 751 umtxq_lock(&key); 752 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); 753 umtxq_unlock(&key); 754 umtx_key_release(&key); 755 return (0); 756 } 757 758 static int 759 linux_futex_wait(struct thread *td, struct linux_futex_args *args) 760 { 761 struct umtx_abs_timeout timo; 762 struct umtx_q *uq; 763 uint32_t uval; 764 int error; 765 766 if (args->val3 == 0) 767 error = EINVAL; 768 769 uq = td->td_umtxq; 770 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), 771 &uq->uq_key); 772 if (error != 0) 773 return (error); 774 if (args->ts != NULL) 775 linux_umtx_abs_timeout_init(&timo, args); 776 umtxq_lock(&uq->uq_key); 777 umtxq_busy(&uq->uq_key); 778 uq->uq_bitset = args->val3; 779 umtxq_insert(uq); 780 umtxq_unlock(&uq->uq_key); 781 error = fueword32(args->uaddr, &uval); 782 if (error != 0) 783 error = EFAULT; 784 else if (uval != args->val) 785 error = EWOULDBLOCK; 786 umtxq_lock(&uq->uq_key); 787 umtxq_unbusy(&uq->uq_key); 788 if (error == 0) { 789 error = umtxq_sleep(uq, "futex", 790 args->ts == NULL ? NULL : &timo); 791 if ((uq->uq_flags & UQF_UMTXQ) == 0) 792 error = 0; 793 else 794 umtxq_remove(uq); 795 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { 796 umtxq_remove(uq); 797 } 798 umtxq_unlock(&uq->uq_key); 799 umtx_key_release(&uq->uq_key); 800 if (error == ERESTART) 801 error = EINTR; 802 return (error); 803 } 804 805 static void 806 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, 807 struct linux_futex_args *args) 808 { 809 int clockid, absolute; 810 811 /* 812 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the 813 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2. 814 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other 815 * futex operations timeout is interpreted as an absolute value. 816 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures 817 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel 818 * measures the timeout against the CLOCK_MONOTONIC clock. 819 */ 820 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; 821 absolute = args->op == LINUX_FUTEX_WAIT ? false : true; 822 umtx_abs_timeout_init(timo, clockid, absolute, args->ts); 823 } 824 825 int 826 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 827 { 828 struct linux_futex_args fargs = { 829 .uaddr = args->uaddr, 830 .op = args->op, 831 .val = args->val, 832 .ts = NULL, 833 .uaddr2 = args->uaddr2, 834 .val3 = args->val3, 835 .val3_compare = true, 836 }; 837 struct l_timespec lts; 838 int error; 839 840 switch (args->op & LINUX_FUTEX_CMD_MASK) { 841 case LINUX_FUTEX_WAIT: 842 case LINUX_FUTEX_WAIT_BITSET: 843 case LINUX_FUTEX_LOCK_PI: 844 case LINUX_FUTEX_LOCK_PI2: 845 if (args->timeout != NULL) { 846 error = copyin(args->timeout, <s, sizeof(lts)); 847 if (error != 0) 848 return (error); 849 error = linux_to_native_timespec(&fargs.kts, <s); 850 if (error != 0) 851 return (error); 852 fargs.ts = &fargs.kts; 853 } 854 break; 855 default: 856 fargs.ts = PTRIN(args->timeout); 857 } 858 return (linux_futex(td, &fargs)); 859 } 860 861 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) 862 int 863 linux_sys_futex_time64(struct thread *td, 864 struct linux_sys_futex_time64_args *args) 865 { 866 struct linux_futex_args fargs = { 867 .uaddr = args->uaddr, 868 .op = args->op, 869 .val = args->val, 870 .ts = NULL, 871 .uaddr2 = args->uaddr2, 872 .val3 = args->val3, 873 .val3_compare = true, 874 }; 875 struct l_timespec64 lts; 876 int error; 877 878 switch (args->op & LINUX_FUTEX_CMD_MASK) { 879 case LINUX_FUTEX_WAIT: 880 case LINUX_FUTEX_WAIT_BITSET: 881 case LINUX_FUTEX_LOCK_PI: 882 case LINUX_FUTEX_LOCK_PI2: 883 if (args->timeout != NULL) { 884 error = copyin(args->timeout, <s, sizeof(lts)); 885 if (error != 0) 886 return (error); 887 error = linux_to_native_timespec64(&fargs.kts, <s); 888 if (error != 0) 889 return (error); 890 fargs.ts = &fargs.kts; 891 } 892 break; 893 default: 894 fargs.ts = PTRIN(args->timeout); 895 } 896 return (linux_futex(td, &fargs)); 897 } 898 #endif 899 900 int 901 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 902 { 903 struct linux_emuldata *em; 904 905 if (args->len != sizeof(struct linux_robust_list_head)) 906 return (EINVAL); 907 908 em = em_find(td); 909 em->robust_futexes = args->head; 910 911 return (0); 912 } 913 914 int 915 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 916 { 917 struct linux_emuldata *em; 918 struct linux_robust_list_head *head; 919 l_size_t len; 920 struct thread *td2; 921 int error; 922 923 if (!args->pid) { 924 em = em_find(td); 925 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 926 head = em->robust_futexes; 927 } else { 928 td2 = linux_tdfind(td, args->pid, -1); 929 if (td2 == NULL) 930 return (ESRCH); 931 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { 932 PROC_UNLOCK(td2->td_proc); 933 return (EPERM); 934 } 935 936 em = em_find(td2); 937 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 938 /* XXX: ptrace? */ 939 if (priv_check(td, PRIV_CRED_SETUID) || 940 priv_check(td, PRIV_CRED_SETEUID) || 941 p_candebug(td, td2->td_proc)) { 942 PROC_UNLOCK(td2->td_proc); 943 return (EPERM); 944 } 945 head = em->robust_futexes; 946 947 PROC_UNLOCK(td2->td_proc); 948 } 949 950 len = sizeof(struct linux_robust_list_head); 951 error = copyout(&len, args->len, sizeof(l_size_t)); 952 if (error != 0) 953 return (EFAULT); 954 955 return (copyout(&head, args->head, sizeof(head))); 956 } 957 958 static int 959 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr, 960 unsigned int pi, bool pending_op) 961 { 962 uint32_t uval, nval, mval; 963 int error; 964 965 retry: 966 error = fueword32(uaddr, &uval); 967 if (error != 0) 968 return (EFAULT); 969 970 /* 971 * Special case for regular (non PI) futexes. The unlock path in 972 * user space has two race scenarios: 973 * 974 * 1. The unlock path releases the user space futex value and 975 * before it can execute the futex() syscall to wake up 976 * waiters it is killed. 977 * 978 * 2. A woken up waiter is killed before it can acquire the 979 * futex in user space. 980 * 981 * In both cases the TID validation below prevents a wakeup of 982 * potential waiters which can cause these waiters to block 983 * forever. 984 * 985 * In both cases it is safe to attempt waking up a potential 986 * waiter without touching the user space futex value and trying 987 * to set the OWNER_DIED bit. 988 */ 989 if (pending_op && !pi && !uval) { 990 (void)futex_wake(td, uaddr, 1, true); 991 return (0); 992 } 993 994 if ((uval & FUTEX_TID_MASK) == em->em_tid) { 995 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 996 error = casueword32(uaddr, uval, &nval, mval); 997 if (error == -1) 998 return (EFAULT); 999 if (error == 1) { 1000 error = thread_check_susp(td, false); 1001 if (error != 0) 1002 return (error); 1003 goto retry; 1004 } 1005 1006 if (!pi && (uval & FUTEX_WAITERS)) { 1007 error = futex_wake(td, uaddr, 1, true); 1008 if (error != 0) 1009 return (error); 1010 } else if (pi && (uval & FUTEX_WAITERS)) { 1011 error = futex_wake_pi(td, uaddr, true); 1012 if (error != 0) 1013 return (error); 1014 } 1015 } 1016 1017 return (0); 1018 } 1019 1020 static int 1021 fetch_robust_entry(struct linux_robust_list **entry, 1022 struct linux_robust_list **head, unsigned int *pi) 1023 { 1024 l_ulong uentry; 1025 int error; 1026 1027 error = copyin((const void *)head, &uentry, sizeof(uentry)); 1028 if (error != 0) 1029 return (EFAULT); 1030 1031 *entry = (void *)(uentry & ~1UL); 1032 *pi = uentry & 1; 1033 1034 return (0); 1035 } 1036 1037 #define LINUX_HANDLE_DEATH_PENDING true 1038 #define LINUX_HANDLE_DEATH_LIST false 1039 1040 /* This walks the list of robust futexes releasing them. */ 1041 void 1042 release_futexes(struct thread *td, struct linux_emuldata *em) 1043 { 1044 struct linux_robust_list_head *head; 1045 struct linux_robust_list *entry, *next_entry, *pending; 1046 unsigned int limit = 2048, pi, next_pi, pip; 1047 uint32_t *uaddr; 1048 l_long futex_offset; 1049 int error; 1050 1051 head = em->robust_futexes; 1052 if (head == NULL) 1053 return; 1054 1055 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 1056 return; 1057 1058 error = copyin(&head->futex_offset, &futex_offset, 1059 sizeof(futex_offset)); 1060 if (error != 0) 1061 return; 1062 1063 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 1064 return; 1065 1066 while (entry != &head->list) { 1067 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next), 1068 &next_pi); 1069 1070 /* 1071 * A pending lock might already be on the list, so 1072 * don't process it twice. 1073 */ 1074 if (entry != pending) { 1075 uaddr = (uint32_t *)((caddr_t)entry + futex_offset); 1076 if (handle_futex_death(td, em, uaddr, pi, 1077 LINUX_HANDLE_DEATH_LIST)) 1078 return; 1079 } 1080 if (error != 0) 1081 return; 1082 1083 entry = next_entry; 1084 pi = next_pi; 1085 1086 if (!--limit) 1087 break; 1088 1089 sched_relinquish(curthread); 1090 } 1091 1092 if (pending) { 1093 uaddr = (uint32_t *)((caddr_t)pending + futex_offset); 1094 (void)handle_futex_death(td, em, uaddr, pip, 1095 LINUX_HANDLE_DEATH_PENDING); 1096 } 1097 } 1098