1 /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-4-Clause 5 * 6 * Copyright (c) 2005 Emmanuel Dreyfus 7 * All rights reserved. 8 * Copyright (c) 2009-2016 Dmitry Chagin <dchagin@FreeBSD.org> 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by Emmanuel Dreyfus 21 * 4. The name of the author may not be used to endorse or promote 22 * products derived from this software without specific prior written 23 * permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' 26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 27 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 #if 0 41 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); 42 #endif 43 44 #include "opt_compat.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/imgact.h> 49 #include <sys/imgact_elf.h> 50 #include <sys/kernel.h> 51 #include <sys/ktr.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mutex.h> 55 #include <sys/priv.h> 56 #include <sys/proc.h> 57 #include <sys/queue.h> 58 #include <sys/sched.h> 59 #include <sys/umtxvar.h> 60 61 #include <vm/vm_extern.h> 62 63 #ifdef COMPAT_LINUX32 64 #include <machine/../linux32/linux.h> 65 #include <machine/../linux32/linux32_proto.h> 66 #else 67 #include <machine/../linux/linux.h> 68 #include <machine/../linux/linux_proto.h> 69 #endif 70 #include <compat/linux/linux_emul.h> 71 #include <compat/linux/linux_futex.h> 72 #include <compat/linux/linux_misc.h> 73 #include <compat/linux/linux_timer.h> 74 #include <compat/linux/linux_util.h> 75 76 #define FUTEX_SHARED 0x8 /* shared futex */ 77 78 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE 79 80 static int futex_atomic_op(struct thread *, int, uint32_t *); 81 static int handle_futex_death(struct thread *td, struct linux_emuldata *, 82 uint32_t *, unsigned int, bool); 83 static int fetch_robust_entry(struct linux_robust_list **, 84 struct linux_robust_list **, unsigned int *); 85 86 struct linux_futex_args { 87 uint32_t *uaddr; 88 int32_t op; 89 uint32_t flags; 90 bool clockrt; 91 uint32_t val; 92 struct timespec *ts; 93 uint32_t *uaddr2; 94 uint32_t val3; 95 bool val3_compare; 96 struct timespec kts; 97 }; 98 99 static inline int futex_key_get(const void *, int, int, struct umtx_key *); 100 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, 101 struct linux_futex_args *); 102 static int linux_futex(struct thread *, struct linux_futex_args *); 103 static int linux_futex_wait(struct thread *, struct linux_futex_args *); 104 static int linux_futex_wake(struct thread *, struct linux_futex_args *); 105 static int linux_futex_requeue(struct thread *, struct linux_futex_args *); 106 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); 107 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); 108 static int linux_futex_unlock_pi(struct thread *, bool, 109 struct linux_futex_args *); 110 static int futex_wake_pi(struct thread *, uint32_t *, bool); 111 112 static int 113 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key) 114 { 115 116 /* Check that futex address is a 32bit aligned. */ 117 if (!__is_aligned(uaddr, sizeof(uint32_t))) 118 return (EINVAL); 119 return (umtx_key_get(uaddr, type, share, key)); 120 } 121 122 int 123 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) 124 { 125 struct linux_futex_args args; 126 127 bzero(&args, sizeof(args)); 128 args.op = LINUX_FUTEX_WAKE; 129 args.uaddr = uaddr; 130 args.flags = shared == true ? FUTEX_SHARED : 0; 131 args.val = val; 132 args.val3 = FUTEX_BITSET_MATCH_ANY; 133 134 return (linux_futex_wake(td, &args)); 135 } 136 137 static int 138 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) 139 { 140 struct linux_futex_args args; 141 142 bzero(&args, sizeof(args)); 143 args.op = LINUX_FUTEX_UNLOCK_PI; 144 args.uaddr = uaddr; 145 args.flags = shared == true ? FUTEX_SHARED : 0; 146 147 return (linux_futex_unlock_pi(td, true, &args)); 148 } 149 150 static int 151 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) 152 { 153 int op = (encoded_op >> 28) & 7; 154 int cmp = (encoded_op >> 24) & 15; 155 int oparg = (encoded_op << 8) >> 20; 156 int cmparg = (encoded_op << 20) >> 20; 157 int oldval = 0, ret; 158 159 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 160 oparg = 1 << oparg; 161 162 switch (op) { 163 case FUTEX_OP_SET: 164 ret = futex_xchgl(oparg, uaddr, &oldval); 165 break; 166 case FUTEX_OP_ADD: 167 ret = futex_addl(oparg, uaddr, &oldval); 168 break; 169 case FUTEX_OP_OR: 170 ret = futex_orl(oparg, uaddr, &oldval); 171 break; 172 case FUTEX_OP_ANDN: 173 ret = futex_andl(~oparg, uaddr, &oldval); 174 break; 175 case FUTEX_OP_XOR: 176 ret = futex_xorl(oparg, uaddr, &oldval); 177 break; 178 default: 179 ret = -ENOSYS; 180 break; 181 } 182 183 if (ret) 184 return (ret); 185 186 switch (cmp) { 187 case FUTEX_OP_CMP_EQ: 188 ret = (oldval == cmparg); 189 break; 190 case FUTEX_OP_CMP_NE: 191 ret = (oldval != cmparg); 192 break; 193 case FUTEX_OP_CMP_LT: 194 ret = (oldval < cmparg); 195 break; 196 case FUTEX_OP_CMP_GE: 197 ret = (oldval >= cmparg); 198 break; 199 case FUTEX_OP_CMP_LE: 200 ret = (oldval <= cmparg); 201 break; 202 case FUTEX_OP_CMP_GT: 203 ret = (oldval > cmparg); 204 break; 205 default: 206 ret = -ENOSYS; 207 } 208 209 return (ret); 210 } 211 212 static int 213 linux_futex(struct thread *td, struct linux_futex_args *args) 214 { 215 struct linux_pemuldata *pem; 216 struct proc *p; 217 218 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { 219 args->flags = 0; 220 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; 221 } else 222 args->flags = FUTEX_SHARED; 223 224 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; 225 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; 226 227 switch (args->op) { 228 case LINUX_FUTEX_WAIT: 229 args->val3 = FUTEX_BITSET_MATCH_ANY; 230 /* FALLTHROUGH */ 231 232 case LINUX_FUTEX_WAIT_BITSET: 233 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", 234 args->uaddr, args->val, args->val3); 235 236 return (linux_futex_wait(td, args)); 237 238 case LINUX_FUTEX_WAKE: 239 args->val3 = FUTEX_BITSET_MATCH_ANY; 240 /* FALLTHROUGH */ 241 242 case LINUX_FUTEX_WAKE_BITSET: 243 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", 244 args->uaddr, args->val, args->val3); 245 246 return (linux_futex_wake(td, args)); 247 248 case LINUX_FUTEX_REQUEUE: 249 /* 250 * Glibc does not use this operation since version 2.3.3, 251 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. 252 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when 253 * FUTEX_REQUEUE returned EINVAL. 254 */ 255 pem = pem_find(td->td_proc); 256 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { 257 linux_msg(td, "unsupported FUTEX_REQUEUE"); 258 pem->flags |= LINUX_XDEPR_REQUEUEOP; 259 } 260 261 /* 262 * The above is true, however musl libc does make use of the 263 * futex requeue operation, allow operation for brands which 264 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. 265 */ 266 p = td->td_proc; 267 Elf_Brandinfo *bi = p->p_elf_brandinfo; 268 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) 269 return (EINVAL); 270 args->val3_compare = false; 271 /* FALLTHROUGH */ 272 273 case LINUX_FUTEX_CMP_REQUEUE: 274 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " 275 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", 276 args->uaddr, args->val, args->val3, args->uaddr2, 277 args->ts); 278 279 return (linux_futex_requeue(td, args)); 280 281 case LINUX_FUTEX_WAKE_OP: 282 LINUX_CTR5(sys_futex, "WAKE_OP " 283 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", 284 args->uaddr, args->val, args->uaddr2, args->val3, 285 args->ts); 286 287 return (linux_futex_wakeop(td, args)); 288 289 case LINUX_FUTEX_LOCK_PI: 290 args->clockrt = true; 291 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", 292 args->uaddr, args->val); 293 294 return (linux_futex_lock_pi(td, false, args)); 295 296 case LINUX_FUTEX_UNLOCK_PI: 297 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", 298 args->uaddr); 299 300 return (linux_futex_unlock_pi(td, false, args)); 301 302 case LINUX_FUTEX_TRYLOCK_PI: 303 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", 304 args->uaddr); 305 306 return (linux_futex_lock_pi(td, true, args)); 307 308 case LINUX_FUTEX_WAIT_REQUEUE_PI: 309 /* not yet implemented */ 310 pem = pem_find(td->td_proc); 311 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 312 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); 313 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 314 } 315 return (ENOSYS); 316 317 case LINUX_FUTEX_CMP_REQUEUE_PI: 318 /* not yet implemented */ 319 pem = pem_find(td->td_proc); 320 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 321 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); 322 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 323 } 324 return (ENOSYS); 325 326 default: 327 linux_msg(td, "unsupported futex op %d", args->op); 328 return (ENOSYS); 329 } 330 } 331 332 /* 333 * pi protocol: 334 * - 0 futex word value means unlocked. 335 * - TID futex word value means locked. 336 * Userspace uses atomic ops to lock/unlock these futexes without entering the 337 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), 338 * then FUTEX_LOCK_PI is called. 339 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no 340 * other waiters exists looks up the thread that owns the futex (it has put its 341 * own TID into the futex value) and made this thread the owner of the internal 342 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock 343 * object, on which it blocks. Once it returns, it has the mutex acquired, and it 344 * sets the futex value to its own TID and returns (futex value contains 345 * FUTEX_WAITERS|TID). 346 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and 347 * FUTEX_UNLOCK_PI will be called. 348 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED 349 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS 350 * bit is preserved (if any). 351 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against 352 * the internal futex state and if correct, acquire futex. 353 */ 354 static int 355 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) 356 { 357 struct umtx_abs_timeout timo; 358 struct linux_emuldata *em; 359 struct umtx_pi *pi, *new_pi; 360 struct thread *td1; 361 struct umtx_q *uq; 362 int error, rv; 363 uint32_t owner, old_owner; 364 365 em = em_find(td); 366 uq = td->td_umtxq; 367 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), 368 &uq->uq_key); 369 if (error != 0) 370 return (error); 371 if (args->ts != NULL) 372 linux_umtx_abs_timeout_init(&timo, args); 373 374 umtxq_lock(&uq->uq_key); 375 pi = umtx_pi_lookup(&uq->uq_key); 376 if (pi == NULL) { 377 new_pi = umtx_pi_alloc(M_NOWAIT); 378 if (new_pi == NULL) { 379 umtxq_unlock(&uq->uq_key); 380 new_pi = umtx_pi_alloc(M_WAITOK); 381 umtxq_lock(&uq->uq_key); 382 pi = umtx_pi_lookup(&uq->uq_key); 383 if (pi != NULL) { 384 umtx_pi_free(new_pi); 385 new_pi = NULL; 386 } 387 } 388 if (new_pi != NULL) { 389 new_pi->pi_key = uq->uq_key; 390 umtx_pi_insert(new_pi); 391 pi = new_pi; 392 } 393 } 394 umtx_pi_ref(pi); 395 umtxq_unlock(&uq->uq_key); 396 for (;;) { 397 /* Try uncontested case first. */ 398 rv = casueword32(args->uaddr, 0, &owner, em->em_tid); 399 /* The acquire succeeded. */ 400 if (rv == 0) { 401 error = 0; 402 break; 403 } 404 if (rv == -1) { 405 error = EFAULT; 406 break; 407 } 408 409 /* 410 * Avoid overwriting a possible error from sleep due 411 * to the pending signal with suspension check result. 412 */ 413 if (error == 0) { 414 error = thread_check_susp(td, true); 415 if (error != 0) 416 break; 417 } 418 419 /* The futex word at *uaddr is already locked by the caller. */ 420 if ((owner & FUTEX_TID_MASK) == em->em_tid) { 421 error = EDEADLK; 422 break; 423 } 424 425 /* 426 * Futex owner died, handle_futex_death() set the OWNER_DIED bit 427 * and clear tid. Try to acquire it. 428 */ 429 if ((owner & FUTEX_TID_MASK) == 0) { 430 old_owner = owner; 431 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); 432 owner |= em->em_tid; 433 rv = casueword32(args->uaddr, old_owner, &owner, owner); 434 if (rv == -1) { 435 error = EFAULT; 436 break; 437 } 438 if (rv == 1) { 439 if (error == 0) { 440 error = thread_check_susp(td, true); 441 if (error != 0) 442 break; 443 } 444 445 /* 446 * If this failed the lock could 447 * changed, restart. 448 */ 449 continue; 450 } 451 452 umtxq_lock(&uq->uq_key); 453 umtxq_busy(&uq->uq_key); 454 error = umtx_pi_claim(pi, td); 455 umtxq_unbusy(&uq->uq_key); 456 umtxq_unlock(&uq->uq_key); 457 if (error != 0) { 458 /* 459 * Since we're going to return an 460 * error, restore the futex to its 461 * previous, unowned state to avoid 462 * compounding the problem. 463 */ 464 (void)casuword32(args->uaddr, owner, old_owner); 465 } 466 break; 467 } 468 469 /* 470 * Inconsistent state: OWNER_DIED is set and tid is not 0. 471 * Linux does some checks of futex state, we return EINVAL, 472 * as the user space can take care of this. 473 */ 474 if ((owner & FUTEX_OWNER_DIED) != 0) { 475 error = EINVAL; 476 break; 477 } 478 479 if (try != 0) { 480 error = EBUSY; 481 break; 482 } 483 484 /* 485 * If we caught a signal, we have retried and now 486 * exit immediately. 487 */ 488 if (error != 0) 489 break; 490 491 umtxq_lock(&uq->uq_key); 492 umtxq_busy(&uq->uq_key); 493 umtxq_unlock(&uq->uq_key); 494 495 /* 496 * Set the contested bit so that a release in user space knows 497 * to use the system call for unlock. If this fails either some 498 * one else has acquired the lock or it has been released. 499 */ 500 rv = casueword32(args->uaddr, owner, &owner, 501 owner | FUTEX_WAITERS); 502 if (rv == -1) { 503 umtxq_unbusy_unlocked(&uq->uq_key); 504 error = EFAULT; 505 break; 506 } 507 if (rv == 1) { 508 umtxq_unbusy_unlocked(&uq->uq_key); 509 error = thread_check_susp(td, true); 510 if (error != 0) 511 break; 512 513 /* 514 * The lock changed and we need to retry or we 515 * lost a race to the thread unlocking the umtx. 516 */ 517 continue; 518 } 519 520 /* 521 * Substitute Linux thread id by native thread id to 522 * avoid refactoring code of umtxq_sleep_pi(). 523 */ 524 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); 525 if (td1 != NULL) { 526 owner = td1->td_tid; 527 PROC_UNLOCK(td1->td_proc); 528 } else { 529 umtxq_unbusy_unlocked(&uq->uq_key); 530 error = EINVAL; 531 break; 532 } 533 534 umtxq_lock(&uq->uq_key); 535 536 /* We set the contested bit, sleep. */ 537 error = umtxq_sleep_pi(uq, pi, owner, "futexp", 538 args->ts == NULL ? NULL : &timo, 539 (args->flags & FUTEX_SHARED) != 0); 540 if (error != 0) 541 continue; 542 543 error = thread_check_susp(td, false); 544 if (error != 0) 545 break; 546 } 547 548 umtxq_lock(&uq->uq_key); 549 umtx_pi_unref(pi); 550 umtxq_unlock(&uq->uq_key); 551 umtx_key_release(&uq->uq_key); 552 return (error); 553 } 554 555 static int 556 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) 557 { 558 struct linux_emuldata *em; 559 struct umtx_key key; 560 uint32_t old, owner, new_owner; 561 int count, error; 562 563 em = em_find(td); 564 565 /* 566 * Make sure we own this mtx. 567 */ 568 error = fueword32(args->uaddr, &owner); 569 if (error == -1) 570 return (EFAULT); 571 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) 572 return (EPERM); 573 574 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); 575 if (error != 0) 576 return (error); 577 umtxq_lock(&key); 578 umtxq_busy(&key); 579 error = umtx_pi_drop(td, &key, rb, &count); 580 if (error != 0 || rb) { 581 umtxq_unbusy(&key); 582 umtxq_unlock(&key); 583 umtx_key_release(&key); 584 return (error); 585 } 586 umtxq_unlock(&key); 587 588 /* 589 * When unlocking the futex, it must be marked as unowned if 590 * there is zero or one thread only waiting for it. 591 * Otherwise, it must be marked as contested. 592 */ 593 if (count > 1) 594 new_owner = FUTEX_WAITERS; 595 else 596 new_owner = 0; 597 598 again: 599 error = casueword32(args->uaddr, owner, &old, new_owner); 600 if (error == 1) { 601 error = thread_check_susp(td, false); 602 if (error == 0) 603 goto again; 604 } 605 umtxq_unbusy_unlocked(&key); 606 umtx_key_release(&key); 607 if (error == -1) 608 return (EFAULT); 609 if (error == 0 && old != owner) 610 return (EINVAL); 611 return (error); 612 } 613 614 static int 615 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) 616 { 617 struct umtx_key key, key2; 618 int nrwake, op_ret, ret; 619 int error, count; 620 621 if (args->uaddr == args->uaddr2) 622 return (EINVAL); 623 624 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 625 if (error != 0) 626 return (error); 627 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 628 if (error != 0) { 629 umtx_key_release(&key); 630 return (error); 631 } 632 umtxq_lock(&key); 633 umtxq_busy(&key); 634 umtxq_unlock(&key); 635 op_ret = futex_atomic_op(td, args->val3, args->uaddr2); 636 if (op_ret < 0) { 637 if (op_ret == -ENOSYS) 638 error = ENOSYS; 639 else 640 error = EFAULT; 641 } 642 umtxq_lock(&key); 643 umtxq_unbusy(&key); 644 if (error != 0) 645 goto out; 646 ret = umtxq_signal_mask(&key, args->val, args->val3); 647 if (op_ret > 0) { 648 nrwake = (int)(unsigned long)args->ts; 649 umtxq_lock(&key2); 650 count = umtxq_count(&key2); 651 if (count > 0) 652 ret += umtxq_signal_mask(&key2, nrwake, args->val3); 653 else 654 ret += umtxq_signal_mask(&key, nrwake, args->val3); 655 umtxq_unlock(&key2); 656 } 657 td->td_retval[0] = ret; 658 out: 659 umtxq_unlock(&key); 660 umtx_key_release(&key2); 661 umtx_key_release(&key); 662 return (error); 663 } 664 665 static int 666 linux_futex_requeue(struct thread *td, struct linux_futex_args *args) 667 { 668 int nrwake, nrrequeue; 669 struct umtx_key key, key2; 670 int error; 671 uint32_t uval; 672 673 /* 674 * Linux allows this, we would not, it is an incorrect 675 * usage of declared ABI, so return EINVAL. 676 */ 677 if (args->uaddr == args->uaddr2) 678 return (EINVAL); 679 680 nrrequeue = (int)(unsigned long)args->ts; 681 nrwake = args->val; 682 /* 683 * Sanity check to prevent signed integer overflow, 684 * see Linux CVE-2018-6927 685 */ 686 if (nrwake < 0 || nrrequeue < 0) 687 return (EINVAL); 688 689 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 690 if (error != 0) 691 return (error); 692 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 693 if (error != 0) { 694 umtx_key_release(&key); 695 return (error); 696 } 697 umtxq_lock(&key); 698 umtxq_busy(&key); 699 umtxq_unlock(&key); 700 error = fueword32(args->uaddr, &uval); 701 if (error != 0) 702 error = EFAULT; 703 else if (args->val3_compare == true && uval != args->val3) 704 error = EWOULDBLOCK; 705 umtxq_lock(&key); 706 umtxq_unbusy(&key); 707 if (error == 0) { 708 umtxq_lock(&key2); 709 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); 710 umtxq_unlock(&key2); 711 } 712 umtxq_unlock(&key); 713 umtx_key_release(&key2); 714 umtx_key_release(&key); 715 return (error); 716 } 717 718 static int 719 linux_futex_wake(struct thread *td, struct linux_futex_args *args) 720 { 721 struct umtx_key key; 722 int error; 723 724 if (args->val3 == 0) 725 return (EINVAL); 726 727 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 728 if (error != 0) 729 return (error); 730 umtxq_lock(&key); 731 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); 732 umtxq_unlock(&key); 733 umtx_key_release(&key); 734 return (0); 735 } 736 737 static int 738 linux_futex_wait(struct thread *td, struct linux_futex_args *args) 739 { 740 struct umtx_abs_timeout timo; 741 struct umtx_q *uq; 742 uint32_t uval; 743 int error; 744 745 if (args->val3 == 0) 746 error = EINVAL; 747 748 uq = td->td_umtxq; 749 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), 750 &uq->uq_key); 751 if (error != 0) 752 return (error); 753 if (args->ts != NULL) 754 linux_umtx_abs_timeout_init(&timo, args); 755 umtxq_lock(&uq->uq_key); 756 umtxq_busy(&uq->uq_key); 757 uq->uq_bitset = args->val3; 758 umtxq_insert(uq); 759 umtxq_unlock(&uq->uq_key); 760 error = fueword32(args->uaddr, &uval); 761 if (error != 0) 762 error = EFAULT; 763 else if (uval != args->val) 764 error = EWOULDBLOCK; 765 umtxq_lock(&uq->uq_key); 766 umtxq_unbusy(&uq->uq_key); 767 if (error == 0) { 768 error = umtxq_sleep(uq, "futex", 769 args->ts == NULL ? NULL : &timo); 770 if ((uq->uq_flags & UQF_UMTXQ) == 0) 771 error = 0; 772 else 773 umtxq_remove(uq); 774 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { 775 umtxq_remove(uq); 776 } 777 umtxq_unlock(&uq->uq_key); 778 umtx_key_release(&uq->uq_key); 779 if (error == ERESTART) 780 error = EINTR; 781 return (error); 782 } 783 784 static void 785 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, 786 struct linux_futex_args *args) 787 { 788 int clockid, absolute; 789 790 /* 791 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the 792 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI. 793 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other 794 * futex operations timeout is interpreted as an absolute value. 795 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures 796 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel 797 * measures the timeout against the CLOCK_MONOTONIC clock. 798 */ 799 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; 800 absolute = args->op == LINUX_FUTEX_WAIT ? false : true; 801 umtx_abs_timeout_init(timo, clockid, absolute, args->ts); 802 } 803 804 int 805 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 806 { 807 struct linux_futex_args fargs = { 808 .uaddr = args->uaddr, 809 .op = args->op, 810 .val = args->val, 811 .ts = NULL, 812 .uaddr2 = args->uaddr2, 813 .val3 = args->val3, 814 .val3_compare = true, 815 }; 816 struct l_timespec lts; 817 int error; 818 819 switch (args->op & LINUX_FUTEX_CMD_MASK) { 820 case LINUX_FUTEX_WAIT: 821 case LINUX_FUTEX_WAIT_BITSET: 822 case LINUX_FUTEX_LOCK_PI: 823 if (args->timeout != NULL) { 824 error = copyin(args->timeout, <s, sizeof(lts)); 825 if (error != 0) 826 return (error); 827 error = linux_to_native_timespec(&fargs.kts, <s); 828 if (error != 0) 829 return (error); 830 fargs.ts = &fargs.kts; 831 } 832 break; 833 default: 834 fargs.ts = PTRIN(args->timeout); 835 } 836 return (linux_futex(td, &fargs)); 837 } 838 839 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) 840 int 841 linux_sys_futex_time64(struct thread *td, 842 struct linux_sys_futex_time64_args *args) 843 { 844 struct linux_futex_args fargs = { 845 .uaddr = args->uaddr, 846 .op = args->op, 847 .val = args->val, 848 .ts = NULL, 849 .uaddr2 = args->uaddr2, 850 .val3 = args->val3, 851 .val3_compare = true, 852 }; 853 struct l_timespec64 lts; 854 int error; 855 856 switch (args->op & LINUX_FUTEX_CMD_MASK) { 857 case LINUX_FUTEX_WAIT: 858 case LINUX_FUTEX_WAIT_BITSET: 859 case LINUX_FUTEX_LOCK_PI: 860 if (args->timeout != NULL) { 861 error = copyin(args->timeout, <s, sizeof(lts)); 862 if (error != 0) 863 return (error); 864 error = linux_to_native_timespec64(&fargs.kts, <s); 865 if (error != 0) 866 return (error); 867 fargs.ts = &fargs.kts; 868 } 869 break; 870 default: 871 fargs.ts = PTRIN(args->timeout); 872 } 873 return (linux_futex(td, &fargs)); 874 } 875 #endif 876 877 int 878 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 879 { 880 struct linux_emuldata *em; 881 882 if (args->len != sizeof(struct linux_robust_list_head)) 883 return (EINVAL); 884 885 em = em_find(td); 886 em->robust_futexes = args->head; 887 888 return (0); 889 } 890 891 int 892 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 893 { 894 struct linux_emuldata *em; 895 struct linux_robust_list_head *head; 896 l_size_t len; 897 struct thread *td2; 898 int error; 899 900 if (!args->pid) { 901 em = em_find(td); 902 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 903 head = em->robust_futexes; 904 } else { 905 td2 = linux_tdfind(td, args->pid, -1); 906 if (td2 == NULL) 907 return (ESRCH); 908 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { 909 PROC_UNLOCK(td2->td_proc); 910 return (EPERM); 911 } 912 913 em = em_find(td2); 914 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 915 /* XXX: ptrace? */ 916 if (priv_check(td, PRIV_CRED_SETUID) || 917 priv_check(td, PRIV_CRED_SETEUID) || 918 p_candebug(td, td2->td_proc)) { 919 PROC_UNLOCK(td2->td_proc); 920 return (EPERM); 921 } 922 head = em->robust_futexes; 923 924 PROC_UNLOCK(td2->td_proc); 925 } 926 927 len = sizeof(struct linux_robust_list_head); 928 error = copyout(&len, args->len, sizeof(l_size_t)); 929 if (error != 0) 930 return (EFAULT); 931 932 return (copyout(&head, args->head, sizeof(head))); 933 } 934 935 static int 936 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr, 937 unsigned int pi, bool pending_op) 938 { 939 uint32_t uval, nval, mval; 940 int error; 941 942 retry: 943 error = fueword32(uaddr, &uval); 944 if (error != 0) 945 return (EFAULT); 946 947 /* 948 * Special case for regular (non PI) futexes. The unlock path in 949 * user space has two race scenarios: 950 * 951 * 1. The unlock path releases the user space futex value and 952 * before it can execute the futex() syscall to wake up 953 * waiters it is killed. 954 * 955 * 2. A woken up waiter is killed before it can acquire the 956 * futex in user space. 957 * 958 * In both cases the TID validation below prevents a wakeup of 959 * potential waiters which can cause these waiters to block 960 * forever. 961 * 962 * In both cases it is safe to attempt waking up a potential 963 * waiter without touching the user space futex value and trying 964 * to set the OWNER_DIED bit. 965 */ 966 if (pending_op && !pi && !uval) { 967 (void)futex_wake(td, uaddr, 1, true); 968 return (0); 969 } 970 971 if ((uval & FUTEX_TID_MASK) == em->em_tid) { 972 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 973 error = casueword32(uaddr, uval, &nval, mval); 974 if (error == -1) 975 return (EFAULT); 976 if (error == 1) { 977 error = thread_check_susp(td, false); 978 if (error != 0) 979 return (error); 980 goto retry; 981 } 982 983 if (!pi && (uval & FUTEX_WAITERS)) { 984 error = futex_wake(td, uaddr, 1, true); 985 if (error != 0) 986 return (error); 987 } else if (pi && (uval & FUTEX_WAITERS)) { 988 error = futex_wake_pi(td, uaddr, true); 989 if (error != 0) 990 return (error); 991 } 992 } 993 994 return (0); 995 } 996 997 static int 998 fetch_robust_entry(struct linux_robust_list **entry, 999 struct linux_robust_list **head, unsigned int *pi) 1000 { 1001 l_ulong uentry; 1002 int error; 1003 1004 error = copyin((const void *)head, &uentry, sizeof(l_ulong)); 1005 if (error != 0) 1006 return (EFAULT); 1007 1008 *entry = (void *)(uentry & ~1UL); 1009 *pi = uentry & 1; 1010 1011 return (0); 1012 } 1013 1014 #define LINUX_HANDLE_DEATH_PENDING true 1015 #define LINUX_HANDLE_DEATH_LIST false 1016 1017 /* This walks the list of robust futexes releasing them. */ 1018 void 1019 release_futexes(struct thread *td, struct linux_emuldata *em) 1020 { 1021 struct linux_robust_list_head *head; 1022 struct linux_robust_list *entry, *next_entry, *pending; 1023 unsigned int limit = 2048, pi, next_pi, pip; 1024 uint32_t *uaddr; 1025 l_long futex_offset; 1026 int error; 1027 1028 head = em->robust_futexes; 1029 if (head == NULL) 1030 return; 1031 1032 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 1033 return; 1034 1035 error = copyin(&head->futex_offset, &futex_offset, 1036 sizeof(futex_offset)); 1037 if (error != 0) 1038 return; 1039 1040 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 1041 return; 1042 1043 while (entry != &head->list) { 1044 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next), 1045 &next_pi); 1046 1047 /* 1048 * A pending lock might already be on the list, so 1049 * don't process it twice. 1050 */ 1051 if (entry != pending) { 1052 uaddr = (uint32_t *)((caddr_t)entry + futex_offset); 1053 if (handle_futex_death(td, em, uaddr, pi, 1054 LINUX_HANDLE_DEATH_LIST)) 1055 return; 1056 } 1057 if (error != 0) 1058 return; 1059 1060 entry = next_entry; 1061 pi = next_pi; 1062 1063 if (!--limit) 1064 break; 1065 1066 sched_relinquish(curthread); 1067 } 1068 1069 if (pending) { 1070 uaddr = (uint32_t *)((caddr_t)pending + futex_offset); 1071 (void)handle_futex_death(td, em, uaddr, pip, 1072 LINUX_HANDLE_DEATH_PENDING); 1073 } 1074 } 1075