1 /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ 2 3 /*- 4 * SPDX-License-Identifier: BSD-4-Clause 5 * 6 * Copyright (c) 2005 Emmanuel Dreyfus 7 * All rights reserved. 8 * Copyright (c) 2009-2016 Dmitry Chagin <dchagin@FreeBSD.org> 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by Emmanuel Dreyfus 21 * 4. The name of the author may not be used to endorse or promote 22 * products derived from this software without specific prior written 23 * permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' 26 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 27 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 #if 0 41 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); 42 #endif 43 44 #include "opt_compat.h" 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/imgact.h> 49 #include <sys/imgact_elf.h> 50 #include <sys/kernel.h> 51 #include <sys/ktr.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mutex.h> 55 #include <sys/priv.h> 56 #include <sys/proc.h> 57 #include <sys/queue.h> 58 #include <sys/sched.h> 59 #include <sys/umtxvar.h> 60 61 #include <vm/vm_extern.h> 62 63 #ifdef COMPAT_LINUX32 64 #include <machine/../linux32/linux.h> 65 #include <machine/../linux32/linux32_proto.h> 66 #else 67 #include <machine/../linux/linux.h> 68 #include <machine/../linux/linux_proto.h> 69 #endif 70 #include <compat/linux/linux_emul.h> 71 #include <compat/linux/linux_futex.h> 72 #include <compat/linux/linux_misc.h> 73 #include <compat/linux/linux_timer.h> 74 #include <compat/linux/linux_util.h> 75 76 #define FUTEX_SHARED 0x8 /* shared futex */ 77 78 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE 79 80 static int futex_atomic_op(struct thread *, int, uint32_t *); 81 static int handle_futex_death(struct thread *td, struct linux_emuldata *, 82 uint32_t *, unsigned int, bool); 83 static int fetch_robust_entry(struct linux_robust_list **, 84 struct linux_robust_list **, unsigned int *); 85 86 struct linux_futex_args { 87 uint32_t *uaddr; 88 int32_t op; 89 uint32_t flags; 90 bool clockrt; 91 uint32_t val; 92 struct timespec *ts; 93 uint32_t *uaddr2; 94 uint32_t val3; 95 bool val3_compare; 96 struct timespec kts; 97 }; 98 99 static inline int futex_key_get(const void *, int, int, struct umtx_key *); 100 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, 101 struct linux_futex_args *); 102 static int linux_futex(struct thread *, struct linux_futex_args *); 103 static int linux_futex_wait(struct thread *, struct linux_futex_args *); 104 static int linux_futex_wake(struct thread *, struct linux_futex_args *); 105 static int linux_futex_requeue(struct thread *, struct linux_futex_args *); 106 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); 107 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); 108 static int linux_futex_unlock_pi(struct thread *, bool, 109 struct linux_futex_args *); 110 static int futex_wake_pi(struct thread *, uint32_t *, bool); 111 112 static int 113 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key) 114 { 115 116 /* Check that futex address is a 32bit aligned. */ 117 if (!__is_aligned(uaddr, sizeof(uint32_t))) 118 return (EINVAL); 119 return (umtx_key_get(uaddr, type, share, key)); 120 } 121 122 int 123 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) 124 { 125 struct linux_futex_args args; 126 127 bzero(&args, sizeof(args)); 128 args.op = LINUX_FUTEX_WAKE; 129 args.uaddr = uaddr; 130 args.flags = shared == true ? FUTEX_SHARED : 0; 131 args.val = val; 132 args.val3 = FUTEX_BITSET_MATCH_ANY; 133 134 return (linux_futex_wake(td, &args)); 135 } 136 137 static int 138 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) 139 { 140 struct linux_futex_args args; 141 142 bzero(&args, sizeof(args)); 143 args.op = LINUX_FUTEX_UNLOCK_PI; 144 args.uaddr = uaddr; 145 args.flags = shared == true ? FUTEX_SHARED : 0; 146 147 return (linux_futex_unlock_pi(td, true, &args)); 148 } 149 150 static int 151 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) 152 { 153 int op = (encoded_op >> 28) & 7; 154 int cmp = (encoded_op >> 24) & 15; 155 int oparg = (encoded_op << 8) >> 20; 156 int cmparg = (encoded_op << 20) >> 20; 157 int oldval = 0, ret; 158 159 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 160 oparg = 1 << oparg; 161 162 switch (op) { 163 case FUTEX_OP_SET: 164 ret = futex_xchgl(oparg, uaddr, &oldval); 165 break; 166 case FUTEX_OP_ADD: 167 ret = futex_addl(oparg, uaddr, &oldval); 168 break; 169 case FUTEX_OP_OR: 170 ret = futex_orl(oparg, uaddr, &oldval); 171 break; 172 case FUTEX_OP_ANDN: 173 ret = futex_andl(~oparg, uaddr, &oldval); 174 break; 175 case FUTEX_OP_XOR: 176 ret = futex_xorl(oparg, uaddr, &oldval); 177 break; 178 default: 179 ret = -ENOSYS; 180 break; 181 } 182 183 if (ret) 184 return (ret); 185 186 switch (cmp) { 187 case FUTEX_OP_CMP_EQ: 188 ret = (oldval == cmparg); 189 break; 190 case FUTEX_OP_CMP_NE: 191 ret = (oldval != cmparg); 192 break; 193 case FUTEX_OP_CMP_LT: 194 ret = (oldval < cmparg); 195 break; 196 case FUTEX_OP_CMP_GE: 197 ret = (oldval >= cmparg); 198 break; 199 case FUTEX_OP_CMP_LE: 200 ret = (oldval <= cmparg); 201 break; 202 case FUTEX_OP_CMP_GT: 203 ret = (oldval > cmparg); 204 break; 205 default: 206 ret = -ENOSYS; 207 } 208 209 return (ret); 210 } 211 212 static int 213 linux_futex(struct thread *td, struct linux_futex_args *args) 214 { 215 struct linux_pemuldata *pem; 216 struct proc *p; 217 218 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { 219 args->flags = 0; 220 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; 221 } else 222 args->flags = FUTEX_SHARED; 223 224 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; 225 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; 226 227 if (args->clockrt && 228 args->op != LINUX_FUTEX_WAIT_BITSET && 229 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI && 230 args->op != LINUX_FUTEX_LOCK_PI2) 231 return (ENOSYS); 232 233 switch (args->op) { 234 case LINUX_FUTEX_WAIT: 235 args->val3 = FUTEX_BITSET_MATCH_ANY; 236 /* FALLTHROUGH */ 237 238 case LINUX_FUTEX_WAIT_BITSET: 239 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", 240 args->uaddr, args->val, args->val3); 241 242 return (linux_futex_wait(td, args)); 243 244 case LINUX_FUTEX_WAKE: 245 args->val3 = FUTEX_BITSET_MATCH_ANY; 246 /* FALLTHROUGH */ 247 248 case LINUX_FUTEX_WAKE_BITSET: 249 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", 250 args->uaddr, args->val, args->val3); 251 252 return (linux_futex_wake(td, args)); 253 254 case LINUX_FUTEX_REQUEUE: 255 /* 256 * Glibc does not use this operation since version 2.3.3, 257 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. 258 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when 259 * FUTEX_REQUEUE returned EINVAL. 260 */ 261 pem = pem_find(td->td_proc); 262 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { 263 linux_msg(td, "unsupported FUTEX_REQUEUE"); 264 pem->flags |= LINUX_XDEPR_REQUEUEOP; 265 } 266 267 /* 268 * The above is true, however musl libc does make use of the 269 * futex requeue operation, allow operation for brands which 270 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. 271 */ 272 p = td->td_proc; 273 Elf_Brandinfo *bi = p->p_elf_brandinfo; 274 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) 275 return (EINVAL); 276 args->val3_compare = false; 277 /* FALLTHROUGH */ 278 279 case LINUX_FUTEX_CMP_REQUEUE: 280 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " 281 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", 282 args->uaddr, args->val, args->val3, args->uaddr2, 283 args->ts); 284 285 return (linux_futex_requeue(td, args)); 286 287 case LINUX_FUTEX_WAKE_OP: 288 LINUX_CTR5(sys_futex, "WAKE_OP " 289 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", 290 args->uaddr, args->val, args->uaddr2, args->val3, 291 args->ts); 292 293 return (linux_futex_wakeop(td, args)); 294 295 case LINUX_FUTEX_LOCK_PI: 296 args->clockrt = true; 297 /* FALLTHROUGH */ 298 299 case LINUX_FUTEX_LOCK_PI2: 300 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", 301 args->uaddr, args->val); 302 303 return (linux_futex_lock_pi(td, false, args)); 304 305 case LINUX_FUTEX_UNLOCK_PI: 306 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", 307 args->uaddr); 308 309 return (linux_futex_unlock_pi(td, false, args)); 310 311 case LINUX_FUTEX_TRYLOCK_PI: 312 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", 313 args->uaddr); 314 315 return (linux_futex_lock_pi(td, true, args)); 316 317 case LINUX_FUTEX_WAIT_REQUEUE_PI: 318 /* not yet implemented */ 319 pem = pem_find(td->td_proc); 320 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 321 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); 322 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 323 } 324 return (ENOSYS); 325 326 case LINUX_FUTEX_CMP_REQUEUE_PI: 327 /* not yet implemented */ 328 pem = pem_find(td->td_proc); 329 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { 330 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); 331 pem->flags |= LINUX_XUNSUP_FUTEXPIOP; 332 } 333 return (ENOSYS); 334 335 default: 336 linux_msg(td, "unsupported futex op %d", args->op); 337 return (ENOSYS); 338 } 339 } 340 341 /* 342 * pi protocol: 343 * - 0 futex word value means unlocked. 344 * - TID futex word value means locked. 345 * Userspace uses atomic ops to lock/unlock these futexes without entering the 346 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), 347 * then FUTEX_LOCK_PI is called. 348 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no 349 * other waiters exists looks up the thread that owns the futex (it has put its 350 * own TID into the futex value) and made this thread the owner of the internal 351 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock 352 * object, on which it blocks. Once it returns, it has the mutex acquired, and it 353 * sets the futex value to its own TID and returns (futex value contains 354 * FUTEX_WAITERS|TID). 355 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and 356 * FUTEX_UNLOCK_PI will be called. 357 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED 358 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS 359 * bit is preserved (if any). 360 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against 361 * the internal futex state and if correct, acquire futex. 362 */ 363 static int 364 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) 365 { 366 struct umtx_abs_timeout timo; 367 struct linux_emuldata *em; 368 struct umtx_pi *pi, *new_pi; 369 struct thread *td1; 370 struct umtx_q *uq; 371 int error, rv; 372 uint32_t owner, old_owner; 373 374 em = em_find(td); 375 uq = td->td_umtxq; 376 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), 377 &uq->uq_key); 378 if (error != 0) 379 return (error); 380 if (args->ts != NULL) 381 linux_umtx_abs_timeout_init(&timo, args); 382 383 umtxq_lock(&uq->uq_key); 384 pi = umtx_pi_lookup(&uq->uq_key); 385 if (pi == NULL) { 386 new_pi = umtx_pi_alloc(M_NOWAIT); 387 if (new_pi == NULL) { 388 umtxq_unlock(&uq->uq_key); 389 new_pi = umtx_pi_alloc(M_WAITOK); 390 umtxq_lock(&uq->uq_key); 391 pi = umtx_pi_lookup(&uq->uq_key); 392 if (pi != NULL) { 393 umtx_pi_free(new_pi); 394 new_pi = NULL; 395 } 396 } 397 if (new_pi != NULL) { 398 new_pi->pi_key = uq->uq_key; 399 umtx_pi_insert(new_pi); 400 pi = new_pi; 401 } 402 } 403 umtx_pi_ref(pi); 404 umtxq_unlock(&uq->uq_key); 405 for (;;) { 406 /* Try uncontested case first. */ 407 rv = casueword32(args->uaddr, 0, &owner, em->em_tid); 408 /* The acquire succeeded. */ 409 if (rv == 0) { 410 error = 0; 411 break; 412 } 413 if (rv == -1) { 414 error = EFAULT; 415 break; 416 } 417 418 /* 419 * Avoid overwriting a possible error from sleep due 420 * to the pending signal with suspension check result. 421 */ 422 if (error == 0) { 423 error = thread_check_susp(td, true); 424 if (error != 0) 425 break; 426 } 427 428 /* The futex word at *uaddr is already locked by the caller. */ 429 if ((owner & FUTEX_TID_MASK) == em->em_tid) { 430 error = EDEADLK; 431 break; 432 } 433 434 /* 435 * Futex owner died, handle_futex_death() set the OWNER_DIED bit 436 * and clear tid. Try to acquire it. 437 */ 438 if ((owner & FUTEX_TID_MASK) == 0) { 439 old_owner = owner; 440 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); 441 owner |= em->em_tid; 442 rv = casueword32(args->uaddr, old_owner, &owner, owner); 443 if (rv == -1) { 444 error = EFAULT; 445 break; 446 } 447 if (rv == 1) { 448 if (error == 0) { 449 error = thread_check_susp(td, true); 450 if (error != 0) 451 break; 452 } 453 454 /* 455 * If this failed the lock could 456 * changed, restart. 457 */ 458 continue; 459 } 460 461 umtxq_lock(&uq->uq_key); 462 umtxq_busy(&uq->uq_key); 463 error = umtx_pi_claim(pi, td); 464 umtxq_unbusy(&uq->uq_key); 465 umtxq_unlock(&uq->uq_key); 466 if (error != 0) { 467 /* 468 * Since we're going to return an 469 * error, restore the futex to its 470 * previous, unowned state to avoid 471 * compounding the problem. 472 */ 473 (void)casuword32(args->uaddr, owner, old_owner); 474 } 475 break; 476 } 477 478 /* 479 * Inconsistent state: OWNER_DIED is set and tid is not 0. 480 * Linux does some checks of futex state, we return EINVAL, 481 * as the user space can take care of this. 482 */ 483 if ((owner & FUTEX_OWNER_DIED) != 0) { 484 error = EINVAL; 485 break; 486 } 487 488 if (try != 0) { 489 error = EBUSY; 490 break; 491 } 492 493 /* 494 * If we caught a signal, we have retried and now 495 * exit immediately. 496 */ 497 if (error != 0) 498 break; 499 500 umtxq_lock(&uq->uq_key); 501 umtxq_busy(&uq->uq_key); 502 umtxq_unlock(&uq->uq_key); 503 504 /* 505 * Set the contested bit so that a release in user space knows 506 * to use the system call for unlock. If this fails either some 507 * one else has acquired the lock or it has been released. 508 */ 509 rv = casueword32(args->uaddr, owner, &owner, 510 owner | FUTEX_WAITERS); 511 if (rv == -1) { 512 umtxq_unbusy_unlocked(&uq->uq_key); 513 error = EFAULT; 514 break; 515 } 516 if (rv == 1) { 517 umtxq_unbusy_unlocked(&uq->uq_key); 518 error = thread_check_susp(td, true); 519 if (error != 0) 520 break; 521 522 /* 523 * The lock changed and we need to retry or we 524 * lost a race to the thread unlocking the umtx. 525 */ 526 continue; 527 } 528 529 /* 530 * Substitute Linux thread id by native thread id to 531 * avoid refactoring code of umtxq_sleep_pi(). 532 */ 533 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); 534 if (td1 != NULL) { 535 owner = td1->td_tid; 536 PROC_UNLOCK(td1->td_proc); 537 } else { 538 umtxq_unbusy_unlocked(&uq->uq_key); 539 error = EINVAL; 540 break; 541 } 542 543 umtxq_lock(&uq->uq_key); 544 545 /* We set the contested bit, sleep. */ 546 error = umtxq_sleep_pi(uq, pi, owner, "futexp", 547 args->ts == NULL ? NULL : &timo, 548 (args->flags & FUTEX_SHARED) != 0); 549 if (error != 0) 550 continue; 551 552 error = thread_check_susp(td, false); 553 if (error != 0) 554 break; 555 } 556 557 umtxq_lock(&uq->uq_key); 558 umtx_pi_unref(pi); 559 umtxq_unlock(&uq->uq_key); 560 umtx_key_release(&uq->uq_key); 561 return (error); 562 } 563 564 static int 565 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) 566 { 567 struct linux_emuldata *em; 568 struct umtx_key key; 569 uint32_t old, owner, new_owner; 570 int count, error; 571 572 em = em_find(td); 573 574 /* 575 * Make sure we own this mtx. 576 */ 577 error = fueword32(args->uaddr, &owner); 578 if (error == -1) 579 return (EFAULT); 580 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) 581 return (EPERM); 582 583 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); 584 if (error != 0) 585 return (error); 586 umtxq_lock(&key); 587 umtxq_busy(&key); 588 error = umtx_pi_drop(td, &key, rb, &count); 589 if (error != 0 || rb) { 590 umtxq_unbusy(&key); 591 umtxq_unlock(&key); 592 umtx_key_release(&key); 593 return (error); 594 } 595 umtxq_unlock(&key); 596 597 /* 598 * When unlocking the futex, it must be marked as unowned if 599 * there is zero or one thread only waiting for it. 600 * Otherwise, it must be marked as contested. 601 */ 602 if (count > 1) 603 new_owner = FUTEX_WAITERS; 604 else 605 new_owner = 0; 606 607 again: 608 error = casueword32(args->uaddr, owner, &old, new_owner); 609 if (error == 1) { 610 error = thread_check_susp(td, false); 611 if (error == 0) 612 goto again; 613 } 614 umtxq_unbusy_unlocked(&key); 615 umtx_key_release(&key); 616 if (error == -1) 617 return (EFAULT); 618 if (error == 0 && old != owner) 619 return (EINVAL); 620 return (error); 621 } 622 623 static int 624 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) 625 { 626 struct umtx_key key, key2; 627 int nrwake, op_ret, ret; 628 int error, count; 629 630 if (args->uaddr == args->uaddr2) 631 return (EINVAL); 632 633 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 634 if (error != 0) 635 return (error); 636 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 637 if (error != 0) { 638 umtx_key_release(&key); 639 return (error); 640 } 641 umtxq_lock(&key); 642 umtxq_busy(&key); 643 umtxq_unlock(&key); 644 op_ret = futex_atomic_op(td, args->val3, args->uaddr2); 645 if (op_ret < 0) { 646 if (op_ret == -ENOSYS) 647 error = ENOSYS; 648 else 649 error = EFAULT; 650 } 651 umtxq_lock(&key); 652 umtxq_unbusy(&key); 653 if (error != 0) 654 goto out; 655 ret = umtxq_signal_mask(&key, args->val, args->val3); 656 if (op_ret > 0) { 657 nrwake = (int)(unsigned long)args->ts; 658 umtxq_lock(&key2); 659 count = umtxq_count(&key2); 660 if (count > 0) 661 ret += umtxq_signal_mask(&key2, nrwake, args->val3); 662 else 663 ret += umtxq_signal_mask(&key, nrwake, args->val3); 664 umtxq_unlock(&key2); 665 } 666 td->td_retval[0] = ret; 667 out: 668 umtxq_unlock(&key); 669 umtx_key_release(&key2); 670 umtx_key_release(&key); 671 return (error); 672 } 673 674 static int 675 linux_futex_requeue(struct thread *td, struct linux_futex_args *args) 676 { 677 int nrwake, nrrequeue; 678 struct umtx_key key, key2; 679 int error; 680 uint32_t uval; 681 682 /* 683 * Linux allows this, we would not, it is an incorrect 684 * usage of declared ABI, so return EINVAL. 685 */ 686 if (args->uaddr == args->uaddr2) 687 return (EINVAL); 688 689 nrrequeue = (int)(unsigned long)args->ts; 690 nrwake = args->val; 691 /* 692 * Sanity check to prevent signed integer overflow, 693 * see Linux CVE-2018-6927 694 */ 695 if (nrwake < 0 || nrrequeue < 0) 696 return (EINVAL); 697 698 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 699 if (error != 0) 700 return (error); 701 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); 702 if (error != 0) { 703 umtx_key_release(&key); 704 return (error); 705 } 706 umtxq_lock(&key); 707 umtxq_busy(&key); 708 umtxq_unlock(&key); 709 error = fueword32(args->uaddr, &uval); 710 if (error != 0) 711 error = EFAULT; 712 else if (args->val3_compare == true && uval != args->val3) 713 error = EWOULDBLOCK; 714 umtxq_lock(&key); 715 umtxq_unbusy(&key); 716 if (error == 0) { 717 umtxq_lock(&key2); 718 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); 719 umtxq_unlock(&key2); 720 } 721 umtxq_unlock(&key); 722 umtx_key_release(&key2); 723 umtx_key_release(&key); 724 return (error); 725 } 726 727 static int 728 linux_futex_wake(struct thread *td, struct linux_futex_args *args) 729 { 730 struct umtx_key key; 731 int error; 732 733 if (args->val3 == 0) 734 return (EINVAL); 735 736 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); 737 if (error != 0) 738 return (error); 739 umtxq_lock(&key); 740 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); 741 umtxq_unlock(&key); 742 umtx_key_release(&key); 743 return (0); 744 } 745 746 static int 747 linux_futex_wait(struct thread *td, struct linux_futex_args *args) 748 { 749 struct umtx_abs_timeout timo; 750 struct umtx_q *uq; 751 uint32_t uval; 752 int error; 753 754 if (args->val3 == 0) 755 error = EINVAL; 756 757 uq = td->td_umtxq; 758 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), 759 &uq->uq_key); 760 if (error != 0) 761 return (error); 762 if (args->ts != NULL) 763 linux_umtx_abs_timeout_init(&timo, args); 764 umtxq_lock(&uq->uq_key); 765 umtxq_busy(&uq->uq_key); 766 uq->uq_bitset = args->val3; 767 umtxq_insert(uq); 768 umtxq_unlock(&uq->uq_key); 769 error = fueword32(args->uaddr, &uval); 770 if (error != 0) 771 error = EFAULT; 772 else if (uval != args->val) 773 error = EWOULDBLOCK; 774 umtxq_lock(&uq->uq_key); 775 umtxq_unbusy(&uq->uq_key); 776 if (error == 0) { 777 error = umtxq_sleep(uq, "futex", 778 args->ts == NULL ? NULL : &timo); 779 if ((uq->uq_flags & UQF_UMTXQ) == 0) 780 error = 0; 781 else 782 umtxq_remove(uq); 783 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { 784 umtxq_remove(uq); 785 } 786 umtxq_unlock(&uq->uq_key); 787 umtx_key_release(&uq->uq_key); 788 if (error == ERESTART) 789 error = EINTR; 790 return (error); 791 } 792 793 static void 794 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, 795 struct linux_futex_args *args) 796 { 797 int clockid, absolute; 798 799 /* 800 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the 801 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2. 802 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other 803 * futex operations timeout is interpreted as an absolute value. 804 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures 805 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel 806 * measures the timeout against the CLOCK_MONOTONIC clock. 807 */ 808 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; 809 absolute = args->op == LINUX_FUTEX_WAIT ? false : true; 810 umtx_abs_timeout_init(timo, clockid, absolute, args->ts); 811 } 812 813 int 814 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 815 { 816 struct linux_futex_args fargs = { 817 .uaddr = args->uaddr, 818 .op = args->op, 819 .val = args->val, 820 .ts = NULL, 821 .uaddr2 = args->uaddr2, 822 .val3 = args->val3, 823 .val3_compare = true, 824 }; 825 struct l_timespec lts; 826 int error; 827 828 switch (args->op & LINUX_FUTEX_CMD_MASK) { 829 case LINUX_FUTEX_WAIT: 830 case LINUX_FUTEX_WAIT_BITSET: 831 case LINUX_FUTEX_LOCK_PI: 832 case LINUX_FUTEX_LOCK_PI2: 833 if (args->timeout != NULL) { 834 error = copyin(args->timeout, <s, sizeof(lts)); 835 if (error != 0) 836 return (error); 837 error = linux_to_native_timespec(&fargs.kts, <s); 838 if (error != 0) 839 return (error); 840 fargs.ts = &fargs.kts; 841 } 842 break; 843 default: 844 fargs.ts = PTRIN(args->timeout); 845 } 846 return (linux_futex(td, &fargs)); 847 } 848 849 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) 850 int 851 linux_sys_futex_time64(struct thread *td, 852 struct linux_sys_futex_time64_args *args) 853 { 854 struct linux_futex_args fargs = { 855 .uaddr = args->uaddr, 856 .op = args->op, 857 .val = args->val, 858 .ts = NULL, 859 .uaddr2 = args->uaddr2, 860 .val3 = args->val3, 861 .val3_compare = true, 862 }; 863 struct l_timespec64 lts; 864 int error; 865 866 switch (args->op & LINUX_FUTEX_CMD_MASK) { 867 case LINUX_FUTEX_WAIT: 868 case LINUX_FUTEX_WAIT_BITSET: 869 case LINUX_FUTEX_LOCK_PI: 870 case LINUX_FUTEX_LOCK_PI2: 871 if (args->timeout != NULL) { 872 error = copyin(args->timeout, <s, sizeof(lts)); 873 if (error != 0) 874 return (error); 875 error = linux_to_native_timespec64(&fargs.kts, <s); 876 if (error != 0) 877 return (error); 878 fargs.ts = &fargs.kts; 879 } 880 break; 881 default: 882 fargs.ts = PTRIN(args->timeout); 883 } 884 return (linux_futex(td, &fargs)); 885 } 886 #endif 887 888 int 889 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 890 { 891 struct linux_emuldata *em; 892 893 if (args->len != sizeof(struct linux_robust_list_head)) 894 return (EINVAL); 895 896 em = em_find(td); 897 em->robust_futexes = args->head; 898 899 return (0); 900 } 901 902 int 903 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 904 { 905 struct linux_emuldata *em; 906 struct linux_robust_list_head *head; 907 l_size_t len; 908 struct thread *td2; 909 int error; 910 911 if (!args->pid) { 912 em = em_find(td); 913 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 914 head = em->robust_futexes; 915 } else { 916 td2 = linux_tdfind(td, args->pid, -1); 917 if (td2 == NULL) 918 return (ESRCH); 919 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { 920 PROC_UNLOCK(td2->td_proc); 921 return (EPERM); 922 } 923 924 em = em_find(td2); 925 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); 926 /* XXX: ptrace? */ 927 if (priv_check(td, PRIV_CRED_SETUID) || 928 priv_check(td, PRIV_CRED_SETEUID) || 929 p_candebug(td, td2->td_proc)) { 930 PROC_UNLOCK(td2->td_proc); 931 return (EPERM); 932 } 933 head = em->robust_futexes; 934 935 PROC_UNLOCK(td2->td_proc); 936 } 937 938 len = sizeof(struct linux_robust_list_head); 939 error = copyout(&len, args->len, sizeof(l_size_t)); 940 if (error != 0) 941 return (EFAULT); 942 943 return (copyout(&head, args->head, sizeof(head))); 944 } 945 946 static int 947 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr, 948 unsigned int pi, bool pending_op) 949 { 950 uint32_t uval, nval, mval; 951 int error; 952 953 retry: 954 error = fueword32(uaddr, &uval); 955 if (error != 0) 956 return (EFAULT); 957 958 /* 959 * Special case for regular (non PI) futexes. The unlock path in 960 * user space has two race scenarios: 961 * 962 * 1. The unlock path releases the user space futex value and 963 * before it can execute the futex() syscall to wake up 964 * waiters it is killed. 965 * 966 * 2. A woken up waiter is killed before it can acquire the 967 * futex in user space. 968 * 969 * In both cases the TID validation below prevents a wakeup of 970 * potential waiters which can cause these waiters to block 971 * forever. 972 * 973 * In both cases it is safe to attempt waking up a potential 974 * waiter without touching the user space futex value and trying 975 * to set the OWNER_DIED bit. 976 */ 977 if (pending_op && !pi && !uval) { 978 (void)futex_wake(td, uaddr, 1, true); 979 return (0); 980 } 981 982 if ((uval & FUTEX_TID_MASK) == em->em_tid) { 983 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 984 error = casueword32(uaddr, uval, &nval, mval); 985 if (error == -1) 986 return (EFAULT); 987 if (error == 1) { 988 error = thread_check_susp(td, false); 989 if (error != 0) 990 return (error); 991 goto retry; 992 } 993 994 if (!pi && (uval & FUTEX_WAITERS)) { 995 error = futex_wake(td, uaddr, 1, true); 996 if (error != 0) 997 return (error); 998 } else if (pi && (uval & FUTEX_WAITERS)) { 999 error = futex_wake_pi(td, uaddr, true); 1000 if (error != 0) 1001 return (error); 1002 } 1003 } 1004 1005 return (0); 1006 } 1007 1008 static int 1009 fetch_robust_entry(struct linux_robust_list **entry, 1010 struct linux_robust_list **head, unsigned int *pi) 1011 { 1012 l_ulong uentry; 1013 int error; 1014 1015 error = copyin((const void *)head, &uentry, sizeof(uentry)); 1016 if (error != 0) 1017 return (EFAULT); 1018 1019 *entry = (void *)(uentry & ~1UL); 1020 *pi = uentry & 1; 1021 1022 return (0); 1023 } 1024 1025 #define LINUX_HANDLE_DEATH_PENDING true 1026 #define LINUX_HANDLE_DEATH_LIST false 1027 1028 /* This walks the list of robust futexes releasing them. */ 1029 void 1030 release_futexes(struct thread *td, struct linux_emuldata *em) 1031 { 1032 struct linux_robust_list_head *head; 1033 struct linux_robust_list *entry, *next_entry, *pending; 1034 unsigned int limit = 2048, pi, next_pi, pip; 1035 uint32_t *uaddr; 1036 l_long futex_offset; 1037 int error; 1038 1039 head = em->robust_futexes; 1040 if (head == NULL) 1041 return; 1042 1043 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 1044 return; 1045 1046 error = copyin(&head->futex_offset, &futex_offset, 1047 sizeof(futex_offset)); 1048 if (error != 0) 1049 return; 1050 1051 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 1052 return; 1053 1054 while (entry != &head->list) { 1055 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next), 1056 &next_pi); 1057 1058 /* 1059 * A pending lock might already be on the list, so 1060 * don't process it twice. 1061 */ 1062 if (entry != pending) { 1063 uaddr = (uint32_t *)((caddr_t)entry + futex_offset); 1064 if (handle_futex_death(td, em, uaddr, pi, 1065 LINUX_HANDLE_DEATH_LIST)) 1066 return; 1067 } 1068 if (error != 0) 1069 return; 1070 1071 entry = next_entry; 1072 pi = next_pi; 1073 1074 if (!--limit) 1075 break; 1076 1077 sched_relinquish(curthread); 1078 } 1079 1080 if (pending) { 1081 uaddr = (uint32_t *)((caddr_t)pending + futex_offset); 1082 (void)handle_futex_death(td, em, uaddr, pip, 1083 LINUX_HANDLE_DEATH_PENDING); 1084 } 1085 } 1086