1 /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ 2 3 /*- 4 * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by Emmanuel Dreyfus 17 * 4. The name of the author may not be used to endorse or promote 18 * products derived from this software without specific prior written 19 * permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 #if 0 37 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); 38 #endif 39 40 #include "opt_compat.h" 41 42 #include <sys/param.h> 43 #include <sys/types.h> 44 #include <sys/time.h> 45 #include <sys/systm.h> 46 #include <sys/proc.h> 47 #include <sys/queue.h> 48 #include <sys/imgact.h> 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/priv.h> 52 #include <sys/sched.h> 53 #include <sys/sx.h> 54 #include <sys/malloc.h> 55 56 #ifdef COMPAT_LINUX32 57 #include <machine/../linux32/linux.h> 58 #include <machine/../linux32/linux32_proto.h> 59 #else 60 #include <machine/../linux/linux.h> 61 #include <machine/../linux/linux_proto.h> 62 #endif 63 #include <compat/linux/linux_emul.h> 64 #include <compat/linux/linux_futex.h> 65 66 struct futex; 67 68 struct waiting_proc { 69 struct thread *wp_t; 70 struct futex *wp_new_futex; 71 TAILQ_ENTRY(waiting_proc) wp_list; 72 }; 73 struct futex { 74 void *f_uaddr; 75 int f_refcount; 76 LIST_ENTRY(futex) f_list; 77 TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc; 78 }; 79 80 LIST_HEAD(futex_list, futex) futex_list; 81 struct sx futex_sx; /* this protects the LIST of futexes */ 82 83 #define FUTEX_LOCK sx_xlock(&futex_sx) 84 #define FUTEX_UNLOCK sx_xunlock(&futex_sx) 85 86 #define FUTEX_LOCKED 1 87 #define FUTEX_UNLOCKED 0 88 89 #define FUTEX_SYSTEM_LOCK mtx_lock(&Giant) 90 #define FUTEX_SYSTEM_UNLOCK mtx_unlock(&Giant) 91 92 static struct futex *futex_get(void *, int); 93 static void futex_put(struct futex *); 94 static int futex_sleep(struct futex *, struct thread *, unsigned long); 95 static int futex_wake(struct futex *, int, struct futex *, int); 96 static int futex_atomic_op(struct thread *td, int encoded_op, caddr_t uaddr); 97 98 /* support.s */ 99 int futex_xchgl(int oparg, caddr_t uaddr, int *oldval); 100 int futex_addl(int oparg, caddr_t uaddr, int *oldval); 101 int futex_orl(int oparg, caddr_t uaddr, int *oldval); 102 int futex_andl(int oparg, caddr_t uaddr, int *oldval); 103 int futex_xorl(int oparg, caddr_t uaddr, int *oldval); 104 105 int 106 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 107 { 108 int val; 109 int ret; 110 struct l_timespec timeout = {0, 0}; 111 int error = 0; 112 struct futex *f; 113 struct futex *newf; 114 int timeout_hz; 115 struct timeval tv = {0, 0}; 116 struct futex *f2; 117 int op_ret; 118 119 #ifdef DEBUG 120 if (ldebug(sys_futex)) 121 printf(ARGS(futex, "%p, %i, %i, *, %p, %i"), args->uaddr, args->op, 122 args->val, args->uaddr2, args->val3); 123 #endif 124 125 /* 126 * Our implementation provides only privates futexes. Most of the apps 127 * should use private futexes but don't claim so. Therefore we treat 128 * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works 129 * in most cases (ie. when futexes are not shared on file descriptor 130 * or between different processes.). 131 */ 132 args->op = (args->op & ~LINUX_FUTEX_PRIVATE_FLAG); 133 134 switch (args->op) { 135 case LINUX_FUTEX_WAIT: 136 FUTEX_SYSTEM_LOCK; 137 138 if ((error = copyin(args->uaddr, 139 &val, sizeof(val))) != 0) { 140 FUTEX_SYSTEM_UNLOCK; 141 return error; 142 } 143 144 if (val != args->val) { 145 FUTEX_SYSTEM_UNLOCK; 146 return EWOULDBLOCK; 147 } 148 149 if (args->timeout != NULL) { 150 if ((error = copyin(args->timeout, 151 &timeout, sizeof(timeout))) != 0) { 152 FUTEX_SYSTEM_UNLOCK; 153 return error; 154 } 155 } 156 157 #ifdef DEBUG 158 if (ldebug(sys_futex)) 159 printf("FUTEX_WAIT %d: val = %d, uaddr = %p, " 160 "*uaddr = %d, timeout = %d.%09lu\n", 161 td->td_proc->p_pid, args->val, 162 args->uaddr, val, timeout.tv_sec, 163 (unsigned long)timeout.tv_nsec); 164 #endif 165 tv.tv_usec = timeout.tv_sec * 1000000 + timeout.tv_nsec / 1000; 166 timeout_hz = tvtohz(&tv); 167 168 if (timeout.tv_sec == 0 && timeout.tv_nsec == 0) 169 timeout_hz = 0; 170 /* 171 * If the user process requests a non null timeout, 172 * make sure we do not turn it into an infinite 173 * timeout because timeout_hz gets null. 174 * 175 * We use a minimal timeout of 1/hz. Maybe it would 176 * make sense to just return ETIMEDOUT without sleeping. 177 */ 178 if (((timeout.tv_sec != 0) || (timeout.tv_nsec != 0)) && 179 (timeout_hz == 0)) 180 timeout_hz = 1; 181 182 183 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 184 ret = futex_sleep(f, td, timeout_hz); 185 futex_put(f); 186 187 #ifdef DEBUG 188 if (ldebug(sys_futex)) 189 printf("FUTEX_WAIT %d: uaddr = %p, " 190 "ret = %d\n", td->td_proc->p_pid, args->uaddr, ret); 191 #endif 192 193 FUTEX_SYSTEM_UNLOCK; 194 switch (ret) { 195 case EWOULDBLOCK: /* timeout */ 196 return ETIMEDOUT; 197 break; 198 case EINTR: /* signal */ 199 return EINTR; 200 break; 201 case 0: /* FUTEX_WAKE received */ 202 #ifdef DEBUG 203 if (ldebug(sys_futex)) 204 printf("FUTEX_WAIT %d: uaddr = %p, " 205 "got FUTEX_WAKE\n", 206 td->td_proc->p_pid, args->uaddr); 207 #endif 208 return 0; 209 break; 210 default: 211 #ifdef DEBUG 212 if (ldebug(sys_futex)) 213 printf("FUTEX_WAIT: unexpected ret = %d\n", 214 ret); 215 #endif 216 break; 217 } 218 219 /* NOTREACHED */ 220 break; 221 222 case LINUX_FUTEX_WAKE: 223 FUTEX_SYSTEM_LOCK; 224 225 /* 226 * XXX: Linux is able to cope with different addresses 227 * corresponding to the same mapped memory in the sleeping 228 * and waker process(es). 229 */ 230 #ifdef DEBUG 231 if (ldebug(sys_futex)) 232 printf("FUTEX_WAKE %d: uaddr = %p, val = %d\n", 233 td->td_proc->p_pid, args->uaddr, args->val); 234 #endif 235 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 236 td->td_retval[0] = futex_wake(f, args->val, NULL, 0); 237 futex_put(f); 238 239 FUTEX_SYSTEM_UNLOCK; 240 break; 241 242 case LINUX_FUTEX_CMP_REQUEUE: 243 FUTEX_SYSTEM_LOCK; 244 245 if ((error = copyin(args->uaddr, 246 &val, sizeof(val))) != 0) { 247 FUTEX_SYSTEM_UNLOCK; 248 return error; 249 } 250 251 if (val != args->val3) { 252 FUTEX_SYSTEM_UNLOCK; 253 return EAGAIN; 254 } 255 256 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 257 newf = futex_get(args->uaddr2, FUTEX_UNLOCKED); 258 td->td_retval[0] = futex_wake(f, args->val, newf, 259 (int)(unsigned long)args->timeout); 260 futex_put(f); 261 futex_put(newf); 262 263 FUTEX_SYSTEM_UNLOCK; 264 break; 265 266 case LINUX_FUTEX_REQUEUE: 267 FUTEX_SYSTEM_LOCK; 268 269 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 270 newf = futex_get(args->uaddr2, FUTEX_UNLOCKED); 271 td->td_retval[0] = futex_wake(f, args->val, newf, 272 (int)(unsigned long)args->timeout); 273 futex_put(f); 274 futex_put(newf); 275 276 FUTEX_SYSTEM_UNLOCK; 277 break; 278 279 case LINUX_FUTEX_FD: 280 #ifdef DEBUG 281 printf("linux_sys_futex: unimplemented op %d\n", 282 args->op); 283 #endif 284 return (ENOSYS); 285 286 case LINUX_FUTEX_WAKE_OP: 287 FUTEX_SYSTEM_LOCK; 288 #ifdef DEBUG 289 if (ldebug(sys_futex)) 290 printf("FUTEX_WAKE_OP: %d: uaddr = %p, op = %d, " 291 "val = %x, uaddr2 = %p, val3 = %x\n", 292 td->td_proc->p_pid, args->uaddr, args->op, 293 args->val, args->uaddr2, args->val3); 294 #endif 295 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 296 f2 = futex_get(args->uaddr2, FUTEX_UNLOCKED); 297 298 /* 299 * This function returns positive number as results and 300 * negative as errors 301 */ 302 op_ret = futex_atomic_op(td, args->val3, args->uaddr2); 303 #ifdef DEBUG 304 if (ldebug(sys_futex)) 305 printf("futex_atomic_op ret %d\n", op_ret); 306 #endif 307 if (op_ret < 0) { 308 /* XXX: We don't handle the EFAULT yet. */ 309 if (op_ret != -EFAULT) { 310 futex_put(f); 311 futex_put(f2); 312 FUTEX_SYSTEM_UNLOCK; 313 return (-op_ret); 314 } 315 316 futex_put(f); 317 futex_put(f2); 318 319 FUTEX_SYSTEM_UNLOCK; 320 return (EFAULT); 321 } 322 323 ret = futex_wake(f, args->val, NULL, 0); 324 futex_put(f); 325 if (op_ret > 0) { 326 op_ret = 0; 327 /* 328 * Linux abuses the address of the timespec parameter 329 * as the number of retries. 330 */ 331 op_ret += futex_wake(f2, 332 (int)(unsigned long)args->timeout, NULL, 0); 333 ret += op_ret; 334 } 335 futex_put(f2); 336 td->td_retval[0] = ret; 337 338 FUTEX_SYSTEM_UNLOCK; 339 break; 340 341 case LINUX_FUTEX_LOCK_PI: 342 /* not yet implemented */ 343 return (ENOSYS); 344 345 case LINUX_FUTEX_UNLOCK_PI: 346 /* not yet implemented */ 347 return (ENOSYS); 348 349 case LINUX_FUTEX_TRYLOCK_PI: 350 /* not yet implemented */ 351 return (ENOSYS); 352 353 default: 354 printf("linux_sys_futex: unknown op %d\n", 355 args->op); 356 return (ENOSYS); 357 } 358 return (0); 359 } 360 361 static struct futex * 362 futex_get(void *uaddr, int locked) 363 { 364 struct futex *f; 365 366 if (locked == FUTEX_UNLOCKED) 367 FUTEX_LOCK; 368 LIST_FOREACH(f, &futex_list, f_list) { 369 if (f->f_uaddr == uaddr) { 370 f->f_refcount++; 371 if (locked == FUTEX_UNLOCKED) 372 FUTEX_UNLOCK; 373 return f; 374 } 375 } 376 377 f = malloc(sizeof(*f), M_LINUX, M_WAITOK); 378 f->f_uaddr = uaddr; 379 f->f_refcount = 1; 380 TAILQ_INIT(&f->f_waiting_proc); 381 LIST_INSERT_HEAD(&futex_list, f, f_list); 382 if (locked == FUTEX_UNLOCKED) 383 FUTEX_UNLOCK; 384 385 return f; 386 } 387 388 static void 389 futex_put(f) 390 struct futex *f; 391 { 392 FUTEX_LOCK; 393 f->f_refcount--; 394 if (f->f_refcount == 0) { 395 LIST_REMOVE(f, f_list); 396 free(f, M_LINUX); 397 } 398 FUTEX_UNLOCK; 399 400 return; 401 } 402 403 static int 404 futex_sleep(struct futex *f, struct thread *td, unsigned long timeout) 405 { 406 struct waiting_proc *wp; 407 int ret; 408 409 wp = malloc(sizeof(*wp), M_LINUX, M_WAITOK); 410 wp->wp_t = td; 411 wp->wp_new_futex = NULL; 412 FUTEX_LOCK; 413 TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list); 414 FUTEX_UNLOCK; 415 416 #ifdef DEBUG 417 if (ldebug(sys_futex)) 418 printf("FUTEX --> %d tlseep timeout = %ld\n", 419 td->td_proc->p_pid, timeout); 420 #endif 421 ret = tsleep(wp, PCATCH | PZERO, "linuxfutex", timeout); 422 #ifdef DEBUG 423 if (ldebug(sys_futex)) 424 printf("FUTEX -> %d tsleep returns %d\n", 425 td->td_proc->p_pid, ret); 426 #endif 427 428 FUTEX_LOCK; 429 TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); 430 FUTEX_UNLOCK; 431 432 /* if we got woken up in futex_wake */ 433 if ((ret == 0) && (wp->wp_new_futex != NULL)) { 434 /* suspend us on the new futex */ 435 ret = futex_sleep(wp->wp_new_futex, td, timeout); 436 /* and release the old one */ 437 futex_put(wp->wp_new_futex); 438 } 439 440 free(wp, M_LINUX); 441 442 return ret; 443 } 444 445 static int 446 futex_wake(struct futex *f, int n, struct futex *newf, int n2) 447 { 448 struct waiting_proc *wp; 449 int count; 450 451 /* 452 * Linux is very strange it wakes up N threads for 453 * all operations BUT requeue ones where its N+1 454 * mimic this. 455 */ 456 count = newf ? 0 : 1; 457 458 FUTEX_LOCK; 459 TAILQ_FOREACH(wp, &f->f_waiting_proc, wp_list) { 460 if (count <= n) { 461 wakeup_one(wp); 462 count++; 463 } else { 464 if (newf != NULL) { 465 /* futex_put called after tsleep */ 466 wp->wp_new_futex = futex_get(newf->f_uaddr, 467 FUTEX_LOCKED); 468 wakeup_one(wp); 469 if (count - n >= n2) 470 break; 471 } 472 } 473 } 474 FUTEX_UNLOCK; 475 476 return count; 477 } 478 479 static int 480 futex_atomic_op(struct thread *td, int encoded_op, caddr_t uaddr) 481 { 482 int op = (encoded_op >> 28) & 7; 483 int cmp = (encoded_op >> 24) & 15; 484 int oparg = (encoded_op << 8) >> 20; 485 int cmparg = (encoded_op << 20) >> 20; 486 int oldval = 0, ret; 487 488 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 489 oparg = 1 << oparg; 490 491 #ifdef DEBUG 492 if (ldebug(sys_futex)) 493 printf("futex_atomic_op: op = %d, cmp = %d, oparg = %x, " 494 "cmparg = %x, uaddr = %p\n", 495 op, cmp, oparg, cmparg, uaddr); 496 #endif 497 /* XXX: linux verifies access here and returns EFAULT */ 498 499 switch (op) { 500 case FUTEX_OP_SET: 501 ret = futex_xchgl(oparg, uaddr, &oldval); 502 break; 503 case FUTEX_OP_ADD: 504 ret = futex_addl(oparg, uaddr, &oldval); 505 break; 506 case FUTEX_OP_OR: 507 ret = futex_orl(oparg, uaddr, &oldval); 508 break; 509 case FUTEX_OP_ANDN: 510 ret = futex_andl(~oparg, uaddr, &oldval); 511 break; 512 case FUTEX_OP_XOR: 513 ret = futex_xorl(oparg, uaddr, &oldval); 514 break; 515 default: 516 ret = -ENOSYS; 517 break; 518 } 519 520 if (ret) 521 return (ret); 522 523 switch (cmp) { 524 case FUTEX_OP_CMP_EQ: 525 return (oldval == cmparg); 526 case FUTEX_OP_CMP_NE: 527 return (oldval != cmparg); 528 case FUTEX_OP_CMP_LT: 529 return (oldval < cmparg); 530 case FUTEX_OP_CMP_GE: 531 return (oldval >= cmparg); 532 case FUTEX_OP_CMP_LE: 533 return (oldval <= cmparg); 534 case FUTEX_OP_CMP_GT: 535 return (oldval > cmparg); 536 default: 537 return (-ENOSYS); 538 } 539 } 540 541 int 542 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 543 { 544 struct linux_emuldata *em; 545 546 #ifdef DEBUG 547 if (ldebug(set_robust_list)) 548 printf(ARGS(set_robust_list, "")); 549 #endif 550 if (args->len != sizeof(struct linux_robust_list_head)) 551 return (EINVAL); 552 553 em = em_find(td->td_proc, EMUL_DOLOCK); 554 em->robust_futexes = args->head; 555 EMUL_UNLOCK(&emul_lock); 556 557 return (0); 558 } 559 560 int 561 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 562 { 563 struct linux_emuldata *em; 564 struct linux_robust_list_head *head; 565 l_size_t len = sizeof(struct linux_robust_list_head); 566 int error = 0; 567 568 #ifdef DEBUG 569 if (ldebug(get_robust_list)) 570 printf(ARGS(get_robust_list, "")); 571 #endif 572 573 if (!args->pid) { 574 em = em_find(td->td_proc, EMUL_DONTLOCK); 575 head = em->robust_futexes; 576 } else { 577 struct proc *p; 578 579 p = pfind(args->pid); 580 if (p == NULL) 581 return (ESRCH); 582 583 em = em_find(p, EMUL_DONTLOCK); 584 /* XXX: ptrace? */ 585 if (priv_check(td, PRIV_CRED_SETUID) || 586 priv_check(td, PRIV_CRED_SETEUID) || 587 p_candebug(td, p)) 588 return (EPERM); 589 head = em->robust_futexes; 590 591 PROC_UNLOCK(p); 592 } 593 594 error = copyout(&len, args->len, sizeof(l_size_t)); 595 if (error) 596 return (EFAULT); 597 598 error = copyout(head, args->head, sizeof(struct linux_robust_list_head)); 599 600 return (error); 601 } 602 603 static int 604 handle_futex_death(void *uaddr, pid_t pid, int pi) 605 { 606 int uval, nval, mval; 607 struct futex *f; 608 609 retry: 610 if (copyin(uaddr, &uval, 4)) 611 return (EFAULT); 612 613 if ((uval & FUTEX_TID_MASK) == pid) { 614 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 615 nval = casuword32(uaddr, uval, mval); 616 617 if (nval == -1) 618 return (EFAULT); 619 620 if (nval != uval) 621 goto retry; 622 623 if (!pi && (uval & FUTEX_WAITERS)) { 624 f = futex_get(uaddr, FUTEX_UNLOCKED); 625 futex_wake(f, 1, NULL, 0); 626 } 627 } 628 629 return (0); 630 } 631 632 static int 633 fetch_robust_entry(struct linux_robust_list **entry, 634 struct linux_robust_list **head, int *pi) 635 { 636 l_ulong uentry; 637 638 if (copyin((const void *)head, &uentry, sizeof(l_ulong))) 639 return (EFAULT); 640 641 *entry = (void *)(uentry & ~1UL); 642 *pi = uentry & 1; 643 644 return (0); 645 } 646 647 /* This walks the list of robust futexes releasing them. */ 648 void 649 release_futexes(struct proc *p) 650 { 651 struct linux_robust_list_head *head = NULL; 652 struct linux_robust_list *entry, *next_entry, *pending; 653 unsigned int limit = 2048, pi, next_pi, pip; 654 struct linux_emuldata *em; 655 l_long futex_offset; 656 int rc; 657 658 em = em_find(p, EMUL_DONTLOCK); 659 head = em->robust_futexes; 660 661 if (head == NULL) 662 return; 663 664 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 665 return; 666 667 if (copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset))) 668 return; 669 670 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 671 return; 672 673 while (entry != &head->list) { 674 rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); 675 676 if (entry != pending) 677 if (handle_futex_death((char *)entry + futex_offset, 678 p->p_pid, pi)) 679 return; 680 681 if (rc) 682 return; 683 684 entry = next_entry; 685 pi = next_pi; 686 687 if (!--limit) 688 break; 689 690 sched_relinquish(curthread); 691 } 692 693 if (pending) 694 handle_futex_death((char *) pending + futex_offset, 695 p->p_pid, pip); 696 } 697