1 /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ 2 3 /*- 4 * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. All advertising materials mentioning features or use of this software 15 * must display the following acknowledgement: 16 * This product includes software developed by Emmanuel Dreyfus 17 * 4. The name of the author may not be used to endorse or promote 18 * products derived from this software without specific prior written 19 * permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' 22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 #if 0 37 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); 38 #endif 39 40 #include "opt_compat.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/imgact.h> 45 #include <sys/kernel.h> 46 #include <sys/lock.h> 47 #include <sys/malloc.h> 48 #include <sys/mutex.h> 49 #include <sys/priv.h> 50 #include <sys/proc.h> 51 #include <sys/queue.h> 52 #include <sys/sched.h> 53 #include <sys/sx.h> 54 55 #ifdef COMPAT_LINUX32 56 #include <machine/../linux32/linux.h> 57 #include <machine/../linux32/linux32_proto.h> 58 #else 59 #include <machine/../linux/linux.h> 60 #include <machine/../linux/linux_proto.h> 61 #endif 62 #include <compat/linux/linux_futex.h> 63 #include <compat/linux/linux_emul.h> 64 65 struct futex; 66 67 struct waiting_proc { 68 struct thread *wp_t; 69 struct futex *wp_new_futex; 70 TAILQ_ENTRY(waiting_proc) wp_list; 71 }; 72 struct futex { 73 void *f_uaddr; 74 int f_refcount; 75 LIST_ENTRY(futex) f_list; 76 TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc; 77 }; 78 79 LIST_HEAD(futex_list, futex) futex_list; 80 struct sx futex_sx; /* this protects the LIST of futexes */ 81 82 #define FUTEX_LOCK sx_xlock(&futex_sx) 83 #define FUTEX_UNLOCK sx_xunlock(&futex_sx) 84 85 #define FUTEX_LOCKED 1 86 #define FUTEX_UNLOCKED 0 87 88 #define FUTEX_SYSTEM_LOCK mtx_lock(&Giant) 89 #define FUTEX_SYSTEM_UNLOCK mtx_unlock(&Giant) 90 91 static struct futex *futex_get(void *, int); 92 static void futex_put(struct futex *); 93 static int futex_sleep(struct futex *, struct thread *, unsigned long); 94 static int futex_wake(struct futex *, int, struct futex *, int); 95 static int futex_atomic_op(struct thread *td, int encoded_op, caddr_t uaddr); 96 97 /* support.s */ 98 int futex_xchgl(int oparg, caddr_t uaddr, int *oldval); 99 int futex_addl(int oparg, caddr_t uaddr, int *oldval); 100 int futex_orl(int oparg, caddr_t uaddr, int *oldval); 101 int futex_andl(int oparg, caddr_t uaddr, int *oldval); 102 int futex_xorl(int oparg, caddr_t uaddr, int *oldval); 103 104 int 105 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) 106 { 107 int val; 108 int ret; 109 struct l_timespec timeout = {0, 0}; 110 int error = 0; 111 struct futex *f; 112 struct futex *newf; 113 int timeout_hz; 114 struct timeval tv = {0, 0}; 115 struct futex *f2; 116 int op_ret; 117 118 #ifdef DEBUG 119 if (ldebug(sys_futex)) 120 printf(ARGS(futex, "%p, %i, %i, *, %p, %i"), args->uaddr, args->op, 121 args->val, args->uaddr2, args->val3); 122 #endif 123 124 /* 125 * Our implementation provides only privates futexes. Most of the apps 126 * should use private futexes but don't claim so. Therefore we treat 127 * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works 128 * in most cases (ie. when futexes are not shared on file descriptor 129 * or between different processes.). 130 */ 131 args->op = (args->op & ~LINUX_FUTEX_PRIVATE_FLAG); 132 133 switch (args->op) { 134 case LINUX_FUTEX_WAIT: 135 FUTEX_SYSTEM_LOCK; 136 137 if ((error = copyin(args->uaddr, 138 &val, sizeof(val))) != 0) { 139 FUTEX_SYSTEM_UNLOCK; 140 return error; 141 } 142 143 if (val != args->val) { 144 FUTEX_SYSTEM_UNLOCK; 145 return EWOULDBLOCK; 146 } 147 148 if (args->timeout != NULL) { 149 if ((error = copyin(args->timeout, 150 &timeout, sizeof(timeout))) != 0) { 151 FUTEX_SYSTEM_UNLOCK; 152 return error; 153 } 154 } 155 156 #ifdef DEBUG 157 if (ldebug(sys_futex)) 158 printf("FUTEX_WAIT %d: val = %d, uaddr = %p, " 159 "*uaddr = %d, timeout = %d.%09lu\n", 160 td->td_proc->p_pid, args->val, 161 args->uaddr, val, timeout.tv_sec, 162 (unsigned long)timeout.tv_nsec); 163 #endif 164 tv.tv_usec = timeout.tv_sec * 1000000 + timeout.tv_nsec / 1000; 165 timeout_hz = tvtohz(&tv); 166 167 if (timeout.tv_sec == 0 && timeout.tv_nsec == 0) 168 timeout_hz = 0; 169 /* 170 * If the user process requests a non null timeout, 171 * make sure we do not turn it into an infinite 172 * timeout because timeout_hz gets null. 173 * 174 * We use a minimal timeout of 1/hz. Maybe it would 175 * make sense to just return ETIMEDOUT without sleeping. 176 */ 177 if (((timeout.tv_sec != 0) || (timeout.tv_nsec != 0)) && 178 (timeout_hz == 0)) 179 timeout_hz = 1; 180 181 182 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 183 ret = futex_sleep(f, td, timeout_hz); 184 futex_put(f); 185 186 #ifdef DEBUG 187 if (ldebug(sys_futex)) 188 printf("FUTEX_WAIT %d: uaddr = %p, " 189 "ret = %d\n", td->td_proc->p_pid, args->uaddr, ret); 190 #endif 191 192 FUTEX_SYSTEM_UNLOCK; 193 switch (ret) { 194 case EWOULDBLOCK: /* timeout */ 195 return ETIMEDOUT; 196 break; 197 case EINTR: /* signal */ 198 return EINTR; 199 break; 200 case 0: /* FUTEX_WAKE received */ 201 #ifdef DEBUG 202 if (ldebug(sys_futex)) 203 printf("FUTEX_WAIT %d: uaddr = %p, " 204 "got FUTEX_WAKE\n", 205 td->td_proc->p_pid, args->uaddr); 206 #endif 207 return 0; 208 break; 209 default: 210 #ifdef DEBUG 211 if (ldebug(sys_futex)) 212 printf("FUTEX_WAIT: unexpected ret = %d\n", 213 ret); 214 #endif 215 break; 216 } 217 218 /* NOTREACHED */ 219 break; 220 221 case LINUX_FUTEX_WAKE: 222 FUTEX_SYSTEM_LOCK; 223 224 /* 225 * XXX: Linux is able to cope with different addresses 226 * corresponding to the same mapped memory in the sleeping 227 * and waker process(es). 228 */ 229 #ifdef DEBUG 230 if (ldebug(sys_futex)) 231 printf("FUTEX_WAKE %d: uaddr = %p, val = %d\n", 232 td->td_proc->p_pid, args->uaddr, args->val); 233 #endif 234 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 235 td->td_retval[0] = futex_wake(f, args->val, NULL, 0); 236 futex_put(f); 237 238 FUTEX_SYSTEM_UNLOCK; 239 break; 240 241 case LINUX_FUTEX_CMP_REQUEUE: 242 FUTEX_SYSTEM_LOCK; 243 244 if ((error = copyin(args->uaddr, 245 &val, sizeof(val))) != 0) { 246 FUTEX_SYSTEM_UNLOCK; 247 return error; 248 } 249 250 if (val != args->val3) { 251 FUTEX_SYSTEM_UNLOCK; 252 return EAGAIN; 253 } 254 255 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 256 newf = futex_get(args->uaddr2, FUTEX_UNLOCKED); 257 td->td_retval[0] = futex_wake(f, args->val, newf, 258 (int)(unsigned long)args->timeout); 259 futex_put(f); 260 futex_put(newf); 261 262 FUTEX_SYSTEM_UNLOCK; 263 break; 264 265 case LINUX_FUTEX_REQUEUE: 266 FUTEX_SYSTEM_LOCK; 267 268 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 269 newf = futex_get(args->uaddr2, FUTEX_UNLOCKED); 270 td->td_retval[0] = futex_wake(f, args->val, newf, 271 (int)(unsigned long)args->timeout); 272 futex_put(f); 273 futex_put(newf); 274 275 FUTEX_SYSTEM_UNLOCK; 276 break; 277 278 case LINUX_FUTEX_WAKE_OP: 279 FUTEX_SYSTEM_LOCK; 280 #ifdef DEBUG 281 if (ldebug(sys_futex)) 282 printf("FUTEX_WAKE_OP: %d: uaddr = %p, op = %d, " 283 "val = %x, uaddr2 = %p, val3 = %x\n", 284 td->td_proc->p_pid, args->uaddr, args->op, 285 args->val, args->uaddr2, args->val3); 286 #endif 287 f = futex_get(args->uaddr, FUTEX_UNLOCKED); 288 f2 = futex_get(args->uaddr2, FUTEX_UNLOCKED); 289 290 /* 291 * This function returns positive number as results and 292 * negative as errors 293 */ 294 op_ret = futex_atomic_op(td, args->val3, args->uaddr2); 295 #ifdef DEBUG 296 if (ldebug(sys_futex)) 297 printf("futex_atomic_op ret %d\n", op_ret); 298 #endif 299 if (op_ret < 0) { 300 /* XXX: We don't handle the EFAULT yet. */ 301 if (op_ret != -EFAULT) { 302 futex_put(f); 303 futex_put(f2); 304 FUTEX_SYSTEM_UNLOCK; 305 return (-op_ret); 306 } 307 308 futex_put(f); 309 futex_put(f2); 310 311 FUTEX_SYSTEM_UNLOCK; 312 return (EFAULT); 313 } 314 315 ret = futex_wake(f, args->val, NULL, 0); 316 futex_put(f); 317 if (op_ret > 0) { 318 op_ret = 0; 319 /* 320 * Linux abuses the address of the timespec parameter 321 * as the number of retries. 322 */ 323 op_ret += futex_wake(f2, 324 (int)(unsigned long)args->timeout, NULL, 0); 325 ret += op_ret; 326 } 327 futex_put(f2); 328 td->td_retval[0] = ret; 329 330 FUTEX_SYSTEM_UNLOCK; 331 break; 332 333 case LINUX_FUTEX_LOCK_PI: 334 /* not yet implemented */ 335 return (ENOSYS); 336 337 case LINUX_FUTEX_UNLOCK_PI: 338 /* not yet implemented */ 339 return (ENOSYS); 340 341 case LINUX_FUTEX_TRYLOCK_PI: 342 /* not yet implemented */ 343 return (ENOSYS); 344 345 default: 346 printf("linux_sys_futex: unknown op %d\n", 347 args->op); 348 return (ENOSYS); 349 } 350 return (0); 351 } 352 353 static struct futex * 354 futex_get(void *uaddr, int locked) 355 { 356 struct futex *f; 357 358 if (locked == FUTEX_UNLOCKED) 359 FUTEX_LOCK; 360 LIST_FOREACH(f, &futex_list, f_list) { 361 if (f->f_uaddr == uaddr) { 362 f->f_refcount++; 363 if (locked == FUTEX_UNLOCKED) 364 FUTEX_UNLOCK; 365 return f; 366 } 367 } 368 369 f = malloc(sizeof(*f), M_LINUX, M_WAITOK); 370 f->f_uaddr = uaddr; 371 f->f_refcount = 1; 372 TAILQ_INIT(&f->f_waiting_proc); 373 LIST_INSERT_HEAD(&futex_list, f, f_list); 374 if (locked == FUTEX_UNLOCKED) 375 FUTEX_UNLOCK; 376 377 return f; 378 } 379 380 static void 381 futex_put(f) 382 struct futex *f; 383 { 384 FUTEX_LOCK; 385 f->f_refcount--; 386 if (f->f_refcount == 0) { 387 LIST_REMOVE(f, f_list); 388 free(f, M_LINUX); 389 } 390 FUTEX_UNLOCK; 391 392 return; 393 } 394 395 static int 396 futex_sleep(struct futex *f, struct thread *td, unsigned long timeout) 397 { 398 struct waiting_proc *wp; 399 int ret; 400 401 wp = malloc(sizeof(*wp), M_LINUX, M_WAITOK); 402 wp->wp_t = td; 403 wp->wp_new_futex = NULL; 404 FUTEX_LOCK; 405 TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list); 406 FUTEX_UNLOCK; 407 408 #ifdef DEBUG 409 if (ldebug(sys_futex)) 410 printf("FUTEX --> %d tlseep timeout = %ld\n", 411 td->td_proc->p_pid, timeout); 412 #endif 413 ret = tsleep(wp, PCATCH | PZERO, "linuxfutex", timeout); 414 #ifdef DEBUG 415 if (ldebug(sys_futex)) 416 printf("FUTEX -> %d tsleep returns %d\n", 417 td->td_proc->p_pid, ret); 418 #endif 419 420 FUTEX_LOCK; 421 TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); 422 FUTEX_UNLOCK; 423 424 /* if we got woken up in futex_wake */ 425 if ((ret == 0) && (wp->wp_new_futex != NULL)) { 426 /* suspend us on the new futex */ 427 ret = futex_sleep(wp->wp_new_futex, td, timeout); 428 /* and release the old one */ 429 futex_put(wp->wp_new_futex); 430 } 431 432 free(wp, M_LINUX); 433 434 return ret; 435 } 436 437 static int 438 futex_wake(struct futex *f, int n, struct futex *newf, int n2) 439 { 440 struct waiting_proc *wp; 441 int count; 442 443 /* 444 * Linux is very strange it wakes up N threads for 445 * all operations BUT requeue ones where its N+1 446 * mimic this. 447 */ 448 count = newf ? 0 : 1; 449 450 FUTEX_LOCK; 451 TAILQ_FOREACH(wp, &f->f_waiting_proc, wp_list) { 452 if (count <= n) { 453 wakeup_one(wp); 454 count++; 455 } else { 456 if (newf != NULL) { 457 /* futex_put called after tsleep */ 458 wp->wp_new_futex = futex_get(newf->f_uaddr, 459 FUTEX_LOCKED); 460 wakeup_one(wp); 461 if (count - n >= n2) 462 break; 463 } 464 } 465 } 466 FUTEX_UNLOCK; 467 468 return count; 469 } 470 471 static int 472 futex_atomic_op(struct thread *td, int encoded_op, caddr_t uaddr) 473 { 474 int op = (encoded_op >> 28) & 7; 475 int cmp = (encoded_op >> 24) & 15; 476 int oparg = (encoded_op << 8) >> 20; 477 int cmparg = (encoded_op << 20) >> 20; 478 int oldval = 0, ret; 479 480 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 481 oparg = 1 << oparg; 482 483 #ifdef DEBUG 484 if (ldebug(sys_futex)) 485 printf("futex_atomic_op: op = %d, cmp = %d, oparg = %x, " 486 "cmparg = %x, uaddr = %p\n", 487 op, cmp, oparg, cmparg, uaddr); 488 #endif 489 /* XXX: linux verifies access here and returns EFAULT */ 490 491 switch (op) { 492 case FUTEX_OP_SET: 493 ret = futex_xchgl(oparg, uaddr, &oldval); 494 break; 495 case FUTEX_OP_ADD: 496 ret = futex_addl(oparg, uaddr, &oldval); 497 break; 498 case FUTEX_OP_OR: 499 ret = futex_orl(oparg, uaddr, &oldval); 500 break; 501 case FUTEX_OP_ANDN: 502 ret = futex_andl(~oparg, uaddr, &oldval); 503 break; 504 case FUTEX_OP_XOR: 505 ret = futex_xorl(oparg, uaddr, &oldval); 506 break; 507 default: 508 ret = -ENOSYS; 509 break; 510 } 511 512 if (ret) 513 return (ret); 514 515 switch (cmp) { 516 case FUTEX_OP_CMP_EQ: 517 return (oldval == cmparg); 518 case FUTEX_OP_CMP_NE: 519 return (oldval != cmparg); 520 case FUTEX_OP_CMP_LT: 521 return (oldval < cmparg); 522 case FUTEX_OP_CMP_GE: 523 return (oldval >= cmparg); 524 case FUTEX_OP_CMP_LE: 525 return (oldval <= cmparg); 526 case FUTEX_OP_CMP_GT: 527 return (oldval > cmparg); 528 default: 529 return (-ENOSYS); 530 } 531 } 532 533 int 534 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) 535 { 536 struct linux_emuldata *em; 537 538 #ifdef DEBUG 539 if (ldebug(set_robust_list)) 540 printf(ARGS(set_robust_list, "")); 541 #endif 542 if (args->len != sizeof(struct linux_robust_list_head)) 543 return (EINVAL); 544 545 em = em_find(td->td_proc, EMUL_DOLOCK); 546 em->robust_futexes = args->head; 547 EMUL_UNLOCK(&emul_lock); 548 549 return (0); 550 } 551 552 int 553 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) 554 { 555 struct linux_emuldata *em; 556 struct linux_robust_list_head *head; 557 l_size_t len = sizeof(struct linux_robust_list_head); 558 int error = 0; 559 560 #ifdef DEBUG 561 if (ldebug(get_robust_list)) 562 printf(ARGS(get_robust_list, "")); 563 #endif 564 565 if (!args->pid) { 566 em = em_find(td->td_proc, EMUL_DONTLOCK); 567 head = em->robust_futexes; 568 } else { 569 struct proc *p; 570 571 p = pfind(args->pid); 572 if (p == NULL) 573 return (ESRCH); 574 575 em = em_find(p, EMUL_DONTLOCK); 576 /* XXX: ptrace? */ 577 if (priv_check(td, PRIV_CRED_SETUID) || 578 priv_check(td, PRIV_CRED_SETEUID) || 579 p_candebug(td, p)) 580 return (EPERM); 581 head = em->robust_futexes; 582 583 PROC_UNLOCK(p); 584 } 585 586 error = copyout(&len, args->len, sizeof(l_size_t)); 587 if (error) 588 return (EFAULT); 589 590 error = copyout(head, args->head, sizeof(struct linux_robust_list_head)); 591 592 return (error); 593 } 594 595 static int 596 handle_futex_death(void *uaddr, pid_t pid, int pi) 597 { 598 int uval, nval, mval; 599 struct futex *f; 600 601 retry: 602 if (copyin(uaddr, &uval, 4)) 603 return (EFAULT); 604 605 if ((uval & FUTEX_TID_MASK) == pid) { 606 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 607 nval = casuword32(uaddr, uval, mval); 608 609 if (nval == -1) 610 return (EFAULT); 611 612 if (nval != uval) 613 goto retry; 614 615 if (!pi && (uval & FUTEX_WAITERS)) { 616 f = futex_get(uaddr, FUTEX_UNLOCKED); 617 futex_wake(f, 1, NULL, 0); 618 } 619 } 620 621 return (0); 622 } 623 624 static int 625 fetch_robust_entry(struct linux_robust_list **entry, 626 struct linux_robust_list **head, int *pi) 627 { 628 l_ulong uentry; 629 630 if (copyin((const void *)head, &uentry, sizeof(l_ulong))) 631 return (EFAULT); 632 633 *entry = (void *)(uentry & ~1UL); 634 *pi = uentry & 1; 635 636 return (0); 637 } 638 639 /* This walks the list of robust futexes releasing them. */ 640 void 641 release_futexes(struct proc *p) 642 { 643 struct linux_robust_list_head *head = NULL; 644 struct linux_robust_list *entry, *next_entry, *pending; 645 unsigned int limit = 2048, pi, next_pi, pip; 646 struct linux_emuldata *em; 647 l_long futex_offset; 648 int rc; 649 650 em = em_find(p, EMUL_DONTLOCK); 651 head = em->robust_futexes; 652 653 if (head == NULL) 654 return; 655 656 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) 657 return; 658 659 if (copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset))) 660 return; 661 662 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) 663 return; 664 665 while (entry != &head->list) { 666 rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); 667 668 if (entry != pending) 669 if (handle_futex_death((char *)entry + futex_offset, 670 p->p_pid, pi)) 671 return; 672 673 if (rc) 674 return; 675 676 entry = next_entry; 677 pi = next_pi; 678 679 if (!--limit) 680 break; 681 682 sched_relinquish(curthread); 683 } 684 685 if (pending) 686 handle_futex_death((char *) pending + futex_offset, 687 p->p_pid, pip); 688 } 689