1 /* 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 */ 19 20 /* 21 * This file contains a high-performance replacement for the socket-based 22 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 23 * all features of sockets, but does do everything that pipes normally 24 * do. 25 */ 26 27 /* 28 * This code has two modes of operation, a small write mode and a large 29 * write mode. The small write mode acts like conventional pipes with 30 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 31 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 33 * the receiving process can copy it directly from the pages in the sending 34 * process. 35 * 36 * If the sending process receives a signal, it is possible that it will 37 * go away, and certainly its address space can change, because control 38 * is returned back to the user-mode side. In that case, the pipe code 39 * arranges to copy the buffer supplied by the user process, to a pageable 40 * kernel buffer, and the receiving process will grab the data from the 41 * pageable kernel buffer. Since signals don't happen all that often, 42 * the copy operation is normally eliminated. 43 * 44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 45 * happen for small transfers so that the system will not spend all of 46 * its time context switching. 47 * 48 * In order to limit the resource use of pipes, two sysctls exist: 49 * 50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 51 * address space available to us in pipe_map. Whenever the amount in use 52 * exceeds half of this value, all new pipes will be created with size 53 * SMALL_PIPE_SIZE, rather than PIPE_SIZE. Big pipe creation will be limited 54 * as well. This value is loader tunable only. 55 * 56 * kern.ipc.maxpipekvawired - This value limits the amount of memory that may 57 * be wired in order to facilitate direct copies using page flipping. 58 * Whenever this value is exceeded, pipes will fall back to using regular 59 * copies. This value is sysctl controllable at all times. 60 * 61 * These values are autotuned in subr_param.c. 62 * 63 * Memory usage may be monitored through the sysctls 64 * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired. 65 * 66 */ 67 68 #include <sys/cdefs.h> 69 __FBSDID("$FreeBSD$"); 70 71 #include "opt_mac.h" 72 73 #include <sys/param.h> 74 #include <sys/systm.h> 75 #include <sys/fcntl.h> 76 #include <sys/file.h> 77 #include <sys/filedesc.h> 78 #include <sys/filio.h> 79 #include <sys/kernel.h> 80 #include <sys/lock.h> 81 #include <sys/mac.h> 82 #include <sys/mutex.h> 83 #include <sys/ttycom.h> 84 #include <sys/stat.h> 85 #include <sys/malloc.h> 86 #include <sys/poll.h> 87 #include <sys/selinfo.h> 88 #include <sys/signalvar.h> 89 #include <sys/sysctl.h> 90 #include <sys/sysproto.h> 91 #include <sys/pipe.h> 92 #include <sys/proc.h> 93 #include <sys/vnode.h> 94 #include <sys/uio.h> 95 #include <sys/event.h> 96 97 #include <vm/vm.h> 98 #include <vm/vm_param.h> 99 #include <vm/vm_object.h> 100 #include <vm/vm_kern.h> 101 #include <vm/vm_extern.h> 102 #include <vm/pmap.h> 103 #include <vm/vm_map.h> 104 #include <vm/vm_page.h> 105 #include <vm/uma.h> 106 107 /* 108 * Use this define if you want to disable *fancy* VM things. Expect an 109 * approx 30% decrease in transfer rate. This could be useful for 110 * NetBSD or OpenBSD. 111 */ 112 /* #define PIPE_NODIRECT */ 113 114 /* 115 * interfaces to the outside world 116 */ 117 static fo_rdwr_t pipe_read; 118 static fo_rdwr_t pipe_write; 119 static fo_ioctl_t pipe_ioctl; 120 static fo_poll_t pipe_poll; 121 static fo_kqfilter_t pipe_kqfilter; 122 static fo_stat_t pipe_stat; 123 static fo_close_t pipe_close; 124 125 static struct fileops pipeops = { 126 .fo_read = pipe_read, 127 .fo_write = pipe_write, 128 .fo_ioctl = pipe_ioctl, 129 .fo_poll = pipe_poll, 130 .fo_kqfilter = pipe_kqfilter, 131 .fo_stat = pipe_stat, 132 .fo_close = pipe_close, 133 .fo_flags = DFLAG_PASSABLE 134 }; 135 136 static void filt_pipedetach(struct knote *kn); 137 static int filt_piperead(struct knote *kn, long hint); 138 static int filt_pipewrite(struct knote *kn, long hint); 139 140 static struct filterops pipe_rfiltops = 141 { 1, NULL, filt_pipedetach, filt_piperead }; 142 static struct filterops pipe_wfiltops = 143 { 1, NULL, filt_pipedetach, filt_pipewrite }; 144 145 /* 146 * Default pipe buffer size(s), this can be kind-of large now because pipe 147 * space is pageable. The pipe code will try to maintain locality of 148 * reference for performance reasons, so small amounts of outstanding I/O 149 * will not wipe the cache. 150 */ 151 #define MINPIPESIZE (PIPE_SIZE/3) 152 #define MAXPIPESIZE (2*PIPE_SIZE/3) 153 154 /* 155 * Limit the number of "big" pipes 156 */ 157 #define LIMITBIGPIPES 32 158 static int nbigpipe; 159 160 static int amountpipes; 161 static int amountpipekva; 162 static int amountpipekvawired; 163 164 SYSCTL_DECL(_kern_ipc); 165 166 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, 167 &maxpipekva, 0, "Pipe KVA limit"); 168 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW, 169 &maxpipekvawired, 0, "Pipe KVA wired limit"); 170 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, 171 &amountpipes, 0, "Current # of pipes"); 172 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD, 173 &nbigpipe, 0, "Current # of big pipes"); 174 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 175 &amountpipekva, 0, "Pipe KVA usage"); 176 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD, 177 &amountpipekvawired, 0, "Pipe wired KVA usage"); 178 179 static void pipeinit(void *dummy __unused); 180 static void pipeclose(struct pipe *cpipe); 181 static void pipe_free_kmem(struct pipe *cpipe); 182 static int pipe_create(struct pipe *pipe); 183 static __inline int pipelock(struct pipe *cpipe, int catch); 184 static __inline void pipeunlock(struct pipe *cpipe); 185 static __inline void pipeselwakeup(struct pipe *cpipe); 186 #ifndef PIPE_NODIRECT 187 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 188 static void pipe_destroy_write_buffer(struct pipe *wpipe); 189 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 190 static void pipe_clone_write_buffer(struct pipe *wpipe); 191 #endif 192 static int pipespace(struct pipe *cpipe, int size); 193 194 static void pipe_zone_ctor(void *mem, int size, void *arg); 195 static void pipe_zone_dtor(void *mem, int size, void *arg); 196 static void pipe_zone_init(void *mem, int size); 197 static void pipe_zone_fini(void *mem, int size); 198 199 static uma_zone_t pipe_zone; 200 201 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 202 203 static void 204 pipeinit(void *dummy __unused) 205 { 206 207 pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair), 208 pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini, 209 UMA_ALIGN_PTR, 0); 210 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 211 } 212 213 static void 214 pipe_zone_ctor(void *mem, int size, void *arg) 215 { 216 struct pipepair *pp; 217 struct pipe *rpipe, *wpipe; 218 219 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 220 221 pp = (struct pipepair *)mem; 222 223 /* 224 * We zero both pipe endpoints to make sure all the kmem pointers 225 * are NULL, flag fields are zero'd, etc. We timestamp both 226 * endpoints with the same time. 227 */ 228 rpipe = &pp->pp_rpipe; 229 bzero(rpipe, sizeof(*rpipe)); 230 vfs_timestamp(&rpipe->pipe_ctime); 231 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 232 233 wpipe = &pp->pp_wpipe; 234 bzero(wpipe, sizeof(*wpipe)); 235 wpipe->pipe_ctime = rpipe->pipe_ctime; 236 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 237 238 rpipe->pipe_peer = wpipe; 239 rpipe->pipe_pair = pp; 240 wpipe->pipe_peer = rpipe; 241 wpipe->pipe_pair = pp; 242 243 /* 244 * Mark both endpoints as present; they will later get free'd 245 * one at a time. When both are free'd, then the whole pair 246 * is released. 247 */ 248 rpipe->pipe_present = 1; 249 wpipe->pipe_present = 1; 250 251 /* 252 * Eventually, the MAC Framework may initialize the label 253 * in ctor or init, but for now we do it elswhere to avoid 254 * blocking in ctor or init. 255 */ 256 pp->pp_label = NULL; 257 258 atomic_add_int(&amountpipes, 2); 259 } 260 261 static void 262 pipe_zone_dtor(void *mem, int size, void *arg) 263 { 264 struct pipepair *pp; 265 266 KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size")); 267 268 pp = (struct pipepair *)mem; 269 270 atomic_subtract_int(&amountpipes, 2); 271 } 272 273 static void 274 pipe_zone_init(void *mem, int size) 275 { 276 struct pipepair *pp; 277 278 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 279 280 pp = (struct pipepair *)mem; 281 282 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); 283 } 284 285 static void 286 pipe_zone_fini(void *mem, int size) 287 { 288 struct pipepair *pp; 289 290 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 291 292 pp = (struct pipepair *)mem; 293 294 mtx_destroy(&pp->pp_mtx); 295 } 296 297 /* 298 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, 299 * let the zone pick up the pieces via pipeclose(). 300 */ 301 302 /* ARGSUSED */ 303 int 304 pipe(td, uap) 305 struct thread *td; 306 struct pipe_args /* { 307 int dummy; 308 } */ *uap; 309 { 310 struct filedesc *fdp = td->td_proc->p_fd; 311 struct file *rf, *wf; 312 struct pipepair *pp; 313 struct pipe *rpipe, *wpipe; 314 int fd, error; 315 316 pp = uma_zalloc(pipe_zone, M_WAITOK); 317 #ifdef MAC 318 /* 319 * The MAC label is shared between the connected endpoints. As a 320 * result mac_init_pipe() and mac_create_pipe() are called once 321 * for the pair, and not on the endpoints. 322 */ 323 mac_init_pipe(pp); 324 mac_create_pipe(td->td_ucred, pp); 325 #endif 326 rpipe = &pp->pp_rpipe; 327 wpipe = &pp->pp_wpipe; 328 329 if (pipe_create(rpipe) || pipe_create(wpipe)) { 330 pipeclose(rpipe); 331 pipeclose(wpipe); 332 return (ENFILE); 333 } 334 335 rpipe->pipe_state |= PIPE_DIRECTOK; 336 wpipe->pipe_state |= PIPE_DIRECTOK; 337 338 error = falloc(td, &rf, &fd); 339 if (error) { 340 pipeclose(rpipe); 341 pipeclose(wpipe); 342 return (error); 343 } 344 /* An extra reference on `rf' has been held for us by falloc(). */ 345 td->td_retval[0] = fd; 346 347 /* 348 * Warning: once we've gotten past allocation of the fd for the 349 * read-side, we can only drop the read side via fdrop() in order 350 * to avoid races against processes which manage to dup() the read 351 * side while we are blocked trying to allocate the write side. 352 */ 353 FILE_LOCK(rf); 354 rf->f_flag = FREAD | FWRITE; 355 rf->f_type = DTYPE_PIPE; 356 rf->f_data = rpipe; 357 rf->f_ops = &pipeops; 358 FILE_UNLOCK(rf); 359 error = falloc(td, &wf, &fd); 360 if (error) { 361 FILEDESC_LOCK(fdp); 362 if (fdp->fd_ofiles[td->td_retval[0]] == rf) { 363 fdp->fd_ofiles[td->td_retval[0]] = NULL; 364 fdunused(fdp, td->td_retval[0]); 365 FILEDESC_UNLOCK(fdp); 366 fdrop(rf, td); 367 } else { 368 FILEDESC_UNLOCK(fdp); 369 } 370 fdrop(rf, td); 371 /* rpipe has been closed by fdrop(). */ 372 pipeclose(wpipe); 373 return (error); 374 } 375 /* An extra reference on `wf' has been held for us by falloc(). */ 376 FILE_LOCK(wf); 377 wf->f_flag = FREAD | FWRITE; 378 wf->f_type = DTYPE_PIPE; 379 wf->f_data = wpipe; 380 wf->f_ops = &pipeops; 381 FILE_UNLOCK(wf); 382 fdrop(wf, td); 383 td->td_retval[1] = fd; 384 fdrop(rf, td); 385 386 return (0); 387 } 388 389 /* 390 * Allocate kva for pipe circular buffer, the space is pageable 391 * This routine will 'realloc' the size of a pipe safely, if it fails 392 * it will retain the old buffer. 393 * If it fails it will return ENOMEM. 394 */ 395 static int 396 pipespace(cpipe, size) 397 struct pipe *cpipe; 398 int size; 399 { 400 caddr_t buffer; 401 int error; 402 static int curfail = 0; 403 static struct timeval lastfail; 404 405 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 406 407 size = round_page(size); 408 /* 409 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 410 */ 411 buffer = (caddr_t) vm_map_min(pipe_map); 412 413 /* 414 * The map entry is, by default, pageable. 415 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 416 */ 417 error = vm_map_find(pipe_map, NULL, 0, 418 (vm_offset_t *) &buffer, size, 1, 419 VM_PROT_ALL, VM_PROT_ALL, 0); 420 if (error != KERN_SUCCESS) { 421 if (ppsratecheck(&lastfail, &curfail, 1)) 422 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 423 return (ENOMEM); 424 } 425 426 /* free old resources if we're resizing */ 427 pipe_free_kmem(cpipe); 428 cpipe->pipe_buffer.buffer = buffer; 429 cpipe->pipe_buffer.size = size; 430 cpipe->pipe_buffer.in = 0; 431 cpipe->pipe_buffer.out = 0; 432 cpipe->pipe_buffer.cnt = 0; 433 atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size); 434 return (0); 435 } 436 437 /* 438 * lock a pipe for I/O, blocking other access 439 */ 440 static __inline int 441 pipelock(cpipe, catch) 442 struct pipe *cpipe; 443 int catch; 444 { 445 int error; 446 447 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 448 while (cpipe->pipe_state & PIPE_LOCKFL) { 449 cpipe->pipe_state |= PIPE_LWANT; 450 error = msleep(cpipe, PIPE_MTX(cpipe), 451 catch ? (PRIBIO | PCATCH) : PRIBIO, 452 "pipelk", 0); 453 if (error != 0) 454 return (error); 455 } 456 cpipe->pipe_state |= PIPE_LOCKFL; 457 return (0); 458 } 459 460 /* 461 * unlock a pipe I/O lock 462 */ 463 static __inline void 464 pipeunlock(cpipe) 465 struct pipe *cpipe; 466 { 467 468 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 469 cpipe->pipe_state &= ~PIPE_LOCKFL; 470 if (cpipe->pipe_state & PIPE_LWANT) { 471 cpipe->pipe_state &= ~PIPE_LWANT; 472 wakeup(cpipe); 473 } 474 } 475 476 static __inline void 477 pipeselwakeup(cpipe) 478 struct pipe *cpipe; 479 { 480 481 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 482 if (cpipe->pipe_state & PIPE_SEL) { 483 cpipe->pipe_state &= ~PIPE_SEL; 484 selwakeuppri(&cpipe->pipe_sel, PSOCK); 485 } 486 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 487 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 488 KNOTE(&cpipe->pipe_sel.si_note, 0); 489 } 490 491 /* 492 * Initialize and allocate VM and memory for pipe. The structure 493 * will start out zero'd from the ctor, so we just manage the kmem. 494 */ 495 static int 496 pipe_create(pipe) 497 struct pipe *pipe; 498 { 499 int error; 500 501 PIPE_LOCK(pipe); 502 pipelock(pipe, 0); 503 PIPE_UNLOCK(pipe); 504 /* 505 * Reduce to 1/4th pipe size if we're over our global max. 506 */ 507 if (amountpipekva > maxpipekva / 2) 508 error = pipespace(pipe, SMALL_PIPE_SIZE); 509 else 510 error = pipespace(pipe, PIPE_SIZE); 511 PIPE_LOCK(pipe); 512 pipeunlock(pipe); 513 PIPE_UNLOCK(pipe); 514 if (error) 515 return (error); 516 517 return (0); 518 } 519 520 /* ARGSUSED */ 521 static int 522 pipe_read(fp, uio, active_cred, flags, td) 523 struct file *fp; 524 struct uio *uio; 525 struct ucred *active_cred; 526 struct thread *td; 527 int flags; 528 { 529 struct pipe *rpipe = fp->f_data; 530 int error; 531 int nread = 0; 532 u_int size; 533 534 PIPE_LOCK(rpipe); 535 ++rpipe->pipe_busy; 536 error = pipelock(rpipe, 1); 537 if (error) 538 goto unlocked_error; 539 540 #ifdef MAC 541 error = mac_check_pipe_read(active_cred, rpipe->pipe_pair); 542 if (error) 543 goto locked_error; 544 #endif 545 546 while (uio->uio_resid) { 547 /* 548 * normal pipe buffer receive 549 */ 550 if (rpipe->pipe_buffer.cnt > 0) { 551 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 552 if (size > rpipe->pipe_buffer.cnt) 553 size = rpipe->pipe_buffer.cnt; 554 if (size > (u_int) uio->uio_resid) 555 size = (u_int) uio->uio_resid; 556 557 PIPE_UNLOCK(rpipe); 558 error = uiomove( 559 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 560 size, uio); 561 PIPE_LOCK(rpipe); 562 if (error) 563 break; 564 565 rpipe->pipe_buffer.out += size; 566 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 567 rpipe->pipe_buffer.out = 0; 568 569 rpipe->pipe_buffer.cnt -= size; 570 571 /* 572 * If there is no more to read in the pipe, reset 573 * its pointers to the beginning. This improves 574 * cache hit stats. 575 */ 576 if (rpipe->pipe_buffer.cnt == 0) { 577 rpipe->pipe_buffer.in = 0; 578 rpipe->pipe_buffer.out = 0; 579 } 580 nread += size; 581 #ifndef PIPE_NODIRECT 582 /* 583 * Direct copy, bypassing a kernel buffer. 584 */ 585 } else if ((size = rpipe->pipe_map.cnt) && 586 (rpipe->pipe_state & PIPE_DIRECTW)) { 587 caddr_t va; 588 if (size > (u_int) uio->uio_resid) 589 size = (u_int) uio->uio_resid; 590 591 va = (caddr_t) rpipe->pipe_map.kva + 592 rpipe->pipe_map.pos; 593 PIPE_UNLOCK(rpipe); 594 error = uiomove(va, size, uio); 595 PIPE_LOCK(rpipe); 596 if (error) 597 break; 598 nread += size; 599 rpipe->pipe_map.pos += size; 600 rpipe->pipe_map.cnt -= size; 601 if (rpipe->pipe_map.cnt == 0) { 602 rpipe->pipe_state &= ~PIPE_DIRECTW; 603 wakeup(rpipe); 604 } 605 #endif 606 } else { 607 /* 608 * detect EOF condition 609 * read returns 0 on EOF, no need to set error 610 */ 611 if (rpipe->pipe_state & PIPE_EOF) 612 break; 613 614 /* 615 * If the "write-side" has been blocked, wake it up now. 616 */ 617 if (rpipe->pipe_state & PIPE_WANTW) { 618 rpipe->pipe_state &= ~PIPE_WANTW; 619 wakeup(rpipe); 620 } 621 622 /* 623 * Break if some data was read. 624 */ 625 if (nread > 0) 626 break; 627 628 /* 629 * Unlock the pipe buffer for our remaining processing. 630 * We will either break out with an error or we will 631 * sleep and relock to loop. 632 */ 633 pipeunlock(rpipe); 634 635 /* 636 * Handle non-blocking mode operation or 637 * wait for more data. 638 */ 639 if (fp->f_flag & FNONBLOCK) { 640 error = EAGAIN; 641 } else { 642 rpipe->pipe_state |= PIPE_WANTR; 643 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 644 PRIBIO | PCATCH, 645 "piperd", 0)) == 0) 646 error = pipelock(rpipe, 1); 647 } 648 if (error) 649 goto unlocked_error; 650 } 651 } 652 #ifdef MAC 653 locked_error: 654 #endif 655 pipeunlock(rpipe); 656 657 /* XXX: should probably do this before getting any locks. */ 658 if (error == 0) 659 vfs_timestamp(&rpipe->pipe_atime); 660 unlocked_error: 661 --rpipe->pipe_busy; 662 663 /* 664 * PIPE_WANT processing only makes sense if pipe_busy is 0. 665 */ 666 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 667 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 668 wakeup(rpipe); 669 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 670 /* 671 * Handle write blocking hysteresis. 672 */ 673 if (rpipe->pipe_state & PIPE_WANTW) { 674 rpipe->pipe_state &= ~PIPE_WANTW; 675 wakeup(rpipe); 676 } 677 } 678 679 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 680 pipeselwakeup(rpipe); 681 682 PIPE_UNLOCK(rpipe); 683 return (error); 684 } 685 686 #ifndef PIPE_NODIRECT 687 /* 688 * Map the sending processes' buffer into kernel space and wire it. 689 * This is similar to a physical write operation. 690 */ 691 static int 692 pipe_build_write_buffer(wpipe, uio) 693 struct pipe *wpipe; 694 struct uio *uio; 695 { 696 pmap_t pmap; 697 u_int size; 698 int i, j; 699 vm_offset_t addr, endaddr; 700 701 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 702 703 size = (u_int) uio->uio_iov->iov_len; 704 if (size > wpipe->pipe_buffer.size) 705 size = wpipe->pipe_buffer.size; 706 707 pmap = vmspace_pmap(curproc->p_vmspace); 708 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 709 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 710 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { 711 /* 712 * vm_fault_quick() can sleep. Consequently, 713 * vm_page_lock_queue() and vm_page_unlock_queue() 714 * should not be performed outside of this loop. 715 */ 716 race: 717 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { 718 vm_page_lock_queues(); 719 for (j = 0; j < i; j++) 720 vm_page_unhold(wpipe->pipe_map.ms[j]); 721 vm_page_unlock_queues(); 722 return (EFAULT); 723 } 724 wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, 725 VM_PROT_READ); 726 if (wpipe->pipe_map.ms[i] == NULL) 727 goto race; 728 } 729 730 /* 731 * set up the control block 732 */ 733 wpipe->pipe_map.npages = i; 734 wpipe->pipe_map.pos = 735 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 736 wpipe->pipe_map.cnt = size; 737 738 /* 739 * and map the buffer 740 */ 741 if (wpipe->pipe_map.kva == 0) { 742 /* 743 * We need to allocate space for an extra page because the 744 * address range might (will) span pages at times. 745 */ 746 wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map, 747 wpipe->pipe_buffer.size + PAGE_SIZE); 748 atomic_add_int(&amountpipekvawired, 749 wpipe->pipe_buffer.size + PAGE_SIZE); 750 } 751 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, 752 wpipe->pipe_map.npages); 753 754 /* 755 * and update the uio data 756 */ 757 758 uio->uio_iov->iov_len -= size; 759 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 760 if (uio->uio_iov->iov_len == 0) 761 uio->uio_iov++; 762 uio->uio_resid -= size; 763 uio->uio_offset += size; 764 return (0); 765 } 766 767 /* 768 * unmap and unwire the process buffer 769 */ 770 static void 771 pipe_destroy_write_buffer(wpipe) 772 struct pipe *wpipe; 773 { 774 int i; 775 776 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 777 if (wpipe->pipe_map.kva) { 778 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); 779 780 if (amountpipekvawired > maxpipekvawired / 2) { 781 /* Conserve address space */ 782 vm_offset_t kva = wpipe->pipe_map.kva; 783 wpipe->pipe_map.kva = 0; 784 kmem_free(kernel_map, kva, 785 wpipe->pipe_buffer.size + PAGE_SIZE); 786 atomic_subtract_int(&amountpipekvawired, 787 wpipe->pipe_buffer.size + PAGE_SIZE); 788 } 789 } 790 vm_page_lock_queues(); 791 for (i = 0; i < wpipe->pipe_map.npages; i++) { 792 vm_page_unhold(wpipe->pipe_map.ms[i]); 793 } 794 vm_page_unlock_queues(); 795 wpipe->pipe_map.npages = 0; 796 } 797 798 /* 799 * In the case of a signal, the writing process might go away. This 800 * code copies the data into the circular buffer so that the source 801 * pages can be freed without loss of data. 802 */ 803 static void 804 pipe_clone_write_buffer(wpipe) 805 struct pipe *wpipe; 806 { 807 int size; 808 int pos; 809 810 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 811 size = wpipe->pipe_map.cnt; 812 pos = wpipe->pipe_map.pos; 813 814 wpipe->pipe_buffer.in = size; 815 wpipe->pipe_buffer.out = 0; 816 wpipe->pipe_buffer.cnt = size; 817 wpipe->pipe_state &= ~PIPE_DIRECTW; 818 819 PIPE_UNLOCK(wpipe); 820 bcopy((caddr_t) wpipe->pipe_map.kva + pos, 821 wpipe->pipe_buffer.buffer, size); 822 pipe_destroy_write_buffer(wpipe); 823 PIPE_LOCK(wpipe); 824 } 825 826 /* 827 * This implements the pipe buffer write mechanism. Note that only 828 * a direct write OR a normal pipe write can be pending at any given time. 829 * If there are any characters in the pipe buffer, the direct write will 830 * be deferred until the receiving process grabs all of the bytes from 831 * the pipe buffer. Then the direct mapping write is set-up. 832 */ 833 static int 834 pipe_direct_write(wpipe, uio) 835 struct pipe *wpipe; 836 struct uio *uio; 837 { 838 int error; 839 840 retry: 841 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 842 while (wpipe->pipe_state & PIPE_DIRECTW) { 843 if (wpipe->pipe_state & PIPE_WANTR) { 844 wpipe->pipe_state &= ~PIPE_WANTR; 845 wakeup(wpipe); 846 } 847 wpipe->pipe_state |= PIPE_WANTW; 848 error = msleep(wpipe, PIPE_MTX(wpipe), 849 PRIBIO | PCATCH, "pipdww", 0); 850 if (error) 851 goto error1; 852 if (wpipe->pipe_state & PIPE_EOF) { 853 error = EPIPE; 854 goto error1; 855 } 856 } 857 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 858 if (wpipe->pipe_buffer.cnt > 0) { 859 if (wpipe->pipe_state & PIPE_WANTR) { 860 wpipe->pipe_state &= ~PIPE_WANTR; 861 wakeup(wpipe); 862 } 863 864 wpipe->pipe_state |= PIPE_WANTW; 865 error = msleep(wpipe, PIPE_MTX(wpipe), 866 PRIBIO | PCATCH, "pipdwc", 0); 867 if (error) 868 goto error1; 869 if (wpipe->pipe_state & PIPE_EOF) { 870 error = EPIPE; 871 goto error1; 872 } 873 goto retry; 874 } 875 876 wpipe->pipe_state |= PIPE_DIRECTW; 877 878 pipelock(wpipe, 0); 879 if (wpipe->pipe_state & PIPE_EOF) { 880 error = EPIPE; 881 goto error2; 882 } 883 PIPE_UNLOCK(wpipe); 884 error = pipe_build_write_buffer(wpipe, uio); 885 PIPE_LOCK(wpipe); 886 pipeunlock(wpipe); 887 if (error) { 888 wpipe->pipe_state &= ~PIPE_DIRECTW; 889 goto error1; 890 } 891 892 error = 0; 893 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 894 if (wpipe->pipe_state & PIPE_EOF) { 895 pipelock(wpipe, 0); 896 PIPE_UNLOCK(wpipe); 897 pipe_destroy_write_buffer(wpipe); 898 PIPE_LOCK(wpipe); 899 pipeselwakeup(wpipe); 900 pipeunlock(wpipe); 901 error = EPIPE; 902 goto error1; 903 } 904 if (wpipe->pipe_state & PIPE_WANTR) { 905 wpipe->pipe_state &= ~PIPE_WANTR; 906 wakeup(wpipe); 907 } 908 pipeselwakeup(wpipe); 909 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 910 "pipdwt", 0); 911 } 912 913 pipelock(wpipe,0); 914 if (wpipe->pipe_state & PIPE_EOF) 915 error = EPIPE; 916 if (wpipe->pipe_state & PIPE_DIRECTW) { 917 /* 918 * this bit of trickery substitutes a kernel buffer for 919 * the process that might be going away. 920 */ 921 pipe_clone_write_buffer(wpipe); 922 } else { 923 PIPE_UNLOCK(wpipe); 924 pipe_destroy_write_buffer(wpipe); 925 PIPE_LOCK(wpipe); 926 } 927 error2: 928 pipeunlock(wpipe); 929 return (error); 930 931 error1: 932 wakeup(wpipe); 933 return (error); 934 } 935 #endif 936 937 static int 938 pipe_write(fp, uio, active_cred, flags, td) 939 struct file *fp; 940 struct uio *uio; 941 struct ucred *active_cred; 942 struct thread *td; 943 int flags; 944 { 945 int error = 0; 946 int orig_resid; 947 struct pipe *wpipe, *rpipe; 948 949 rpipe = fp->f_data; 950 wpipe = rpipe->pipe_peer; 951 952 PIPE_LOCK(rpipe); 953 /* 954 * detect loss of pipe read side, issue SIGPIPE if lost. 955 */ 956 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 957 PIPE_UNLOCK(rpipe); 958 return (EPIPE); 959 } 960 #ifdef MAC 961 error = mac_check_pipe_write(active_cred, wpipe->pipe_pair); 962 if (error) { 963 PIPE_UNLOCK(rpipe); 964 return (error); 965 } 966 #endif 967 ++wpipe->pipe_busy; 968 969 /* 970 * If it is advantageous to resize the pipe buffer, do 971 * so. 972 */ 973 if ((uio->uio_resid > PIPE_SIZE) && 974 (amountpipekva < maxpipekva / 2) && 975 (nbigpipe < LIMITBIGPIPES) && 976 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 977 (wpipe->pipe_buffer.size <= PIPE_SIZE) && 978 (wpipe->pipe_buffer.cnt == 0)) { 979 980 if ((error = pipelock(wpipe, 1)) == 0) { 981 if (wpipe->pipe_state & PIPE_EOF) 982 error = EPIPE; 983 else { 984 PIPE_UNLOCK(wpipe); 985 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 986 atomic_add_int(&nbigpipe, 1); 987 PIPE_LOCK(wpipe); 988 } 989 pipeunlock(wpipe); 990 } 991 } 992 993 /* 994 * If an early error occured unbusy and return, waking up any pending 995 * readers. 996 */ 997 if (error) { 998 --wpipe->pipe_busy; 999 if ((wpipe->pipe_busy == 0) && 1000 (wpipe->pipe_state & PIPE_WANT)) { 1001 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1002 wakeup(wpipe); 1003 } 1004 PIPE_UNLOCK(rpipe); 1005 return(error); 1006 } 1007 1008 orig_resid = uio->uio_resid; 1009 1010 while (uio->uio_resid) { 1011 int space; 1012 1013 #ifndef PIPE_NODIRECT 1014 /* 1015 * If the transfer is large, we can gain performance if 1016 * we do process-to-process copies directly. 1017 * If the write is non-blocking, we don't use the 1018 * direct write mechanism. 1019 * 1020 * The direct write mechanism will detect the reader going 1021 * away on us. 1022 */ 1023 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 1024 (fp->f_flag & FNONBLOCK) == 0 && 1025 amountpipekvawired + uio->uio_resid < maxpipekvawired) { 1026 error = pipe_direct_write(wpipe, uio); 1027 if (error) 1028 break; 1029 continue; 1030 } 1031 #endif 1032 1033 /* 1034 * Pipe buffered writes cannot be coincidental with 1035 * direct writes. We wait until the currently executing 1036 * direct write is completed before we start filling the 1037 * pipe buffer. We break out if a signal occurs or the 1038 * reader goes away. 1039 */ 1040 retrywrite: 1041 while (wpipe->pipe_state & PIPE_DIRECTW) { 1042 if (wpipe->pipe_state & PIPE_WANTR) { 1043 wpipe->pipe_state &= ~PIPE_WANTR; 1044 wakeup(wpipe); 1045 } 1046 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1047 "pipbww", 0); 1048 if (wpipe->pipe_state & PIPE_EOF) { 1049 error = EPIPE; 1050 break; 1051 } 1052 if (error) 1053 break; 1054 } 1055 1056 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1057 1058 /* Writes of size <= PIPE_BUF must be atomic. */ 1059 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1060 space = 0; 1061 1062 if (space > 0) { 1063 if ((error = pipelock(wpipe,1)) == 0) { 1064 int size; /* Transfer size */ 1065 int segsize; /* first segment to transfer */ 1066 1067 /* 1068 * It is possible for a direct write/EOF to 1069 * slip in on us... handle them here... 1070 */ 1071 if (wpipe->pipe_state & PIPE_EOF) 1072 goto lost_wpipe; 1073 if (wpipe->pipe_state & PIPE_DIRECTW) { 1074 pipeunlock(wpipe); 1075 goto retrywrite; 1076 } 1077 /* 1078 * If a process blocked in uiomove, our 1079 * value for space might be bad. 1080 * 1081 * XXX will we be ok if the reader has gone 1082 * away here? 1083 */ 1084 if (space > wpipe->pipe_buffer.size - 1085 wpipe->pipe_buffer.cnt) { 1086 pipeunlock(wpipe); 1087 goto retrywrite; 1088 } 1089 1090 /* 1091 * Transfer size is minimum of uio transfer 1092 * and free space in pipe buffer. 1093 */ 1094 if (space > uio->uio_resid) 1095 size = uio->uio_resid; 1096 else 1097 size = space; 1098 /* 1099 * First segment to transfer is minimum of 1100 * transfer size and contiguous space in 1101 * pipe buffer. If first segment to transfer 1102 * is less than the transfer size, we've got 1103 * a wraparound in the buffer. 1104 */ 1105 segsize = wpipe->pipe_buffer.size - 1106 wpipe->pipe_buffer.in; 1107 if (segsize > size) 1108 segsize = size; 1109 1110 /* Transfer first segment */ 1111 1112 PIPE_UNLOCK(rpipe); 1113 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1114 segsize, uio); 1115 PIPE_LOCK(rpipe); 1116 1117 if (error == 0 && segsize < size) { 1118 /* 1119 * Transfer remaining part now, to 1120 * support atomic writes. Wraparound 1121 * happened. 1122 */ 1123 if (wpipe->pipe_buffer.in + segsize != 1124 wpipe->pipe_buffer.size) 1125 panic("Expected pipe buffer " 1126 "wraparound disappeared"); 1127 1128 PIPE_UNLOCK(rpipe); 1129 error = uiomove( 1130 &wpipe->pipe_buffer.buffer[0], 1131 size - segsize, uio); 1132 PIPE_LOCK(rpipe); 1133 } 1134 if (error == 0) { 1135 wpipe->pipe_buffer.in += size; 1136 if (wpipe->pipe_buffer.in >= 1137 wpipe->pipe_buffer.size) { 1138 if (wpipe->pipe_buffer.in != 1139 size - segsize + 1140 wpipe->pipe_buffer.size) 1141 panic("Expected " 1142 "wraparound bad"); 1143 wpipe->pipe_buffer.in = size - 1144 segsize; 1145 } 1146 1147 wpipe->pipe_buffer.cnt += size; 1148 if (wpipe->pipe_buffer.cnt > 1149 wpipe->pipe_buffer.size) 1150 panic("Pipe buffer overflow"); 1151 1152 } 1153 lost_wpipe: 1154 pipeunlock(wpipe); 1155 } 1156 if (error) 1157 break; 1158 1159 } else { 1160 /* 1161 * If the "read-side" has been blocked, wake it up now. 1162 */ 1163 if (wpipe->pipe_state & PIPE_WANTR) { 1164 wpipe->pipe_state &= ~PIPE_WANTR; 1165 wakeup(wpipe); 1166 } 1167 1168 /* 1169 * don't block on non-blocking I/O 1170 */ 1171 if (fp->f_flag & FNONBLOCK) { 1172 error = EAGAIN; 1173 break; 1174 } 1175 1176 /* 1177 * We have no more space and have something to offer, 1178 * wake up select/poll. 1179 */ 1180 pipeselwakeup(wpipe); 1181 1182 wpipe->pipe_state |= PIPE_WANTW; 1183 error = msleep(wpipe, PIPE_MTX(rpipe), 1184 PRIBIO | PCATCH, "pipewr", 0); 1185 if (error != 0) 1186 break; 1187 /* 1188 * If read side wants to go away, we just issue a signal 1189 * to ourselves. 1190 */ 1191 if (wpipe->pipe_state & PIPE_EOF) { 1192 error = EPIPE; 1193 break; 1194 } 1195 } 1196 } 1197 1198 --wpipe->pipe_busy; 1199 1200 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1201 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1202 wakeup(wpipe); 1203 } else if (wpipe->pipe_buffer.cnt > 0) { 1204 /* 1205 * If we have put any characters in the buffer, we wake up 1206 * the reader. 1207 */ 1208 if (wpipe->pipe_state & PIPE_WANTR) { 1209 wpipe->pipe_state &= ~PIPE_WANTR; 1210 wakeup(wpipe); 1211 } 1212 } 1213 1214 /* 1215 * Don't return EPIPE if I/O was successful 1216 */ 1217 if ((wpipe->pipe_buffer.cnt == 0) && 1218 (uio->uio_resid == 0) && 1219 (error == EPIPE)) { 1220 error = 0; 1221 } 1222 1223 if (error == 0) 1224 vfs_timestamp(&wpipe->pipe_mtime); 1225 1226 /* 1227 * We have something to offer, 1228 * wake up select/poll. 1229 */ 1230 if (wpipe->pipe_buffer.cnt) 1231 pipeselwakeup(wpipe); 1232 1233 PIPE_UNLOCK(rpipe); 1234 return (error); 1235 } 1236 1237 /* 1238 * we implement a very minimal set of ioctls for compatibility with sockets. 1239 */ 1240 static int 1241 pipe_ioctl(fp, cmd, data, active_cred, td) 1242 struct file *fp; 1243 u_long cmd; 1244 void *data; 1245 struct ucred *active_cred; 1246 struct thread *td; 1247 { 1248 struct pipe *mpipe = fp->f_data; 1249 #ifdef MAC 1250 int error; 1251 #endif 1252 1253 PIPE_LOCK(mpipe); 1254 1255 #ifdef MAC 1256 error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1257 if (error) { 1258 PIPE_UNLOCK(mpipe); 1259 return (error); 1260 } 1261 #endif 1262 1263 switch (cmd) { 1264 1265 case FIONBIO: 1266 PIPE_UNLOCK(mpipe); 1267 return (0); 1268 1269 case FIOASYNC: 1270 if (*(int *)data) { 1271 mpipe->pipe_state |= PIPE_ASYNC; 1272 } else { 1273 mpipe->pipe_state &= ~PIPE_ASYNC; 1274 } 1275 PIPE_UNLOCK(mpipe); 1276 return (0); 1277 1278 case FIONREAD: 1279 if (mpipe->pipe_state & PIPE_DIRECTW) 1280 *(int *)data = mpipe->pipe_map.cnt; 1281 else 1282 *(int *)data = mpipe->pipe_buffer.cnt; 1283 PIPE_UNLOCK(mpipe); 1284 return (0); 1285 1286 case FIOSETOWN: 1287 PIPE_UNLOCK(mpipe); 1288 return (fsetown(*(int *)data, &mpipe->pipe_sigio)); 1289 1290 case FIOGETOWN: 1291 PIPE_UNLOCK(mpipe); 1292 *(int *)data = fgetown(&mpipe->pipe_sigio); 1293 return (0); 1294 1295 /* This is deprecated, FIOSETOWN should be used instead. */ 1296 case TIOCSPGRP: 1297 PIPE_UNLOCK(mpipe); 1298 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); 1299 1300 /* This is deprecated, FIOGETOWN should be used instead. */ 1301 case TIOCGPGRP: 1302 PIPE_UNLOCK(mpipe); 1303 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1304 return (0); 1305 1306 } 1307 PIPE_UNLOCK(mpipe); 1308 return (ENOTTY); 1309 } 1310 1311 static int 1312 pipe_poll(fp, events, active_cred, td) 1313 struct file *fp; 1314 int events; 1315 struct ucred *active_cred; 1316 struct thread *td; 1317 { 1318 struct pipe *rpipe = fp->f_data; 1319 struct pipe *wpipe; 1320 int revents = 0; 1321 #ifdef MAC 1322 int error; 1323 #endif 1324 1325 wpipe = rpipe->pipe_peer; 1326 PIPE_LOCK(rpipe); 1327 #ifdef MAC 1328 error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair); 1329 if (error) 1330 goto locked_error; 1331 #endif 1332 if (events & (POLLIN | POLLRDNORM)) 1333 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1334 (rpipe->pipe_buffer.cnt > 0) || 1335 (rpipe->pipe_state & PIPE_EOF)) 1336 revents |= events & (POLLIN | POLLRDNORM); 1337 1338 if (events & (POLLOUT | POLLWRNORM)) 1339 if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) || 1340 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1341 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1342 revents |= events & (POLLOUT | POLLWRNORM); 1343 1344 if ((rpipe->pipe_state & PIPE_EOF) || 1345 (!wpipe->pipe_present) || 1346 (wpipe->pipe_state & PIPE_EOF)) 1347 revents |= POLLHUP; 1348 1349 if (revents == 0) { 1350 if (events & (POLLIN | POLLRDNORM)) { 1351 selrecord(td, &rpipe->pipe_sel); 1352 rpipe->pipe_state |= PIPE_SEL; 1353 } 1354 1355 if (events & (POLLOUT | POLLWRNORM)) { 1356 selrecord(td, &wpipe->pipe_sel); 1357 wpipe->pipe_state |= PIPE_SEL; 1358 } 1359 } 1360 #ifdef MAC 1361 locked_error: 1362 #endif 1363 PIPE_UNLOCK(rpipe); 1364 1365 return (revents); 1366 } 1367 1368 /* 1369 * We shouldn't need locks here as we're doing a read and this should 1370 * be a natural race. 1371 */ 1372 static int 1373 pipe_stat(fp, ub, active_cred, td) 1374 struct file *fp; 1375 struct stat *ub; 1376 struct ucred *active_cred; 1377 struct thread *td; 1378 { 1379 struct pipe *pipe = fp->f_data; 1380 #ifdef MAC 1381 int error; 1382 1383 PIPE_LOCK(pipe); 1384 error = mac_check_pipe_stat(active_cred, pipe->pipe_pair); 1385 PIPE_UNLOCK(pipe); 1386 if (error) 1387 return (error); 1388 #endif 1389 bzero(ub, sizeof(*ub)); 1390 ub->st_mode = S_IFIFO; 1391 ub->st_blksize = pipe->pipe_buffer.size; 1392 ub->st_size = pipe->pipe_buffer.cnt; 1393 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1394 ub->st_atimespec = pipe->pipe_atime; 1395 ub->st_mtimespec = pipe->pipe_mtime; 1396 ub->st_ctimespec = pipe->pipe_ctime; 1397 ub->st_uid = fp->f_cred->cr_uid; 1398 ub->st_gid = fp->f_cred->cr_gid; 1399 /* 1400 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1401 * XXX (st_dev, st_ino) should be unique. 1402 */ 1403 return (0); 1404 } 1405 1406 /* ARGSUSED */ 1407 static int 1408 pipe_close(fp, td) 1409 struct file *fp; 1410 struct thread *td; 1411 { 1412 struct pipe *cpipe = fp->f_data; 1413 1414 fp->f_ops = &badfileops; 1415 fp->f_data = NULL; 1416 funsetown(&cpipe->pipe_sigio); 1417 pipeclose(cpipe); 1418 return (0); 1419 } 1420 1421 static void 1422 pipe_free_kmem(cpipe) 1423 struct pipe *cpipe; 1424 { 1425 1426 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1427 ("pipe_free_kmem: pipe mutex locked")); 1428 1429 if (cpipe->pipe_buffer.buffer != NULL) { 1430 if (cpipe->pipe_buffer.size > PIPE_SIZE) 1431 atomic_subtract_int(&nbigpipe, 1); 1432 atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size); 1433 vm_map_remove(pipe_map, 1434 (vm_offset_t)cpipe->pipe_buffer.buffer, 1435 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1436 cpipe->pipe_buffer.buffer = NULL; 1437 } 1438 #ifndef PIPE_NODIRECT 1439 if (cpipe->pipe_map.kva != 0) { 1440 atomic_subtract_int(&amountpipekvawired, 1441 cpipe->pipe_buffer.size + PAGE_SIZE); 1442 kmem_free(kernel_map, 1443 cpipe->pipe_map.kva, 1444 cpipe->pipe_buffer.size + PAGE_SIZE); 1445 cpipe->pipe_map.cnt = 0; 1446 cpipe->pipe_map.kva = 0; 1447 cpipe->pipe_map.pos = 0; 1448 cpipe->pipe_map.npages = 0; 1449 } 1450 #endif 1451 } 1452 1453 /* 1454 * shutdown the pipe 1455 */ 1456 static void 1457 pipeclose(cpipe) 1458 struct pipe *cpipe; 1459 { 1460 struct pipepair *pp; 1461 struct pipe *ppipe; 1462 1463 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1464 1465 PIPE_LOCK(cpipe); 1466 pp = cpipe->pipe_pair; 1467 1468 pipeselwakeup(cpipe); 1469 1470 /* 1471 * If the other side is blocked, wake it up saying that 1472 * we want to close it down. 1473 */ 1474 cpipe->pipe_state |= PIPE_EOF; 1475 while (cpipe->pipe_busy) { 1476 wakeup(cpipe); 1477 cpipe->pipe_state |= PIPE_WANT; 1478 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1479 } 1480 1481 1482 /* 1483 * Disconnect from peer, if any. 1484 */ 1485 ppipe = cpipe->pipe_peer; 1486 if (ppipe->pipe_present != 0) { 1487 pipeselwakeup(ppipe); 1488 1489 ppipe->pipe_state |= PIPE_EOF; 1490 wakeup(ppipe); 1491 KNOTE(&ppipe->pipe_sel.si_note, 0); 1492 } 1493 1494 /* 1495 * Mark this endpoint as free. Release kmem resources. We 1496 * don't mark this endpoint as unused until we've finished 1497 * doing that, or the pipe might disappear out from under 1498 * us. 1499 */ 1500 pipelock(cpipe, 0); 1501 PIPE_UNLOCK(cpipe); 1502 pipe_free_kmem(cpipe); 1503 PIPE_LOCK(cpipe); 1504 cpipe->pipe_present = 0; 1505 pipeunlock(cpipe); 1506 1507 /* 1508 * If both endpoints are now closed, release the memory for the 1509 * pipe pair. If not, unlock. 1510 */ 1511 if (ppipe->pipe_present == 0) { 1512 PIPE_UNLOCK(cpipe); 1513 #ifdef MAC 1514 mac_destroy_pipe(pp); 1515 #endif 1516 uma_zfree(pipe_zone, cpipe->pipe_pair); 1517 } else 1518 PIPE_UNLOCK(cpipe); 1519 } 1520 1521 /*ARGSUSED*/ 1522 static int 1523 pipe_kqfilter(struct file *fp, struct knote *kn) 1524 { 1525 struct pipe *cpipe; 1526 1527 cpipe = kn->kn_fp->f_data; 1528 PIPE_LOCK(cpipe); 1529 switch (kn->kn_filter) { 1530 case EVFILT_READ: 1531 kn->kn_fop = &pipe_rfiltops; 1532 break; 1533 case EVFILT_WRITE: 1534 kn->kn_fop = &pipe_wfiltops; 1535 if (!cpipe->pipe_peer->pipe_present) { 1536 /* other end of pipe has been closed */ 1537 PIPE_UNLOCK(cpipe); 1538 return (EPIPE); 1539 } 1540 cpipe = cpipe->pipe_peer; 1541 break; 1542 default: 1543 PIPE_UNLOCK(cpipe); 1544 return (1); 1545 } 1546 1547 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); 1548 PIPE_UNLOCK(cpipe); 1549 return (0); 1550 } 1551 1552 static void 1553 filt_pipedetach(struct knote *kn) 1554 { 1555 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1556 1557 PIPE_LOCK(cpipe); 1558 if (kn->kn_filter == EVFILT_WRITE) { 1559 if (!cpipe->pipe_peer->pipe_present) { 1560 PIPE_UNLOCK(cpipe); 1561 return; 1562 } 1563 cpipe = cpipe->pipe_peer; 1564 } 1565 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); 1566 PIPE_UNLOCK(cpipe); 1567 } 1568 1569 /*ARGSUSED*/ 1570 static int 1571 filt_piperead(struct knote *kn, long hint) 1572 { 1573 struct pipe *rpipe = kn->kn_fp->f_data; 1574 struct pipe *wpipe = rpipe->pipe_peer; 1575 1576 PIPE_LOCK(rpipe); 1577 kn->kn_data = rpipe->pipe_buffer.cnt; 1578 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1579 kn->kn_data = rpipe->pipe_map.cnt; 1580 1581 if ((rpipe->pipe_state & PIPE_EOF) || 1582 (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1583 kn->kn_flags |= EV_EOF; 1584 PIPE_UNLOCK(rpipe); 1585 return (1); 1586 } 1587 PIPE_UNLOCK(rpipe); 1588 return (kn->kn_data > 0); 1589 } 1590 1591 /*ARGSUSED*/ 1592 static int 1593 filt_pipewrite(struct knote *kn, long hint) 1594 { 1595 struct pipe *rpipe = kn->kn_fp->f_data; 1596 struct pipe *wpipe = rpipe->pipe_peer; 1597 1598 PIPE_LOCK(rpipe); 1599 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1600 kn->kn_data = 0; 1601 kn->kn_flags |= EV_EOF; 1602 PIPE_UNLOCK(rpipe); 1603 return (1); 1604 } 1605 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1606 if (wpipe->pipe_state & PIPE_DIRECTW) 1607 kn->kn_data = 0; 1608 1609 PIPE_UNLOCK(rpipe); 1610 return (kn->kn_data >= PIPE_BUF); 1611 } 1612