1 /* 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 */ 19 20 /* 21 * This file contains a high-performance replacement for the socket-based 22 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 23 * all features of sockets, but does do everything that pipes normally 24 * do. 25 */ 26 27 /* 28 * This code has two modes of operation, a small write mode and a large 29 * write mode. The small write mode acts like conventional pipes with 30 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 31 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 33 * the receiving process can copy it directly from the pages in the sending 34 * process. 35 * 36 * If the sending process receives a signal, it is possible that it will 37 * go away, and certainly its address space can change, because control 38 * is returned back to the user-mode side. In that case, the pipe code 39 * arranges to copy the buffer supplied by the user process, to a pageable 40 * kernel buffer, and the receiving process will grab the data from the 41 * pageable kernel buffer. Since signals don't happen all that often, 42 * the copy operation is normally eliminated. 43 * 44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 45 * happen for small transfers so that the system will not spend all of 46 * its time context switching. 47 * 48 * In order to limit the resource use of pipes, two sysctls exist: 49 * 50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 51 * address space available to us in pipe_map. Whenever the amount in use 52 * exceeds half of this value, all new pipes will be created with size 53 * SMALL_PIPE_SIZE, rather than PIPE_SIZE. Big pipe creation will be limited 54 * as well. This value is loader tunable only. 55 * 56 * These values are autotuned in subr_param.c. 57 * 58 * Memory usage may be monitored through the sysctls 59 * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired. 60 * 61 */ 62 63 #include <sys/cdefs.h> 64 __FBSDID("$FreeBSD$"); 65 66 #include "opt_mac.h" 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/fcntl.h> 71 #include <sys/file.h> 72 #include <sys/filedesc.h> 73 #include <sys/filio.h> 74 #include <sys/kernel.h> 75 #include <sys/lock.h> 76 #include <sys/mac.h> 77 #include <sys/mutex.h> 78 #include <sys/ttycom.h> 79 #include <sys/stat.h> 80 #include <sys/malloc.h> 81 #include <sys/poll.h> 82 #include <sys/selinfo.h> 83 #include <sys/signalvar.h> 84 #include <sys/sysctl.h> 85 #include <sys/sysproto.h> 86 #include <sys/pipe.h> 87 #include <sys/proc.h> 88 #include <sys/vnode.h> 89 #include <sys/uio.h> 90 #include <sys/event.h> 91 92 #include <vm/vm.h> 93 #include <vm/vm_param.h> 94 #include <vm/vm_object.h> 95 #include <vm/vm_kern.h> 96 #include <vm/vm_extern.h> 97 #include <vm/pmap.h> 98 #include <vm/vm_map.h> 99 #include <vm/vm_page.h> 100 #include <vm/uma.h> 101 102 /* 103 * Use this define if you want to disable *fancy* VM things. Expect an 104 * approx 30% decrease in transfer rate. This could be useful for 105 * NetBSD or OpenBSD. 106 */ 107 /* #define PIPE_NODIRECT */ 108 109 /* 110 * interfaces to the outside world 111 */ 112 static fo_rdwr_t pipe_read; 113 static fo_rdwr_t pipe_write; 114 static fo_ioctl_t pipe_ioctl; 115 static fo_poll_t pipe_poll; 116 static fo_kqfilter_t pipe_kqfilter; 117 static fo_stat_t pipe_stat; 118 static fo_close_t pipe_close; 119 120 static struct fileops pipeops = { 121 .fo_read = pipe_read, 122 .fo_write = pipe_write, 123 .fo_ioctl = pipe_ioctl, 124 .fo_poll = pipe_poll, 125 .fo_kqfilter = pipe_kqfilter, 126 .fo_stat = pipe_stat, 127 .fo_close = pipe_close, 128 .fo_flags = DFLAG_PASSABLE 129 }; 130 131 static void filt_pipedetach(struct knote *kn); 132 static int filt_piperead(struct knote *kn, long hint); 133 static int filt_pipewrite(struct knote *kn, long hint); 134 135 static struct filterops pipe_rfiltops = 136 { 1, NULL, filt_pipedetach, filt_piperead }; 137 static struct filterops pipe_wfiltops = 138 { 1, NULL, filt_pipedetach, filt_pipewrite }; 139 140 /* 141 * Default pipe buffer size(s), this can be kind-of large now because pipe 142 * space is pageable. The pipe code will try to maintain locality of 143 * reference for performance reasons, so small amounts of outstanding I/O 144 * will not wipe the cache. 145 */ 146 #define MINPIPESIZE (PIPE_SIZE/3) 147 #define MAXPIPESIZE (2*PIPE_SIZE/3) 148 149 /* 150 * Limit the number of "big" pipes 151 */ 152 #define LIMITBIGPIPES 32 153 static int nbigpipe; 154 155 static int amountpipes; 156 static int amountpipekva; 157 158 SYSCTL_DECL(_kern_ipc); 159 160 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, 161 &maxpipekva, 0, "Pipe KVA limit"); 162 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, 163 &amountpipes, 0, "Current # of pipes"); 164 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD, 165 &nbigpipe, 0, "Current # of big pipes"); 166 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 167 &amountpipekva, 0, "Pipe KVA usage"); 168 169 static void pipeinit(void *dummy __unused); 170 static void pipeclose(struct pipe *cpipe); 171 static void pipe_free_kmem(struct pipe *cpipe); 172 static int pipe_create(struct pipe *pipe); 173 static __inline int pipelock(struct pipe *cpipe, int catch); 174 static __inline void pipeunlock(struct pipe *cpipe); 175 static __inline void pipeselwakeup(struct pipe *cpipe); 176 #ifndef PIPE_NODIRECT 177 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 178 static void pipe_destroy_write_buffer(struct pipe *wpipe); 179 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 180 static void pipe_clone_write_buffer(struct pipe *wpipe); 181 #endif 182 static int pipespace(struct pipe *cpipe, int size); 183 static int pipespace_new(struct pipe *cpipe, int size); 184 185 static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); 186 static void pipe_zone_dtor(void *mem, int size, void *arg); 187 static int pipe_zone_init(void *mem, int size, int flags); 188 static void pipe_zone_fini(void *mem, int size); 189 190 static uma_zone_t pipe_zone; 191 192 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 193 194 static void 195 pipeinit(void *dummy __unused) 196 { 197 198 pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair), 199 pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini, 200 UMA_ALIGN_PTR, 0); 201 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 202 } 203 204 static int 205 pipe_zone_ctor(void *mem, int size, void *arg, int flags) 206 { 207 struct pipepair *pp; 208 struct pipe *rpipe, *wpipe; 209 210 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 211 212 pp = (struct pipepair *)mem; 213 214 /* 215 * We zero both pipe endpoints to make sure all the kmem pointers 216 * are NULL, flag fields are zero'd, etc. We timestamp both 217 * endpoints with the same time. 218 */ 219 rpipe = &pp->pp_rpipe; 220 bzero(rpipe, sizeof(*rpipe)); 221 vfs_timestamp(&rpipe->pipe_ctime); 222 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 223 224 wpipe = &pp->pp_wpipe; 225 bzero(wpipe, sizeof(*wpipe)); 226 wpipe->pipe_ctime = rpipe->pipe_ctime; 227 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 228 229 rpipe->pipe_peer = wpipe; 230 rpipe->pipe_pair = pp; 231 wpipe->pipe_peer = rpipe; 232 wpipe->pipe_pair = pp; 233 234 /* 235 * Mark both endpoints as present; they will later get free'd 236 * one at a time. When both are free'd, then the whole pair 237 * is released. 238 */ 239 rpipe->pipe_present = 1; 240 wpipe->pipe_present = 1; 241 242 /* 243 * Eventually, the MAC Framework may initialize the label 244 * in ctor or init, but for now we do it elswhere to avoid 245 * blocking in ctor or init. 246 */ 247 pp->pp_label = NULL; 248 249 atomic_add_int(&amountpipes, 2); 250 return (0); 251 } 252 253 static void 254 pipe_zone_dtor(void *mem, int size, void *arg) 255 { 256 struct pipepair *pp; 257 258 KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size")); 259 260 pp = (struct pipepair *)mem; 261 262 atomic_subtract_int(&amountpipes, 2); 263 } 264 265 static int 266 pipe_zone_init(void *mem, int size, int flags) 267 { 268 struct pipepair *pp; 269 270 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 271 272 pp = (struct pipepair *)mem; 273 274 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); 275 return (0); 276 } 277 278 static void 279 pipe_zone_fini(void *mem, int size) 280 { 281 struct pipepair *pp; 282 283 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 284 285 pp = (struct pipepair *)mem; 286 287 mtx_destroy(&pp->pp_mtx); 288 } 289 290 /* 291 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, 292 * let the zone pick up the pieces via pipeclose(). 293 */ 294 295 /* ARGSUSED */ 296 int 297 pipe(td, uap) 298 struct thread *td; 299 struct pipe_args /* { 300 int dummy; 301 } */ *uap; 302 { 303 struct filedesc *fdp = td->td_proc->p_fd; 304 struct file *rf, *wf; 305 struct pipepair *pp; 306 struct pipe *rpipe, *wpipe; 307 int fd, error; 308 309 pp = uma_zalloc(pipe_zone, M_WAITOK); 310 #ifdef MAC 311 /* 312 * The MAC label is shared between the connected endpoints. As a 313 * result mac_init_pipe() and mac_create_pipe() are called once 314 * for the pair, and not on the endpoints. 315 */ 316 mac_init_pipe(pp); 317 mac_create_pipe(td->td_ucred, pp); 318 #endif 319 rpipe = &pp->pp_rpipe; 320 wpipe = &pp->pp_wpipe; 321 322 if (pipe_create(rpipe) || pipe_create(wpipe)) { 323 pipeclose(rpipe); 324 pipeclose(wpipe); 325 return (ENFILE); 326 } 327 328 rpipe->pipe_state |= PIPE_DIRECTOK; 329 wpipe->pipe_state |= PIPE_DIRECTOK; 330 331 error = falloc(td, &rf, &fd); 332 if (error) { 333 pipeclose(rpipe); 334 pipeclose(wpipe); 335 return (error); 336 } 337 /* An extra reference on `rf' has been held for us by falloc(). */ 338 td->td_retval[0] = fd; 339 340 /* 341 * Warning: once we've gotten past allocation of the fd for the 342 * read-side, we can only drop the read side via fdrop() in order 343 * to avoid races against processes which manage to dup() the read 344 * side while we are blocked trying to allocate the write side. 345 */ 346 FILE_LOCK(rf); 347 rf->f_flag = FREAD | FWRITE; 348 rf->f_type = DTYPE_PIPE; 349 rf->f_data = rpipe; 350 rf->f_ops = &pipeops; 351 FILE_UNLOCK(rf); 352 error = falloc(td, &wf, &fd); 353 if (error) { 354 FILEDESC_LOCK(fdp); 355 if (fdp->fd_ofiles[td->td_retval[0]] == rf) { 356 fdp->fd_ofiles[td->td_retval[0]] = NULL; 357 fdunused(fdp, td->td_retval[0]); 358 FILEDESC_UNLOCK(fdp); 359 fdrop(rf, td); 360 } else { 361 FILEDESC_UNLOCK(fdp); 362 } 363 fdrop(rf, td); 364 /* rpipe has been closed by fdrop(). */ 365 pipeclose(wpipe); 366 return (error); 367 } 368 /* An extra reference on `wf' has been held for us by falloc(). */ 369 FILE_LOCK(wf); 370 wf->f_flag = FREAD | FWRITE; 371 wf->f_type = DTYPE_PIPE; 372 wf->f_data = wpipe; 373 wf->f_ops = &pipeops; 374 FILE_UNLOCK(wf); 375 fdrop(wf, td); 376 td->td_retval[1] = fd; 377 fdrop(rf, td); 378 379 return (0); 380 } 381 382 /* 383 * Allocate kva for pipe circular buffer, the space is pageable 384 * This routine will 'realloc' the size of a pipe safely, if it fails 385 * it will retain the old buffer. 386 * If it fails it will return ENOMEM. 387 */ 388 static int 389 pipespace_new(cpipe, size) 390 struct pipe *cpipe; 391 int size; 392 { 393 caddr_t buffer; 394 int error; 395 static int curfail = 0; 396 static struct timeval lastfail; 397 398 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 399 400 size = round_page(size); 401 /* 402 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 403 */ 404 buffer = (caddr_t) vm_map_min(pipe_map); 405 406 /* 407 * The map entry is, by default, pageable. 408 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 409 */ 410 error = vm_map_find(pipe_map, NULL, 0, 411 (vm_offset_t *) &buffer, size, 1, 412 VM_PROT_ALL, VM_PROT_ALL, 0); 413 if (error != KERN_SUCCESS) { 414 if (ppsratecheck(&lastfail, &curfail, 1)) 415 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 416 return (ENOMEM); 417 } 418 419 /* free old resources if we're resizing */ 420 pipe_free_kmem(cpipe); 421 cpipe->pipe_buffer.buffer = buffer; 422 cpipe->pipe_buffer.size = size; 423 cpipe->pipe_buffer.in = 0; 424 cpipe->pipe_buffer.out = 0; 425 cpipe->pipe_buffer.cnt = 0; 426 atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size); 427 return (0); 428 } 429 430 /* 431 * Wrapper for pipespace_new() that performs locking assertions. 432 */ 433 static int 434 pipespace(cpipe, size) 435 struct pipe *cpipe; 436 int size; 437 { 438 439 /* 440 * XXXRW: Seems like we should really assert PIPE_LOCKFL on the 441 * pipe_state here. 442 */ 443 444 return (pipespace_new(cpipe, size)); 445 } 446 447 /* 448 * lock a pipe for I/O, blocking other access 449 */ 450 static __inline int 451 pipelock(cpipe, catch) 452 struct pipe *cpipe; 453 int catch; 454 { 455 int error; 456 457 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 458 while (cpipe->pipe_state & PIPE_LOCKFL) { 459 cpipe->pipe_state |= PIPE_LWANT; 460 error = msleep(cpipe, PIPE_MTX(cpipe), 461 catch ? (PRIBIO | PCATCH) : PRIBIO, 462 "pipelk", 0); 463 if (error != 0) 464 return (error); 465 } 466 cpipe->pipe_state |= PIPE_LOCKFL; 467 return (0); 468 } 469 470 /* 471 * unlock a pipe I/O lock 472 */ 473 static __inline void 474 pipeunlock(cpipe) 475 struct pipe *cpipe; 476 { 477 478 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 479 cpipe->pipe_state &= ~PIPE_LOCKFL; 480 if (cpipe->pipe_state & PIPE_LWANT) { 481 cpipe->pipe_state &= ~PIPE_LWANT; 482 wakeup(cpipe); 483 } 484 } 485 486 static __inline void 487 pipeselwakeup(cpipe) 488 struct pipe *cpipe; 489 { 490 491 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 492 if (cpipe->pipe_state & PIPE_SEL) { 493 cpipe->pipe_state &= ~PIPE_SEL; 494 selwakeuppri(&cpipe->pipe_sel, PSOCK); 495 } 496 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 497 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 498 KNOTE(&cpipe->pipe_sel.si_note, 0); 499 } 500 501 /* 502 * Initialize and allocate VM and memory for pipe. The structure 503 * will start out zero'd from the ctor, so we just manage the kmem. 504 */ 505 static int 506 pipe_create(pipe) 507 struct pipe *pipe; 508 { 509 int error; 510 511 /* 512 * Reduce to 1/4th pipe size if we're over our global max. 513 */ 514 if (amountpipekva > maxpipekva / 2) 515 error = pipespace(pipe, SMALL_PIPE_SIZE); 516 else 517 error = pipespace(pipe, PIPE_SIZE); 518 return (error); 519 } 520 521 /* ARGSUSED */ 522 static int 523 pipe_read(fp, uio, active_cred, flags, td) 524 struct file *fp; 525 struct uio *uio; 526 struct ucred *active_cred; 527 struct thread *td; 528 int flags; 529 { 530 struct pipe *rpipe = fp->f_data; 531 int error; 532 int nread = 0; 533 u_int size; 534 535 PIPE_LOCK(rpipe); 536 ++rpipe->pipe_busy; 537 error = pipelock(rpipe, 1); 538 if (error) 539 goto unlocked_error; 540 541 #ifdef MAC 542 error = mac_check_pipe_read(active_cred, rpipe->pipe_pair); 543 if (error) 544 goto locked_error; 545 #endif 546 547 while (uio->uio_resid) { 548 /* 549 * normal pipe buffer receive 550 */ 551 if (rpipe->pipe_buffer.cnt > 0) { 552 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 553 if (size > rpipe->pipe_buffer.cnt) 554 size = rpipe->pipe_buffer.cnt; 555 if (size > (u_int) uio->uio_resid) 556 size = (u_int) uio->uio_resid; 557 558 PIPE_UNLOCK(rpipe); 559 error = uiomove( 560 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 561 size, uio); 562 PIPE_LOCK(rpipe); 563 if (error) 564 break; 565 566 rpipe->pipe_buffer.out += size; 567 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 568 rpipe->pipe_buffer.out = 0; 569 570 rpipe->pipe_buffer.cnt -= size; 571 572 /* 573 * If there is no more to read in the pipe, reset 574 * its pointers to the beginning. This improves 575 * cache hit stats. 576 */ 577 if (rpipe->pipe_buffer.cnt == 0) { 578 rpipe->pipe_buffer.in = 0; 579 rpipe->pipe_buffer.out = 0; 580 } 581 nread += size; 582 #ifndef PIPE_NODIRECT 583 /* 584 * Direct copy, bypassing a kernel buffer. 585 */ 586 } else if ((size = rpipe->pipe_map.cnt) && 587 (rpipe->pipe_state & PIPE_DIRECTW)) { 588 if (size > (u_int) uio->uio_resid) 589 size = (u_int) uio->uio_resid; 590 591 PIPE_UNLOCK(rpipe); 592 error = uiomove_fromphys(rpipe->pipe_map.ms, 593 rpipe->pipe_map.pos, size, uio); 594 PIPE_LOCK(rpipe); 595 if (error) 596 break; 597 nread += size; 598 rpipe->pipe_map.pos += size; 599 rpipe->pipe_map.cnt -= size; 600 if (rpipe->pipe_map.cnt == 0) { 601 rpipe->pipe_state &= ~PIPE_DIRECTW; 602 wakeup(rpipe); 603 } 604 #endif 605 } else { 606 /* 607 * detect EOF condition 608 * read returns 0 on EOF, no need to set error 609 */ 610 if (rpipe->pipe_state & PIPE_EOF) 611 break; 612 613 /* 614 * If the "write-side" has been blocked, wake it up now. 615 */ 616 if (rpipe->pipe_state & PIPE_WANTW) { 617 rpipe->pipe_state &= ~PIPE_WANTW; 618 wakeup(rpipe); 619 } 620 621 /* 622 * Break if some data was read. 623 */ 624 if (nread > 0) 625 break; 626 627 /* 628 * Unlock the pipe buffer for our remaining processing. 629 * We will either break out with an error or we will 630 * sleep and relock to loop. 631 */ 632 pipeunlock(rpipe); 633 634 /* 635 * Handle non-blocking mode operation or 636 * wait for more data. 637 */ 638 if (fp->f_flag & FNONBLOCK) { 639 error = EAGAIN; 640 } else { 641 rpipe->pipe_state |= PIPE_WANTR; 642 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 643 PRIBIO | PCATCH, 644 "piperd", 0)) == 0) 645 error = pipelock(rpipe, 1); 646 } 647 if (error) 648 goto unlocked_error; 649 } 650 } 651 #ifdef MAC 652 locked_error: 653 #endif 654 pipeunlock(rpipe); 655 656 /* XXX: should probably do this before getting any locks. */ 657 if (error == 0) 658 vfs_timestamp(&rpipe->pipe_atime); 659 unlocked_error: 660 --rpipe->pipe_busy; 661 662 /* 663 * PIPE_WANT processing only makes sense if pipe_busy is 0. 664 */ 665 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 666 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 667 wakeup(rpipe); 668 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 669 /* 670 * Handle write blocking hysteresis. 671 */ 672 if (rpipe->pipe_state & PIPE_WANTW) { 673 rpipe->pipe_state &= ~PIPE_WANTW; 674 wakeup(rpipe); 675 } 676 } 677 678 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 679 pipeselwakeup(rpipe); 680 681 PIPE_UNLOCK(rpipe); 682 return (error); 683 } 684 685 #ifndef PIPE_NODIRECT 686 /* 687 * Map the sending processes' buffer into kernel space and wire it. 688 * This is similar to a physical write operation. 689 */ 690 static int 691 pipe_build_write_buffer(wpipe, uio) 692 struct pipe *wpipe; 693 struct uio *uio; 694 { 695 pmap_t pmap; 696 u_int size; 697 int i, j; 698 vm_offset_t addr, endaddr; 699 700 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 701 702 size = (u_int) uio->uio_iov->iov_len; 703 if (size > wpipe->pipe_buffer.size) 704 size = wpipe->pipe_buffer.size; 705 706 pmap = vmspace_pmap(curproc->p_vmspace); 707 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 708 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 709 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { 710 /* 711 * vm_fault_quick() can sleep. Consequently, 712 * vm_page_lock_queue() and vm_page_unlock_queue() 713 * should not be performed outside of this loop. 714 */ 715 race: 716 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { 717 vm_page_lock_queues(); 718 for (j = 0; j < i; j++) 719 vm_page_unhold(wpipe->pipe_map.ms[j]); 720 vm_page_unlock_queues(); 721 return (EFAULT); 722 } 723 wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, 724 VM_PROT_READ); 725 if (wpipe->pipe_map.ms[i] == NULL) 726 goto race; 727 } 728 729 /* 730 * set up the control block 731 */ 732 wpipe->pipe_map.npages = i; 733 wpipe->pipe_map.pos = 734 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 735 wpipe->pipe_map.cnt = size; 736 737 /* 738 * and update the uio data 739 */ 740 741 uio->uio_iov->iov_len -= size; 742 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 743 if (uio->uio_iov->iov_len == 0) 744 uio->uio_iov++; 745 uio->uio_resid -= size; 746 uio->uio_offset += size; 747 return (0); 748 } 749 750 /* 751 * unmap and unwire the process buffer 752 */ 753 static void 754 pipe_destroy_write_buffer(wpipe) 755 struct pipe *wpipe; 756 { 757 int i; 758 759 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 760 vm_page_lock_queues(); 761 for (i = 0; i < wpipe->pipe_map.npages; i++) { 762 vm_page_unhold(wpipe->pipe_map.ms[i]); 763 } 764 vm_page_unlock_queues(); 765 wpipe->pipe_map.npages = 0; 766 } 767 768 /* 769 * In the case of a signal, the writing process might go away. This 770 * code copies the data into the circular buffer so that the source 771 * pages can be freed without loss of data. 772 */ 773 static void 774 pipe_clone_write_buffer(wpipe) 775 struct pipe *wpipe; 776 { 777 struct uio uio; 778 struct iovec iov; 779 int size; 780 int pos; 781 782 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 783 size = wpipe->pipe_map.cnt; 784 pos = wpipe->pipe_map.pos; 785 786 wpipe->pipe_buffer.in = size; 787 wpipe->pipe_buffer.out = 0; 788 wpipe->pipe_buffer.cnt = size; 789 wpipe->pipe_state &= ~PIPE_DIRECTW; 790 791 PIPE_UNLOCK(wpipe); 792 iov.iov_base = wpipe->pipe_buffer.buffer; 793 iov.iov_len = size; 794 uio.uio_iov = &iov; 795 uio.uio_iovcnt = 1; 796 uio.uio_offset = 0; 797 uio.uio_resid = size; 798 uio.uio_segflg = UIO_SYSSPACE; 799 uio.uio_rw = UIO_READ; 800 uio.uio_td = curthread; 801 uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); 802 PIPE_LOCK(wpipe); 803 pipe_destroy_write_buffer(wpipe); 804 } 805 806 /* 807 * This implements the pipe buffer write mechanism. Note that only 808 * a direct write OR a normal pipe write can be pending at any given time. 809 * If there are any characters in the pipe buffer, the direct write will 810 * be deferred until the receiving process grabs all of the bytes from 811 * the pipe buffer. Then the direct mapping write is set-up. 812 */ 813 static int 814 pipe_direct_write(wpipe, uio) 815 struct pipe *wpipe; 816 struct uio *uio; 817 { 818 int error; 819 820 retry: 821 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 822 while (wpipe->pipe_state & PIPE_DIRECTW) { 823 if (wpipe->pipe_state & PIPE_WANTR) { 824 wpipe->pipe_state &= ~PIPE_WANTR; 825 wakeup(wpipe); 826 } 827 wpipe->pipe_state |= PIPE_WANTW; 828 error = msleep(wpipe, PIPE_MTX(wpipe), 829 PRIBIO | PCATCH, "pipdww", 0); 830 if (error) 831 goto error1; 832 if (wpipe->pipe_state & PIPE_EOF) { 833 error = EPIPE; 834 goto error1; 835 } 836 } 837 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 838 if (wpipe->pipe_buffer.cnt > 0) { 839 if (wpipe->pipe_state & PIPE_WANTR) { 840 wpipe->pipe_state &= ~PIPE_WANTR; 841 wakeup(wpipe); 842 } 843 844 wpipe->pipe_state |= PIPE_WANTW; 845 error = msleep(wpipe, PIPE_MTX(wpipe), 846 PRIBIO | PCATCH, "pipdwc", 0); 847 if (error) 848 goto error1; 849 if (wpipe->pipe_state & PIPE_EOF) { 850 error = EPIPE; 851 goto error1; 852 } 853 goto retry; 854 } 855 856 wpipe->pipe_state |= PIPE_DIRECTW; 857 858 pipelock(wpipe, 0); 859 if (wpipe->pipe_state & PIPE_EOF) { 860 error = EPIPE; 861 goto error2; 862 } 863 PIPE_UNLOCK(wpipe); 864 error = pipe_build_write_buffer(wpipe, uio); 865 PIPE_LOCK(wpipe); 866 pipeunlock(wpipe); 867 if (error) { 868 wpipe->pipe_state &= ~PIPE_DIRECTW; 869 goto error1; 870 } 871 872 error = 0; 873 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 874 if (wpipe->pipe_state & PIPE_EOF) { 875 pipelock(wpipe, 0); 876 pipe_destroy_write_buffer(wpipe); 877 pipeselwakeup(wpipe); 878 pipeunlock(wpipe); 879 error = EPIPE; 880 goto error1; 881 } 882 if (wpipe->pipe_state & PIPE_WANTR) { 883 wpipe->pipe_state &= ~PIPE_WANTR; 884 wakeup(wpipe); 885 } 886 pipeselwakeup(wpipe); 887 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 888 "pipdwt", 0); 889 } 890 891 pipelock(wpipe,0); 892 if (wpipe->pipe_state & PIPE_EOF) 893 error = EPIPE; 894 if (wpipe->pipe_state & PIPE_DIRECTW) { 895 /* 896 * this bit of trickery substitutes a kernel buffer for 897 * the process that might be going away. 898 */ 899 pipe_clone_write_buffer(wpipe); 900 } else { 901 pipe_destroy_write_buffer(wpipe); 902 } 903 error2: 904 pipeunlock(wpipe); 905 return (error); 906 907 error1: 908 wakeup(wpipe); 909 return (error); 910 } 911 #endif 912 913 static int 914 pipe_write(fp, uio, active_cred, flags, td) 915 struct file *fp; 916 struct uio *uio; 917 struct ucred *active_cred; 918 struct thread *td; 919 int flags; 920 { 921 int error = 0; 922 int orig_resid; 923 struct pipe *wpipe, *rpipe; 924 925 rpipe = fp->f_data; 926 wpipe = rpipe->pipe_peer; 927 928 PIPE_LOCK(rpipe); 929 /* 930 * detect loss of pipe read side, issue SIGPIPE if lost. 931 */ 932 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 933 PIPE_UNLOCK(rpipe); 934 return (EPIPE); 935 } 936 #ifdef MAC 937 error = mac_check_pipe_write(active_cred, wpipe->pipe_pair); 938 if (error) { 939 PIPE_UNLOCK(rpipe); 940 return (error); 941 } 942 #endif 943 ++wpipe->pipe_busy; 944 945 /* 946 * If it is advantageous to resize the pipe buffer, do 947 * so. 948 */ 949 if ((uio->uio_resid > PIPE_SIZE) && 950 (amountpipekva < maxpipekva / 2) && 951 (nbigpipe < LIMITBIGPIPES) && 952 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 953 (wpipe->pipe_buffer.size <= PIPE_SIZE) && 954 (wpipe->pipe_buffer.cnt == 0)) { 955 956 if ((error = pipelock(wpipe, 1)) == 0) { 957 if (wpipe->pipe_state & PIPE_EOF) 958 error = EPIPE; 959 else { 960 PIPE_UNLOCK(wpipe); 961 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0) 962 atomic_add_int(&nbigpipe, 1); 963 PIPE_LOCK(wpipe); 964 } 965 pipeunlock(wpipe); 966 } 967 } 968 969 /* 970 * If an early error occured unbusy and return, waking up any pending 971 * readers. 972 */ 973 if (error) { 974 --wpipe->pipe_busy; 975 if ((wpipe->pipe_busy == 0) && 976 (wpipe->pipe_state & PIPE_WANT)) { 977 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 978 wakeup(wpipe); 979 } 980 PIPE_UNLOCK(rpipe); 981 return(error); 982 } 983 984 orig_resid = uio->uio_resid; 985 986 while (uio->uio_resid) { 987 int space; 988 989 #ifndef PIPE_NODIRECT 990 /* 991 * If the transfer is large, we can gain performance if 992 * we do process-to-process copies directly. 993 * If the write is non-blocking, we don't use the 994 * direct write mechanism. 995 * 996 * The direct write mechanism will detect the reader going 997 * away on us. 998 */ 999 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 1000 (fp->f_flag & FNONBLOCK) == 0) { 1001 error = pipe_direct_write(wpipe, uio); 1002 if (error) 1003 break; 1004 continue; 1005 } 1006 #endif 1007 1008 /* 1009 * Pipe buffered writes cannot be coincidental with 1010 * direct writes. We wait until the currently executing 1011 * direct write is completed before we start filling the 1012 * pipe buffer. We break out if a signal occurs or the 1013 * reader goes away. 1014 */ 1015 retrywrite: 1016 while (wpipe->pipe_state & PIPE_DIRECTW) { 1017 if (wpipe->pipe_state & PIPE_WANTR) { 1018 wpipe->pipe_state &= ~PIPE_WANTR; 1019 wakeup(wpipe); 1020 } 1021 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1022 "pipbww", 0); 1023 if (wpipe->pipe_state & PIPE_EOF) { 1024 error = EPIPE; 1025 break; 1026 } 1027 if (error) 1028 break; 1029 } 1030 1031 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1032 1033 /* Writes of size <= PIPE_BUF must be atomic. */ 1034 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1035 space = 0; 1036 1037 if (space > 0) { 1038 if ((error = pipelock(wpipe,1)) == 0) { 1039 int size; /* Transfer size */ 1040 int segsize; /* first segment to transfer */ 1041 1042 /* 1043 * It is possible for a direct write/EOF to 1044 * slip in on us... handle them here... 1045 */ 1046 if (wpipe->pipe_state & PIPE_EOF) 1047 goto lost_wpipe; 1048 if (wpipe->pipe_state & PIPE_DIRECTW) { 1049 pipeunlock(wpipe); 1050 goto retrywrite; 1051 } 1052 /* 1053 * If a process blocked in uiomove, our 1054 * value for space might be bad. 1055 * 1056 * XXX will we be ok if the reader has gone 1057 * away here? 1058 */ 1059 if (space > wpipe->pipe_buffer.size - 1060 wpipe->pipe_buffer.cnt) { 1061 pipeunlock(wpipe); 1062 goto retrywrite; 1063 } 1064 1065 /* 1066 * Transfer size is minimum of uio transfer 1067 * and free space in pipe buffer. 1068 */ 1069 if (space > uio->uio_resid) 1070 size = uio->uio_resid; 1071 else 1072 size = space; 1073 /* 1074 * First segment to transfer is minimum of 1075 * transfer size and contiguous space in 1076 * pipe buffer. If first segment to transfer 1077 * is less than the transfer size, we've got 1078 * a wraparound in the buffer. 1079 */ 1080 segsize = wpipe->pipe_buffer.size - 1081 wpipe->pipe_buffer.in; 1082 if (segsize > size) 1083 segsize = size; 1084 1085 /* Transfer first segment */ 1086 1087 PIPE_UNLOCK(rpipe); 1088 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1089 segsize, uio); 1090 PIPE_LOCK(rpipe); 1091 1092 if (error == 0 && segsize < size) { 1093 /* 1094 * Transfer remaining part now, to 1095 * support atomic writes. Wraparound 1096 * happened. 1097 */ 1098 if (wpipe->pipe_buffer.in + segsize != 1099 wpipe->pipe_buffer.size) 1100 panic("Expected pipe buffer " 1101 "wraparound disappeared"); 1102 1103 PIPE_UNLOCK(rpipe); 1104 error = uiomove( 1105 &wpipe->pipe_buffer.buffer[0], 1106 size - segsize, uio); 1107 PIPE_LOCK(rpipe); 1108 } 1109 if (error == 0) { 1110 wpipe->pipe_buffer.in += size; 1111 if (wpipe->pipe_buffer.in >= 1112 wpipe->pipe_buffer.size) { 1113 if (wpipe->pipe_buffer.in != 1114 size - segsize + 1115 wpipe->pipe_buffer.size) 1116 panic("Expected " 1117 "wraparound bad"); 1118 wpipe->pipe_buffer.in = size - 1119 segsize; 1120 } 1121 1122 wpipe->pipe_buffer.cnt += size; 1123 if (wpipe->pipe_buffer.cnt > 1124 wpipe->pipe_buffer.size) 1125 panic("Pipe buffer overflow"); 1126 1127 } 1128 lost_wpipe: 1129 pipeunlock(wpipe); 1130 } 1131 if (error) 1132 break; 1133 1134 } else { 1135 /* 1136 * If the "read-side" has been blocked, wake it up now. 1137 */ 1138 if (wpipe->pipe_state & PIPE_WANTR) { 1139 wpipe->pipe_state &= ~PIPE_WANTR; 1140 wakeup(wpipe); 1141 } 1142 1143 /* 1144 * don't block on non-blocking I/O 1145 */ 1146 if (fp->f_flag & FNONBLOCK) { 1147 error = EAGAIN; 1148 break; 1149 } 1150 1151 /* 1152 * We have no more space and have something to offer, 1153 * wake up select/poll. 1154 */ 1155 pipeselwakeup(wpipe); 1156 1157 wpipe->pipe_state |= PIPE_WANTW; 1158 error = msleep(wpipe, PIPE_MTX(rpipe), 1159 PRIBIO | PCATCH, "pipewr", 0); 1160 if (error != 0) 1161 break; 1162 /* 1163 * If read side wants to go away, we just issue a signal 1164 * to ourselves. 1165 */ 1166 if (wpipe->pipe_state & PIPE_EOF) { 1167 error = EPIPE; 1168 break; 1169 } 1170 } 1171 } 1172 1173 --wpipe->pipe_busy; 1174 1175 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1176 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1177 wakeup(wpipe); 1178 } else if (wpipe->pipe_buffer.cnt > 0) { 1179 /* 1180 * If we have put any characters in the buffer, we wake up 1181 * the reader. 1182 */ 1183 if (wpipe->pipe_state & PIPE_WANTR) { 1184 wpipe->pipe_state &= ~PIPE_WANTR; 1185 wakeup(wpipe); 1186 } 1187 } 1188 1189 /* 1190 * Don't return EPIPE if I/O was successful 1191 */ 1192 if ((wpipe->pipe_buffer.cnt == 0) && 1193 (uio->uio_resid == 0) && 1194 (error == EPIPE)) { 1195 error = 0; 1196 } 1197 1198 if (error == 0) 1199 vfs_timestamp(&wpipe->pipe_mtime); 1200 1201 /* 1202 * We have something to offer, 1203 * wake up select/poll. 1204 */ 1205 if (wpipe->pipe_buffer.cnt) 1206 pipeselwakeup(wpipe); 1207 1208 PIPE_UNLOCK(rpipe); 1209 return (error); 1210 } 1211 1212 /* 1213 * we implement a very minimal set of ioctls for compatibility with sockets. 1214 */ 1215 static int 1216 pipe_ioctl(fp, cmd, data, active_cred, td) 1217 struct file *fp; 1218 u_long cmd; 1219 void *data; 1220 struct ucred *active_cred; 1221 struct thread *td; 1222 { 1223 struct pipe *mpipe = fp->f_data; 1224 #ifdef MAC 1225 int error; 1226 #endif 1227 1228 PIPE_LOCK(mpipe); 1229 1230 #ifdef MAC 1231 error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1232 if (error) { 1233 PIPE_UNLOCK(mpipe); 1234 return (error); 1235 } 1236 #endif 1237 1238 switch (cmd) { 1239 1240 case FIONBIO: 1241 PIPE_UNLOCK(mpipe); 1242 return (0); 1243 1244 case FIOASYNC: 1245 if (*(int *)data) { 1246 mpipe->pipe_state |= PIPE_ASYNC; 1247 } else { 1248 mpipe->pipe_state &= ~PIPE_ASYNC; 1249 } 1250 PIPE_UNLOCK(mpipe); 1251 return (0); 1252 1253 case FIONREAD: 1254 if (mpipe->pipe_state & PIPE_DIRECTW) 1255 *(int *)data = mpipe->pipe_map.cnt; 1256 else 1257 *(int *)data = mpipe->pipe_buffer.cnt; 1258 PIPE_UNLOCK(mpipe); 1259 return (0); 1260 1261 case FIOSETOWN: 1262 PIPE_UNLOCK(mpipe); 1263 return (fsetown(*(int *)data, &mpipe->pipe_sigio)); 1264 1265 case FIOGETOWN: 1266 PIPE_UNLOCK(mpipe); 1267 *(int *)data = fgetown(&mpipe->pipe_sigio); 1268 return (0); 1269 1270 /* This is deprecated, FIOSETOWN should be used instead. */ 1271 case TIOCSPGRP: 1272 PIPE_UNLOCK(mpipe); 1273 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); 1274 1275 /* This is deprecated, FIOGETOWN should be used instead. */ 1276 case TIOCGPGRP: 1277 PIPE_UNLOCK(mpipe); 1278 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1279 return (0); 1280 1281 } 1282 PIPE_UNLOCK(mpipe); 1283 return (ENOTTY); 1284 } 1285 1286 static int 1287 pipe_poll(fp, events, active_cred, td) 1288 struct file *fp; 1289 int events; 1290 struct ucred *active_cred; 1291 struct thread *td; 1292 { 1293 struct pipe *rpipe = fp->f_data; 1294 struct pipe *wpipe; 1295 int revents = 0; 1296 #ifdef MAC 1297 int error; 1298 #endif 1299 1300 wpipe = rpipe->pipe_peer; 1301 PIPE_LOCK(rpipe); 1302 #ifdef MAC 1303 error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair); 1304 if (error) 1305 goto locked_error; 1306 #endif 1307 if (events & (POLLIN | POLLRDNORM)) 1308 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1309 (rpipe->pipe_buffer.cnt > 0) || 1310 (rpipe->pipe_state & PIPE_EOF)) 1311 revents |= events & (POLLIN | POLLRDNORM); 1312 1313 if (events & (POLLOUT | POLLWRNORM)) 1314 if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) || 1315 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1316 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1317 revents |= events & (POLLOUT | POLLWRNORM); 1318 1319 if ((rpipe->pipe_state & PIPE_EOF) || 1320 (!wpipe->pipe_present) || 1321 (wpipe->pipe_state & PIPE_EOF)) 1322 revents |= POLLHUP; 1323 1324 if (revents == 0) { 1325 if (events & (POLLIN | POLLRDNORM)) { 1326 selrecord(td, &rpipe->pipe_sel); 1327 rpipe->pipe_state |= PIPE_SEL; 1328 } 1329 1330 if (events & (POLLOUT | POLLWRNORM)) { 1331 selrecord(td, &wpipe->pipe_sel); 1332 wpipe->pipe_state |= PIPE_SEL; 1333 } 1334 } 1335 #ifdef MAC 1336 locked_error: 1337 #endif 1338 PIPE_UNLOCK(rpipe); 1339 1340 return (revents); 1341 } 1342 1343 /* 1344 * We shouldn't need locks here as we're doing a read and this should 1345 * be a natural race. 1346 */ 1347 static int 1348 pipe_stat(fp, ub, active_cred, td) 1349 struct file *fp; 1350 struct stat *ub; 1351 struct ucred *active_cred; 1352 struct thread *td; 1353 { 1354 struct pipe *pipe = fp->f_data; 1355 #ifdef MAC 1356 int error; 1357 1358 PIPE_LOCK(pipe); 1359 error = mac_check_pipe_stat(active_cred, pipe->pipe_pair); 1360 PIPE_UNLOCK(pipe); 1361 if (error) 1362 return (error); 1363 #endif 1364 bzero(ub, sizeof(*ub)); 1365 ub->st_mode = S_IFIFO; 1366 ub->st_blksize = pipe->pipe_buffer.size; 1367 if (pipe->pipe_state & PIPE_DIRECTW) 1368 ub->st_size = pipe->pipe_map.cnt; 1369 else 1370 ub->st_size = pipe->pipe_buffer.cnt; 1371 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1372 ub->st_atimespec = pipe->pipe_atime; 1373 ub->st_mtimespec = pipe->pipe_mtime; 1374 ub->st_ctimespec = pipe->pipe_ctime; 1375 ub->st_uid = fp->f_cred->cr_uid; 1376 ub->st_gid = fp->f_cred->cr_gid; 1377 /* 1378 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1379 * XXX (st_dev, st_ino) should be unique. 1380 */ 1381 return (0); 1382 } 1383 1384 /* ARGSUSED */ 1385 static int 1386 pipe_close(fp, td) 1387 struct file *fp; 1388 struct thread *td; 1389 { 1390 struct pipe *cpipe = fp->f_data; 1391 1392 fp->f_ops = &badfileops; 1393 fp->f_data = NULL; 1394 funsetown(&cpipe->pipe_sigio); 1395 pipeclose(cpipe); 1396 return (0); 1397 } 1398 1399 static void 1400 pipe_free_kmem(cpipe) 1401 struct pipe *cpipe; 1402 { 1403 1404 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1405 ("pipe_free_kmem: pipe mutex locked")); 1406 1407 if (cpipe->pipe_buffer.buffer != NULL) { 1408 if (cpipe->pipe_buffer.size > PIPE_SIZE) 1409 atomic_subtract_int(&nbigpipe, 1); 1410 atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size); 1411 vm_map_remove(pipe_map, 1412 (vm_offset_t)cpipe->pipe_buffer.buffer, 1413 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1414 cpipe->pipe_buffer.buffer = NULL; 1415 } 1416 #ifndef PIPE_NODIRECT 1417 { 1418 cpipe->pipe_map.cnt = 0; 1419 cpipe->pipe_map.pos = 0; 1420 cpipe->pipe_map.npages = 0; 1421 } 1422 #endif 1423 } 1424 1425 /* 1426 * shutdown the pipe 1427 */ 1428 static void 1429 pipeclose(cpipe) 1430 struct pipe *cpipe; 1431 { 1432 struct pipepair *pp; 1433 struct pipe *ppipe; 1434 1435 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1436 1437 PIPE_LOCK(cpipe); 1438 pp = cpipe->pipe_pair; 1439 1440 pipeselwakeup(cpipe); 1441 1442 /* 1443 * If the other side is blocked, wake it up saying that 1444 * we want to close it down. 1445 */ 1446 cpipe->pipe_state |= PIPE_EOF; 1447 while (cpipe->pipe_busy) { 1448 wakeup(cpipe); 1449 cpipe->pipe_state |= PIPE_WANT; 1450 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1451 } 1452 1453 1454 /* 1455 * Disconnect from peer, if any. 1456 */ 1457 ppipe = cpipe->pipe_peer; 1458 if (ppipe->pipe_present != 0) { 1459 pipeselwakeup(ppipe); 1460 1461 ppipe->pipe_state |= PIPE_EOF; 1462 wakeup(ppipe); 1463 KNOTE(&ppipe->pipe_sel.si_note, 0); 1464 } 1465 1466 /* 1467 * Mark this endpoint as free. Release kmem resources. We 1468 * don't mark this endpoint as unused until we've finished 1469 * doing that, or the pipe might disappear out from under 1470 * us. 1471 */ 1472 pipelock(cpipe, 0); 1473 PIPE_UNLOCK(cpipe); 1474 pipe_free_kmem(cpipe); 1475 PIPE_LOCK(cpipe); 1476 cpipe->pipe_present = 0; 1477 pipeunlock(cpipe); 1478 1479 /* 1480 * If both endpoints are now closed, release the memory for the 1481 * pipe pair. If not, unlock. 1482 */ 1483 if (ppipe->pipe_present == 0) { 1484 PIPE_UNLOCK(cpipe); 1485 #ifdef MAC 1486 mac_destroy_pipe(pp); 1487 #endif 1488 uma_zfree(pipe_zone, cpipe->pipe_pair); 1489 } else 1490 PIPE_UNLOCK(cpipe); 1491 } 1492 1493 /*ARGSUSED*/ 1494 static int 1495 pipe_kqfilter(struct file *fp, struct knote *kn) 1496 { 1497 struct pipe *cpipe; 1498 1499 cpipe = kn->kn_fp->f_data; 1500 PIPE_LOCK(cpipe); 1501 switch (kn->kn_filter) { 1502 case EVFILT_READ: 1503 kn->kn_fop = &pipe_rfiltops; 1504 break; 1505 case EVFILT_WRITE: 1506 kn->kn_fop = &pipe_wfiltops; 1507 if (!cpipe->pipe_peer->pipe_present) { 1508 /* other end of pipe has been closed */ 1509 PIPE_UNLOCK(cpipe); 1510 return (EPIPE); 1511 } 1512 cpipe = cpipe->pipe_peer; 1513 break; 1514 default: 1515 PIPE_UNLOCK(cpipe); 1516 return (1); 1517 } 1518 1519 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext); 1520 PIPE_UNLOCK(cpipe); 1521 return (0); 1522 } 1523 1524 static void 1525 filt_pipedetach(struct knote *kn) 1526 { 1527 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1528 1529 PIPE_LOCK(cpipe); 1530 if (kn->kn_filter == EVFILT_WRITE) { 1531 if (!cpipe->pipe_peer->pipe_present) { 1532 PIPE_UNLOCK(cpipe); 1533 return; 1534 } 1535 cpipe = cpipe->pipe_peer; 1536 } 1537 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext); 1538 PIPE_UNLOCK(cpipe); 1539 } 1540 1541 /*ARGSUSED*/ 1542 static int 1543 filt_piperead(struct knote *kn, long hint) 1544 { 1545 struct pipe *rpipe = kn->kn_fp->f_data; 1546 struct pipe *wpipe = rpipe->pipe_peer; 1547 1548 PIPE_LOCK(rpipe); 1549 kn->kn_data = rpipe->pipe_buffer.cnt; 1550 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1551 kn->kn_data = rpipe->pipe_map.cnt; 1552 1553 if ((rpipe->pipe_state & PIPE_EOF) || 1554 (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1555 kn->kn_flags |= EV_EOF; 1556 PIPE_UNLOCK(rpipe); 1557 return (1); 1558 } 1559 PIPE_UNLOCK(rpipe); 1560 return (kn->kn_data > 0); 1561 } 1562 1563 /*ARGSUSED*/ 1564 static int 1565 filt_pipewrite(struct knote *kn, long hint) 1566 { 1567 struct pipe *rpipe = kn->kn_fp->f_data; 1568 struct pipe *wpipe = rpipe->pipe_peer; 1569 1570 PIPE_LOCK(rpipe); 1571 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1572 kn->kn_data = 0; 1573 kn->kn_flags |= EV_EOF; 1574 PIPE_UNLOCK(rpipe); 1575 return (1); 1576 } 1577 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1578 if (wpipe->pipe_state & PIPE_DIRECTW) 1579 kn->kn_data = 0; 1580 1581 PIPE_UNLOCK(rpipe); 1582 return (kn->kn_data >= PIPE_BUF); 1583 } 1584