1 /*- 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 */ 19 20 /* 21 * This file contains a high-performance replacement for the socket-based 22 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 23 * all features of sockets, but does do everything that pipes normally 24 * do. 25 */ 26 27 /* 28 * This code has two modes of operation, a small write mode and a large 29 * write mode. The small write mode acts like conventional pipes with 30 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 31 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 32 * and PIPE_SIZE in size, the sending process pins the underlying pages in 33 * memory, and the receiving process copies directly from these pinned pages 34 * in the sending process. 35 * 36 * If the sending process receives a signal, it is possible that it will 37 * go away, and certainly its address space can change, because control 38 * is returned back to the user-mode side. In that case, the pipe code 39 * arranges to copy the buffer supplied by the user process, to a pageable 40 * kernel buffer, and the receiving process will grab the data from the 41 * pageable kernel buffer. Since signals don't happen all that often, 42 * the copy operation is normally eliminated. 43 * 44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 45 * happen for small transfers so that the system will not spend all of 46 * its time context switching. 47 * 48 * In order to limit the resource use of pipes, two sysctls exist: 49 * 50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 51 * address space available to us in pipe_map. This value is normally 52 * autotuned, but may also be loader tuned. 53 * 54 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of 55 * memory in use by pipes. 56 * 57 * Based on how large pipekva is relative to maxpipekva, the following 58 * will happen: 59 * 60 * 0% - 50%: 61 * New pipes are given 16K of memory backing, pipes may dynamically 62 * grow to as large as 64K where needed. 63 * 50% - 75%: 64 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 65 * existing pipes may NOT grow. 66 * 75% - 100%: 67 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 68 * existing pipes will be shrunk down to 4K whenever possible. 69 * 70 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If 71 * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE 72 * resize which MUST occur for reverse-direction pipes when they are 73 * first used. 74 * 75 * Additional information about the current state of pipes may be obtained 76 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, 77 * and kern.ipc.piperesizefail. 78 * 79 * Locking rules: There are two locks present here: A mutex, used via 80 * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via 81 * the flag, as mutexes can not persist over uiomove. The mutex 82 * exists only to guard access to the flag, and is not in itself a 83 * locking mechanism. Also note that there is only a single mutex for 84 * both directions of a pipe. 85 * 86 * As pipelock() may have to sleep before it can acquire the flag, it 87 * is important to reread all data after a call to pipelock(); everything 88 * in the structure may have changed. 89 */ 90 91 #include <sys/cdefs.h> 92 __FBSDID("$FreeBSD$"); 93 94 #include <sys/param.h> 95 #include <sys/systm.h> 96 #include <sys/conf.h> 97 #include <sys/fcntl.h> 98 #include <sys/file.h> 99 #include <sys/filedesc.h> 100 #include <sys/filio.h> 101 #include <sys/kernel.h> 102 #include <sys/lock.h> 103 #include <sys/mutex.h> 104 #include <sys/ttycom.h> 105 #include <sys/stat.h> 106 #include <sys/malloc.h> 107 #include <sys/poll.h> 108 #include <sys/selinfo.h> 109 #include <sys/signalvar.h> 110 #include <sys/syscallsubr.h> 111 #include <sys/sysctl.h> 112 #include <sys/sysproto.h> 113 #include <sys/pipe.h> 114 #include <sys/proc.h> 115 #include <sys/vnode.h> 116 #include <sys/uio.h> 117 #include <sys/event.h> 118 119 #include <security/mac/mac_framework.h> 120 121 #include <vm/vm.h> 122 #include <vm/vm_param.h> 123 #include <vm/vm_object.h> 124 #include <vm/vm_kern.h> 125 #include <vm/vm_extern.h> 126 #include <vm/pmap.h> 127 #include <vm/vm_map.h> 128 #include <vm/vm_page.h> 129 #include <vm/uma.h> 130 131 /* 132 * Use this define if you want to disable *fancy* VM things. Expect an 133 * approx 30% decrease in transfer rate. This could be useful for 134 * NetBSD or OpenBSD. 135 */ 136 /* #define PIPE_NODIRECT */ 137 138 /* 139 * interfaces to the outside world 140 */ 141 static fo_rdwr_t pipe_read; 142 static fo_rdwr_t pipe_write; 143 static fo_truncate_t pipe_truncate; 144 static fo_ioctl_t pipe_ioctl; 145 static fo_poll_t pipe_poll; 146 static fo_kqfilter_t pipe_kqfilter; 147 static fo_stat_t pipe_stat; 148 static fo_close_t pipe_close; 149 150 static struct fileops pipeops = { 151 .fo_read = pipe_read, 152 .fo_write = pipe_write, 153 .fo_truncate = pipe_truncate, 154 .fo_ioctl = pipe_ioctl, 155 .fo_poll = pipe_poll, 156 .fo_kqfilter = pipe_kqfilter, 157 .fo_stat = pipe_stat, 158 .fo_close = pipe_close, 159 .fo_chmod = invfo_chmod, 160 .fo_chown = invfo_chown, 161 .fo_flags = DFLAG_PASSABLE 162 }; 163 164 static void filt_pipedetach(struct knote *kn); 165 static int filt_piperead(struct knote *kn, long hint); 166 static int filt_pipewrite(struct knote *kn, long hint); 167 168 static struct filterops pipe_rfiltops = { 169 .f_isfd = 1, 170 .f_detach = filt_pipedetach, 171 .f_event = filt_piperead 172 }; 173 static struct filterops pipe_wfiltops = { 174 .f_isfd = 1, 175 .f_detach = filt_pipedetach, 176 .f_event = filt_pipewrite 177 }; 178 179 /* 180 * Default pipe buffer size(s), this can be kind-of large now because pipe 181 * space is pageable. The pipe code will try to maintain locality of 182 * reference for performance reasons, so small amounts of outstanding I/O 183 * will not wipe the cache. 184 */ 185 #define MINPIPESIZE (PIPE_SIZE/3) 186 #define MAXPIPESIZE (2*PIPE_SIZE/3) 187 188 static long amountpipekva; 189 static int pipefragretry; 190 static int pipeallocfail; 191 static int piperesizefail; 192 static int piperesizeallowed = 1; 193 194 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, 195 &maxpipekva, 0, "Pipe KVA limit"); 196 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 197 &amountpipekva, 0, "Pipe KVA usage"); 198 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, 199 &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); 200 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, 201 &pipeallocfail, 0, "Pipe allocation failures"); 202 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, 203 &piperesizefail, 0, "Pipe resize failures"); 204 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, 205 &piperesizeallowed, 0, "Pipe resizing allowed"); 206 207 static void pipeinit(void *dummy __unused); 208 static void pipeclose(struct pipe *cpipe); 209 static void pipe_free_kmem(struct pipe *cpipe); 210 static int pipe_create(struct pipe *pipe, int backing); 211 static __inline int pipelock(struct pipe *cpipe, int catch); 212 static __inline void pipeunlock(struct pipe *cpipe); 213 static __inline void pipeselwakeup(struct pipe *cpipe); 214 #ifndef PIPE_NODIRECT 215 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 216 static void pipe_destroy_write_buffer(struct pipe *wpipe); 217 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 218 static void pipe_clone_write_buffer(struct pipe *wpipe); 219 #endif 220 static int pipespace(struct pipe *cpipe, int size); 221 static int pipespace_new(struct pipe *cpipe, int size); 222 223 static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); 224 static int pipe_zone_init(void *mem, int size, int flags); 225 static void pipe_zone_fini(void *mem, int size); 226 227 static uma_zone_t pipe_zone; 228 static struct unrhdr *pipeino_unr; 229 static dev_t pipedev_ino; 230 231 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 232 233 static void 234 pipeinit(void *dummy __unused) 235 { 236 237 pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), 238 pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, 239 UMA_ALIGN_PTR, 0); 240 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 241 pipeino_unr = new_unrhdr(1, INT32_MAX, NULL); 242 KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized")); 243 pipedev_ino = devfs_alloc_cdp_inode(); 244 KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); 245 } 246 247 static int 248 pipe_zone_ctor(void *mem, int size, void *arg, int flags) 249 { 250 struct pipepair *pp; 251 struct pipe *rpipe, *wpipe; 252 253 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 254 255 pp = (struct pipepair *)mem; 256 257 /* 258 * We zero both pipe endpoints to make sure all the kmem pointers 259 * are NULL, flag fields are zero'd, etc. We timestamp both 260 * endpoints with the same time. 261 */ 262 rpipe = &pp->pp_rpipe; 263 bzero(rpipe, sizeof(*rpipe)); 264 vfs_timestamp(&rpipe->pipe_ctime); 265 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 266 267 wpipe = &pp->pp_wpipe; 268 bzero(wpipe, sizeof(*wpipe)); 269 wpipe->pipe_ctime = rpipe->pipe_ctime; 270 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 271 272 rpipe->pipe_peer = wpipe; 273 rpipe->pipe_pair = pp; 274 wpipe->pipe_peer = rpipe; 275 wpipe->pipe_pair = pp; 276 277 /* 278 * Mark both endpoints as present; they will later get free'd 279 * one at a time. When both are free'd, then the whole pair 280 * is released. 281 */ 282 rpipe->pipe_present = PIPE_ACTIVE; 283 wpipe->pipe_present = PIPE_ACTIVE; 284 285 /* 286 * Eventually, the MAC Framework may initialize the label 287 * in ctor or init, but for now we do it elswhere to avoid 288 * blocking in ctor or init. 289 */ 290 pp->pp_label = NULL; 291 292 return (0); 293 } 294 295 static int 296 pipe_zone_init(void *mem, int size, int flags) 297 { 298 struct pipepair *pp; 299 300 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 301 302 pp = (struct pipepair *)mem; 303 304 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); 305 return (0); 306 } 307 308 static void 309 pipe_zone_fini(void *mem, int size) 310 { 311 struct pipepair *pp; 312 313 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 314 315 pp = (struct pipepair *)mem; 316 317 mtx_destroy(&pp->pp_mtx); 318 } 319 320 /* 321 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let 322 * the zone pick up the pieces via pipeclose(). 323 */ 324 int 325 kern_pipe(struct thread *td, int fildes[2]) 326 { 327 struct filedesc *fdp = td->td_proc->p_fd; 328 struct file *rf, *wf; 329 struct pipepair *pp; 330 struct pipe *rpipe, *wpipe; 331 int fd, error; 332 333 pp = uma_zalloc(pipe_zone, M_WAITOK); 334 #ifdef MAC 335 /* 336 * The MAC label is shared between the connected endpoints. As a 337 * result mac_pipe_init() and mac_pipe_create() are called once 338 * for the pair, and not on the endpoints. 339 */ 340 mac_pipe_init(pp); 341 mac_pipe_create(td->td_ucred, pp); 342 #endif 343 rpipe = &pp->pp_rpipe; 344 wpipe = &pp->pp_wpipe; 345 346 knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); 347 knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); 348 349 /* Only the forward direction pipe is backed by default */ 350 if ((error = pipe_create(rpipe, 1)) != 0 || 351 (error = pipe_create(wpipe, 0)) != 0) { 352 pipeclose(rpipe); 353 pipeclose(wpipe); 354 return (error); 355 } 356 357 rpipe->pipe_state |= PIPE_DIRECTOK; 358 wpipe->pipe_state |= PIPE_DIRECTOK; 359 360 error = falloc(td, &rf, &fd, 0); 361 if (error) { 362 pipeclose(rpipe); 363 pipeclose(wpipe); 364 return (error); 365 } 366 /* An extra reference on `rf' has been held for us by falloc(). */ 367 fildes[0] = fd; 368 369 /* 370 * Warning: once we've gotten past allocation of the fd for the 371 * read-side, we can only drop the read side via fdrop() in order 372 * to avoid races against processes which manage to dup() the read 373 * side while we are blocked trying to allocate the write side. 374 */ 375 finit(rf, FREAD | FWRITE, DTYPE_PIPE, rpipe, &pipeops); 376 error = falloc(td, &wf, &fd, 0); 377 if (error) { 378 fdclose(fdp, rf, fildes[0], td); 379 fdrop(rf, td); 380 /* rpipe has been closed by fdrop(). */ 381 pipeclose(wpipe); 382 return (error); 383 } 384 /* An extra reference on `wf' has been held for us by falloc(). */ 385 finit(wf, FREAD | FWRITE, DTYPE_PIPE, wpipe, &pipeops); 386 fdrop(wf, td); 387 fildes[1] = fd; 388 fdrop(rf, td); 389 390 return (0); 391 } 392 393 /* ARGSUSED */ 394 int 395 sys_pipe(struct thread *td, struct pipe_args *uap) 396 { 397 int error; 398 int fildes[2]; 399 400 error = kern_pipe(td, fildes); 401 if (error) 402 return (error); 403 404 td->td_retval[0] = fildes[0]; 405 td->td_retval[1] = fildes[1]; 406 407 return (0); 408 } 409 410 /* 411 * Allocate kva for pipe circular buffer, the space is pageable 412 * This routine will 'realloc' the size of a pipe safely, if it fails 413 * it will retain the old buffer. 414 * If it fails it will return ENOMEM. 415 */ 416 static int 417 pipespace_new(cpipe, size) 418 struct pipe *cpipe; 419 int size; 420 { 421 caddr_t buffer; 422 int error, cnt, firstseg; 423 static int curfail = 0; 424 static struct timeval lastfail; 425 426 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 427 KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), 428 ("pipespace: resize of direct writes not allowed")); 429 retry: 430 cnt = cpipe->pipe_buffer.cnt; 431 if (cnt > size) 432 size = cnt; 433 434 size = round_page(size); 435 buffer = (caddr_t) vm_map_min(pipe_map); 436 437 error = vm_map_find(pipe_map, NULL, 0, 438 (vm_offset_t *) &buffer, size, 1, 439 VM_PROT_ALL, VM_PROT_ALL, 0); 440 if (error != KERN_SUCCESS) { 441 if ((cpipe->pipe_buffer.buffer == NULL) && 442 (size > SMALL_PIPE_SIZE)) { 443 size = SMALL_PIPE_SIZE; 444 pipefragretry++; 445 goto retry; 446 } 447 if (cpipe->pipe_buffer.buffer == NULL) { 448 pipeallocfail++; 449 if (ppsratecheck(&lastfail, &curfail, 1)) 450 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 451 } else { 452 piperesizefail++; 453 } 454 return (ENOMEM); 455 } 456 457 /* copy data, then free old resources if we're resizing */ 458 if (cnt > 0) { 459 if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { 460 firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; 461 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 462 buffer, firstseg); 463 if ((cnt - firstseg) > 0) 464 bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], 465 cpipe->pipe_buffer.in); 466 } else { 467 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 468 buffer, cnt); 469 } 470 } 471 pipe_free_kmem(cpipe); 472 cpipe->pipe_buffer.buffer = buffer; 473 cpipe->pipe_buffer.size = size; 474 cpipe->pipe_buffer.in = cnt; 475 cpipe->pipe_buffer.out = 0; 476 cpipe->pipe_buffer.cnt = cnt; 477 atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); 478 return (0); 479 } 480 481 /* 482 * Wrapper for pipespace_new() that performs locking assertions. 483 */ 484 static int 485 pipespace(cpipe, size) 486 struct pipe *cpipe; 487 int size; 488 { 489 490 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 491 ("Unlocked pipe passed to pipespace")); 492 return (pipespace_new(cpipe, size)); 493 } 494 495 /* 496 * lock a pipe for I/O, blocking other access 497 */ 498 static __inline int 499 pipelock(cpipe, catch) 500 struct pipe *cpipe; 501 int catch; 502 { 503 int error; 504 505 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 506 while (cpipe->pipe_state & PIPE_LOCKFL) { 507 cpipe->pipe_state |= PIPE_LWANT; 508 error = msleep(cpipe, PIPE_MTX(cpipe), 509 catch ? (PRIBIO | PCATCH) : PRIBIO, 510 "pipelk", 0); 511 if (error != 0) 512 return (error); 513 } 514 cpipe->pipe_state |= PIPE_LOCKFL; 515 return (0); 516 } 517 518 /* 519 * unlock a pipe I/O lock 520 */ 521 static __inline void 522 pipeunlock(cpipe) 523 struct pipe *cpipe; 524 { 525 526 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 527 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 528 ("Unlocked pipe passed to pipeunlock")); 529 cpipe->pipe_state &= ~PIPE_LOCKFL; 530 if (cpipe->pipe_state & PIPE_LWANT) { 531 cpipe->pipe_state &= ~PIPE_LWANT; 532 wakeup(cpipe); 533 } 534 } 535 536 static __inline void 537 pipeselwakeup(cpipe) 538 struct pipe *cpipe; 539 { 540 541 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 542 if (cpipe->pipe_state & PIPE_SEL) { 543 selwakeuppri(&cpipe->pipe_sel, PSOCK); 544 if (!SEL_WAITING(&cpipe->pipe_sel)) 545 cpipe->pipe_state &= ~PIPE_SEL; 546 } 547 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 548 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 549 KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); 550 } 551 552 /* 553 * Initialize and allocate VM and memory for pipe. The structure 554 * will start out zero'd from the ctor, so we just manage the kmem. 555 */ 556 static int 557 pipe_create(pipe, backing) 558 struct pipe *pipe; 559 int backing; 560 { 561 int error; 562 563 if (backing) { 564 if (amountpipekva > maxpipekva / 2) 565 error = pipespace_new(pipe, SMALL_PIPE_SIZE); 566 else 567 error = pipespace_new(pipe, PIPE_SIZE); 568 } else { 569 /* If we're not backing this pipe, no need to do anything. */ 570 error = 0; 571 } 572 if (error == 0) { 573 pipe->pipe_ino = alloc_unr(pipeino_unr); 574 if (pipe->pipe_ino == -1) 575 /* pipeclose will clear allocated kva */ 576 error = ENOMEM; 577 } 578 return (error); 579 } 580 581 /* ARGSUSED */ 582 static int 583 pipe_read(fp, uio, active_cred, flags, td) 584 struct file *fp; 585 struct uio *uio; 586 struct ucred *active_cred; 587 struct thread *td; 588 int flags; 589 { 590 struct pipe *rpipe = fp->f_data; 591 int error; 592 int nread = 0; 593 u_int size; 594 595 PIPE_LOCK(rpipe); 596 ++rpipe->pipe_busy; 597 error = pipelock(rpipe, 1); 598 if (error) 599 goto unlocked_error; 600 601 #ifdef MAC 602 error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); 603 if (error) 604 goto locked_error; 605 #endif 606 if (amountpipekva > (3 * maxpipekva) / 4) { 607 if (!(rpipe->pipe_state & PIPE_DIRECTW) && 608 (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 609 (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 610 (piperesizeallowed == 1)) { 611 PIPE_UNLOCK(rpipe); 612 pipespace(rpipe, SMALL_PIPE_SIZE); 613 PIPE_LOCK(rpipe); 614 } 615 } 616 617 while (uio->uio_resid) { 618 /* 619 * normal pipe buffer receive 620 */ 621 if (rpipe->pipe_buffer.cnt > 0) { 622 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 623 if (size > rpipe->pipe_buffer.cnt) 624 size = rpipe->pipe_buffer.cnt; 625 if (size > (u_int) uio->uio_resid) 626 size = (u_int) uio->uio_resid; 627 628 PIPE_UNLOCK(rpipe); 629 error = uiomove( 630 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 631 size, uio); 632 PIPE_LOCK(rpipe); 633 if (error) 634 break; 635 636 rpipe->pipe_buffer.out += size; 637 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 638 rpipe->pipe_buffer.out = 0; 639 640 rpipe->pipe_buffer.cnt -= size; 641 642 /* 643 * If there is no more to read in the pipe, reset 644 * its pointers to the beginning. This improves 645 * cache hit stats. 646 */ 647 if (rpipe->pipe_buffer.cnt == 0) { 648 rpipe->pipe_buffer.in = 0; 649 rpipe->pipe_buffer.out = 0; 650 } 651 nread += size; 652 #ifndef PIPE_NODIRECT 653 /* 654 * Direct copy, bypassing a kernel buffer. 655 */ 656 } else if ((size = rpipe->pipe_map.cnt) && 657 (rpipe->pipe_state & PIPE_DIRECTW)) { 658 if (size > (u_int) uio->uio_resid) 659 size = (u_int) uio->uio_resid; 660 661 PIPE_UNLOCK(rpipe); 662 error = uiomove_fromphys(rpipe->pipe_map.ms, 663 rpipe->pipe_map.pos, size, uio); 664 PIPE_LOCK(rpipe); 665 if (error) 666 break; 667 nread += size; 668 rpipe->pipe_map.pos += size; 669 rpipe->pipe_map.cnt -= size; 670 if (rpipe->pipe_map.cnt == 0) { 671 rpipe->pipe_state &= ~PIPE_DIRECTW; 672 wakeup(rpipe); 673 } 674 #endif 675 } else { 676 /* 677 * detect EOF condition 678 * read returns 0 on EOF, no need to set error 679 */ 680 if (rpipe->pipe_state & PIPE_EOF) 681 break; 682 683 /* 684 * If the "write-side" has been blocked, wake it up now. 685 */ 686 if (rpipe->pipe_state & PIPE_WANTW) { 687 rpipe->pipe_state &= ~PIPE_WANTW; 688 wakeup(rpipe); 689 } 690 691 /* 692 * Break if some data was read. 693 */ 694 if (nread > 0) 695 break; 696 697 /* 698 * Unlock the pipe buffer for our remaining processing. 699 * We will either break out with an error or we will 700 * sleep and relock to loop. 701 */ 702 pipeunlock(rpipe); 703 704 /* 705 * Handle non-blocking mode operation or 706 * wait for more data. 707 */ 708 if (fp->f_flag & FNONBLOCK) { 709 error = EAGAIN; 710 } else { 711 rpipe->pipe_state |= PIPE_WANTR; 712 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 713 PRIBIO | PCATCH, 714 "piperd", 0)) == 0) 715 error = pipelock(rpipe, 1); 716 } 717 if (error) 718 goto unlocked_error; 719 } 720 } 721 #ifdef MAC 722 locked_error: 723 #endif 724 pipeunlock(rpipe); 725 726 /* XXX: should probably do this before getting any locks. */ 727 if (error == 0) 728 vfs_timestamp(&rpipe->pipe_atime); 729 unlocked_error: 730 --rpipe->pipe_busy; 731 732 /* 733 * PIPE_WANT processing only makes sense if pipe_busy is 0. 734 */ 735 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 736 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 737 wakeup(rpipe); 738 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 739 /* 740 * Handle write blocking hysteresis. 741 */ 742 if (rpipe->pipe_state & PIPE_WANTW) { 743 rpipe->pipe_state &= ~PIPE_WANTW; 744 wakeup(rpipe); 745 } 746 } 747 748 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 749 pipeselwakeup(rpipe); 750 751 PIPE_UNLOCK(rpipe); 752 return (error); 753 } 754 755 #ifndef PIPE_NODIRECT 756 /* 757 * Map the sending processes' buffer into kernel space and wire it. 758 * This is similar to a physical write operation. 759 */ 760 static int 761 pipe_build_write_buffer(wpipe, uio) 762 struct pipe *wpipe; 763 struct uio *uio; 764 { 765 u_int size; 766 int i; 767 768 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 769 KASSERT(wpipe->pipe_state & PIPE_DIRECTW, 770 ("Clone attempt on non-direct write pipe!")); 771 772 size = (u_int) uio->uio_iov->iov_len; 773 if (size > wpipe->pipe_buffer.size) 774 size = wpipe->pipe_buffer.size; 775 776 if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, 777 (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, 778 wpipe->pipe_map.ms, PIPENPAGES)) < 0) 779 return (EFAULT); 780 781 /* 782 * set up the control block 783 */ 784 wpipe->pipe_map.npages = i; 785 wpipe->pipe_map.pos = 786 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 787 wpipe->pipe_map.cnt = size; 788 789 /* 790 * and update the uio data 791 */ 792 793 uio->uio_iov->iov_len -= size; 794 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 795 if (uio->uio_iov->iov_len == 0) 796 uio->uio_iov++; 797 uio->uio_resid -= size; 798 uio->uio_offset += size; 799 return (0); 800 } 801 802 /* 803 * unmap and unwire the process buffer 804 */ 805 static void 806 pipe_destroy_write_buffer(wpipe) 807 struct pipe *wpipe; 808 { 809 810 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 811 vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); 812 wpipe->pipe_map.npages = 0; 813 } 814 815 /* 816 * In the case of a signal, the writing process might go away. This 817 * code copies the data into the circular buffer so that the source 818 * pages can be freed without loss of data. 819 */ 820 static void 821 pipe_clone_write_buffer(wpipe) 822 struct pipe *wpipe; 823 { 824 struct uio uio; 825 struct iovec iov; 826 int size; 827 int pos; 828 829 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 830 size = wpipe->pipe_map.cnt; 831 pos = wpipe->pipe_map.pos; 832 833 wpipe->pipe_buffer.in = size; 834 wpipe->pipe_buffer.out = 0; 835 wpipe->pipe_buffer.cnt = size; 836 wpipe->pipe_state &= ~PIPE_DIRECTW; 837 838 PIPE_UNLOCK(wpipe); 839 iov.iov_base = wpipe->pipe_buffer.buffer; 840 iov.iov_len = size; 841 uio.uio_iov = &iov; 842 uio.uio_iovcnt = 1; 843 uio.uio_offset = 0; 844 uio.uio_resid = size; 845 uio.uio_segflg = UIO_SYSSPACE; 846 uio.uio_rw = UIO_READ; 847 uio.uio_td = curthread; 848 uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); 849 PIPE_LOCK(wpipe); 850 pipe_destroy_write_buffer(wpipe); 851 } 852 853 /* 854 * This implements the pipe buffer write mechanism. Note that only 855 * a direct write OR a normal pipe write can be pending at any given time. 856 * If there are any characters in the pipe buffer, the direct write will 857 * be deferred until the receiving process grabs all of the bytes from 858 * the pipe buffer. Then the direct mapping write is set-up. 859 */ 860 static int 861 pipe_direct_write(wpipe, uio) 862 struct pipe *wpipe; 863 struct uio *uio; 864 { 865 int error; 866 867 retry: 868 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 869 error = pipelock(wpipe, 1); 870 if (wpipe->pipe_state & PIPE_EOF) 871 error = EPIPE; 872 if (error) { 873 pipeunlock(wpipe); 874 goto error1; 875 } 876 while (wpipe->pipe_state & PIPE_DIRECTW) { 877 if (wpipe->pipe_state & PIPE_WANTR) { 878 wpipe->pipe_state &= ~PIPE_WANTR; 879 wakeup(wpipe); 880 } 881 pipeselwakeup(wpipe); 882 wpipe->pipe_state |= PIPE_WANTW; 883 pipeunlock(wpipe); 884 error = msleep(wpipe, PIPE_MTX(wpipe), 885 PRIBIO | PCATCH, "pipdww", 0); 886 if (error) 887 goto error1; 888 else 889 goto retry; 890 } 891 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 892 if (wpipe->pipe_buffer.cnt > 0) { 893 if (wpipe->pipe_state & PIPE_WANTR) { 894 wpipe->pipe_state &= ~PIPE_WANTR; 895 wakeup(wpipe); 896 } 897 pipeselwakeup(wpipe); 898 wpipe->pipe_state |= PIPE_WANTW; 899 pipeunlock(wpipe); 900 error = msleep(wpipe, PIPE_MTX(wpipe), 901 PRIBIO | PCATCH, "pipdwc", 0); 902 if (error) 903 goto error1; 904 else 905 goto retry; 906 } 907 908 wpipe->pipe_state |= PIPE_DIRECTW; 909 910 PIPE_UNLOCK(wpipe); 911 error = pipe_build_write_buffer(wpipe, uio); 912 PIPE_LOCK(wpipe); 913 if (error) { 914 wpipe->pipe_state &= ~PIPE_DIRECTW; 915 pipeunlock(wpipe); 916 goto error1; 917 } 918 919 error = 0; 920 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 921 if (wpipe->pipe_state & PIPE_EOF) { 922 pipe_destroy_write_buffer(wpipe); 923 pipeselwakeup(wpipe); 924 pipeunlock(wpipe); 925 error = EPIPE; 926 goto error1; 927 } 928 if (wpipe->pipe_state & PIPE_WANTR) { 929 wpipe->pipe_state &= ~PIPE_WANTR; 930 wakeup(wpipe); 931 } 932 pipeselwakeup(wpipe); 933 pipeunlock(wpipe); 934 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 935 "pipdwt", 0); 936 pipelock(wpipe, 0); 937 } 938 939 if (wpipe->pipe_state & PIPE_EOF) 940 error = EPIPE; 941 if (wpipe->pipe_state & PIPE_DIRECTW) { 942 /* 943 * this bit of trickery substitutes a kernel buffer for 944 * the process that might be going away. 945 */ 946 pipe_clone_write_buffer(wpipe); 947 } else { 948 pipe_destroy_write_buffer(wpipe); 949 } 950 pipeunlock(wpipe); 951 return (error); 952 953 error1: 954 wakeup(wpipe); 955 return (error); 956 } 957 #endif 958 959 static int 960 pipe_write(fp, uio, active_cred, flags, td) 961 struct file *fp; 962 struct uio *uio; 963 struct ucred *active_cred; 964 struct thread *td; 965 int flags; 966 { 967 int error = 0; 968 int desiredsize, orig_resid; 969 struct pipe *wpipe, *rpipe; 970 971 rpipe = fp->f_data; 972 wpipe = rpipe->pipe_peer; 973 974 PIPE_LOCK(rpipe); 975 error = pipelock(wpipe, 1); 976 if (error) { 977 PIPE_UNLOCK(rpipe); 978 return (error); 979 } 980 /* 981 * detect loss of pipe read side, issue SIGPIPE if lost. 982 */ 983 if (wpipe->pipe_present != PIPE_ACTIVE || 984 (wpipe->pipe_state & PIPE_EOF)) { 985 pipeunlock(wpipe); 986 PIPE_UNLOCK(rpipe); 987 return (EPIPE); 988 } 989 #ifdef MAC 990 error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); 991 if (error) { 992 pipeunlock(wpipe); 993 PIPE_UNLOCK(rpipe); 994 return (error); 995 } 996 #endif 997 ++wpipe->pipe_busy; 998 999 /* Choose a larger size if it's advantageous */ 1000 desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); 1001 while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { 1002 if (piperesizeallowed != 1) 1003 break; 1004 if (amountpipekva > maxpipekva / 2) 1005 break; 1006 if (desiredsize == BIG_PIPE_SIZE) 1007 break; 1008 desiredsize = desiredsize * 2; 1009 } 1010 1011 /* Choose a smaller size if we're in a OOM situation */ 1012 if ((amountpipekva > (3 * maxpipekva) / 4) && 1013 (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 1014 (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 1015 (piperesizeallowed == 1)) 1016 desiredsize = SMALL_PIPE_SIZE; 1017 1018 /* Resize if the above determined that a new size was necessary */ 1019 if ((desiredsize != wpipe->pipe_buffer.size) && 1020 ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { 1021 PIPE_UNLOCK(wpipe); 1022 pipespace(wpipe, desiredsize); 1023 PIPE_LOCK(wpipe); 1024 } 1025 if (wpipe->pipe_buffer.size == 0) { 1026 /* 1027 * This can only happen for reverse direction use of pipes 1028 * in a complete OOM situation. 1029 */ 1030 error = ENOMEM; 1031 --wpipe->pipe_busy; 1032 pipeunlock(wpipe); 1033 PIPE_UNLOCK(wpipe); 1034 return (error); 1035 } 1036 1037 pipeunlock(wpipe); 1038 1039 orig_resid = uio->uio_resid; 1040 1041 while (uio->uio_resid) { 1042 int space; 1043 1044 pipelock(wpipe, 0); 1045 if (wpipe->pipe_state & PIPE_EOF) { 1046 pipeunlock(wpipe); 1047 error = EPIPE; 1048 break; 1049 } 1050 #ifndef PIPE_NODIRECT 1051 /* 1052 * If the transfer is large, we can gain performance if 1053 * we do process-to-process copies directly. 1054 * If the write is non-blocking, we don't use the 1055 * direct write mechanism. 1056 * 1057 * The direct write mechanism will detect the reader going 1058 * away on us. 1059 */ 1060 if (uio->uio_segflg == UIO_USERSPACE && 1061 uio->uio_iov->iov_len >= PIPE_MINDIRECT && 1062 wpipe->pipe_buffer.size >= PIPE_MINDIRECT && 1063 (fp->f_flag & FNONBLOCK) == 0) { 1064 pipeunlock(wpipe); 1065 error = pipe_direct_write(wpipe, uio); 1066 if (error) 1067 break; 1068 continue; 1069 } 1070 #endif 1071 1072 /* 1073 * Pipe buffered writes cannot be coincidental with 1074 * direct writes. We wait until the currently executing 1075 * direct write is completed before we start filling the 1076 * pipe buffer. We break out if a signal occurs or the 1077 * reader goes away. 1078 */ 1079 if (wpipe->pipe_state & PIPE_DIRECTW) { 1080 if (wpipe->pipe_state & PIPE_WANTR) { 1081 wpipe->pipe_state &= ~PIPE_WANTR; 1082 wakeup(wpipe); 1083 } 1084 pipeselwakeup(wpipe); 1085 wpipe->pipe_state |= PIPE_WANTW; 1086 pipeunlock(wpipe); 1087 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1088 "pipbww", 0); 1089 if (error) 1090 break; 1091 else 1092 continue; 1093 } 1094 1095 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1096 1097 /* Writes of size <= PIPE_BUF must be atomic. */ 1098 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1099 space = 0; 1100 1101 if (space > 0) { 1102 int size; /* Transfer size */ 1103 int segsize; /* first segment to transfer */ 1104 1105 /* 1106 * Transfer size is minimum of uio transfer 1107 * and free space in pipe buffer. 1108 */ 1109 if (space > uio->uio_resid) 1110 size = uio->uio_resid; 1111 else 1112 size = space; 1113 /* 1114 * First segment to transfer is minimum of 1115 * transfer size and contiguous space in 1116 * pipe buffer. If first segment to transfer 1117 * is less than the transfer size, we've got 1118 * a wraparound in the buffer. 1119 */ 1120 segsize = wpipe->pipe_buffer.size - 1121 wpipe->pipe_buffer.in; 1122 if (segsize > size) 1123 segsize = size; 1124 1125 /* Transfer first segment */ 1126 1127 PIPE_UNLOCK(rpipe); 1128 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1129 segsize, uio); 1130 PIPE_LOCK(rpipe); 1131 1132 if (error == 0 && segsize < size) { 1133 KASSERT(wpipe->pipe_buffer.in + segsize == 1134 wpipe->pipe_buffer.size, 1135 ("Pipe buffer wraparound disappeared")); 1136 /* 1137 * Transfer remaining part now, to 1138 * support atomic writes. Wraparound 1139 * happened. 1140 */ 1141 1142 PIPE_UNLOCK(rpipe); 1143 error = uiomove( 1144 &wpipe->pipe_buffer.buffer[0], 1145 size - segsize, uio); 1146 PIPE_LOCK(rpipe); 1147 } 1148 if (error == 0) { 1149 wpipe->pipe_buffer.in += size; 1150 if (wpipe->pipe_buffer.in >= 1151 wpipe->pipe_buffer.size) { 1152 KASSERT(wpipe->pipe_buffer.in == 1153 size - segsize + 1154 wpipe->pipe_buffer.size, 1155 ("Expected wraparound bad")); 1156 wpipe->pipe_buffer.in = size - segsize; 1157 } 1158 1159 wpipe->pipe_buffer.cnt += size; 1160 KASSERT(wpipe->pipe_buffer.cnt <= 1161 wpipe->pipe_buffer.size, 1162 ("Pipe buffer overflow")); 1163 } 1164 pipeunlock(wpipe); 1165 if (error != 0) 1166 break; 1167 } else { 1168 /* 1169 * If the "read-side" has been blocked, wake it up now. 1170 */ 1171 if (wpipe->pipe_state & PIPE_WANTR) { 1172 wpipe->pipe_state &= ~PIPE_WANTR; 1173 wakeup(wpipe); 1174 } 1175 1176 /* 1177 * don't block on non-blocking I/O 1178 */ 1179 if (fp->f_flag & FNONBLOCK) { 1180 error = EAGAIN; 1181 pipeunlock(wpipe); 1182 break; 1183 } 1184 1185 /* 1186 * We have no more space and have something to offer, 1187 * wake up select/poll. 1188 */ 1189 pipeselwakeup(wpipe); 1190 1191 wpipe->pipe_state |= PIPE_WANTW; 1192 pipeunlock(wpipe); 1193 error = msleep(wpipe, PIPE_MTX(rpipe), 1194 PRIBIO | PCATCH, "pipewr", 0); 1195 if (error != 0) 1196 break; 1197 } 1198 } 1199 1200 pipelock(wpipe, 0); 1201 --wpipe->pipe_busy; 1202 1203 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1204 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1205 wakeup(wpipe); 1206 } else if (wpipe->pipe_buffer.cnt > 0) { 1207 /* 1208 * If we have put any characters in the buffer, we wake up 1209 * the reader. 1210 */ 1211 if (wpipe->pipe_state & PIPE_WANTR) { 1212 wpipe->pipe_state &= ~PIPE_WANTR; 1213 wakeup(wpipe); 1214 } 1215 } 1216 1217 /* 1218 * Don't return EPIPE if I/O was successful 1219 */ 1220 if ((wpipe->pipe_buffer.cnt == 0) && 1221 (uio->uio_resid == 0) && 1222 (error == EPIPE)) { 1223 error = 0; 1224 } 1225 1226 if (error == 0) 1227 vfs_timestamp(&wpipe->pipe_mtime); 1228 1229 /* 1230 * We have something to offer, 1231 * wake up select/poll. 1232 */ 1233 if (wpipe->pipe_buffer.cnt) 1234 pipeselwakeup(wpipe); 1235 1236 pipeunlock(wpipe); 1237 PIPE_UNLOCK(rpipe); 1238 return (error); 1239 } 1240 1241 /* ARGSUSED */ 1242 static int 1243 pipe_truncate(fp, length, active_cred, td) 1244 struct file *fp; 1245 off_t length; 1246 struct ucred *active_cred; 1247 struct thread *td; 1248 { 1249 1250 return (EINVAL); 1251 } 1252 1253 /* 1254 * we implement a very minimal set of ioctls for compatibility with sockets. 1255 */ 1256 static int 1257 pipe_ioctl(fp, cmd, data, active_cred, td) 1258 struct file *fp; 1259 u_long cmd; 1260 void *data; 1261 struct ucred *active_cred; 1262 struct thread *td; 1263 { 1264 struct pipe *mpipe = fp->f_data; 1265 int error; 1266 1267 PIPE_LOCK(mpipe); 1268 1269 #ifdef MAC 1270 error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1271 if (error) { 1272 PIPE_UNLOCK(mpipe); 1273 return (error); 1274 } 1275 #endif 1276 1277 error = 0; 1278 switch (cmd) { 1279 1280 case FIONBIO: 1281 break; 1282 1283 case FIOASYNC: 1284 if (*(int *)data) { 1285 mpipe->pipe_state |= PIPE_ASYNC; 1286 } else { 1287 mpipe->pipe_state &= ~PIPE_ASYNC; 1288 } 1289 break; 1290 1291 case FIONREAD: 1292 if (mpipe->pipe_state & PIPE_DIRECTW) 1293 *(int *)data = mpipe->pipe_map.cnt; 1294 else 1295 *(int *)data = mpipe->pipe_buffer.cnt; 1296 break; 1297 1298 case FIOSETOWN: 1299 PIPE_UNLOCK(mpipe); 1300 error = fsetown(*(int *)data, &mpipe->pipe_sigio); 1301 goto out_unlocked; 1302 1303 case FIOGETOWN: 1304 *(int *)data = fgetown(&mpipe->pipe_sigio); 1305 break; 1306 1307 /* This is deprecated, FIOSETOWN should be used instead. */ 1308 case TIOCSPGRP: 1309 PIPE_UNLOCK(mpipe); 1310 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); 1311 goto out_unlocked; 1312 1313 /* This is deprecated, FIOGETOWN should be used instead. */ 1314 case TIOCGPGRP: 1315 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1316 break; 1317 1318 default: 1319 error = ENOTTY; 1320 break; 1321 } 1322 PIPE_UNLOCK(mpipe); 1323 out_unlocked: 1324 return (error); 1325 } 1326 1327 static int 1328 pipe_poll(fp, events, active_cred, td) 1329 struct file *fp; 1330 int events; 1331 struct ucred *active_cred; 1332 struct thread *td; 1333 { 1334 struct pipe *rpipe = fp->f_data; 1335 struct pipe *wpipe; 1336 int revents = 0; 1337 #ifdef MAC 1338 int error; 1339 #endif 1340 1341 wpipe = rpipe->pipe_peer; 1342 PIPE_LOCK(rpipe); 1343 #ifdef MAC 1344 error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); 1345 if (error) 1346 goto locked_error; 1347 #endif 1348 if (events & (POLLIN | POLLRDNORM)) 1349 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1350 (rpipe->pipe_buffer.cnt > 0)) 1351 revents |= events & (POLLIN | POLLRDNORM); 1352 1353 if (events & (POLLOUT | POLLWRNORM)) 1354 if (wpipe->pipe_present != PIPE_ACTIVE || 1355 (wpipe->pipe_state & PIPE_EOF) || 1356 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1357 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1358 revents |= events & (POLLOUT | POLLWRNORM); 1359 1360 if ((events & POLLINIGNEOF) == 0) { 1361 if (rpipe->pipe_state & PIPE_EOF) { 1362 revents |= (events & (POLLIN | POLLRDNORM)); 1363 if (wpipe->pipe_present != PIPE_ACTIVE || 1364 (wpipe->pipe_state & PIPE_EOF)) 1365 revents |= POLLHUP; 1366 } 1367 } 1368 1369 if (revents == 0) { 1370 if (events & (POLLIN | POLLRDNORM)) { 1371 selrecord(td, &rpipe->pipe_sel); 1372 if (SEL_WAITING(&rpipe->pipe_sel)) 1373 rpipe->pipe_state |= PIPE_SEL; 1374 } 1375 1376 if (events & (POLLOUT | POLLWRNORM)) { 1377 selrecord(td, &wpipe->pipe_sel); 1378 if (SEL_WAITING(&wpipe->pipe_sel)) 1379 wpipe->pipe_state |= PIPE_SEL; 1380 } 1381 } 1382 #ifdef MAC 1383 locked_error: 1384 #endif 1385 PIPE_UNLOCK(rpipe); 1386 1387 return (revents); 1388 } 1389 1390 /* 1391 * We shouldn't need locks here as we're doing a read and this should 1392 * be a natural race. 1393 */ 1394 static int 1395 pipe_stat(fp, ub, active_cred, td) 1396 struct file *fp; 1397 struct stat *ub; 1398 struct ucred *active_cred; 1399 struct thread *td; 1400 { 1401 struct pipe *pipe = fp->f_data; 1402 #ifdef MAC 1403 int error; 1404 1405 PIPE_LOCK(pipe); 1406 error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); 1407 PIPE_UNLOCK(pipe); 1408 if (error) 1409 return (error); 1410 #endif 1411 bzero(ub, sizeof(*ub)); 1412 ub->st_mode = S_IFIFO; 1413 ub->st_blksize = PAGE_SIZE; 1414 if (pipe->pipe_state & PIPE_DIRECTW) 1415 ub->st_size = pipe->pipe_map.cnt; 1416 else 1417 ub->st_size = pipe->pipe_buffer.cnt; 1418 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1419 ub->st_atim = pipe->pipe_atime; 1420 ub->st_mtim = pipe->pipe_mtime; 1421 ub->st_ctim = pipe->pipe_ctime; 1422 ub->st_uid = fp->f_cred->cr_uid; 1423 ub->st_gid = fp->f_cred->cr_gid; 1424 ub->st_dev = pipedev_ino; 1425 ub->st_ino = pipe->pipe_ino; 1426 /* 1427 * Left as 0: st_nlink, st_rdev, st_flags, st_gen. 1428 */ 1429 return (0); 1430 } 1431 1432 /* ARGSUSED */ 1433 static int 1434 pipe_close(fp, td) 1435 struct file *fp; 1436 struct thread *td; 1437 { 1438 struct pipe *cpipe = fp->f_data; 1439 1440 fp->f_ops = &badfileops; 1441 fp->f_data = NULL; 1442 funsetown(&cpipe->pipe_sigio); 1443 pipeclose(cpipe); 1444 return (0); 1445 } 1446 1447 static void 1448 pipe_free_kmem(cpipe) 1449 struct pipe *cpipe; 1450 { 1451 1452 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1453 ("pipe_free_kmem: pipe mutex locked")); 1454 1455 if (cpipe->pipe_buffer.buffer != NULL) { 1456 atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); 1457 vm_map_remove(pipe_map, 1458 (vm_offset_t)cpipe->pipe_buffer.buffer, 1459 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1460 cpipe->pipe_buffer.buffer = NULL; 1461 } 1462 #ifndef PIPE_NODIRECT 1463 { 1464 cpipe->pipe_map.cnt = 0; 1465 cpipe->pipe_map.pos = 0; 1466 cpipe->pipe_map.npages = 0; 1467 } 1468 #endif 1469 } 1470 1471 /* 1472 * shutdown the pipe 1473 */ 1474 static void 1475 pipeclose(cpipe) 1476 struct pipe *cpipe; 1477 { 1478 struct pipepair *pp; 1479 struct pipe *ppipe; 1480 ino_t ino; 1481 1482 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1483 1484 PIPE_LOCK(cpipe); 1485 pipelock(cpipe, 0); 1486 pp = cpipe->pipe_pair; 1487 1488 pipeselwakeup(cpipe); 1489 1490 /* 1491 * If the other side is blocked, wake it up saying that 1492 * we want to close it down. 1493 */ 1494 cpipe->pipe_state |= PIPE_EOF; 1495 while (cpipe->pipe_busy) { 1496 wakeup(cpipe); 1497 cpipe->pipe_state |= PIPE_WANT; 1498 pipeunlock(cpipe); 1499 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1500 pipelock(cpipe, 0); 1501 } 1502 1503 1504 /* 1505 * Disconnect from peer, if any. 1506 */ 1507 ppipe = cpipe->pipe_peer; 1508 if (ppipe->pipe_present == PIPE_ACTIVE) { 1509 pipeselwakeup(ppipe); 1510 1511 ppipe->pipe_state |= PIPE_EOF; 1512 wakeup(ppipe); 1513 KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); 1514 } 1515 1516 /* 1517 * Mark this endpoint as free. Release kmem resources. We 1518 * don't mark this endpoint as unused until we've finished 1519 * doing that, or the pipe might disappear out from under 1520 * us. 1521 */ 1522 PIPE_UNLOCK(cpipe); 1523 pipe_free_kmem(cpipe); 1524 PIPE_LOCK(cpipe); 1525 cpipe->pipe_present = PIPE_CLOSING; 1526 pipeunlock(cpipe); 1527 1528 /* 1529 * knlist_clear() may sleep dropping the PIPE_MTX. Set the 1530 * PIPE_FINALIZED, that allows other end to free the 1531 * pipe_pair, only after the knotes are completely dismantled. 1532 */ 1533 knlist_clear(&cpipe->pipe_sel.si_note, 1); 1534 cpipe->pipe_present = PIPE_FINALIZED; 1535 seldrain(&cpipe->pipe_sel); 1536 knlist_destroy(&cpipe->pipe_sel.si_note); 1537 1538 /* 1539 * Postpone the destroy of the fake inode number allocated for 1540 * our end, until pipe mtx is unlocked. 1541 */ 1542 ino = cpipe->pipe_ino; 1543 1544 /* 1545 * If both endpoints are now closed, release the memory for the 1546 * pipe pair. If not, unlock. 1547 */ 1548 if (ppipe->pipe_present == PIPE_FINALIZED) { 1549 PIPE_UNLOCK(cpipe); 1550 #ifdef MAC 1551 mac_pipe_destroy(pp); 1552 #endif 1553 uma_zfree(pipe_zone, cpipe->pipe_pair); 1554 } else 1555 PIPE_UNLOCK(cpipe); 1556 1557 if (ino > 0) 1558 free_unr(pipeino_unr, cpipe->pipe_ino); 1559 } 1560 1561 /*ARGSUSED*/ 1562 static int 1563 pipe_kqfilter(struct file *fp, struct knote *kn) 1564 { 1565 struct pipe *cpipe; 1566 1567 cpipe = kn->kn_fp->f_data; 1568 PIPE_LOCK(cpipe); 1569 switch (kn->kn_filter) { 1570 case EVFILT_READ: 1571 kn->kn_fop = &pipe_rfiltops; 1572 break; 1573 case EVFILT_WRITE: 1574 kn->kn_fop = &pipe_wfiltops; 1575 if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { 1576 /* other end of pipe has been closed */ 1577 PIPE_UNLOCK(cpipe); 1578 return (EPIPE); 1579 } 1580 cpipe = cpipe->pipe_peer; 1581 break; 1582 default: 1583 PIPE_UNLOCK(cpipe); 1584 return (EINVAL); 1585 } 1586 1587 knlist_add(&cpipe->pipe_sel.si_note, kn, 1); 1588 PIPE_UNLOCK(cpipe); 1589 return (0); 1590 } 1591 1592 static void 1593 filt_pipedetach(struct knote *kn) 1594 { 1595 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1596 1597 PIPE_LOCK(cpipe); 1598 if (kn->kn_filter == EVFILT_WRITE) 1599 cpipe = cpipe->pipe_peer; 1600 knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); 1601 PIPE_UNLOCK(cpipe); 1602 } 1603 1604 /*ARGSUSED*/ 1605 static int 1606 filt_piperead(struct knote *kn, long hint) 1607 { 1608 struct pipe *rpipe = kn->kn_fp->f_data; 1609 struct pipe *wpipe = rpipe->pipe_peer; 1610 int ret; 1611 1612 PIPE_LOCK(rpipe); 1613 kn->kn_data = rpipe->pipe_buffer.cnt; 1614 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1615 kn->kn_data = rpipe->pipe_map.cnt; 1616 1617 if ((rpipe->pipe_state & PIPE_EOF) || 1618 wpipe->pipe_present != PIPE_ACTIVE || 1619 (wpipe->pipe_state & PIPE_EOF)) { 1620 kn->kn_flags |= EV_EOF; 1621 PIPE_UNLOCK(rpipe); 1622 return (1); 1623 } 1624 ret = kn->kn_data > 0; 1625 PIPE_UNLOCK(rpipe); 1626 return ret; 1627 } 1628 1629 /*ARGSUSED*/ 1630 static int 1631 filt_pipewrite(struct knote *kn, long hint) 1632 { 1633 struct pipe *rpipe = kn->kn_fp->f_data; 1634 struct pipe *wpipe = rpipe->pipe_peer; 1635 1636 PIPE_LOCK(rpipe); 1637 if (wpipe->pipe_present != PIPE_ACTIVE || 1638 (wpipe->pipe_state & PIPE_EOF)) { 1639 kn->kn_data = 0; 1640 kn->kn_flags |= EV_EOF; 1641 PIPE_UNLOCK(rpipe); 1642 return (1); 1643 } 1644 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1645 if (wpipe->pipe_state & PIPE_DIRECTW) 1646 kn->kn_data = 0; 1647 1648 PIPE_UNLOCK(rpipe); 1649 return (kn->kn_data >= PIPE_BUF); 1650 } 1651