1 /*- 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 */ 19 20 /* 21 * This file contains a high-performance replacement for the socket-based 22 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 23 * all features of sockets, but does do everything that pipes normally 24 * do. 25 */ 26 27 /* 28 * This code has two modes of operation, a small write mode and a large 29 * write mode. The small write mode acts like conventional pipes with 30 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 31 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 33 * the receiving process can copy it directly from the pages in the sending 34 * process. 35 * 36 * If the sending process receives a signal, it is possible that it will 37 * go away, and certainly its address space can change, because control 38 * is returned back to the user-mode side. In that case, the pipe code 39 * arranges to copy the buffer supplied by the user process, to a pageable 40 * kernel buffer, and the receiving process will grab the data from the 41 * pageable kernel buffer. Since signals don't happen all that often, 42 * the copy operation is normally eliminated. 43 * 44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 45 * happen for small transfers so that the system will not spend all of 46 * its time context switching. 47 * 48 * In order to limit the resource use of pipes, two sysctls exist: 49 * 50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 51 * address space available to us in pipe_map. This value is normally 52 * autotuned, but may also be loader tuned. 53 * 54 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of 55 * memory in use by pipes. 56 * 57 * Based on how large pipekva is relative to maxpipekva, the following 58 * will happen: 59 * 60 * 0% - 50%: 61 * New pipes are given 16K of memory backing, pipes may dynamically 62 * grow to as large as 64K where needed. 63 * 50% - 75%: 64 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 65 * existing pipes may NOT grow. 66 * 75% - 100%: 67 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 68 * existing pipes will be shrunk down to 4K whenever possible. 69 * 70 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If 71 * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE 72 * resize which MUST occur for reverse-direction pipes when they are 73 * first used. 74 * 75 * Additional information about the current state of pipes may be obtained 76 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, 77 * and kern.ipc.piperesizefail. 78 * 79 * Locking rules: There are two locks present here: A mutex, used via 80 * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via 81 * the flag, as mutexes can not persist over uiomove. The mutex 82 * exists only to guard access to the flag, and is not in itself a 83 * locking mechanism. Also note that there is only a single mutex for 84 * both directions of a pipe. 85 * 86 * As pipelock() may have to sleep before it can acquire the flag, it 87 * is important to reread all data after a call to pipelock(); everything 88 * in the structure may have changed. 89 */ 90 91 #include <sys/cdefs.h> 92 __FBSDID("$FreeBSD$"); 93 94 #include "opt_mac.h" 95 96 #include <sys/param.h> 97 #include <sys/systm.h> 98 #include <sys/fcntl.h> 99 #include <sys/file.h> 100 #include <sys/filedesc.h> 101 #include <sys/filio.h> 102 #include <sys/kernel.h> 103 #include <sys/lock.h> 104 #include <sys/mac.h> 105 #include <sys/mutex.h> 106 #include <sys/ttycom.h> 107 #include <sys/stat.h> 108 #include <sys/malloc.h> 109 #include <sys/poll.h> 110 #include <sys/selinfo.h> 111 #include <sys/signalvar.h> 112 #include <sys/sysctl.h> 113 #include <sys/sysproto.h> 114 #include <sys/pipe.h> 115 #include <sys/proc.h> 116 #include <sys/vnode.h> 117 #include <sys/uio.h> 118 #include <sys/event.h> 119 120 #include <vm/vm.h> 121 #include <vm/vm_param.h> 122 #include <vm/vm_object.h> 123 #include <vm/vm_kern.h> 124 #include <vm/vm_extern.h> 125 #include <vm/pmap.h> 126 #include <vm/vm_map.h> 127 #include <vm/vm_page.h> 128 #include <vm/uma.h> 129 130 /* 131 * Use this define if you want to disable *fancy* VM things. Expect an 132 * approx 30% decrease in transfer rate. This could be useful for 133 * NetBSD or OpenBSD. 134 */ 135 /* #define PIPE_NODIRECT */ 136 137 /* 138 * interfaces to the outside world 139 */ 140 static fo_rdwr_t pipe_read; 141 static fo_rdwr_t pipe_write; 142 static fo_ioctl_t pipe_ioctl; 143 static fo_poll_t pipe_poll; 144 static fo_kqfilter_t pipe_kqfilter; 145 static fo_stat_t pipe_stat; 146 static fo_close_t pipe_close; 147 148 static struct fileops pipeops = { 149 .fo_read = pipe_read, 150 .fo_write = pipe_write, 151 .fo_ioctl = pipe_ioctl, 152 .fo_poll = pipe_poll, 153 .fo_kqfilter = pipe_kqfilter, 154 .fo_stat = pipe_stat, 155 .fo_close = pipe_close, 156 .fo_flags = DFLAG_PASSABLE 157 }; 158 159 static void filt_pipedetach(struct knote *kn); 160 static int filt_piperead(struct knote *kn, long hint); 161 static int filt_pipewrite(struct knote *kn, long hint); 162 163 static struct filterops pipe_rfiltops = 164 { 1, NULL, filt_pipedetach, filt_piperead }; 165 static struct filterops pipe_wfiltops = 166 { 1, NULL, filt_pipedetach, filt_pipewrite }; 167 168 /* 169 * Default pipe buffer size(s), this can be kind-of large now because pipe 170 * space is pageable. The pipe code will try to maintain locality of 171 * reference for performance reasons, so small amounts of outstanding I/O 172 * will not wipe the cache. 173 */ 174 #define MINPIPESIZE (PIPE_SIZE/3) 175 #define MAXPIPESIZE (2*PIPE_SIZE/3) 176 177 static int amountpipes; 178 static int amountpipekva; 179 static int pipefragretry; 180 static int pipeallocfail; 181 static int piperesizefail; 182 static int piperesizeallowed = 1; 183 184 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, 185 &maxpipekva, 0, "Pipe KVA limit"); 186 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, 187 &amountpipes, 0, "Current # of pipes"); 188 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 189 &amountpipekva, 0, "Pipe KVA usage"); 190 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, 191 &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); 192 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, 193 &pipeallocfail, 0, "Pipe allocation failures"); 194 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, 195 &piperesizefail, 0, "Pipe resize failures"); 196 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, 197 &piperesizeallowed, 0, "Pipe resizing allowed"); 198 199 static void pipeinit(void *dummy __unused); 200 static void pipeclose(struct pipe *cpipe); 201 static void pipe_free_kmem(struct pipe *cpipe); 202 static int pipe_create(struct pipe *pipe, int backing); 203 static __inline int pipelock(struct pipe *cpipe, int catch); 204 static __inline void pipeunlock(struct pipe *cpipe); 205 static __inline void pipeselwakeup(struct pipe *cpipe); 206 #ifndef PIPE_NODIRECT 207 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 208 static void pipe_destroy_write_buffer(struct pipe *wpipe); 209 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 210 static void pipe_clone_write_buffer(struct pipe *wpipe); 211 #endif 212 static int pipespace(struct pipe *cpipe, int size); 213 static int pipespace_new(struct pipe *cpipe, int size); 214 215 static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); 216 static void pipe_zone_dtor(void *mem, int size, void *arg); 217 static int pipe_zone_init(void *mem, int size, int flags); 218 static void pipe_zone_fini(void *mem, int size); 219 220 static uma_zone_t pipe_zone; 221 222 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 223 224 static void 225 pipeinit(void *dummy __unused) 226 { 227 228 pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair), 229 pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini, 230 UMA_ALIGN_PTR, 0); 231 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 232 } 233 234 static int 235 pipe_zone_ctor(void *mem, int size, void *arg, int flags) 236 { 237 struct pipepair *pp; 238 struct pipe *rpipe, *wpipe; 239 240 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 241 242 pp = (struct pipepair *)mem; 243 244 /* 245 * We zero both pipe endpoints to make sure all the kmem pointers 246 * are NULL, flag fields are zero'd, etc. We timestamp both 247 * endpoints with the same time. 248 */ 249 rpipe = &pp->pp_rpipe; 250 bzero(rpipe, sizeof(*rpipe)); 251 vfs_timestamp(&rpipe->pipe_ctime); 252 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 253 254 wpipe = &pp->pp_wpipe; 255 bzero(wpipe, sizeof(*wpipe)); 256 wpipe->pipe_ctime = rpipe->pipe_ctime; 257 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 258 259 rpipe->pipe_peer = wpipe; 260 rpipe->pipe_pair = pp; 261 wpipe->pipe_peer = rpipe; 262 wpipe->pipe_pair = pp; 263 264 /* 265 * Mark both endpoints as present; they will later get free'd 266 * one at a time. When both are free'd, then the whole pair 267 * is released. 268 */ 269 rpipe->pipe_present = 1; 270 wpipe->pipe_present = 1; 271 272 /* 273 * Eventually, the MAC Framework may initialize the label 274 * in ctor or init, but for now we do it elswhere to avoid 275 * blocking in ctor or init. 276 */ 277 pp->pp_label = NULL; 278 279 atomic_add_int(&amountpipes, 2); 280 return (0); 281 } 282 283 static void 284 pipe_zone_dtor(void *mem, int size, void *arg) 285 { 286 struct pipepair *pp; 287 288 KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size")); 289 290 pp = (struct pipepair *)mem; 291 292 atomic_subtract_int(&amountpipes, 2); 293 } 294 295 static int 296 pipe_zone_init(void *mem, int size, int flags) 297 { 298 struct pipepair *pp; 299 300 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 301 302 pp = (struct pipepair *)mem; 303 304 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); 305 return (0); 306 } 307 308 static void 309 pipe_zone_fini(void *mem, int size) 310 { 311 struct pipepair *pp; 312 313 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 314 315 pp = (struct pipepair *)mem; 316 317 mtx_destroy(&pp->pp_mtx); 318 } 319 320 /* 321 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, 322 * let the zone pick up the pieces via pipeclose(). 323 */ 324 325 /* ARGSUSED */ 326 int 327 pipe(td, uap) 328 struct thread *td; 329 struct pipe_args /* { 330 int dummy; 331 } */ *uap; 332 { 333 struct filedesc *fdp = td->td_proc->p_fd; 334 struct file *rf, *wf; 335 struct pipepair *pp; 336 struct pipe *rpipe, *wpipe; 337 int fd, error; 338 339 pp = uma_zalloc(pipe_zone, M_WAITOK); 340 #ifdef MAC 341 /* 342 * The MAC label is shared between the connected endpoints. As a 343 * result mac_init_pipe() and mac_create_pipe() are called once 344 * for the pair, and not on the endpoints. 345 */ 346 mac_init_pipe(pp); 347 mac_create_pipe(td->td_ucred, pp); 348 #endif 349 rpipe = &pp->pp_rpipe; 350 wpipe = &pp->pp_wpipe; 351 352 knlist_init(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe), NULL, NULL, 353 NULL); 354 knlist_init(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe), NULL, NULL, 355 NULL); 356 357 /* Only the forward direction pipe is backed by default */ 358 if ((error = pipe_create(rpipe, 1)) != 0 || 359 (error = pipe_create(wpipe, 0)) != 0) { 360 pipeclose(rpipe); 361 pipeclose(wpipe); 362 return (error); 363 } 364 365 rpipe->pipe_state |= PIPE_DIRECTOK; 366 wpipe->pipe_state |= PIPE_DIRECTOK; 367 368 error = falloc(td, &rf, &fd); 369 if (error) { 370 pipeclose(rpipe); 371 pipeclose(wpipe); 372 return (error); 373 } 374 /* An extra reference on `rf' has been held for us by falloc(). */ 375 td->td_retval[0] = fd; 376 377 /* 378 * Warning: once we've gotten past allocation of the fd for the 379 * read-side, we can only drop the read side via fdrop() in order 380 * to avoid races against processes which manage to dup() the read 381 * side while we are blocked trying to allocate the write side. 382 */ 383 FILE_LOCK(rf); 384 rf->f_flag = FREAD | FWRITE; 385 rf->f_type = DTYPE_PIPE; 386 rf->f_data = rpipe; 387 rf->f_ops = &pipeops; 388 FILE_UNLOCK(rf); 389 error = falloc(td, &wf, &fd); 390 if (error) { 391 fdclose(fdp, rf, td->td_retval[0], td); 392 fdrop(rf, td); 393 /* rpipe has been closed by fdrop(). */ 394 pipeclose(wpipe); 395 return (error); 396 } 397 /* An extra reference on `wf' has been held for us by falloc(). */ 398 FILE_LOCK(wf); 399 wf->f_flag = FREAD | FWRITE; 400 wf->f_type = DTYPE_PIPE; 401 wf->f_data = wpipe; 402 wf->f_ops = &pipeops; 403 FILE_UNLOCK(wf); 404 fdrop(wf, td); 405 td->td_retval[1] = fd; 406 fdrop(rf, td); 407 408 return (0); 409 } 410 411 /* 412 * Allocate kva for pipe circular buffer, the space is pageable 413 * This routine will 'realloc' the size of a pipe safely, if it fails 414 * it will retain the old buffer. 415 * If it fails it will return ENOMEM. 416 */ 417 static int 418 pipespace_new(cpipe, size) 419 struct pipe *cpipe; 420 int size; 421 { 422 caddr_t buffer; 423 int error, cnt, firstseg; 424 static int curfail = 0; 425 static struct timeval lastfail; 426 427 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 428 KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), 429 ("pipespace: resize of direct writes not allowed")); 430 retry: 431 cnt = cpipe->pipe_buffer.cnt; 432 if (cnt > size) 433 size = cnt; 434 435 size = round_page(size); 436 buffer = (caddr_t) vm_map_min(pipe_map); 437 438 error = vm_map_find(pipe_map, NULL, 0, 439 (vm_offset_t *) &buffer, size, 1, 440 VM_PROT_ALL, VM_PROT_ALL, 0); 441 if (error != KERN_SUCCESS) { 442 if ((cpipe->pipe_buffer.buffer == NULL) && 443 (size > SMALL_PIPE_SIZE)) { 444 size = SMALL_PIPE_SIZE; 445 pipefragretry++; 446 goto retry; 447 } 448 if (cpipe->pipe_buffer.buffer == NULL) { 449 pipeallocfail++; 450 if (ppsratecheck(&lastfail, &curfail, 1)) 451 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 452 } else { 453 piperesizefail++; 454 } 455 return (ENOMEM); 456 } 457 458 /* copy data, then free old resources if we're resizing */ 459 if (cnt > 0) { 460 if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { 461 firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; 462 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 463 buffer, firstseg); 464 if ((cnt - firstseg) > 0) 465 bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], 466 cpipe->pipe_buffer.in); 467 } else { 468 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 469 buffer, cnt); 470 } 471 } 472 pipe_free_kmem(cpipe); 473 cpipe->pipe_buffer.buffer = buffer; 474 cpipe->pipe_buffer.size = size; 475 cpipe->pipe_buffer.in = cnt; 476 cpipe->pipe_buffer.out = 0; 477 cpipe->pipe_buffer.cnt = cnt; 478 atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size); 479 return (0); 480 } 481 482 /* 483 * Wrapper for pipespace_new() that performs locking assertions. 484 */ 485 static int 486 pipespace(cpipe, size) 487 struct pipe *cpipe; 488 int size; 489 { 490 491 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 492 ("Unlocked pipe passed to pipespace")); 493 return (pipespace_new(cpipe, size)); 494 } 495 496 /* 497 * lock a pipe for I/O, blocking other access 498 */ 499 static __inline int 500 pipelock(cpipe, catch) 501 struct pipe *cpipe; 502 int catch; 503 { 504 int error; 505 506 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 507 while (cpipe->pipe_state & PIPE_LOCKFL) { 508 cpipe->pipe_state |= PIPE_LWANT; 509 error = msleep(cpipe, PIPE_MTX(cpipe), 510 catch ? (PRIBIO | PCATCH) : PRIBIO, 511 "pipelk", 0); 512 if (error != 0) 513 return (error); 514 } 515 cpipe->pipe_state |= PIPE_LOCKFL; 516 return (0); 517 } 518 519 /* 520 * unlock a pipe I/O lock 521 */ 522 static __inline void 523 pipeunlock(cpipe) 524 struct pipe *cpipe; 525 { 526 527 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 528 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 529 ("Unlocked pipe passed to pipeunlock")); 530 cpipe->pipe_state &= ~PIPE_LOCKFL; 531 if (cpipe->pipe_state & PIPE_LWANT) { 532 cpipe->pipe_state &= ~PIPE_LWANT; 533 wakeup(cpipe); 534 } 535 } 536 537 static __inline void 538 pipeselwakeup(cpipe) 539 struct pipe *cpipe; 540 { 541 542 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 543 if (cpipe->pipe_state & PIPE_SEL) { 544 cpipe->pipe_state &= ~PIPE_SEL; 545 selwakeuppri(&cpipe->pipe_sel, PSOCK); 546 } 547 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 548 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 549 KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); 550 } 551 552 /* 553 * Initialize and allocate VM and memory for pipe. The structure 554 * will start out zero'd from the ctor, so we just manage the kmem. 555 */ 556 static int 557 pipe_create(pipe, backing) 558 struct pipe *pipe; 559 int backing; 560 { 561 int error; 562 563 if (backing) { 564 if (amountpipekva > maxpipekva / 2) 565 error = pipespace_new(pipe, SMALL_PIPE_SIZE); 566 else 567 error = pipespace_new(pipe, PIPE_SIZE); 568 } else { 569 /* If we're not backing this pipe, no need to do anything. */ 570 error = 0; 571 } 572 return (error); 573 } 574 575 /* ARGSUSED */ 576 static int 577 pipe_read(fp, uio, active_cred, flags, td) 578 struct file *fp; 579 struct uio *uio; 580 struct ucred *active_cred; 581 struct thread *td; 582 int flags; 583 { 584 struct pipe *rpipe = fp->f_data; 585 int error; 586 int nread = 0; 587 u_int size; 588 589 PIPE_LOCK(rpipe); 590 ++rpipe->pipe_busy; 591 error = pipelock(rpipe, 1); 592 if (error) 593 goto unlocked_error; 594 595 #ifdef MAC 596 error = mac_check_pipe_read(active_cred, rpipe->pipe_pair); 597 if (error) 598 goto locked_error; 599 #endif 600 if (amountpipekva > (3 * maxpipekva) / 4) { 601 if (!(rpipe->pipe_state & PIPE_DIRECTW) && 602 (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 603 (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 604 (piperesizeallowed == 1)) { 605 PIPE_UNLOCK(rpipe); 606 pipespace(rpipe, SMALL_PIPE_SIZE); 607 PIPE_LOCK(rpipe); 608 } 609 } 610 611 while (uio->uio_resid) { 612 /* 613 * normal pipe buffer receive 614 */ 615 if (rpipe->pipe_buffer.cnt > 0) { 616 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 617 if (size > rpipe->pipe_buffer.cnt) 618 size = rpipe->pipe_buffer.cnt; 619 if (size > (u_int) uio->uio_resid) 620 size = (u_int) uio->uio_resid; 621 622 PIPE_UNLOCK(rpipe); 623 error = uiomove( 624 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 625 size, uio); 626 PIPE_LOCK(rpipe); 627 if (error) 628 break; 629 630 rpipe->pipe_buffer.out += size; 631 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 632 rpipe->pipe_buffer.out = 0; 633 634 rpipe->pipe_buffer.cnt -= size; 635 636 /* 637 * If there is no more to read in the pipe, reset 638 * its pointers to the beginning. This improves 639 * cache hit stats. 640 */ 641 if (rpipe->pipe_buffer.cnt == 0) { 642 rpipe->pipe_buffer.in = 0; 643 rpipe->pipe_buffer.out = 0; 644 } 645 nread += size; 646 #ifndef PIPE_NODIRECT 647 /* 648 * Direct copy, bypassing a kernel buffer. 649 */ 650 } else if ((size = rpipe->pipe_map.cnt) && 651 (rpipe->pipe_state & PIPE_DIRECTW)) { 652 if (size > (u_int) uio->uio_resid) 653 size = (u_int) uio->uio_resid; 654 655 PIPE_UNLOCK(rpipe); 656 error = uiomove_fromphys(rpipe->pipe_map.ms, 657 rpipe->pipe_map.pos, size, uio); 658 PIPE_LOCK(rpipe); 659 if (error) 660 break; 661 nread += size; 662 rpipe->pipe_map.pos += size; 663 rpipe->pipe_map.cnt -= size; 664 if (rpipe->pipe_map.cnt == 0) { 665 rpipe->pipe_state &= ~PIPE_DIRECTW; 666 wakeup(rpipe); 667 } 668 #endif 669 } else { 670 /* 671 * detect EOF condition 672 * read returns 0 on EOF, no need to set error 673 */ 674 if (rpipe->pipe_state & PIPE_EOF) 675 break; 676 677 /* 678 * If the "write-side" has been blocked, wake it up now. 679 */ 680 if (rpipe->pipe_state & PIPE_WANTW) { 681 rpipe->pipe_state &= ~PIPE_WANTW; 682 wakeup(rpipe); 683 } 684 685 /* 686 * Break if some data was read. 687 */ 688 if (nread > 0) 689 break; 690 691 /* 692 * Unlock the pipe buffer for our remaining processing. 693 * We will either break out with an error or we will 694 * sleep and relock to loop. 695 */ 696 pipeunlock(rpipe); 697 698 /* 699 * Handle non-blocking mode operation or 700 * wait for more data. 701 */ 702 if (fp->f_flag & FNONBLOCK) { 703 error = EAGAIN; 704 } else { 705 rpipe->pipe_state |= PIPE_WANTR; 706 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 707 PRIBIO | PCATCH, 708 "piperd", 0)) == 0) 709 error = pipelock(rpipe, 1); 710 } 711 if (error) 712 goto unlocked_error; 713 } 714 } 715 #ifdef MAC 716 locked_error: 717 #endif 718 pipeunlock(rpipe); 719 720 /* XXX: should probably do this before getting any locks. */ 721 if (error == 0) 722 vfs_timestamp(&rpipe->pipe_atime); 723 unlocked_error: 724 --rpipe->pipe_busy; 725 726 /* 727 * PIPE_WANT processing only makes sense if pipe_busy is 0. 728 */ 729 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 730 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 731 wakeup(rpipe); 732 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 733 /* 734 * Handle write blocking hysteresis. 735 */ 736 if (rpipe->pipe_state & PIPE_WANTW) { 737 rpipe->pipe_state &= ~PIPE_WANTW; 738 wakeup(rpipe); 739 } 740 } 741 742 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 743 pipeselwakeup(rpipe); 744 745 PIPE_UNLOCK(rpipe); 746 return (error); 747 } 748 749 #ifndef PIPE_NODIRECT 750 /* 751 * Map the sending processes' buffer into kernel space and wire it. 752 * This is similar to a physical write operation. 753 */ 754 static int 755 pipe_build_write_buffer(wpipe, uio) 756 struct pipe *wpipe; 757 struct uio *uio; 758 { 759 pmap_t pmap; 760 u_int size; 761 int i, j; 762 vm_offset_t addr, endaddr; 763 764 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 765 KASSERT(wpipe->pipe_state & PIPE_DIRECTW, 766 ("Clone attempt on non-direct write pipe!")); 767 768 size = (u_int) uio->uio_iov->iov_len; 769 if (size > wpipe->pipe_buffer.size) 770 size = wpipe->pipe_buffer.size; 771 772 pmap = vmspace_pmap(curproc->p_vmspace); 773 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 774 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 775 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { 776 /* 777 * vm_fault_quick() can sleep. Consequently, 778 * vm_page_lock_queue() and vm_page_unlock_queue() 779 * should not be performed outside of this loop. 780 */ 781 race: 782 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { 783 vm_page_lock_queues(); 784 for (j = 0; j < i; j++) 785 vm_page_unhold(wpipe->pipe_map.ms[j]); 786 vm_page_unlock_queues(); 787 return (EFAULT); 788 } 789 wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, 790 VM_PROT_READ); 791 if (wpipe->pipe_map.ms[i] == NULL) 792 goto race; 793 } 794 795 /* 796 * set up the control block 797 */ 798 wpipe->pipe_map.npages = i; 799 wpipe->pipe_map.pos = 800 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 801 wpipe->pipe_map.cnt = size; 802 803 /* 804 * and update the uio data 805 */ 806 807 uio->uio_iov->iov_len -= size; 808 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 809 if (uio->uio_iov->iov_len == 0) 810 uio->uio_iov++; 811 uio->uio_resid -= size; 812 uio->uio_offset += size; 813 return (0); 814 } 815 816 /* 817 * unmap and unwire the process buffer 818 */ 819 static void 820 pipe_destroy_write_buffer(wpipe) 821 struct pipe *wpipe; 822 { 823 int i; 824 825 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 826 vm_page_lock_queues(); 827 for (i = 0; i < wpipe->pipe_map.npages; i++) { 828 vm_page_unhold(wpipe->pipe_map.ms[i]); 829 } 830 vm_page_unlock_queues(); 831 wpipe->pipe_map.npages = 0; 832 } 833 834 /* 835 * In the case of a signal, the writing process might go away. This 836 * code copies the data into the circular buffer so that the source 837 * pages can be freed without loss of data. 838 */ 839 static void 840 pipe_clone_write_buffer(wpipe) 841 struct pipe *wpipe; 842 { 843 struct uio uio; 844 struct iovec iov; 845 int size; 846 int pos; 847 848 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 849 size = wpipe->pipe_map.cnt; 850 pos = wpipe->pipe_map.pos; 851 852 wpipe->pipe_buffer.in = size; 853 wpipe->pipe_buffer.out = 0; 854 wpipe->pipe_buffer.cnt = size; 855 wpipe->pipe_state &= ~PIPE_DIRECTW; 856 857 PIPE_UNLOCK(wpipe); 858 iov.iov_base = wpipe->pipe_buffer.buffer; 859 iov.iov_len = size; 860 uio.uio_iov = &iov; 861 uio.uio_iovcnt = 1; 862 uio.uio_offset = 0; 863 uio.uio_resid = size; 864 uio.uio_segflg = UIO_SYSSPACE; 865 uio.uio_rw = UIO_READ; 866 uio.uio_td = curthread; 867 uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); 868 PIPE_LOCK(wpipe); 869 pipe_destroy_write_buffer(wpipe); 870 } 871 872 /* 873 * This implements the pipe buffer write mechanism. Note that only 874 * a direct write OR a normal pipe write can be pending at any given time. 875 * If there are any characters in the pipe buffer, the direct write will 876 * be deferred until the receiving process grabs all of the bytes from 877 * the pipe buffer. Then the direct mapping write is set-up. 878 */ 879 static int 880 pipe_direct_write(wpipe, uio) 881 struct pipe *wpipe; 882 struct uio *uio; 883 { 884 int error; 885 886 retry: 887 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 888 error = pipelock(wpipe, 1); 889 if (wpipe->pipe_state & PIPE_EOF) 890 error = EPIPE; 891 if (error) { 892 pipeunlock(wpipe); 893 goto error1; 894 } 895 while (wpipe->pipe_state & PIPE_DIRECTW) { 896 if (wpipe->pipe_state & PIPE_WANTR) { 897 wpipe->pipe_state &= ~PIPE_WANTR; 898 wakeup(wpipe); 899 } 900 wpipe->pipe_state |= PIPE_WANTW; 901 pipeunlock(wpipe); 902 error = msleep(wpipe, PIPE_MTX(wpipe), 903 PRIBIO | PCATCH, "pipdww", 0); 904 if (error) 905 goto error1; 906 else 907 goto retry; 908 } 909 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 910 if (wpipe->pipe_buffer.cnt > 0) { 911 if (wpipe->pipe_state & PIPE_WANTR) { 912 wpipe->pipe_state &= ~PIPE_WANTR; 913 wakeup(wpipe); 914 } 915 wpipe->pipe_state |= PIPE_WANTW; 916 pipeunlock(wpipe); 917 error = msleep(wpipe, PIPE_MTX(wpipe), 918 PRIBIO | PCATCH, "pipdwc", 0); 919 if (error) 920 goto error1; 921 else 922 goto retry; 923 } 924 925 wpipe->pipe_state |= PIPE_DIRECTW; 926 927 PIPE_UNLOCK(wpipe); 928 error = pipe_build_write_buffer(wpipe, uio); 929 PIPE_LOCK(wpipe); 930 if (error) { 931 wpipe->pipe_state &= ~PIPE_DIRECTW; 932 pipeunlock(wpipe); 933 goto error1; 934 } 935 936 error = 0; 937 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 938 if (wpipe->pipe_state & PIPE_EOF) { 939 pipe_destroy_write_buffer(wpipe); 940 pipeselwakeup(wpipe); 941 pipeunlock(wpipe); 942 error = EPIPE; 943 goto error1; 944 } 945 if (wpipe->pipe_state & PIPE_WANTR) { 946 wpipe->pipe_state &= ~PIPE_WANTR; 947 wakeup(wpipe); 948 } 949 pipeselwakeup(wpipe); 950 pipeunlock(wpipe); 951 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 952 "pipdwt", 0); 953 pipelock(wpipe, 0); 954 } 955 956 if (wpipe->pipe_state & PIPE_EOF) 957 error = EPIPE; 958 if (wpipe->pipe_state & PIPE_DIRECTW) { 959 /* 960 * this bit of trickery substitutes a kernel buffer for 961 * the process that might be going away. 962 */ 963 pipe_clone_write_buffer(wpipe); 964 } else { 965 pipe_destroy_write_buffer(wpipe); 966 } 967 pipeunlock(wpipe); 968 return (error); 969 970 error1: 971 wakeup(wpipe); 972 return (error); 973 } 974 #endif 975 976 static int 977 pipe_write(fp, uio, active_cred, flags, td) 978 struct file *fp; 979 struct uio *uio; 980 struct ucred *active_cred; 981 struct thread *td; 982 int flags; 983 { 984 int error = 0; 985 int desiredsize, orig_resid; 986 struct pipe *wpipe, *rpipe; 987 988 rpipe = fp->f_data; 989 wpipe = rpipe->pipe_peer; 990 991 PIPE_LOCK(rpipe); 992 error = pipelock(wpipe, 1); 993 if (error) { 994 PIPE_UNLOCK(rpipe); 995 return (error); 996 } 997 /* 998 * detect loss of pipe read side, issue SIGPIPE if lost. 999 */ 1000 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1001 pipeunlock(wpipe); 1002 PIPE_UNLOCK(rpipe); 1003 return (EPIPE); 1004 } 1005 #ifdef MAC 1006 error = mac_check_pipe_write(active_cred, wpipe->pipe_pair); 1007 if (error) { 1008 pipeunlock(wpipe); 1009 PIPE_UNLOCK(rpipe); 1010 return (error); 1011 } 1012 #endif 1013 ++wpipe->pipe_busy; 1014 1015 /* Choose a larger size if it's advantageous */ 1016 desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); 1017 while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { 1018 if (piperesizeallowed != 1) 1019 break; 1020 if (amountpipekva > maxpipekva / 2) 1021 break; 1022 if (desiredsize == BIG_PIPE_SIZE) 1023 break; 1024 desiredsize = desiredsize * 2; 1025 } 1026 1027 /* Choose a smaller size if we're in a OOM situation */ 1028 if ((amountpipekva > (3 * maxpipekva) / 4) && 1029 (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 1030 (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 1031 (piperesizeallowed == 1)) 1032 desiredsize = SMALL_PIPE_SIZE; 1033 1034 /* Resize if the above determined that a new size was necessary */ 1035 if ((desiredsize != wpipe->pipe_buffer.size) && 1036 ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { 1037 PIPE_UNLOCK(wpipe); 1038 pipespace(wpipe, desiredsize); 1039 PIPE_LOCK(wpipe); 1040 } 1041 if (wpipe->pipe_buffer.size == 0) { 1042 /* 1043 * This can only happen for reverse direction use of pipes 1044 * in a complete OOM situation. 1045 */ 1046 error = ENOMEM; 1047 --wpipe->pipe_busy; 1048 pipeunlock(wpipe); 1049 PIPE_UNLOCK(wpipe); 1050 return (error); 1051 } 1052 1053 pipeunlock(wpipe); 1054 1055 orig_resid = uio->uio_resid; 1056 1057 while (uio->uio_resid) { 1058 int space; 1059 1060 pipelock(wpipe, 0); 1061 if (wpipe->pipe_state & PIPE_EOF) { 1062 pipeunlock(wpipe); 1063 error = EPIPE; 1064 break; 1065 } 1066 #ifndef PIPE_NODIRECT 1067 /* 1068 * If the transfer is large, we can gain performance if 1069 * we do process-to-process copies directly. 1070 * If the write is non-blocking, we don't use the 1071 * direct write mechanism. 1072 * 1073 * The direct write mechanism will detect the reader going 1074 * away on us. 1075 */ 1076 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 1077 (wpipe->pipe_buffer.size >= PIPE_MINDIRECT) && 1078 (fp->f_flag & FNONBLOCK) == 0) { 1079 pipeunlock(wpipe); 1080 error = pipe_direct_write(wpipe, uio); 1081 if (error) 1082 break; 1083 continue; 1084 } 1085 #endif 1086 1087 /* 1088 * Pipe buffered writes cannot be coincidental with 1089 * direct writes. We wait until the currently executing 1090 * direct write is completed before we start filling the 1091 * pipe buffer. We break out if a signal occurs or the 1092 * reader goes away. 1093 */ 1094 if (wpipe->pipe_state & PIPE_DIRECTW) { 1095 if (wpipe->pipe_state & PIPE_WANTR) { 1096 wpipe->pipe_state &= ~PIPE_WANTR; 1097 wakeup(wpipe); 1098 } 1099 pipeunlock(wpipe); 1100 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1101 "pipbww", 0); 1102 if (error) 1103 break; 1104 else 1105 continue; 1106 } 1107 1108 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1109 1110 /* Writes of size <= PIPE_BUF must be atomic. */ 1111 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1112 space = 0; 1113 1114 if (space > 0) { 1115 int size; /* Transfer size */ 1116 int segsize; /* first segment to transfer */ 1117 1118 /* 1119 * Transfer size is minimum of uio transfer 1120 * and free space in pipe buffer. 1121 */ 1122 if (space > uio->uio_resid) 1123 size = uio->uio_resid; 1124 else 1125 size = space; 1126 /* 1127 * First segment to transfer is minimum of 1128 * transfer size and contiguous space in 1129 * pipe buffer. If first segment to transfer 1130 * is less than the transfer size, we've got 1131 * a wraparound in the buffer. 1132 */ 1133 segsize = wpipe->pipe_buffer.size - 1134 wpipe->pipe_buffer.in; 1135 if (segsize > size) 1136 segsize = size; 1137 1138 /* Transfer first segment */ 1139 1140 PIPE_UNLOCK(rpipe); 1141 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1142 segsize, uio); 1143 PIPE_LOCK(rpipe); 1144 1145 if (error == 0 && segsize < size) { 1146 KASSERT(wpipe->pipe_buffer.in + segsize == 1147 wpipe->pipe_buffer.size, 1148 ("Pipe buffer wraparound disappeared")); 1149 /* 1150 * Transfer remaining part now, to 1151 * support atomic writes. Wraparound 1152 * happened. 1153 */ 1154 1155 PIPE_UNLOCK(rpipe); 1156 error = uiomove( 1157 &wpipe->pipe_buffer.buffer[0], 1158 size - segsize, uio); 1159 PIPE_LOCK(rpipe); 1160 } 1161 if (error == 0) { 1162 wpipe->pipe_buffer.in += size; 1163 if (wpipe->pipe_buffer.in >= 1164 wpipe->pipe_buffer.size) { 1165 KASSERT(wpipe->pipe_buffer.in == 1166 size - segsize + 1167 wpipe->pipe_buffer.size, 1168 ("Expected wraparound bad")); 1169 wpipe->pipe_buffer.in = size - segsize; 1170 } 1171 1172 wpipe->pipe_buffer.cnt += size; 1173 KASSERT(wpipe->pipe_buffer.cnt <= 1174 wpipe->pipe_buffer.size, 1175 ("Pipe buffer overflow")); 1176 } 1177 pipeunlock(wpipe); 1178 if (error != 0) 1179 break; 1180 } else { 1181 /* 1182 * If the "read-side" has been blocked, wake it up now. 1183 */ 1184 if (wpipe->pipe_state & PIPE_WANTR) { 1185 wpipe->pipe_state &= ~PIPE_WANTR; 1186 wakeup(wpipe); 1187 } 1188 1189 /* 1190 * don't block on non-blocking I/O 1191 */ 1192 if (fp->f_flag & FNONBLOCK) { 1193 error = EAGAIN; 1194 pipeunlock(wpipe); 1195 break; 1196 } 1197 1198 /* 1199 * We have no more space and have something to offer, 1200 * wake up select/poll. 1201 */ 1202 pipeselwakeup(wpipe); 1203 1204 wpipe->pipe_state |= PIPE_WANTW; 1205 pipeunlock(wpipe); 1206 error = msleep(wpipe, PIPE_MTX(rpipe), 1207 PRIBIO | PCATCH, "pipewr", 0); 1208 if (error != 0) 1209 break; 1210 } 1211 } 1212 1213 pipelock(wpipe, 0); 1214 --wpipe->pipe_busy; 1215 1216 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1217 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1218 wakeup(wpipe); 1219 } else if (wpipe->pipe_buffer.cnt > 0) { 1220 /* 1221 * If we have put any characters in the buffer, we wake up 1222 * the reader. 1223 */ 1224 if (wpipe->pipe_state & PIPE_WANTR) { 1225 wpipe->pipe_state &= ~PIPE_WANTR; 1226 wakeup(wpipe); 1227 } 1228 } 1229 1230 /* 1231 * Don't return EPIPE if I/O was successful 1232 */ 1233 if ((wpipe->pipe_buffer.cnt == 0) && 1234 (uio->uio_resid == 0) && 1235 (error == EPIPE)) { 1236 error = 0; 1237 } 1238 1239 if (error == 0) 1240 vfs_timestamp(&wpipe->pipe_mtime); 1241 1242 /* 1243 * We have something to offer, 1244 * wake up select/poll. 1245 */ 1246 if (wpipe->pipe_buffer.cnt) 1247 pipeselwakeup(wpipe); 1248 1249 pipeunlock(wpipe); 1250 PIPE_UNLOCK(rpipe); 1251 return (error); 1252 } 1253 1254 /* 1255 * we implement a very minimal set of ioctls for compatibility with sockets. 1256 */ 1257 static int 1258 pipe_ioctl(fp, cmd, data, active_cred, td) 1259 struct file *fp; 1260 u_long cmd; 1261 void *data; 1262 struct ucred *active_cred; 1263 struct thread *td; 1264 { 1265 struct pipe *mpipe = fp->f_data; 1266 int error; 1267 1268 PIPE_LOCK(mpipe); 1269 1270 #ifdef MAC 1271 error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1272 if (error) { 1273 PIPE_UNLOCK(mpipe); 1274 return (error); 1275 } 1276 #endif 1277 1278 error = 0; 1279 switch (cmd) { 1280 1281 case FIONBIO: 1282 break; 1283 1284 case FIOASYNC: 1285 if (*(int *)data) { 1286 mpipe->pipe_state |= PIPE_ASYNC; 1287 } else { 1288 mpipe->pipe_state &= ~PIPE_ASYNC; 1289 } 1290 break; 1291 1292 case FIONREAD: 1293 if (mpipe->pipe_state & PIPE_DIRECTW) 1294 *(int *)data = mpipe->pipe_map.cnt; 1295 else 1296 *(int *)data = mpipe->pipe_buffer.cnt; 1297 break; 1298 1299 case FIOSETOWN: 1300 PIPE_UNLOCK(mpipe); 1301 error = fsetown(*(int *)data, &mpipe->pipe_sigio); 1302 goto out_unlocked; 1303 1304 case FIOGETOWN: 1305 *(int *)data = fgetown(&mpipe->pipe_sigio); 1306 break; 1307 1308 /* This is deprecated, FIOSETOWN should be used instead. */ 1309 case TIOCSPGRP: 1310 PIPE_UNLOCK(mpipe); 1311 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); 1312 goto out_unlocked; 1313 1314 /* This is deprecated, FIOGETOWN should be used instead. */ 1315 case TIOCGPGRP: 1316 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1317 break; 1318 1319 default: 1320 error = ENOTTY; 1321 break; 1322 } 1323 PIPE_UNLOCK(mpipe); 1324 out_unlocked: 1325 return (error); 1326 } 1327 1328 static int 1329 pipe_poll(fp, events, active_cred, td) 1330 struct file *fp; 1331 int events; 1332 struct ucred *active_cred; 1333 struct thread *td; 1334 { 1335 struct pipe *rpipe = fp->f_data; 1336 struct pipe *wpipe; 1337 int revents = 0; 1338 #ifdef MAC 1339 int error; 1340 #endif 1341 1342 wpipe = rpipe->pipe_peer; 1343 PIPE_LOCK(rpipe); 1344 #ifdef MAC 1345 error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair); 1346 if (error) 1347 goto locked_error; 1348 #endif 1349 if (events & (POLLIN | POLLRDNORM)) 1350 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1351 (rpipe->pipe_buffer.cnt > 0) || 1352 (rpipe->pipe_state & PIPE_EOF)) 1353 revents |= events & (POLLIN | POLLRDNORM); 1354 1355 if (events & (POLLOUT | POLLWRNORM)) 1356 if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) || 1357 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1358 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1359 revents |= events & (POLLOUT | POLLWRNORM); 1360 1361 if ((rpipe->pipe_state & PIPE_EOF) || 1362 (!wpipe->pipe_present) || 1363 (wpipe->pipe_state & PIPE_EOF)) 1364 revents |= POLLHUP; 1365 1366 if (revents == 0) { 1367 if (events & (POLLIN | POLLRDNORM)) { 1368 selrecord(td, &rpipe->pipe_sel); 1369 rpipe->pipe_state |= PIPE_SEL; 1370 } 1371 1372 if (events & (POLLOUT | POLLWRNORM)) { 1373 selrecord(td, &wpipe->pipe_sel); 1374 wpipe->pipe_state |= PIPE_SEL; 1375 } 1376 } 1377 #ifdef MAC 1378 locked_error: 1379 #endif 1380 PIPE_UNLOCK(rpipe); 1381 1382 return (revents); 1383 } 1384 1385 /* 1386 * We shouldn't need locks here as we're doing a read and this should 1387 * be a natural race. 1388 */ 1389 static int 1390 pipe_stat(fp, ub, active_cred, td) 1391 struct file *fp; 1392 struct stat *ub; 1393 struct ucred *active_cred; 1394 struct thread *td; 1395 { 1396 struct pipe *pipe = fp->f_data; 1397 #ifdef MAC 1398 int error; 1399 1400 PIPE_LOCK(pipe); 1401 error = mac_check_pipe_stat(active_cred, pipe->pipe_pair); 1402 PIPE_UNLOCK(pipe); 1403 if (error) 1404 return (error); 1405 #endif 1406 bzero(ub, sizeof(*ub)); 1407 ub->st_mode = S_IFIFO; 1408 ub->st_blksize = PAGE_SIZE; 1409 if (pipe->pipe_state & PIPE_DIRECTW) 1410 ub->st_size = pipe->pipe_map.cnt; 1411 else 1412 ub->st_size = pipe->pipe_buffer.cnt; 1413 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1414 ub->st_atimespec = pipe->pipe_atime; 1415 ub->st_mtimespec = pipe->pipe_mtime; 1416 ub->st_ctimespec = pipe->pipe_ctime; 1417 ub->st_uid = fp->f_cred->cr_uid; 1418 ub->st_gid = fp->f_cred->cr_gid; 1419 /* 1420 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1421 * XXX (st_dev, st_ino) should be unique. 1422 */ 1423 return (0); 1424 } 1425 1426 /* ARGSUSED */ 1427 static int 1428 pipe_close(fp, td) 1429 struct file *fp; 1430 struct thread *td; 1431 { 1432 struct pipe *cpipe = fp->f_data; 1433 1434 fp->f_ops = &badfileops; 1435 fp->f_data = NULL; 1436 funsetown(&cpipe->pipe_sigio); 1437 pipeclose(cpipe); 1438 return (0); 1439 } 1440 1441 static void 1442 pipe_free_kmem(cpipe) 1443 struct pipe *cpipe; 1444 { 1445 1446 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1447 ("pipe_free_kmem: pipe mutex locked")); 1448 1449 if (cpipe->pipe_buffer.buffer != NULL) { 1450 atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size); 1451 vm_map_remove(pipe_map, 1452 (vm_offset_t)cpipe->pipe_buffer.buffer, 1453 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1454 cpipe->pipe_buffer.buffer = NULL; 1455 } 1456 #ifndef PIPE_NODIRECT 1457 { 1458 cpipe->pipe_map.cnt = 0; 1459 cpipe->pipe_map.pos = 0; 1460 cpipe->pipe_map.npages = 0; 1461 } 1462 #endif 1463 } 1464 1465 /* 1466 * shutdown the pipe 1467 */ 1468 static void 1469 pipeclose(cpipe) 1470 struct pipe *cpipe; 1471 { 1472 struct pipepair *pp; 1473 struct pipe *ppipe; 1474 1475 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1476 1477 PIPE_LOCK(cpipe); 1478 pipelock(cpipe, 0); 1479 pp = cpipe->pipe_pair; 1480 1481 pipeselwakeup(cpipe); 1482 1483 /* 1484 * If the other side is blocked, wake it up saying that 1485 * we want to close it down. 1486 */ 1487 cpipe->pipe_state |= PIPE_EOF; 1488 while (cpipe->pipe_busy) { 1489 wakeup(cpipe); 1490 cpipe->pipe_state |= PIPE_WANT; 1491 pipeunlock(cpipe); 1492 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1493 pipelock(cpipe, 0); 1494 } 1495 1496 1497 /* 1498 * Disconnect from peer, if any. 1499 */ 1500 ppipe = cpipe->pipe_peer; 1501 if (ppipe->pipe_present != 0) { 1502 pipeselwakeup(ppipe); 1503 1504 ppipe->pipe_state |= PIPE_EOF; 1505 wakeup(ppipe); 1506 KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); 1507 } 1508 1509 /* 1510 * Mark this endpoint as free. Release kmem resources. We 1511 * don't mark this endpoint as unused until we've finished 1512 * doing that, or the pipe might disappear out from under 1513 * us. 1514 */ 1515 PIPE_UNLOCK(cpipe); 1516 pipe_free_kmem(cpipe); 1517 PIPE_LOCK(cpipe); 1518 cpipe->pipe_present = 0; 1519 pipeunlock(cpipe); 1520 knlist_clear(&cpipe->pipe_sel.si_note, 1); 1521 knlist_destroy(&cpipe->pipe_sel.si_note); 1522 1523 /* 1524 * If both endpoints are now closed, release the memory for the 1525 * pipe pair. If not, unlock. 1526 */ 1527 if (ppipe->pipe_present == 0) { 1528 PIPE_UNLOCK(cpipe); 1529 #ifdef MAC 1530 mac_destroy_pipe(pp); 1531 #endif 1532 uma_zfree(pipe_zone, cpipe->pipe_pair); 1533 } else 1534 PIPE_UNLOCK(cpipe); 1535 } 1536 1537 /*ARGSUSED*/ 1538 static int 1539 pipe_kqfilter(struct file *fp, struct knote *kn) 1540 { 1541 struct pipe *cpipe; 1542 1543 cpipe = kn->kn_fp->f_data; 1544 PIPE_LOCK(cpipe); 1545 switch (kn->kn_filter) { 1546 case EVFILT_READ: 1547 kn->kn_fop = &pipe_rfiltops; 1548 break; 1549 case EVFILT_WRITE: 1550 kn->kn_fop = &pipe_wfiltops; 1551 if (!cpipe->pipe_peer->pipe_present) { 1552 /* other end of pipe has been closed */ 1553 PIPE_UNLOCK(cpipe); 1554 return (EPIPE); 1555 } 1556 cpipe = cpipe->pipe_peer; 1557 break; 1558 default: 1559 PIPE_UNLOCK(cpipe); 1560 return (EINVAL); 1561 } 1562 1563 knlist_add(&cpipe->pipe_sel.si_note, kn, 1); 1564 PIPE_UNLOCK(cpipe); 1565 return (0); 1566 } 1567 1568 static void 1569 filt_pipedetach(struct knote *kn) 1570 { 1571 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1572 1573 PIPE_LOCK(cpipe); 1574 if (kn->kn_filter == EVFILT_WRITE) { 1575 if (!cpipe->pipe_peer->pipe_present) { 1576 PIPE_UNLOCK(cpipe); 1577 return; 1578 } 1579 cpipe = cpipe->pipe_peer; 1580 } 1581 knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); 1582 PIPE_UNLOCK(cpipe); 1583 } 1584 1585 /*ARGSUSED*/ 1586 static int 1587 filt_piperead(struct knote *kn, long hint) 1588 { 1589 struct pipe *rpipe = kn->kn_fp->f_data; 1590 struct pipe *wpipe = rpipe->pipe_peer; 1591 int ret; 1592 1593 PIPE_LOCK(rpipe); 1594 kn->kn_data = rpipe->pipe_buffer.cnt; 1595 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1596 kn->kn_data = rpipe->pipe_map.cnt; 1597 1598 if ((rpipe->pipe_state & PIPE_EOF) || 1599 (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1600 kn->kn_flags |= EV_EOF; 1601 PIPE_UNLOCK(rpipe); 1602 return (1); 1603 } 1604 ret = kn->kn_data > 0; 1605 PIPE_UNLOCK(rpipe); 1606 return ret; 1607 } 1608 1609 /*ARGSUSED*/ 1610 static int 1611 filt_pipewrite(struct knote *kn, long hint) 1612 { 1613 struct pipe *rpipe = kn->kn_fp->f_data; 1614 struct pipe *wpipe = rpipe->pipe_peer; 1615 1616 PIPE_LOCK(rpipe); 1617 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1618 kn->kn_data = 0; 1619 kn->kn_flags |= EV_EOF; 1620 PIPE_UNLOCK(rpipe); 1621 return (1); 1622 } 1623 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1624 if (wpipe->pipe_state & PIPE_DIRECTW) 1625 kn->kn_data = 0; 1626 1627 PIPE_UNLOCK(rpipe); 1628 return (kn->kn_data >= PIPE_BUF); 1629 } 1630