1 /*- 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 */ 19 20 /* 21 * This file contains a high-performance replacement for the socket-based 22 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 23 * all features of sockets, but does do everything that pipes normally 24 * do. 25 */ 26 27 /* 28 * This code has two modes of operation, a small write mode and a large 29 * write mode. The small write mode acts like conventional pipes with 30 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 31 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 32 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 33 * the receiving process can copy it directly from the pages in the sending 34 * process. 35 * 36 * If the sending process receives a signal, it is possible that it will 37 * go away, and certainly its address space can change, because control 38 * is returned back to the user-mode side. In that case, the pipe code 39 * arranges to copy the buffer supplied by the user process, to a pageable 40 * kernel buffer, and the receiving process will grab the data from the 41 * pageable kernel buffer. Since signals don't happen all that often, 42 * the copy operation is normally eliminated. 43 * 44 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 45 * happen for small transfers so that the system will not spend all of 46 * its time context switching. 47 * 48 * In order to limit the resource use of pipes, two sysctls exist: 49 * 50 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 51 * address space available to us in pipe_map. This value is normally 52 * autotuned, but may also be loader tuned. 53 * 54 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of 55 * memory in use by pipes. 56 * 57 * Based on how large pipekva is relative to maxpipekva, the following 58 * will happen: 59 * 60 * 0% - 50%: 61 * New pipes are given 16K of memory backing, pipes may dynamically 62 * grow to as large as 64K where needed. 63 * 50% - 75%: 64 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 65 * existing pipes may NOT grow. 66 * 75% - 100%: 67 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 68 * existing pipes will be shrunk down to 4K whenever possible. 69 * 70 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If 71 * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE 72 * resize which MUST occur for reverse-direction pipes when they are 73 * first used. 74 * 75 * Additional information about the current state of pipes may be obtained 76 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, 77 * and kern.ipc.piperesizefail. 78 * 79 * Locking rules: There are two locks present here: A mutex, used via 80 * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via 81 * the flag, as mutexes can not persist over uiomove. The mutex 82 * exists only to guard access to the flag, and is not in itself a 83 * locking mechanism. Also note that there is only a single mutex for 84 * both directions of a pipe. 85 * 86 * As pipelock() may have to sleep before it can acquire the flag, it 87 * is important to reread all data after a call to pipelock(); everything 88 * in the structure may have changed. 89 */ 90 91 #include <sys/cdefs.h> 92 __FBSDID("$FreeBSD$"); 93 94 #include "opt_mac.h" 95 96 #include <sys/param.h> 97 #include <sys/systm.h> 98 #include <sys/fcntl.h> 99 #include <sys/file.h> 100 #include <sys/filedesc.h> 101 #include <sys/filio.h> 102 #include <sys/kernel.h> 103 #include <sys/lock.h> 104 #include <sys/mutex.h> 105 #include <sys/ttycom.h> 106 #include <sys/stat.h> 107 #include <sys/malloc.h> 108 #include <sys/poll.h> 109 #include <sys/selinfo.h> 110 #include <sys/signalvar.h> 111 #include <sys/sysctl.h> 112 #include <sys/sysproto.h> 113 #include <sys/pipe.h> 114 #include <sys/proc.h> 115 #include <sys/vnode.h> 116 #include <sys/uio.h> 117 #include <sys/event.h> 118 119 #include <security/mac/mac_framework.h> 120 121 #include <vm/vm.h> 122 #include <vm/vm_param.h> 123 #include <vm/vm_object.h> 124 #include <vm/vm_kern.h> 125 #include <vm/vm_extern.h> 126 #include <vm/pmap.h> 127 #include <vm/vm_map.h> 128 #include <vm/vm_page.h> 129 #include <vm/uma.h> 130 131 /* 132 * Use this define if you want to disable *fancy* VM things. Expect an 133 * approx 30% decrease in transfer rate. This could be useful for 134 * NetBSD or OpenBSD. 135 */ 136 /* #define PIPE_NODIRECT */ 137 138 /* 139 * interfaces to the outside world 140 */ 141 static fo_rdwr_t pipe_read; 142 static fo_rdwr_t pipe_write; 143 static fo_ioctl_t pipe_ioctl; 144 static fo_poll_t pipe_poll; 145 static fo_kqfilter_t pipe_kqfilter; 146 static fo_stat_t pipe_stat; 147 static fo_close_t pipe_close; 148 149 static struct fileops pipeops = { 150 .fo_read = pipe_read, 151 .fo_write = pipe_write, 152 .fo_ioctl = pipe_ioctl, 153 .fo_poll = pipe_poll, 154 .fo_kqfilter = pipe_kqfilter, 155 .fo_stat = pipe_stat, 156 .fo_close = pipe_close, 157 .fo_flags = DFLAG_PASSABLE 158 }; 159 160 static void filt_pipedetach(struct knote *kn); 161 static int filt_piperead(struct knote *kn, long hint); 162 static int filt_pipewrite(struct knote *kn, long hint); 163 164 static struct filterops pipe_rfiltops = 165 { 1, NULL, filt_pipedetach, filt_piperead }; 166 static struct filterops pipe_wfiltops = 167 { 1, NULL, filt_pipedetach, filt_pipewrite }; 168 169 /* 170 * Default pipe buffer size(s), this can be kind-of large now because pipe 171 * space is pageable. The pipe code will try to maintain locality of 172 * reference for performance reasons, so small amounts of outstanding I/O 173 * will not wipe the cache. 174 */ 175 #define MINPIPESIZE (PIPE_SIZE/3) 176 #define MAXPIPESIZE (2*PIPE_SIZE/3) 177 178 static int amountpipes; 179 static int amountpipekva; 180 static int pipefragretry; 181 static int pipeallocfail; 182 static int piperesizefail; 183 static int piperesizeallowed = 1; 184 185 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN, 186 &maxpipekva, 0, "Pipe KVA limit"); 187 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, 188 &amountpipes, 0, "Current # of pipes"); 189 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 190 &amountpipekva, 0, "Pipe KVA usage"); 191 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, 192 &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); 193 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, 194 &pipeallocfail, 0, "Pipe allocation failures"); 195 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, 196 &piperesizefail, 0, "Pipe resize failures"); 197 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, 198 &piperesizeallowed, 0, "Pipe resizing allowed"); 199 200 static void pipeinit(void *dummy __unused); 201 static void pipeclose(struct pipe *cpipe); 202 static void pipe_free_kmem(struct pipe *cpipe); 203 static int pipe_create(struct pipe *pipe, int backing); 204 static __inline int pipelock(struct pipe *cpipe, int catch); 205 static __inline void pipeunlock(struct pipe *cpipe); 206 static __inline void pipeselwakeup(struct pipe *cpipe); 207 #ifndef PIPE_NODIRECT 208 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 209 static void pipe_destroy_write_buffer(struct pipe *wpipe); 210 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 211 static void pipe_clone_write_buffer(struct pipe *wpipe); 212 #endif 213 static int pipespace(struct pipe *cpipe, int size); 214 static int pipespace_new(struct pipe *cpipe, int size); 215 216 static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); 217 static void pipe_zone_dtor(void *mem, int size, void *arg); 218 static int pipe_zone_init(void *mem, int size, int flags); 219 static void pipe_zone_fini(void *mem, int size); 220 221 static uma_zone_t pipe_zone; 222 223 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 224 225 static void 226 pipeinit(void *dummy __unused) 227 { 228 229 pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair), 230 pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini, 231 UMA_ALIGN_PTR, 0); 232 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 233 } 234 235 static int 236 pipe_zone_ctor(void *mem, int size, void *arg, int flags) 237 { 238 struct pipepair *pp; 239 struct pipe *rpipe, *wpipe; 240 241 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 242 243 pp = (struct pipepair *)mem; 244 245 /* 246 * We zero both pipe endpoints to make sure all the kmem pointers 247 * are NULL, flag fields are zero'd, etc. We timestamp both 248 * endpoints with the same time. 249 */ 250 rpipe = &pp->pp_rpipe; 251 bzero(rpipe, sizeof(*rpipe)); 252 vfs_timestamp(&rpipe->pipe_ctime); 253 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 254 255 wpipe = &pp->pp_wpipe; 256 bzero(wpipe, sizeof(*wpipe)); 257 wpipe->pipe_ctime = rpipe->pipe_ctime; 258 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 259 260 rpipe->pipe_peer = wpipe; 261 rpipe->pipe_pair = pp; 262 wpipe->pipe_peer = rpipe; 263 wpipe->pipe_pair = pp; 264 265 /* 266 * Mark both endpoints as present; they will later get free'd 267 * one at a time. When both are free'd, then the whole pair 268 * is released. 269 */ 270 rpipe->pipe_present = 1; 271 wpipe->pipe_present = 1; 272 273 /* 274 * Eventually, the MAC Framework may initialize the label 275 * in ctor or init, but for now we do it elswhere to avoid 276 * blocking in ctor or init. 277 */ 278 pp->pp_label = NULL; 279 280 atomic_add_int(&amountpipes, 2); 281 return (0); 282 } 283 284 static void 285 pipe_zone_dtor(void *mem, int size, void *arg) 286 { 287 struct pipepair *pp; 288 289 KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size")); 290 291 pp = (struct pipepair *)mem; 292 293 atomic_subtract_int(&amountpipes, 2); 294 } 295 296 static int 297 pipe_zone_init(void *mem, int size, int flags) 298 { 299 struct pipepair *pp; 300 301 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 302 303 pp = (struct pipepair *)mem; 304 305 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE); 306 return (0); 307 } 308 309 static void 310 pipe_zone_fini(void *mem, int size) 311 { 312 struct pipepair *pp; 313 314 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 315 316 pp = (struct pipepair *)mem; 317 318 mtx_destroy(&pp->pp_mtx); 319 } 320 321 /* 322 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let 323 * the zone pick up the pieces via pipeclose(). 324 */ 325 /* ARGSUSED */ 326 int 327 pipe(td, uap) 328 struct thread *td; 329 struct pipe_args /* { 330 int dummy; 331 } */ *uap; 332 { 333 struct filedesc *fdp = td->td_proc->p_fd; 334 struct file *rf, *wf; 335 struct pipepair *pp; 336 struct pipe *rpipe, *wpipe; 337 int fd, error; 338 339 pp = uma_zalloc(pipe_zone, M_WAITOK); 340 #ifdef MAC 341 /* 342 * The MAC label is shared between the connected endpoints. As a 343 * result mac_init_pipe() and mac_create_pipe() are called once 344 * for the pair, and not on the endpoints. 345 */ 346 mac_init_pipe(pp); 347 mac_create_pipe(td->td_ucred, pp); 348 #endif 349 rpipe = &pp->pp_rpipe; 350 wpipe = &pp->pp_wpipe; 351 352 knlist_init(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe), NULL, NULL, 353 NULL); 354 knlist_init(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe), NULL, NULL, 355 NULL); 356 357 /* Only the forward direction pipe is backed by default */ 358 if ((error = pipe_create(rpipe, 1)) != 0 || 359 (error = pipe_create(wpipe, 0)) != 0) { 360 pipeclose(rpipe); 361 pipeclose(wpipe); 362 return (error); 363 } 364 365 rpipe->pipe_state |= PIPE_DIRECTOK; 366 wpipe->pipe_state |= PIPE_DIRECTOK; 367 368 error = falloc(td, &rf, &fd); 369 if (error) { 370 pipeclose(rpipe); 371 pipeclose(wpipe); 372 return (error); 373 } 374 /* An extra reference on `rf' has been held for us by falloc(). */ 375 td->td_retval[0] = fd; 376 377 /* 378 * Warning: once we've gotten past allocation of the fd for the 379 * read-side, we can only drop the read side via fdrop() in order 380 * to avoid races against processes which manage to dup() the read 381 * side while we are blocked trying to allocate the write side. 382 */ 383 FILE_LOCK(rf); 384 rf->f_flag = FREAD | FWRITE; 385 rf->f_type = DTYPE_PIPE; 386 rf->f_data = rpipe; 387 rf->f_ops = &pipeops; 388 FILE_UNLOCK(rf); 389 error = falloc(td, &wf, &fd); 390 if (error) { 391 fdclose(fdp, rf, td->td_retval[0], td); 392 fdrop(rf, td); 393 /* rpipe has been closed by fdrop(). */ 394 pipeclose(wpipe); 395 return (error); 396 } 397 /* An extra reference on `wf' has been held for us by falloc(). */ 398 FILE_LOCK(wf); 399 wf->f_flag = FREAD | FWRITE; 400 wf->f_type = DTYPE_PIPE; 401 wf->f_data = wpipe; 402 wf->f_ops = &pipeops; 403 FILE_UNLOCK(wf); 404 fdrop(wf, td); 405 td->td_retval[1] = fd; 406 fdrop(rf, td); 407 408 return (0); 409 } 410 411 /* 412 * Allocate kva for pipe circular buffer, the space is pageable 413 * This routine will 'realloc' the size of a pipe safely, if it fails 414 * it will retain the old buffer. 415 * If it fails it will return ENOMEM. 416 */ 417 static int 418 pipespace_new(cpipe, size) 419 struct pipe *cpipe; 420 int size; 421 { 422 caddr_t buffer; 423 int error, cnt, firstseg; 424 static int curfail = 0; 425 static struct timeval lastfail; 426 427 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 428 KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), 429 ("pipespace: resize of direct writes not allowed")); 430 retry: 431 cnt = cpipe->pipe_buffer.cnt; 432 if (cnt > size) 433 size = cnt; 434 435 size = round_page(size); 436 buffer = (caddr_t) vm_map_min(pipe_map); 437 438 error = vm_map_find(pipe_map, NULL, 0, 439 (vm_offset_t *) &buffer, size, 1, 440 VM_PROT_ALL, VM_PROT_ALL, 0); 441 if (error != KERN_SUCCESS) { 442 if ((cpipe->pipe_buffer.buffer == NULL) && 443 (size > SMALL_PIPE_SIZE)) { 444 size = SMALL_PIPE_SIZE; 445 pipefragretry++; 446 goto retry; 447 } 448 if (cpipe->pipe_buffer.buffer == NULL) { 449 pipeallocfail++; 450 if (ppsratecheck(&lastfail, &curfail, 1)) 451 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 452 } else { 453 piperesizefail++; 454 } 455 return (ENOMEM); 456 } 457 458 /* copy data, then free old resources if we're resizing */ 459 if (cnt > 0) { 460 if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { 461 firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; 462 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 463 buffer, firstseg); 464 if ((cnt - firstseg) > 0) 465 bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], 466 cpipe->pipe_buffer.in); 467 } else { 468 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 469 buffer, cnt); 470 } 471 } 472 pipe_free_kmem(cpipe); 473 cpipe->pipe_buffer.buffer = buffer; 474 cpipe->pipe_buffer.size = size; 475 cpipe->pipe_buffer.in = cnt; 476 cpipe->pipe_buffer.out = 0; 477 cpipe->pipe_buffer.cnt = cnt; 478 atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size); 479 return (0); 480 } 481 482 /* 483 * Wrapper for pipespace_new() that performs locking assertions. 484 */ 485 static int 486 pipespace(cpipe, size) 487 struct pipe *cpipe; 488 int size; 489 { 490 491 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 492 ("Unlocked pipe passed to pipespace")); 493 return (pipespace_new(cpipe, size)); 494 } 495 496 /* 497 * lock a pipe for I/O, blocking other access 498 */ 499 static __inline int 500 pipelock(cpipe, catch) 501 struct pipe *cpipe; 502 int catch; 503 { 504 int error; 505 506 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 507 while (cpipe->pipe_state & PIPE_LOCKFL) { 508 cpipe->pipe_state |= PIPE_LWANT; 509 error = msleep(cpipe, PIPE_MTX(cpipe), 510 catch ? (PRIBIO | PCATCH) : PRIBIO, 511 "pipelk", 0); 512 if (error != 0) 513 return (error); 514 } 515 cpipe->pipe_state |= PIPE_LOCKFL; 516 return (0); 517 } 518 519 /* 520 * unlock a pipe I/O lock 521 */ 522 static __inline void 523 pipeunlock(cpipe) 524 struct pipe *cpipe; 525 { 526 527 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 528 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 529 ("Unlocked pipe passed to pipeunlock")); 530 cpipe->pipe_state &= ~PIPE_LOCKFL; 531 if (cpipe->pipe_state & PIPE_LWANT) { 532 cpipe->pipe_state &= ~PIPE_LWANT; 533 wakeup(cpipe); 534 } 535 } 536 537 static __inline void 538 pipeselwakeup(cpipe) 539 struct pipe *cpipe; 540 { 541 542 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 543 if (cpipe->pipe_state & PIPE_SEL) { 544 cpipe->pipe_state &= ~PIPE_SEL; 545 selwakeuppri(&cpipe->pipe_sel, PSOCK); 546 } 547 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 548 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 549 KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); 550 } 551 552 /* 553 * Initialize and allocate VM and memory for pipe. The structure 554 * will start out zero'd from the ctor, so we just manage the kmem. 555 */ 556 static int 557 pipe_create(pipe, backing) 558 struct pipe *pipe; 559 int backing; 560 { 561 int error; 562 563 if (backing) { 564 if (amountpipekva > maxpipekva / 2) 565 error = pipespace_new(pipe, SMALL_PIPE_SIZE); 566 else 567 error = pipespace_new(pipe, PIPE_SIZE); 568 } else { 569 /* If we're not backing this pipe, no need to do anything. */ 570 error = 0; 571 } 572 return (error); 573 } 574 575 /* ARGSUSED */ 576 static int 577 pipe_read(fp, uio, active_cred, flags, td) 578 struct file *fp; 579 struct uio *uio; 580 struct ucred *active_cred; 581 struct thread *td; 582 int flags; 583 { 584 struct pipe *rpipe = fp->f_data; 585 int error; 586 int nread = 0; 587 u_int size; 588 589 PIPE_LOCK(rpipe); 590 ++rpipe->pipe_busy; 591 error = pipelock(rpipe, 1); 592 if (error) 593 goto unlocked_error; 594 595 #ifdef MAC 596 error = mac_check_pipe_read(active_cred, rpipe->pipe_pair); 597 if (error) 598 goto locked_error; 599 #endif 600 if (amountpipekva > (3 * maxpipekva) / 4) { 601 if (!(rpipe->pipe_state & PIPE_DIRECTW) && 602 (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 603 (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 604 (piperesizeallowed == 1)) { 605 PIPE_UNLOCK(rpipe); 606 pipespace(rpipe, SMALL_PIPE_SIZE); 607 PIPE_LOCK(rpipe); 608 } 609 } 610 611 while (uio->uio_resid) { 612 /* 613 * normal pipe buffer receive 614 */ 615 if (rpipe->pipe_buffer.cnt > 0) { 616 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 617 if (size > rpipe->pipe_buffer.cnt) 618 size = rpipe->pipe_buffer.cnt; 619 if (size > (u_int) uio->uio_resid) 620 size = (u_int) uio->uio_resid; 621 622 PIPE_UNLOCK(rpipe); 623 error = uiomove( 624 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 625 size, uio); 626 PIPE_LOCK(rpipe); 627 if (error) 628 break; 629 630 rpipe->pipe_buffer.out += size; 631 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 632 rpipe->pipe_buffer.out = 0; 633 634 rpipe->pipe_buffer.cnt -= size; 635 636 /* 637 * If there is no more to read in the pipe, reset 638 * its pointers to the beginning. This improves 639 * cache hit stats. 640 */ 641 if (rpipe->pipe_buffer.cnt == 0) { 642 rpipe->pipe_buffer.in = 0; 643 rpipe->pipe_buffer.out = 0; 644 } 645 nread += size; 646 #ifndef PIPE_NODIRECT 647 /* 648 * Direct copy, bypassing a kernel buffer. 649 */ 650 } else if ((size = rpipe->pipe_map.cnt) && 651 (rpipe->pipe_state & PIPE_DIRECTW)) { 652 if (size > (u_int) uio->uio_resid) 653 size = (u_int) uio->uio_resid; 654 655 PIPE_UNLOCK(rpipe); 656 error = uiomove_fromphys(rpipe->pipe_map.ms, 657 rpipe->pipe_map.pos, size, uio); 658 PIPE_LOCK(rpipe); 659 if (error) 660 break; 661 nread += size; 662 rpipe->pipe_map.pos += size; 663 rpipe->pipe_map.cnt -= size; 664 if (rpipe->pipe_map.cnt == 0) { 665 rpipe->pipe_state &= ~PIPE_DIRECTW; 666 wakeup(rpipe); 667 } 668 #endif 669 } else { 670 /* 671 * detect EOF condition 672 * read returns 0 on EOF, no need to set error 673 */ 674 if (rpipe->pipe_state & PIPE_EOF) 675 break; 676 677 /* 678 * If the "write-side" has been blocked, wake it up now. 679 */ 680 if (rpipe->pipe_state & PIPE_WANTW) { 681 rpipe->pipe_state &= ~PIPE_WANTW; 682 wakeup(rpipe); 683 } 684 685 /* 686 * Break if some data was read. 687 */ 688 if (nread > 0) 689 break; 690 691 /* 692 * Unlock the pipe buffer for our remaining processing. 693 * We will either break out with an error or we will 694 * sleep and relock to loop. 695 */ 696 pipeunlock(rpipe); 697 698 /* 699 * Handle non-blocking mode operation or 700 * wait for more data. 701 */ 702 if (fp->f_flag & FNONBLOCK) { 703 error = EAGAIN; 704 } else { 705 rpipe->pipe_state |= PIPE_WANTR; 706 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 707 PRIBIO | PCATCH, 708 "piperd", 0)) == 0) 709 error = pipelock(rpipe, 1); 710 } 711 if (error) 712 goto unlocked_error; 713 } 714 } 715 #ifdef MAC 716 locked_error: 717 #endif 718 pipeunlock(rpipe); 719 720 /* XXX: should probably do this before getting any locks. */ 721 if (error == 0) 722 vfs_timestamp(&rpipe->pipe_atime); 723 unlocked_error: 724 --rpipe->pipe_busy; 725 726 /* 727 * PIPE_WANT processing only makes sense if pipe_busy is 0. 728 */ 729 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 730 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 731 wakeup(rpipe); 732 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 733 /* 734 * Handle write blocking hysteresis. 735 */ 736 if (rpipe->pipe_state & PIPE_WANTW) { 737 rpipe->pipe_state &= ~PIPE_WANTW; 738 wakeup(rpipe); 739 } 740 } 741 742 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 743 pipeselwakeup(rpipe); 744 745 PIPE_UNLOCK(rpipe); 746 return (error); 747 } 748 749 #ifndef PIPE_NODIRECT 750 /* 751 * Map the sending processes' buffer into kernel space and wire it. 752 * This is similar to a physical write operation. 753 */ 754 static int 755 pipe_build_write_buffer(wpipe, uio) 756 struct pipe *wpipe; 757 struct uio *uio; 758 { 759 pmap_t pmap; 760 u_int size; 761 int i, j; 762 vm_offset_t addr, endaddr; 763 764 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 765 KASSERT(wpipe->pipe_state & PIPE_DIRECTW, 766 ("Clone attempt on non-direct write pipe!")); 767 768 size = (u_int) uio->uio_iov->iov_len; 769 if (size > wpipe->pipe_buffer.size) 770 size = wpipe->pipe_buffer.size; 771 772 pmap = vmspace_pmap(curproc->p_vmspace); 773 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 774 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 775 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) { 776 /* 777 * vm_fault_quick() can sleep. Consequently, 778 * vm_page_lock_queue() and vm_page_unlock_queue() 779 * should not be performed outside of this loop. 780 */ 781 race: 782 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) { 783 vm_page_lock_queues(); 784 for (j = 0; j < i; j++) 785 vm_page_unhold(wpipe->pipe_map.ms[j]); 786 vm_page_unlock_queues(); 787 return (EFAULT); 788 } 789 wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr, 790 VM_PROT_READ); 791 if (wpipe->pipe_map.ms[i] == NULL) 792 goto race; 793 } 794 795 /* 796 * set up the control block 797 */ 798 wpipe->pipe_map.npages = i; 799 wpipe->pipe_map.pos = 800 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 801 wpipe->pipe_map.cnt = size; 802 803 /* 804 * and update the uio data 805 */ 806 807 uio->uio_iov->iov_len -= size; 808 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 809 if (uio->uio_iov->iov_len == 0) 810 uio->uio_iov++; 811 uio->uio_resid -= size; 812 uio->uio_offset += size; 813 return (0); 814 } 815 816 /* 817 * unmap and unwire the process buffer 818 */ 819 static void 820 pipe_destroy_write_buffer(wpipe) 821 struct pipe *wpipe; 822 { 823 int i; 824 825 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 826 vm_page_lock_queues(); 827 for (i = 0; i < wpipe->pipe_map.npages; i++) { 828 vm_page_unhold(wpipe->pipe_map.ms[i]); 829 } 830 vm_page_unlock_queues(); 831 wpipe->pipe_map.npages = 0; 832 } 833 834 /* 835 * In the case of a signal, the writing process might go away. This 836 * code copies the data into the circular buffer so that the source 837 * pages can be freed without loss of data. 838 */ 839 static void 840 pipe_clone_write_buffer(wpipe) 841 struct pipe *wpipe; 842 { 843 struct uio uio; 844 struct iovec iov; 845 int size; 846 int pos; 847 848 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 849 size = wpipe->pipe_map.cnt; 850 pos = wpipe->pipe_map.pos; 851 852 wpipe->pipe_buffer.in = size; 853 wpipe->pipe_buffer.out = 0; 854 wpipe->pipe_buffer.cnt = size; 855 wpipe->pipe_state &= ~PIPE_DIRECTW; 856 857 PIPE_UNLOCK(wpipe); 858 iov.iov_base = wpipe->pipe_buffer.buffer; 859 iov.iov_len = size; 860 uio.uio_iov = &iov; 861 uio.uio_iovcnt = 1; 862 uio.uio_offset = 0; 863 uio.uio_resid = size; 864 uio.uio_segflg = UIO_SYSSPACE; 865 uio.uio_rw = UIO_READ; 866 uio.uio_td = curthread; 867 uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); 868 PIPE_LOCK(wpipe); 869 pipe_destroy_write_buffer(wpipe); 870 } 871 872 /* 873 * This implements the pipe buffer write mechanism. Note that only 874 * a direct write OR a normal pipe write can be pending at any given time. 875 * If there are any characters in the pipe buffer, the direct write will 876 * be deferred until the receiving process grabs all of the bytes from 877 * the pipe buffer. Then the direct mapping write is set-up. 878 */ 879 static int 880 pipe_direct_write(wpipe, uio) 881 struct pipe *wpipe; 882 struct uio *uio; 883 { 884 int error; 885 886 retry: 887 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 888 error = pipelock(wpipe, 1); 889 if (wpipe->pipe_state & PIPE_EOF) 890 error = EPIPE; 891 if (error) { 892 pipeunlock(wpipe); 893 goto error1; 894 } 895 while (wpipe->pipe_state & PIPE_DIRECTW) { 896 if (wpipe->pipe_state & PIPE_WANTR) { 897 wpipe->pipe_state &= ~PIPE_WANTR; 898 wakeup(wpipe); 899 } 900 wpipe->pipe_state |= PIPE_WANTW; 901 pipeunlock(wpipe); 902 error = msleep(wpipe, PIPE_MTX(wpipe), 903 PRIBIO | PCATCH, "pipdww", 0); 904 if (error) 905 goto error1; 906 else 907 goto retry; 908 } 909 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 910 if (wpipe->pipe_buffer.cnt > 0) { 911 if (wpipe->pipe_state & PIPE_WANTR) { 912 wpipe->pipe_state &= ~PIPE_WANTR; 913 wakeup(wpipe); 914 } 915 wpipe->pipe_state |= PIPE_WANTW; 916 pipeunlock(wpipe); 917 error = msleep(wpipe, PIPE_MTX(wpipe), 918 PRIBIO | PCATCH, "pipdwc", 0); 919 if (error) 920 goto error1; 921 else 922 goto retry; 923 } 924 925 wpipe->pipe_state |= PIPE_DIRECTW; 926 927 PIPE_UNLOCK(wpipe); 928 error = pipe_build_write_buffer(wpipe, uio); 929 PIPE_LOCK(wpipe); 930 if (error) { 931 wpipe->pipe_state &= ~PIPE_DIRECTW; 932 pipeunlock(wpipe); 933 goto error1; 934 } 935 936 error = 0; 937 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 938 if (wpipe->pipe_state & PIPE_EOF) { 939 pipe_destroy_write_buffer(wpipe); 940 pipeselwakeup(wpipe); 941 pipeunlock(wpipe); 942 error = EPIPE; 943 goto error1; 944 } 945 if (wpipe->pipe_state & PIPE_WANTR) { 946 wpipe->pipe_state &= ~PIPE_WANTR; 947 wakeup(wpipe); 948 } 949 pipeselwakeup(wpipe); 950 pipeunlock(wpipe); 951 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 952 "pipdwt", 0); 953 pipelock(wpipe, 0); 954 } 955 956 if (wpipe->pipe_state & PIPE_EOF) 957 error = EPIPE; 958 if (wpipe->pipe_state & PIPE_DIRECTW) { 959 /* 960 * this bit of trickery substitutes a kernel buffer for 961 * the process that might be going away. 962 */ 963 pipe_clone_write_buffer(wpipe); 964 } else { 965 pipe_destroy_write_buffer(wpipe); 966 } 967 pipeunlock(wpipe); 968 return (error); 969 970 error1: 971 wakeup(wpipe); 972 return (error); 973 } 974 #endif 975 976 static int 977 pipe_write(fp, uio, active_cred, flags, td) 978 struct file *fp; 979 struct uio *uio; 980 struct ucred *active_cred; 981 struct thread *td; 982 int flags; 983 { 984 int error = 0; 985 int desiredsize, orig_resid; 986 struct pipe *wpipe, *rpipe; 987 988 rpipe = fp->f_data; 989 wpipe = rpipe->pipe_peer; 990 991 PIPE_LOCK(rpipe); 992 error = pipelock(wpipe, 1); 993 if (error) { 994 PIPE_UNLOCK(rpipe); 995 return (error); 996 } 997 /* 998 * detect loss of pipe read side, issue SIGPIPE if lost. 999 */ 1000 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1001 pipeunlock(wpipe); 1002 PIPE_UNLOCK(rpipe); 1003 return (EPIPE); 1004 } 1005 #ifdef MAC 1006 error = mac_check_pipe_write(active_cred, wpipe->pipe_pair); 1007 if (error) { 1008 pipeunlock(wpipe); 1009 PIPE_UNLOCK(rpipe); 1010 return (error); 1011 } 1012 #endif 1013 ++wpipe->pipe_busy; 1014 1015 /* Choose a larger size if it's advantageous */ 1016 desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); 1017 while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { 1018 if (piperesizeallowed != 1) 1019 break; 1020 if (amountpipekva > maxpipekva / 2) 1021 break; 1022 if (desiredsize == BIG_PIPE_SIZE) 1023 break; 1024 desiredsize = desiredsize * 2; 1025 } 1026 1027 /* Choose a smaller size if we're in a OOM situation */ 1028 if ((amountpipekva > (3 * maxpipekva) / 4) && 1029 (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 1030 (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 1031 (piperesizeallowed == 1)) 1032 desiredsize = SMALL_PIPE_SIZE; 1033 1034 /* Resize if the above determined that a new size was necessary */ 1035 if ((desiredsize != wpipe->pipe_buffer.size) && 1036 ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { 1037 PIPE_UNLOCK(wpipe); 1038 pipespace(wpipe, desiredsize); 1039 PIPE_LOCK(wpipe); 1040 } 1041 if (wpipe->pipe_buffer.size == 0) { 1042 /* 1043 * This can only happen for reverse direction use of pipes 1044 * in a complete OOM situation. 1045 */ 1046 error = ENOMEM; 1047 --wpipe->pipe_busy; 1048 pipeunlock(wpipe); 1049 PIPE_UNLOCK(wpipe); 1050 return (error); 1051 } 1052 1053 pipeunlock(wpipe); 1054 1055 orig_resid = uio->uio_resid; 1056 1057 while (uio->uio_resid) { 1058 int space; 1059 1060 pipelock(wpipe, 0); 1061 if (wpipe->pipe_state & PIPE_EOF) { 1062 pipeunlock(wpipe); 1063 error = EPIPE; 1064 break; 1065 } 1066 #ifndef PIPE_NODIRECT 1067 /* 1068 * If the transfer is large, we can gain performance if 1069 * we do process-to-process copies directly. 1070 * If the write is non-blocking, we don't use the 1071 * direct write mechanism. 1072 * 1073 * The direct write mechanism will detect the reader going 1074 * away on us. 1075 */ 1076 if (uio->uio_segflg == UIO_USERSPACE && 1077 uio->uio_iov->iov_len >= PIPE_MINDIRECT && 1078 wpipe->pipe_buffer.size >= PIPE_MINDIRECT && 1079 (fp->f_flag & FNONBLOCK) == 0) { 1080 pipeunlock(wpipe); 1081 error = pipe_direct_write(wpipe, uio); 1082 if (error) 1083 break; 1084 continue; 1085 } 1086 #endif 1087 1088 /* 1089 * Pipe buffered writes cannot be coincidental with 1090 * direct writes. We wait until the currently executing 1091 * direct write is completed before we start filling the 1092 * pipe buffer. We break out if a signal occurs or the 1093 * reader goes away. 1094 */ 1095 if (wpipe->pipe_state & PIPE_DIRECTW) { 1096 if (wpipe->pipe_state & PIPE_WANTR) { 1097 wpipe->pipe_state &= ~PIPE_WANTR; 1098 wakeup(wpipe); 1099 } 1100 pipeunlock(wpipe); 1101 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1102 "pipbww", 0); 1103 if (error) 1104 break; 1105 else 1106 continue; 1107 } 1108 1109 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1110 1111 /* Writes of size <= PIPE_BUF must be atomic. */ 1112 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1113 space = 0; 1114 1115 if (space > 0) { 1116 int size; /* Transfer size */ 1117 int segsize; /* first segment to transfer */ 1118 1119 /* 1120 * Transfer size is minimum of uio transfer 1121 * and free space in pipe buffer. 1122 */ 1123 if (space > uio->uio_resid) 1124 size = uio->uio_resid; 1125 else 1126 size = space; 1127 /* 1128 * First segment to transfer is minimum of 1129 * transfer size and contiguous space in 1130 * pipe buffer. If first segment to transfer 1131 * is less than the transfer size, we've got 1132 * a wraparound in the buffer. 1133 */ 1134 segsize = wpipe->pipe_buffer.size - 1135 wpipe->pipe_buffer.in; 1136 if (segsize > size) 1137 segsize = size; 1138 1139 /* Transfer first segment */ 1140 1141 PIPE_UNLOCK(rpipe); 1142 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1143 segsize, uio); 1144 PIPE_LOCK(rpipe); 1145 1146 if (error == 0 && segsize < size) { 1147 KASSERT(wpipe->pipe_buffer.in + segsize == 1148 wpipe->pipe_buffer.size, 1149 ("Pipe buffer wraparound disappeared")); 1150 /* 1151 * Transfer remaining part now, to 1152 * support atomic writes. Wraparound 1153 * happened. 1154 */ 1155 1156 PIPE_UNLOCK(rpipe); 1157 error = uiomove( 1158 &wpipe->pipe_buffer.buffer[0], 1159 size - segsize, uio); 1160 PIPE_LOCK(rpipe); 1161 } 1162 if (error == 0) { 1163 wpipe->pipe_buffer.in += size; 1164 if (wpipe->pipe_buffer.in >= 1165 wpipe->pipe_buffer.size) { 1166 KASSERT(wpipe->pipe_buffer.in == 1167 size - segsize + 1168 wpipe->pipe_buffer.size, 1169 ("Expected wraparound bad")); 1170 wpipe->pipe_buffer.in = size - segsize; 1171 } 1172 1173 wpipe->pipe_buffer.cnt += size; 1174 KASSERT(wpipe->pipe_buffer.cnt <= 1175 wpipe->pipe_buffer.size, 1176 ("Pipe buffer overflow")); 1177 } 1178 pipeunlock(wpipe); 1179 if (error != 0) 1180 break; 1181 } else { 1182 /* 1183 * If the "read-side" has been blocked, wake it up now. 1184 */ 1185 if (wpipe->pipe_state & PIPE_WANTR) { 1186 wpipe->pipe_state &= ~PIPE_WANTR; 1187 wakeup(wpipe); 1188 } 1189 1190 /* 1191 * don't block on non-blocking I/O 1192 */ 1193 if (fp->f_flag & FNONBLOCK) { 1194 error = EAGAIN; 1195 pipeunlock(wpipe); 1196 break; 1197 } 1198 1199 /* 1200 * We have no more space and have something to offer, 1201 * wake up select/poll. 1202 */ 1203 pipeselwakeup(wpipe); 1204 1205 wpipe->pipe_state |= PIPE_WANTW; 1206 pipeunlock(wpipe); 1207 error = msleep(wpipe, PIPE_MTX(rpipe), 1208 PRIBIO | PCATCH, "pipewr", 0); 1209 if (error != 0) 1210 break; 1211 } 1212 } 1213 1214 pipelock(wpipe, 0); 1215 --wpipe->pipe_busy; 1216 1217 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1218 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1219 wakeup(wpipe); 1220 } else if (wpipe->pipe_buffer.cnt > 0) { 1221 /* 1222 * If we have put any characters in the buffer, we wake up 1223 * the reader. 1224 */ 1225 if (wpipe->pipe_state & PIPE_WANTR) { 1226 wpipe->pipe_state &= ~PIPE_WANTR; 1227 wakeup(wpipe); 1228 } 1229 } 1230 1231 /* 1232 * Don't return EPIPE if I/O was successful 1233 */ 1234 if ((wpipe->pipe_buffer.cnt == 0) && 1235 (uio->uio_resid == 0) && 1236 (error == EPIPE)) { 1237 error = 0; 1238 } 1239 1240 if (error == 0) 1241 vfs_timestamp(&wpipe->pipe_mtime); 1242 1243 /* 1244 * We have something to offer, 1245 * wake up select/poll. 1246 */ 1247 if (wpipe->pipe_buffer.cnt) 1248 pipeselwakeup(wpipe); 1249 1250 pipeunlock(wpipe); 1251 PIPE_UNLOCK(rpipe); 1252 return (error); 1253 } 1254 1255 /* 1256 * we implement a very minimal set of ioctls for compatibility with sockets. 1257 */ 1258 static int 1259 pipe_ioctl(fp, cmd, data, active_cred, td) 1260 struct file *fp; 1261 u_long cmd; 1262 void *data; 1263 struct ucred *active_cred; 1264 struct thread *td; 1265 { 1266 struct pipe *mpipe = fp->f_data; 1267 int error; 1268 1269 PIPE_LOCK(mpipe); 1270 1271 #ifdef MAC 1272 error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1273 if (error) { 1274 PIPE_UNLOCK(mpipe); 1275 return (error); 1276 } 1277 #endif 1278 1279 error = 0; 1280 switch (cmd) { 1281 1282 case FIONBIO: 1283 break; 1284 1285 case FIOASYNC: 1286 if (*(int *)data) { 1287 mpipe->pipe_state |= PIPE_ASYNC; 1288 } else { 1289 mpipe->pipe_state &= ~PIPE_ASYNC; 1290 } 1291 break; 1292 1293 case FIONREAD: 1294 if (mpipe->pipe_state & PIPE_DIRECTW) 1295 *(int *)data = mpipe->pipe_map.cnt; 1296 else 1297 *(int *)data = mpipe->pipe_buffer.cnt; 1298 break; 1299 1300 case FIOSETOWN: 1301 PIPE_UNLOCK(mpipe); 1302 error = fsetown(*(int *)data, &mpipe->pipe_sigio); 1303 goto out_unlocked; 1304 1305 case FIOGETOWN: 1306 *(int *)data = fgetown(&mpipe->pipe_sigio); 1307 break; 1308 1309 /* This is deprecated, FIOSETOWN should be used instead. */ 1310 case TIOCSPGRP: 1311 PIPE_UNLOCK(mpipe); 1312 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); 1313 goto out_unlocked; 1314 1315 /* This is deprecated, FIOGETOWN should be used instead. */ 1316 case TIOCGPGRP: 1317 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1318 break; 1319 1320 default: 1321 error = ENOTTY; 1322 break; 1323 } 1324 PIPE_UNLOCK(mpipe); 1325 out_unlocked: 1326 return (error); 1327 } 1328 1329 static int 1330 pipe_poll(fp, events, active_cred, td) 1331 struct file *fp; 1332 int events; 1333 struct ucred *active_cred; 1334 struct thread *td; 1335 { 1336 struct pipe *rpipe = fp->f_data; 1337 struct pipe *wpipe; 1338 int revents = 0; 1339 #ifdef MAC 1340 int error; 1341 #endif 1342 1343 wpipe = rpipe->pipe_peer; 1344 PIPE_LOCK(rpipe); 1345 #ifdef MAC 1346 error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair); 1347 if (error) 1348 goto locked_error; 1349 #endif 1350 if (events & (POLLIN | POLLRDNORM)) 1351 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1352 (rpipe->pipe_buffer.cnt > 0) || 1353 (rpipe->pipe_state & PIPE_EOF)) 1354 revents |= events & (POLLIN | POLLRDNORM); 1355 1356 if (events & (POLLOUT | POLLWRNORM)) 1357 if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) || 1358 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1359 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 1360 revents |= events & (POLLOUT | POLLWRNORM); 1361 1362 if ((rpipe->pipe_state & PIPE_EOF) || 1363 (!wpipe->pipe_present) || 1364 (wpipe->pipe_state & PIPE_EOF)) 1365 revents |= POLLHUP; 1366 1367 if (revents == 0) { 1368 if (events & (POLLIN | POLLRDNORM)) { 1369 selrecord(td, &rpipe->pipe_sel); 1370 rpipe->pipe_state |= PIPE_SEL; 1371 } 1372 1373 if (events & (POLLOUT | POLLWRNORM)) { 1374 selrecord(td, &wpipe->pipe_sel); 1375 wpipe->pipe_state |= PIPE_SEL; 1376 } 1377 } 1378 #ifdef MAC 1379 locked_error: 1380 #endif 1381 PIPE_UNLOCK(rpipe); 1382 1383 return (revents); 1384 } 1385 1386 /* 1387 * We shouldn't need locks here as we're doing a read and this should 1388 * be a natural race. 1389 */ 1390 static int 1391 pipe_stat(fp, ub, active_cred, td) 1392 struct file *fp; 1393 struct stat *ub; 1394 struct ucred *active_cred; 1395 struct thread *td; 1396 { 1397 struct pipe *pipe = fp->f_data; 1398 #ifdef MAC 1399 int error; 1400 1401 PIPE_LOCK(pipe); 1402 error = mac_check_pipe_stat(active_cred, pipe->pipe_pair); 1403 PIPE_UNLOCK(pipe); 1404 if (error) 1405 return (error); 1406 #endif 1407 bzero(ub, sizeof(*ub)); 1408 ub->st_mode = S_IFIFO; 1409 ub->st_blksize = PAGE_SIZE; 1410 if (pipe->pipe_state & PIPE_DIRECTW) 1411 ub->st_size = pipe->pipe_map.cnt; 1412 else 1413 ub->st_size = pipe->pipe_buffer.cnt; 1414 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1415 ub->st_atimespec = pipe->pipe_atime; 1416 ub->st_mtimespec = pipe->pipe_mtime; 1417 ub->st_ctimespec = pipe->pipe_ctime; 1418 ub->st_uid = fp->f_cred->cr_uid; 1419 ub->st_gid = fp->f_cred->cr_gid; 1420 /* 1421 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen. 1422 * XXX (st_dev, st_ino) should be unique. 1423 */ 1424 return (0); 1425 } 1426 1427 /* ARGSUSED */ 1428 static int 1429 pipe_close(fp, td) 1430 struct file *fp; 1431 struct thread *td; 1432 { 1433 struct pipe *cpipe = fp->f_data; 1434 1435 fp->f_ops = &badfileops; 1436 fp->f_data = NULL; 1437 funsetown(&cpipe->pipe_sigio); 1438 pipeclose(cpipe); 1439 return (0); 1440 } 1441 1442 static void 1443 pipe_free_kmem(cpipe) 1444 struct pipe *cpipe; 1445 { 1446 1447 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1448 ("pipe_free_kmem: pipe mutex locked")); 1449 1450 if (cpipe->pipe_buffer.buffer != NULL) { 1451 atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size); 1452 vm_map_remove(pipe_map, 1453 (vm_offset_t)cpipe->pipe_buffer.buffer, 1454 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1455 cpipe->pipe_buffer.buffer = NULL; 1456 } 1457 #ifndef PIPE_NODIRECT 1458 { 1459 cpipe->pipe_map.cnt = 0; 1460 cpipe->pipe_map.pos = 0; 1461 cpipe->pipe_map.npages = 0; 1462 } 1463 #endif 1464 } 1465 1466 /* 1467 * shutdown the pipe 1468 */ 1469 static void 1470 pipeclose(cpipe) 1471 struct pipe *cpipe; 1472 { 1473 struct pipepair *pp; 1474 struct pipe *ppipe; 1475 1476 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1477 1478 PIPE_LOCK(cpipe); 1479 pipelock(cpipe, 0); 1480 pp = cpipe->pipe_pair; 1481 1482 pipeselwakeup(cpipe); 1483 1484 /* 1485 * If the other side is blocked, wake it up saying that 1486 * we want to close it down. 1487 */ 1488 cpipe->pipe_state |= PIPE_EOF; 1489 while (cpipe->pipe_busy) { 1490 wakeup(cpipe); 1491 cpipe->pipe_state |= PIPE_WANT; 1492 pipeunlock(cpipe); 1493 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1494 pipelock(cpipe, 0); 1495 } 1496 1497 1498 /* 1499 * Disconnect from peer, if any. 1500 */ 1501 ppipe = cpipe->pipe_peer; 1502 if (ppipe->pipe_present != 0) { 1503 pipeselwakeup(ppipe); 1504 1505 ppipe->pipe_state |= PIPE_EOF; 1506 wakeup(ppipe); 1507 KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); 1508 } 1509 1510 /* 1511 * Mark this endpoint as free. Release kmem resources. We 1512 * don't mark this endpoint as unused until we've finished 1513 * doing that, or the pipe might disappear out from under 1514 * us. 1515 */ 1516 PIPE_UNLOCK(cpipe); 1517 pipe_free_kmem(cpipe); 1518 PIPE_LOCK(cpipe); 1519 cpipe->pipe_present = 0; 1520 pipeunlock(cpipe); 1521 knlist_clear(&cpipe->pipe_sel.si_note, 1); 1522 knlist_destroy(&cpipe->pipe_sel.si_note); 1523 1524 /* 1525 * If both endpoints are now closed, release the memory for the 1526 * pipe pair. If not, unlock. 1527 */ 1528 if (ppipe->pipe_present == 0) { 1529 PIPE_UNLOCK(cpipe); 1530 #ifdef MAC 1531 mac_destroy_pipe(pp); 1532 #endif 1533 uma_zfree(pipe_zone, cpipe->pipe_pair); 1534 } else 1535 PIPE_UNLOCK(cpipe); 1536 } 1537 1538 /*ARGSUSED*/ 1539 static int 1540 pipe_kqfilter(struct file *fp, struct knote *kn) 1541 { 1542 struct pipe *cpipe; 1543 1544 cpipe = kn->kn_fp->f_data; 1545 PIPE_LOCK(cpipe); 1546 switch (kn->kn_filter) { 1547 case EVFILT_READ: 1548 kn->kn_fop = &pipe_rfiltops; 1549 break; 1550 case EVFILT_WRITE: 1551 kn->kn_fop = &pipe_wfiltops; 1552 if (!cpipe->pipe_peer->pipe_present) { 1553 /* other end of pipe has been closed */ 1554 PIPE_UNLOCK(cpipe); 1555 return (EPIPE); 1556 } 1557 cpipe = cpipe->pipe_peer; 1558 break; 1559 default: 1560 PIPE_UNLOCK(cpipe); 1561 return (EINVAL); 1562 } 1563 1564 knlist_add(&cpipe->pipe_sel.si_note, kn, 1); 1565 PIPE_UNLOCK(cpipe); 1566 return (0); 1567 } 1568 1569 static void 1570 filt_pipedetach(struct knote *kn) 1571 { 1572 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data; 1573 1574 PIPE_LOCK(cpipe); 1575 if (kn->kn_filter == EVFILT_WRITE) { 1576 if (!cpipe->pipe_peer->pipe_present) { 1577 PIPE_UNLOCK(cpipe); 1578 return; 1579 } 1580 cpipe = cpipe->pipe_peer; 1581 } 1582 knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); 1583 PIPE_UNLOCK(cpipe); 1584 } 1585 1586 /*ARGSUSED*/ 1587 static int 1588 filt_piperead(struct knote *kn, long hint) 1589 { 1590 struct pipe *rpipe = kn->kn_fp->f_data; 1591 struct pipe *wpipe = rpipe->pipe_peer; 1592 int ret; 1593 1594 PIPE_LOCK(rpipe); 1595 kn->kn_data = rpipe->pipe_buffer.cnt; 1596 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1597 kn->kn_data = rpipe->pipe_map.cnt; 1598 1599 if ((rpipe->pipe_state & PIPE_EOF) || 1600 (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1601 kn->kn_flags |= EV_EOF; 1602 PIPE_UNLOCK(rpipe); 1603 return (1); 1604 } 1605 ret = kn->kn_data > 0; 1606 PIPE_UNLOCK(rpipe); 1607 return ret; 1608 } 1609 1610 /*ARGSUSED*/ 1611 static int 1612 filt_pipewrite(struct knote *kn, long hint) 1613 { 1614 struct pipe *rpipe = kn->kn_fp->f_data; 1615 struct pipe *wpipe = rpipe->pipe_peer; 1616 1617 PIPE_LOCK(rpipe); 1618 if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) { 1619 kn->kn_data = 0; 1620 kn->kn_flags |= EV_EOF; 1621 PIPE_UNLOCK(rpipe); 1622 return (1); 1623 } 1624 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1625 if (wpipe->pipe_state & PIPE_DIRECTW) 1626 kn->kn_data = 0; 1627 1628 PIPE_UNLOCK(rpipe); 1629 return (kn->kn_data >= PIPE_BUF); 1630 } 1631