1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1996 John S. Dyson 5 * Copyright (c) 2012 Giovanni Trematerra 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice immediately at the beginning of the file, without modification, 13 * this list of conditions, and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Absolutely no warranty of function or purpose is made by the author 18 * John S. Dyson. 19 * 4. Modifications may be freely made to this file if the above conditions 20 * are met. 21 */ 22 23 /* 24 * This file contains a high-performance replacement for the socket-based 25 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 26 * all features of sockets, but does do everything that pipes normally 27 * do. 28 */ 29 30 /* 31 * This code has two modes of operation, a small write mode and a large 32 * write mode. The small write mode acts like conventional pipes with 33 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 34 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 35 * and PIPE_SIZE in size, the sending process pins the underlying pages in 36 * memory, and the receiving process copies directly from these pinned pages 37 * in the sending process. 38 * 39 * If the sending process receives a signal, it is possible that it will 40 * go away, and certainly its address space can change, because control 41 * is returned back to the user-mode side. In that case, the pipe code 42 * arranges to copy the buffer supplied by the user process, to a pageable 43 * kernel buffer, and the receiving process will grab the data from the 44 * pageable kernel buffer. Since signals don't happen all that often, 45 * the copy operation is normally eliminated. 46 * 47 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 48 * happen for small transfers so that the system will not spend all of 49 * its time context switching. 50 * 51 * In order to limit the resource use of pipes, two sysctls exist: 52 * 53 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 54 * address space available to us in pipe_map. This value is normally 55 * autotuned, but may also be loader tuned. 56 * 57 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of 58 * memory in use by pipes. 59 * 60 * Based on how large pipekva is relative to maxpipekva, the following 61 * will happen: 62 * 63 * 0% - 50%: 64 * New pipes are given 16K of memory backing, pipes may dynamically 65 * grow to as large as 64K where needed. 66 * 50% - 75%: 67 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 68 * existing pipes may NOT grow. 69 * 75% - 100%: 70 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 71 * existing pipes will be shrunk down to 4K whenever possible. 72 * 73 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If 74 * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE 75 * resize which MUST occur for reverse-direction pipes when they are 76 * first used. 77 * 78 * Additional information about the current state of pipes may be obtained 79 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, 80 * and kern.ipc.piperesizefail. 81 * 82 * Locking rules: There are two locks present here: A mutex, used via 83 * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via 84 * the flag, as mutexes can not persist over uiomove. The mutex 85 * exists only to guard access to the flag, and is not in itself a 86 * locking mechanism. Also note that there is only a single mutex for 87 * both directions of a pipe. 88 * 89 * As pipelock() may have to sleep before it can acquire the flag, it 90 * is important to reread all data after a call to pipelock(); everything 91 * in the structure may have changed. 92 */ 93 94 #include <sys/cdefs.h> 95 __FBSDID("$FreeBSD$"); 96 97 #include <sys/param.h> 98 #include <sys/systm.h> 99 #include <sys/conf.h> 100 #include <sys/fcntl.h> 101 #include <sys/file.h> 102 #include <sys/filedesc.h> 103 #include <sys/filio.h> 104 #include <sys/kernel.h> 105 #include <sys/lock.h> 106 #include <sys/mutex.h> 107 #include <sys/ttycom.h> 108 #include <sys/stat.h> 109 #include <sys/malloc.h> 110 #include <sys/poll.h> 111 #include <sys/selinfo.h> 112 #include <sys/signalvar.h> 113 #include <sys/syscallsubr.h> 114 #include <sys/sysctl.h> 115 #include <sys/sysproto.h> 116 #include <sys/pipe.h> 117 #include <sys/proc.h> 118 #include <sys/vnode.h> 119 #include <sys/uio.h> 120 #include <sys/user.h> 121 #include <sys/event.h> 122 123 #include <security/mac/mac_framework.h> 124 125 #include <vm/vm.h> 126 #include <vm/vm_param.h> 127 #include <vm/vm_object.h> 128 #include <vm/vm_kern.h> 129 #include <vm/vm_extern.h> 130 #include <vm/pmap.h> 131 #include <vm/vm_map.h> 132 #include <vm/vm_page.h> 133 #include <vm/uma.h> 134 135 /* 136 * Use this define if you want to disable *fancy* VM things. Expect an 137 * approx 30% decrease in transfer rate. This could be useful for 138 * NetBSD or OpenBSD. 139 */ 140 /* #define PIPE_NODIRECT */ 141 142 #define PIPE_PEER(pipe) \ 143 (((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer)) 144 145 /* 146 * interfaces to the outside world 147 */ 148 static fo_rdwr_t pipe_read; 149 static fo_rdwr_t pipe_write; 150 static fo_truncate_t pipe_truncate; 151 static fo_ioctl_t pipe_ioctl; 152 static fo_poll_t pipe_poll; 153 static fo_kqfilter_t pipe_kqfilter; 154 static fo_stat_t pipe_stat; 155 static fo_close_t pipe_close; 156 static fo_chmod_t pipe_chmod; 157 static fo_chown_t pipe_chown; 158 static fo_fill_kinfo_t pipe_fill_kinfo; 159 160 struct fileops pipeops = { 161 .fo_read = pipe_read, 162 .fo_write = pipe_write, 163 .fo_truncate = pipe_truncate, 164 .fo_ioctl = pipe_ioctl, 165 .fo_poll = pipe_poll, 166 .fo_kqfilter = pipe_kqfilter, 167 .fo_stat = pipe_stat, 168 .fo_close = pipe_close, 169 .fo_chmod = pipe_chmod, 170 .fo_chown = pipe_chown, 171 .fo_sendfile = invfo_sendfile, 172 .fo_fill_kinfo = pipe_fill_kinfo, 173 .fo_flags = DFLAG_PASSABLE 174 }; 175 176 static void filt_pipedetach(struct knote *kn); 177 static void filt_pipedetach_notsup(struct knote *kn); 178 static int filt_pipenotsup(struct knote *kn, long hint); 179 static int filt_piperead(struct knote *kn, long hint); 180 static int filt_pipewrite(struct knote *kn, long hint); 181 182 static struct filterops pipe_nfiltops = { 183 .f_isfd = 1, 184 .f_detach = filt_pipedetach_notsup, 185 .f_event = filt_pipenotsup 186 }; 187 static struct filterops pipe_rfiltops = { 188 .f_isfd = 1, 189 .f_detach = filt_pipedetach, 190 .f_event = filt_piperead 191 }; 192 static struct filterops pipe_wfiltops = { 193 .f_isfd = 1, 194 .f_detach = filt_pipedetach, 195 .f_event = filt_pipewrite 196 }; 197 198 /* 199 * Default pipe buffer size(s), this can be kind-of large now because pipe 200 * space is pageable. The pipe code will try to maintain locality of 201 * reference for performance reasons, so small amounts of outstanding I/O 202 * will not wipe the cache. 203 */ 204 #define MINPIPESIZE (PIPE_SIZE/3) 205 #define MAXPIPESIZE (2*PIPE_SIZE/3) 206 207 static long amountpipekva; 208 static int pipefragretry; 209 static int pipeallocfail; 210 static int piperesizefail; 211 static int piperesizeallowed = 1; 212 213 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 214 &maxpipekva, 0, "Pipe KVA limit"); 215 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 216 &amountpipekva, 0, "Pipe KVA usage"); 217 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, 218 &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); 219 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, 220 &pipeallocfail, 0, "Pipe allocation failures"); 221 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, 222 &piperesizefail, 0, "Pipe resize failures"); 223 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, 224 &piperesizeallowed, 0, "Pipe resizing allowed"); 225 226 static void pipeinit(void *dummy __unused); 227 static void pipeclose(struct pipe *cpipe); 228 static void pipe_free_kmem(struct pipe *cpipe); 229 static void pipe_create(struct pipe *pipe, int backing); 230 static void pipe_paircreate(struct thread *td, struct pipepair **p_pp); 231 static __inline int pipelock(struct pipe *cpipe, int catch); 232 static __inline void pipeunlock(struct pipe *cpipe); 233 #ifndef PIPE_NODIRECT 234 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 235 static void pipe_destroy_write_buffer(struct pipe *wpipe); 236 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 237 static void pipe_clone_write_buffer(struct pipe *wpipe); 238 #endif 239 static int pipespace(struct pipe *cpipe, int size); 240 static int pipespace_new(struct pipe *cpipe, int size); 241 242 static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); 243 static int pipe_zone_init(void *mem, int size, int flags); 244 static void pipe_zone_fini(void *mem, int size); 245 246 static uma_zone_t pipe_zone; 247 static struct unrhdr *pipeino_unr; 248 static dev_t pipedev_ino; 249 250 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 251 252 static void 253 pipeinit(void *dummy __unused) 254 { 255 256 pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), 257 pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, 258 UMA_ALIGN_PTR, 0); 259 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 260 pipeino_unr = new_unrhdr(1, INT32_MAX, NULL); 261 KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized")); 262 pipedev_ino = devfs_alloc_cdp_inode(); 263 KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); 264 } 265 266 static int 267 pipe_zone_ctor(void *mem, int size, void *arg, int flags) 268 { 269 struct pipepair *pp; 270 struct pipe *rpipe, *wpipe; 271 272 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 273 274 pp = (struct pipepair *)mem; 275 276 /* 277 * We zero both pipe endpoints to make sure all the kmem pointers 278 * are NULL, flag fields are zero'd, etc. We timestamp both 279 * endpoints with the same time. 280 */ 281 rpipe = &pp->pp_rpipe; 282 bzero(rpipe, sizeof(*rpipe)); 283 vfs_timestamp(&rpipe->pipe_ctime); 284 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 285 286 wpipe = &pp->pp_wpipe; 287 bzero(wpipe, sizeof(*wpipe)); 288 wpipe->pipe_ctime = rpipe->pipe_ctime; 289 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 290 291 rpipe->pipe_peer = wpipe; 292 rpipe->pipe_pair = pp; 293 wpipe->pipe_peer = rpipe; 294 wpipe->pipe_pair = pp; 295 296 /* 297 * Mark both endpoints as present; they will later get free'd 298 * one at a time. When both are free'd, then the whole pair 299 * is released. 300 */ 301 rpipe->pipe_present = PIPE_ACTIVE; 302 wpipe->pipe_present = PIPE_ACTIVE; 303 304 /* 305 * Eventually, the MAC Framework may initialize the label 306 * in ctor or init, but for now we do it elswhere to avoid 307 * blocking in ctor or init. 308 */ 309 pp->pp_label = NULL; 310 311 return (0); 312 } 313 314 static int 315 pipe_zone_init(void *mem, int size, int flags) 316 { 317 struct pipepair *pp; 318 319 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 320 321 pp = (struct pipepair *)mem; 322 323 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW); 324 return (0); 325 } 326 327 static void 328 pipe_zone_fini(void *mem, int size) 329 { 330 struct pipepair *pp; 331 332 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 333 334 pp = (struct pipepair *)mem; 335 336 mtx_destroy(&pp->pp_mtx); 337 } 338 339 static void 340 pipe_paircreate(struct thread *td, struct pipepair **p_pp) 341 { 342 struct pipepair *pp; 343 struct pipe *rpipe, *wpipe; 344 345 *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK); 346 #ifdef MAC 347 /* 348 * The MAC label is shared between the connected endpoints. As a 349 * result mac_pipe_init() and mac_pipe_create() are called once 350 * for the pair, and not on the endpoints. 351 */ 352 mac_pipe_init(pp); 353 mac_pipe_create(td->td_ucred, pp); 354 #endif 355 rpipe = &pp->pp_rpipe; 356 wpipe = &pp->pp_wpipe; 357 358 knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); 359 knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); 360 361 /* Only the forward direction pipe is backed by default */ 362 pipe_create(rpipe, 1); 363 pipe_create(wpipe, 0); 364 365 rpipe->pipe_state |= PIPE_DIRECTOK; 366 wpipe->pipe_state |= PIPE_DIRECTOK; 367 } 368 369 void 370 pipe_named_ctor(struct pipe **ppipe, struct thread *td) 371 { 372 struct pipepair *pp; 373 374 pipe_paircreate(td, &pp); 375 pp->pp_rpipe.pipe_state |= PIPE_NAMED; 376 *ppipe = &pp->pp_rpipe; 377 } 378 379 void 380 pipe_dtor(struct pipe *dpipe) 381 { 382 struct pipe *peer; 383 ino_t ino; 384 385 ino = dpipe->pipe_ino; 386 peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; 387 funsetown(&dpipe->pipe_sigio); 388 pipeclose(dpipe); 389 if (peer != NULL) { 390 funsetown(&peer->pipe_sigio); 391 pipeclose(peer); 392 } 393 if (ino != 0 && ino != (ino_t)-1) 394 free_unr(pipeino_unr, ino); 395 } 396 397 /* 398 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let 399 * the zone pick up the pieces via pipeclose(). 400 */ 401 int 402 kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1, 403 struct filecaps *fcaps2) 404 { 405 struct file *rf, *wf; 406 struct pipe *rpipe, *wpipe; 407 struct pipepair *pp; 408 int fd, fflags, error; 409 410 pipe_paircreate(td, &pp); 411 rpipe = &pp->pp_rpipe; 412 wpipe = &pp->pp_wpipe; 413 error = falloc_caps(td, &rf, &fd, flags, fcaps1); 414 if (error) { 415 pipeclose(rpipe); 416 pipeclose(wpipe); 417 return (error); 418 } 419 /* An extra reference on `rf' has been held for us by falloc_caps(). */ 420 fildes[0] = fd; 421 422 fflags = FREAD | FWRITE; 423 if ((flags & O_NONBLOCK) != 0) 424 fflags |= FNONBLOCK; 425 426 /* 427 * Warning: once we've gotten past allocation of the fd for the 428 * read-side, we can only drop the read side via fdrop() in order 429 * to avoid races against processes which manage to dup() the read 430 * side while we are blocked trying to allocate the write side. 431 */ 432 finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops); 433 error = falloc_caps(td, &wf, &fd, flags, fcaps2); 434 if (error) { 435 fdclose(td, rf, fildes[0]); 436 fdrop(rf, td); 437 /* rpipe has been closed by fdrop(). */ 438 pipeclose(wpipe); 439 return (error); 440 } 441 /* An extra reference on `wf' has been held for us by falloc_caps(). */ 442 finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops); 443 fdrop(wf, td); 444 fildes[1] = fd; 445 fdrop(rf, td); 446 447 return (0); 448 } 449 450 #ifdef COMPAT_FREEBSD10 451 /* ARGSUSED */ 452 int 453 freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused) 454 { 455 int error; 456 int fildes[2]; 457 458 error = kern_pipe(td, fildes, 0, NULL, NULL); 459 if (error) 460 return (error); 461 462 td->td_retval[0] = fildes[0]; 463 td->td_retval[1] = fildes[1]; 464 465 return (0); 466 } 467 #endif 468 469 int 470 sys_pipe2(struct thread *td, struct pipe2_args *uap) 471 { 472 int error, fildes[2]; 473 474 if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) 475 return (EINVAL); 476 error = kern_pipe(td, fildes, uap->flags, NULL, NULL); 477 if (error) 478 return (error); 479 error = copyout(fildes, uap->fildes, 2 * sizeof(int)); 480 if (error) { 481 (void)kern_close(td, fildes[0]); 482 (void)kern_close(td, fildes[1]); 483 } 484 return (error); 485 } 486 487 /* 488 * Allocate kva for pipe circular buffer, the space is pageable 489 * This routine will 'realloc' the size of a pipe safely, if it fails 490 * it will retain the old buffer. 491 * If it fails it will return ENOMEM. 492 */ 493 static int 494 pipespace_new(struct pipe *cpipe, int size) 495 { 496 caddr_t buffer; 497 int error, cnt, firstseg; 498 static int curfail = 0; 499 static struct timeval lastfail; 500 501 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 502 KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), 503 ("pipespace: resize of direct writes not allowed")); 504 retry: 505 cnt = cpipe->pipe_buffer.cnt; 506 if (cnt > size) 507 size = cnt; 508 509 size = round_page(size); 510 buffer = (caddr_t) vm_map_min(pipe_map); 511 512 error = vm_map_find(pipe_map, NULL, 0, 513 (vm_offset_t *) &buffer, size, 0, VMFS_ANY_SPACE, 514 VM_PROT_ALL, VM_PROT_ALL, 0); 515 if (error != KERN_SUCCESS) { 516 if ((cpipe->pipe_buffer.buffer == NULL) && 517 (size > SMALL_PIPE_SIZE)) { 518 size = SMALL_PIPE_SIZE; 519 pipefragretry++; 520 goto retry; 521 } 522 if (cpipe->pipe_buffer.buffer == NULL) { 523 pipeallocfail++; 524 if (ppsratecheck(&lastfail, &curfail, 1)) 525 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 526 } else { 527 piperesizefail++; 528 } 529 return (ENOMEM); 530 } 531 532 /* copy data, then free old resources if we're resizing */ 533 if (cnt > 0) { 534 if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { 535 firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; 536 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 537 buffer, firstseg); 538 if ((cnt - firstseg) > 0) 539 bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], 540 cpipe->pipe_buffer.in); 541 } else { 542 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 543 buffer, cnt); 544 } 545 } 546 pipe_free_kmem(cpipe); 547 cpipe->pipe_buffer.buffer = buffer; 548 cpipe->pipe_buffer.size = size; 549 cpipe->pipe_buffer.in = cnt; 550 cpipe->pipe_buffer.out = 0; 551 cpipe->pipe_buffer.cnt = cnt; 552 atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); 553 return (0); 554 } 555 556 /* 557 * Wrapper for pipespace_new() that performs locking assertions. 558 */ 559 static int 560 pipespace(struct pipe *cpipe, int size) 561 { 562 563 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 564 ("Unlocked pipe passed to pipespace")); 565 return (pipespace_new(cpipe, size)); 566 } 567 568 /* 569 * lock a pipe for I/O, blocking other access 570 */ 571 static __inline int 572 pipelock(struct pipe *cpipe, int catch) 573 { 574 int error; 575 576 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 577 while (cpipe->pipe_state & PIPE_LOCKFL) { 578 cpipe->pipe_state |= PIPE_LWANT; 579 error = msleep(cpipe, PIPE_MTX(cpipe), 580 catch ? (PRIBIO | PCATCH) : PRIBIO, 581 "pipelk", 0); 582 if (error != 0) 583 return (error); 584 } 585 cpipe->pipe_state |= PIPE_LOCKFL; 586 return (0); 587 } 588 589 /* 590 * unlock a pipe I/O lock 591 */ 592 static __inline void 593 pipeunlock(struct pipe *cpipe) 594 { 595 596 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 597 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 598 ("Unlocked pipe passed to pipeunlock")); 599 cpipe->pipe_state &= ~PIPE_LOCKFL; 600 if (cpipe->pipe_state & PIPE_LWANT) { 601 cpipe->pipe_state &= ~PIPE_LWANT; 602 wakeup(cpipe); 603 } 604 } 605 606 void 607 pipeselwakeup(struct pipe *cpipe) 608 { 609 610 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 611 if (cpipe->pipe_state & PIPE_SEL) { 612 selwakeuppri(&cpipe->pipe_sel, PSOCK); 613 if (!SEL_WAITING(&cpipe->pipe_sel)) 614 cpipe->pipe_state &= ~PIPE_SEL; 615 } 616 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 617 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 618 KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); 619 } 620 621 /* 622 * Initialize and allocate VM and memory for pipe. The structure 623 * will start out zero'd from the ctor, so we just manage the kmem. 624 */ 625 static void 626 pipe_create(struct pipe *pipe, int backing) 627 { 628 629 if (backing) { 630 /* 631 * Note that these functions can fail if pipe map is exhausted 632 * (as a result of too many pipes created), but we ignore the 633 * error as it is not fatal and could be provoked by 634 * unprivileged users. The only consequence is worse performance 635 * with given pipe. 636 */ 637 if (amountpipekva > maxpipekva / 2) 638 (void)pipespace_new(pipe, SMALL_PIPE_SIZE); 639 else 640 (void)pipespace_new(pipe, PIPE_SIZE); 641 } 642 643 pipe->pipe_ino = -1; 644 } 645 646 /* ARGSUSED */ 647 static int 648 pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 649 int flags, struct thread *td) 650 { 651 struct pipe *rpipe; 652 int error; 653 int nread = 0; 654 int size; 655 656 rpipe = fp->f_data; 657 PIPE_LOCK(rpipe); 658 ++rpipe->pipe_busy; 659 error = pipelock(rpipe, 1); 660 if (error) 661 goto unlocked_error; 662 663 #ifdef MAC 664 error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); 665 if (error) 666 goto locked_error; 667 #endif 668 if (amountpipekva > (3 * maxpipekva) / 4) { 669 if (!(rpipe->pipe_state & PIPE_DIRECTW) && 670 (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 671 (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 672 (piperesizeallowed == 1)) { 673 PIPE_UNLOCK(rpipe); 674 pipespace(rpipe, SMALL_PIPE_SIZE); 675 PIPE_LOCK(rpipe); 676 } 677 } 678 679 while (uio->uio_resid) { 680 /* 681 * normal pipe buffer receive 682 */ 683 if (rpipe->pipe_buffer.cnt > 0) { 684 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 685 if (size > rpipe->pipe_buffer.cnt) 686 size = rpipe->pipe_buffer.cnt; 687 if (size > uio->uio_resid) 688 size = uio->uio_resid; 689 690 PIPE_UNLOCK(rpipe); 691 error = uiomove( 692 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 693 size, uio); 694 PIPE_LOCK(rpipe); 695 if (error) 696 break; 697 698 rpipe->pipe_buffer.out += size; 699 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 700 rpipe->pipe_buffer.out = 0; 701 702 rpipe->pipe_buffer.cnt -= size; 703 704 /* 705 * If there is no more to read in the pipe, reset 706 * its pointers to the beginning. This improves 707 * cache hit stats. 708 */ 709 if (rpipe->pipe_buffer.cnt == 0) { 710 rpipe->pipe_buffer.in = 0; 711 rpipe->pipe_buffer.out = 0; 712 } 713 nread += size; 714 #ifndef PIPE_NODIRECT 715 /* 716 * Direct copy, bypassing a kernel buffer. 717 */ 718 } else if ((size = rpipe->pipe_map.cnt) && 719 (rpipe->pipe_state & PIPE_DIRECTW)) { 720 if (size > uio->uio_resid) 721 size = (u_int) uio->uio_resid; 722 723 PIPE_UNLOCK(rpipe); 724 error = uiomove_fromphys(rpipe->pipe_map.ms, 725 rpipe->pipe_map.pos, size, uio); 726 PIPE_LOCK(rpipe); 727 if (error) 728 break; 729 nread += size; 730 rpipe->pipe_map.pos += size; 731 rpipe->pipe_map.cnt -= size; 732 if (rpipe->pipe_map.cnt == 0) { 733 rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW); 734 wakeup(rpipe); 735 } 736 #endif 737 } else { 738 /* 739 * detect EOF condition 740 * read returns 0 on EOF, no need to set error 741 */ 742 if (rpipe->pipe_state & PIPE_EOF) 743 break; 744 745 /* 746 * If the "write-side" has been blocked, wake it up now. 747 */ 748 if (rpipe->pipe_state & PIPE_WANTW) { 749 rpipe->pipe_state &= ~PIPE_WANTW; 750 wakeup(rpipe); 751 } 752 753 /* 754 * Break if some data was read. 755 */ 756 if (nread > 0) 757 break; 758 759 /* 760 * Unlock the pipe buffer for our remaining processing. 761 * We will either break out with an error or we will 762 * sleep and relock to loop. 763 */ 764 pipeunlock(rpipe); 765 766 /* 767 * Handle non-blocking mode operation or 768 * wait for more data. 769 */ 770 if (fp->f_flag & FNONBLOCK) { 771 error = EAGAIN; 772 } else { 773 rpipe->pipe_state |= PIPE_WANTR; 774 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 775 PRIBIO | PCATCH, 776 "piperd", 0)) == 0) 777 error = pipelock(rpipe, 1); 778 } 779 if (error) 780 goto unlocked_error; 781 } 782 } 783 #ifdef MAC 784 locked_error: 785 #endif 786 pipeunlock(rpipe); 787 788 /* XXX: should probably do this before getting any locks. */ 789 if (error == 0) 790 vfs_timestamp(&rpipe->pipe_atime); 791 unlocked_error: 792 --rpipe->pipe_busy; 793 794 /* 795 * PIPE_WANT processing only makes sense if pipe_busy is 0. 796 */ 797 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 798 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 799 wakeup(rpipe); 800 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 801 /* 802 * Handle write blocking hysteresis. 803 */ 804 if (rpipe->pipe_state & PIPE_WANTW) { 805 rpipe->pipe_state &= ~PIPE_WANTW; 806 wakeup(rpipe); 807 } 808 } 809 810 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 811 pipeselwakeup(rpipe); 812 813 PIPE_UNLOCK(rpipe); 814 return (error); 815 } 816 817 #ifndef PIPE_NODIRECT 818 /* 819 * Map the sending processes' buffer into kernel space and wire it. 820 * This is similar to a physical write operation. 821 */ 822 static int 823 pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) 824 { 825 u_int size; 826 int i; 827 828 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 829 KASSERT(wpipe->pipe_state & PIPE_DIRECTW, 830 ("Clone attempt on non-direct write pipe!")); 831 832 if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) 833 size = wpipe->pipe_buffer.size; 834 else 835 size = uio->uio_iov->iov_len; 836 837 if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, 838 (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, 839 wpipe->pipe_map.ms, PIPENPAGES)) < 0) 840 return (EFAULT); 841 842 /* 843 * set up the control block 844 */ 845 wpipe->pipe_map.npages = i; 846 wpipe->pipe_map.pos = 847 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 848 wpipe->pipe_map.cnt = size; 849 850 /* 851 * and update the uio data 852 */ 853 854 uio->uio_iov->iov_len -= size; 855 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 856 if (uio->uio_iov->iov_len == 0) 857 uio->uio_iov++; 858 uio->uio_resid -= size; 859 uio->uio_offset += size; 860 return (0); 861 } 862 863 /* 864 * unmap and unwire the process buffer 865 */ 866 static void 867 pipe_destroy_write_buffer(struct pipe *wpipe) 868 { 869 870 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 871 vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); 872 wpipe->pipe_map.npages = 0; 873 } 874 875 /* 876 * In the case of a signal, the writing process might go away. This 877 * code copies the data into the circular buffer so that the source 878 * pages can be freed without loss of data. 879 */ 880 static void 881 pipe_clone_write_buffer(struct pipe *wpipe) 882 { 883 struct uio uio; 884 struct iovec iov; 885 int size; 886 int pos; 887 888 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 889 size = wpipe->pipe_map.cnt; 890 pos = wpipe->pipe_map.pos; 891 892 wpipe->pipe_buffer.in = size; 893 wpipe->pipe_buffer.out = 0; 894 wpipe->pipe_buffer.cnt = size; 895 wpipe->pipe_state &= ~PIPE_DIRECTW; 896 897 PIPE_UNLOCK(wpipe); 898 iov.iov_base = wpipe->pipe_buffer.buffer; 899 iov.iov_len = size; 900 uio.uio_iov = &iov; 901 uio.uio_iovcnt = 1; 902 uio.uio_offset = 0; 903 uio.uio_resid = size; 904 uio.uio_segflg = UIO_SYSSPACE; 905 uio.uio_rw = UIO_READ; 906 uio.uio_td = curthread; 907 uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); 908 PIPE_LOCK(wpipe); 909 pipe_destroy_write_buffer(wpipe); 910 } 911 912 /* 913 * This implements the pipe buffer write mechanism. Note that only 914 * a direct write OR a normal pipe write can be pending at any given time. 915 * If there are any characters in the pipe buffer, the direct write will 916 * be deferred until the receiving process grabs all of the bytes from 917 * the pipe buffer. Then the direct mapping write is set-up. 918 */ 919 static int 920 pipe_direct_write(struct pipe *wpipe, struct uio *uio) 921 { 922 int error; 923 924 retry: 925 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 926 error = pipelock(wpipe, 1); 927 if (error != 0) 928 goto error1; 929 if ((wpipe->pipe_state & PIPE_EOF) != 0) { 930 error = EPIPE; 931 pipeunlock(wpipe); 932 goto error1; 933 } 934 while (wpipe->pipe_state & PIPE_DIRECTW) { 935 if (wpipe->pipe_state & PIPE_WANTR) { 936 wpipe->pipe_state &= ~PIPE_WANTR; 937 wakeup(wpipe); 938 } 939 pipeselwakeup(wpipe); 940 wpipe->pipe_state |= PIPE_WANTW; 941 pipeunlock(wpipe); 942 error = msleep(wpipe, PIPE_MTX(wpipe), 943 PRIBIO | PCATCH, "pipdww", 0); 944 if (error) 945 goto error1; 946 else 947 goto retry; 948 } 949 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 950 if (wpipe->pipe_buffer.cnt > 0) { 951 if (wpipe->pipe_state & PIPE_WANTR) { 952 wpipe->pipe_state &= ~PIPE_WANTR; 953 wakeup(wpipe); 954 } 955 pipeselwakeup(wpipe); 956 wpipe->pipe_state |= PIPE_WANTW; 957 pipeunlock(wpipe); 958 error = msleep(wpipe, PIPE_MTX(wpipe), 959 PRIBIO | PCATCH, "pipdwc", 0); 960 if (error) 961 goto error1; 962 else 963 goto retry; 964 } 965 966 wpipe->pipe_state |= PIPE_DIRECTW; 967 968 PIPE_UNLOCK(wpipe); 969 error = pipe_build_write_buffer(wpipe, uio); 970 PIPE_LOCK(wpipe); 971 if (error) { 972 wpipe->pipe_state &= ~PIPE_DIRECTW; 973 pipeunlock(wpipe); 974 goto error1; 975 } 976 977 error = 0; 978 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 979 if (wpipe->pipe_state & PIPE_EOF) { 980 pipe_destroy_write_buffer(wpipe); 981 pipeselwakeup(wpipe); 982 pipeunlock(wpipe); 983 error = EPIPE; 984 goto error1; 985 } 986 if (wpipe->pipe_state & PIPE_WANTR) { 987 wpipe->pipe_state &= ~PIPE_WANTR; 988 wakeup(wpipe); 989 } 990 pipeselwakeup(wpipe); 991 wpipe->pipe_state |= PIPE_WANTW; 992 pipeunlock(wpipe); 993 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 994 "pipdwt", 0); 995 pipelock(wpipe, 0); 996 } 997 998 if (wpipe->pipe_state & PIPE_EOF) 999 error = EPIPE; 1000 if (wpipe->pipe_state & PIPE_DIRECTW) { 1001 /* 1002 * this bit of trickery substitutes a kernel buffer for 1003 * the process that might be going away. 1004 */ 1005 pipe_clone_write_buffer(wpipe); 1006 } else { 1007 pipe_destroy_write_buffer(wpipe); 1008 } 1009 pipeunlock(wpipe); 1010 return (error); 1011 1012 error1: 1013 wakeup(wpipe); 1014 return (error); 1015 } 1016 #endif 1017 1018 static int 1019 pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 1020 int flags, struct thread *td) 1021 { 1022 int error = 0; 1023 int desiredsize; 1024 ssize_t orig_resid; 1025 struct pipe *wpipe, *rpipe; 1026 1027 rpipe = fp->f_data; 1028 wpipe = PIPE_PEER(rpipe); 1029 PIPE_LOCK(rpipe); 1030 error = pipelock(wpipe, 1); 1031 if (error) { 1032 PIPE_UNLOCK(rpipe); 1033 return (error); 1034 } 1035 /* 1036 * detect loss of pipe read side, issue SIGPIPE if lost. 1037 */ 1038 if (wpipe->pipe_present != PIPE_ACTIVE || 1039 (wpipe->pipe_state & PIPE_EOF)) { 1040 pipeunlock(wpipe); 1041 PIPE_UNLOCK(rpipe); 1042 return (EPIPE); 1043 } 1044 #ifdef MAC 1045 error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); 1046 if (error) { 1047 pipeunlock(wpipe); 1048 PIPE_UNLOCK(rpipe); 1049 return (error); 1050 } 1051 #endif 1052 ++wpipe->pipe_busy; 1053 1054 /* Choose a larger size if it's advantageous */ 1055 desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); 1056 while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { 1057 if (piperesizeallowed != 1) 1058 break; 1059 if (amountpipekva > maxpipekva / 2) 1060 break; 1061 if (desiredsize == BIG_PIPE_SIZE) 1062 break; 1063 desiredsize = desiredsize * 2; 1064 } 1065 1066 /* Choose a smaller size if we're in a OOM situation */ 1067 if ((amountpipekva > (3 * maxpipekva) / 4) && 1068 (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 1069 (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 1070 (piperesizeallowed == 1)) 1071 desiredsize = SMALL_PIPE_SIZE; 1072 1073 /* Resize if the above determined that a new size was necessary */ 1074 if ((desiredsize != wpipe->pipe_buffer.size) && 1075 ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { 1076 PIPE_UNLOCK(wpipe); 1077 pipespace(wpipe, desiredsize); 1078 PIPE_LOCK(wpipe); 1079 } 1080 if (wpipe->pipe_buffer.size == 0) { 1081 /* 1082 * This can only happen for reverse direction use of pipes 1083 * in a complete OOM situation. 1084 */ 1085 error = ENOMEM; 1086 --wpipe->pipe_busy; 1087 pipeunlock(wpipe); 1088 PIPE_UNLOCK(wpipe); 1089 return (error); 1090 } 1091 1092 pipeunlock(wpipe); 1093 1094 orig_resid = uio->uio_resid; 1095 1096 while (uio->uio_resid) { 1097 int space; 1098 1099 pipelock(wpipe, 0); 1100 if (wpipe->pipe_state & PIPE_EOF) { 1101 pipeunlock(wpipe); 1102 error = EPIPE; 1103 break; 1104 } 1105 #ifndef PIPE_NODIRECT 1106 /* 1107 * If the transfer is large, we can gain performance if 1108 * we do process-to-process copies directly. 1109 * If the write is non-blocking, we don't use the 1110 * direct write mechanism. 1111 * 1112 * The direct write mechanism will detect the reader going 1113 * away on us. 1114 */ 1115 if (uio->uio_segflg == UIO_USERSPACE && 1116 uio->uio_iov->iov_len >= PIPE_MINDIRECT && 1117 wpipe->pipe_buffer.size >= PIPE_MINDIRECT && 1118 (fp->f_flag & FNONBLOCK) == 0) { 1119 pipeunlock(wpipe); 1120 error = pipe_direct_write(wpipe, uio); 1121 if (error) 1122 break; 1123 continue; 1124 } 1125 #endif 1126 1127 /* 1128 * Pipe buffered writes cannot be coincidental with 1129 * direct writes. We wait until the currently executing 1130 * direct write is completed before we start filling the 1131 * pipe buffer. We break out if a signal occurs or the 1132 * reader goes away. 1133 */ 1134 if (wpipe->pipe_state & PIPE_DIRECTW) { 1135 if (wpipe->pipe_state & PIPE_WANTR) { 1136 wpipe->pipe_state &= ~PIPE_WANTR; 1137 wakeup(wpipe); 1138 } 1139 pipeselwakeup(wpipe); 1140 wpipe->pipe_state |= PIPE_WANTW; 1141 pipeunlock(wpipe); 1142 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1143 "pipbww", 0); 1144 if (error) 1145 break; 1146 else 1147 continue; 1148 } 1149 1150 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1151 1152 /* Writes of size <= PIPE_BUF must be atomic. */ 1153 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1154 space = 0; 1155 1156 if (space > 0) { 1157 int size; /* Transfer size */ 1158 int segsize; /* first segment to transfer */ 1159 1160 /* 1161 * Transfer size is minimum of uio transfer 1162 * and free space in pipe buffer. 1163 */ 1164 if (space > uio->uio_resid) 1165 size = uio->uio_resid; 1166 else 1167 size = space; 1168 /* 1169 * First segment to transfer is minimum of 1170 * transfer size and contiguous space in 1171 * pipe buffer. If first segment to transfer 1172 * is less than the transfer size, we've got 1173 * a wraparound in the buffer. 1174 */ 1175 segsize = wpipe->pipe_buffer.size - 1176 wpipe->pipe_buffer.in; 1177 if (segsize > size) 1178 segsize = size; 1179 1180 /* Transfer first segment */ 1181 1182 PIPE_UNLOCK(rpipe); 1183 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1184 segsize, uio); 1185 PIPE_LOCK(rpipe); 1186 1187 if (error == 0 && segsize < size) { 1188 KASSERT(wpipe->pipe_buffer.in + segsize == 1189 wpipe->pipe_buffer.size, 1190 ("Pipe buffer wraparound disappeared")); 1191 /* 1192 * Transfer remaining part now, to 1193 * support atomic writes. Wraparound 1194 * happened. 1195 */ 1196 1197 PIPE_UNLOCK(rpipe); 1198 error = uiomove( 1199 &wpipe->pipe_buffer.buffer[0], 1200 size - segsize, uio); 1201 PIPE_LOCK(rpipe); 1202 } 1203 if (error == 0) { 1204 wpipe->pipe_buffer.in += size; 1205 if (wpipe->pipe_buffer.in >= 1206 wpipe->pipe_buffer.size) { 1207 KASSERT(wpipe->pipe_buffer.in == 1208 size - segsize + 1209 wpipe->pipe_buffer.size, 1210 ("Expected wraparound bad")); 1211 wpipe->pipe_buffer.in = size - segsize; 1212 } 1213 1214 wpipe->pipe_buffer.cnt += size; 1215 KASSERT(wpipe->pipe_buffer.cnt <= 1216 wpipe->pipe_buffer.size, 1217 ("Pipe buffer overflow")); 1218 } 1219 pipeunlock(wpipe); 1220 if (error != 0) 1221 break; 1222 } else { 1223 /* 1224 * If the "read-side" has been blocked, wake it up now. 1225 */ 1226 if (wpipe->pipe_state & PIPE_WANTR) { 1227 wpipe->pipe_state &= ~PIPE_WANTR; 1228 wakeup(wpipe); 1229 } 1230 1231 /* 1232 * don't block on non-blocking I/O 1233 */ 1234 if (fp->f_flag & FNONBLOCK) { 1235 error = EAGAIN; 1236 pipeunlock(wpipe); 1237 break; 1238 } 1239 1240 /* 1241 * We have no more space and have something to offer, 1242 * wake up select/poll. 1243 */ 1244 pipeselwakeup(wpipe); 1245 1246 wpipe->pipe_state |= PIPE_WANTW; 1247 pipeunlock(wpipe); 1248 error = msleep(wpipe, PIPE_MTX(rpipe), 1249 PRIBIO | PCATCH, "pipewr", 0); 1250 if (error != 0) 1251 break; 1252 } 1253 } 1254 1255 pipelock(wpipe, 0); 1256 --wpipe->pipe_busy; 1257 1258 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1259 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1260 wakeup(wpipe); 1261 } else if (wpipe->pipe_buffer.cnt > 0) { 1262 /* 1263 * If we have put any characters in the buffer, we wake up 1264 * the reader. 1265 */ 1266 if (wpipe->pipe_state & PIPE_WANTR) { 1267 wpipe->pipe_state &= ~PIPE_WANTR; 1268 wakeup(wpipe); 1269 } 1270 } 1271 1272 /* 1273 * Don't return EPIPE if any byte was written. 1274 * EINTR and other interrupts are handled by generic I/O layer. 1275 * Do not pretend that I/O succeeded for obvious user error 1276 * like EFAULT. 1277 */ 1278 if (uio->uio_resid != orig_resid && error == EPIPE) 1279 error = 0; 1280 1281 if (error == 0) 1282 vfs_timestamp(&wpipe->pipe_mtime); 1283 1284 /* 1285 * We have something to offer, 1286 * wake up select/poll. 1287 */ 1288 if (wpipe->pipe_buffer.cnt) 1289 pipeselwakeup(wpipe); 1290 1291 pipeunlock(wpipe); 1292 PIPE_UNLOCK(rpipe); 1293 return (error); 1294 } 1295 1296 /* ARGSUSED */ 1297 static int 1298 pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred, 1299 struct thread *td) 1300 { 1301 struct pipe *cpipe; 1302 int error; 1303 1304 cpipe = fp->f_data; 1305 if (cpipe->pipe_state & PIPE_NAMED) 1306 error = vnops.fo_truncate(fp, length, active_cred, td); 1307 else 1308 error = invfo_truncate(fp, length, active_cred, td); 1309 return (error); 1310 } 1311 1312 /* 1313 * we implement a very minimal set of ioctls for compatibility with sockets. 1314 */ 1315 static int 1316 pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, 1317 struct thread *td) 1318 { 1319 struct pipe *mpipe = fp->f_data; 1320 int error; 1321 1322 PIPE_LOCK(mpipe); 1323 1324 #ifdef MAC 1325 error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1326 if (error) { 1327 PIPE_UNLOCK(mpipe); 1328 return (error); 1329 } 1330 #endif 1331 1332 error = 0; 1333 switch (cmd) { 1334 1335 case FIONBIO: 1336 break; 1337 1338 case FIOASYNC: 1339 if (*(int *)data) { 1340 mpipe->pipe_state |= PIPE_ASYNC; 1341 } else { 1342 mpipe->pipe_state &= ~PIPE_ASYNC; 1343 } 1344 break; 1345 1346 case FIONREAD: 1347 if (!(fp->f_flag & FREAD)) { 1348 *(int *)data = 0; 1349 PIPE_UNLOCK(mpipe); 1350 return (0); 1351 } 1352 if (mpipe->pipe_state & PIPE_DIRECTW) 1353 *(int *)data = mpipe->pipe_map.cnt; 1354 else 1355 *(int *)data = mpipe->pipe_buffer.cnt; 1356 break; 1357 1358 case FIOSETOWN: 1359 PIPE_UNLOCK(mpipe); 1360 error = fsetown(*(int *)data, &mpipe->pipe_sigio); 1361 goto out_unlocked; 1362 1363 case FIOGETOWN: 1364 *(int *)data = fgetown(&mpipe->pipe_sigio); 1365 break; 1366 1367 /* This is deprecated, FIOSETOWN should be used instead. */ 1368 case TIOCSPGRP: 1369 PIPE_UNLOCK(mpipe); 1370 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); 1371 goto out_unlocked; 1372 1373 /* This is deprecated, FIOGETOWN should be used instead. */ 1374 case TIOCGPGRP: 1375 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1376 break; 1377 1378 default: 1379 error = ENOTTY; 1380 break; 1381 } 1382 PIPE_UNLOCK(mpipe); 1383 out_unlocked: 1384 return (error); 1385 } 1386 1387 static int 1388 pipe_poll(struct file *fp, int events, struct ucred *active_cred, 1389 struct thread *td) 1390 { 1391 struct pipe *rpipe; 1392 struct pipe *wpipe; 1393 int levents, revents; 1394 #ifdef MAC 1395 int error; 1396 #endif 1397 1398 revents = 0; 1399 rpipe = fp->f_data; 1400 wpipe = PIPE_PEER(rpipe); 1401 PIPE_LOCK(rpipe); 1402 #ifdef MAC 1403 error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); 1404 if (error) 1405 goto locked_error; 1406 #endif 1407 if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) 1408 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1409 (rpipe->pipe_buffer.cnt > 0)) 1410 revents |= events & (POLLIN | POLLRDNORM); 1411 1412 if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) 1413 if (wpipe->pipe_present != PIPE_ACTIVE || 1414 (wpipe->pipe_state & PIPE_EOF) || 1415 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1416 ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || 1417 wpipe->pipe_buffer.size == 0))) 1418 revents |= events & (POLLOUT | POLLWRNORM); 1419 1420 levents = events & 1421 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); 1422 if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents && 1423 fp->f_seqcount == rpipe->pipe_wgen) 1424 events |= POLLINIGNEOF; 1425 1426 if ((events & POLLINIGNEOF) == 0) { 1427 if (rpipe->pipe_state & PIPE_EOF) { 1428 revents |= (events & (POLLIN | POLLRDNORM)); 1429 if (wpipe->pipe_present != PIPE_ACTIVE || 1430 (wpipe->pipe_state & PIPE_EOF)) 1431 revents |= POLLHUP; 1432 } 1433 } 1434 1435 if (revents == 0) { 1436 if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) { 1437 selrecord(td, &rpipe->pipe_sel); 1438 if (SEL_WAITING(&rpipe->pipe_sel)) 1439 rpipe->pipe_state |= PIPE_SEL; 1440 } 1441 1442 if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) { 1443 selrecord(td, &wpipe->pipe_sel); 1444 if (SEL_WAITING(&wpipe->pipe_sel)) 1445 wpipe->pipe_state |= PIPE_SEL; 1446 } 1447 } 1448 #ifdef MAC 1449 locked_error: 1450 #endif 1451 PIPE_UNLOCK(rpipe); 1452 1453 return (revents); 1454 } 1455 1456 /* 1457 * We shouldn't need locks here as we're doing a read and this should 1458 * be a natural race. 1459 */ 1460 static int 1461 pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, 1462 struct thread *td) 1463 { 1464 struct pipe *pipe; 1465 int new_unr; 1466 #ifdef MAC 1467 int error; 1468 #endif 1469 1470 pipe = fp->f_data; 1471 PIPE_LOCK(pipe); 1472 #ifdef MAC 1473 error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); 1474 if (error) { 1475 PIPE_UNLOCK(pipe); 1476 return (error); 1477 } 1478 #endif 1479 1480 /* For named pipes ask the underlying filesystem. */ 1481 if (pipe->pipe_state & PIPE_NAMED) { 1482 PIPE_UNLOCK(pipe); 1483 return (vnops.fo_stat(fp, ub, active_cred, td)); 1484 } 1485 1486 /* 1487 * Lazily allocate an inode number for the pipe. Most pipe 1488 * users do not call fstat(2) on the pipe, which means that 1489 * postponing the inode allocation until it is must be 1490 * returned to userland is useful. If alloc_unr failed, 1491 * assign st_ino zero instead of returning an error. 1492 * Special pipe_ino values: 1493 * -1 - not yet initialized; 1494 * 0 - alloc_unr failed, return 0 as st_ino forever. 1495 */ 1496 if (pipe->pipe_ino == (ino_t)-1) { 1497 new_unr = alloc_unr(pipeino_unr); 1498 if (new_unr != -1) 1499 pipe->pipe_ino = new_unr; 1500 else 1501 pipe->pipe_ino = 0; 1502 } 1503 PIPE_UNLOCK(pipe); 1504 1505 bzero(ub, sizeof(*ub)); 1506 ub->st_mode = S_IFIFO; 1507 ub->st_blksize = PAGE_SIZE; 1508 if (pipe->pipe_state & PIPE_DIRECTW) 1509 ub->st_size = pipe->pipe_map.cnt; 1510 else 1511 ub->st_size = pipe->pipe_buffer.cnt; 1512 ub->st_blocks = howmany(ub->st_size, ub->st_blksize); 1513 ub->st_atim = pipe->pipe_atime; 1514 ub->st_mtim = pipe->pipe_mtime; 1515 ub->st_ctim = pipe->pipe_ctime; 1516 ub->st_uid = fp->f_cred->cr_uid; 1517 ub->st_gid = fp->f_cred->cr_gid; 1518 ub->st_dev = pipedev_ino; 1519 ub->st_ino = pipe->pipe_ino; 1520 /* 1521 * Left as 0: st_nlink, st_rdev, st_flags, st_gen. 1522 */ 1523 return (0); 1524 } 1525 1526 /* ARGSUSED */ 1527 static int 1528 pipe_close(struct file *fp, struct thread *td) 1529 { 1530 1531 if (fp->f_vnode != NULL) 1532 return vnops.fo_close(fp, td); 1533 fp->f_ops = &badfileops; 1534 pipe_dtor(fp->f_data); 1535 fp->f_data = NULL; 1536 return (0); 1537 } 1538 1539 static int 1540 pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) 1541 { 1542 struct pipe *cpipe; 1543 int error; 1544 1545 cpipe = fp->f_data; 1546 if (cpipe->pipe_state & PIPE_NAMED) 1547 error = vn_chmod(fp, mode, active_cred, td); 1548 else 1549 error = invfo_chmod(fp, mode, active_cred, td); 1550 return (error); 1551 } 1552 1553 static int 1554 pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 1555 struct thread *td) 1556 { 1557 struct pipe *cpipe; 1558 int error; 1559 1560 cpipe = fp->f_data; 1561 if (cpipe->pipe_state & PIPE_NAMED) 1562 error = vn_chown(fp, uid, gid, active_cred, td); 1563 else 1564 error = invfo_chown(fp, uid, gid, active_cred, td); 1565 return (error); 1566 } 1567 1568 static int 1569 pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1570 { 1571 struct pipe *pi; 1572 1573 if (fp->f_type == DTYPE_FIFO) 1574 return (vn_fill_kinfo(fp, kif, fdp)); 1575 kif->kf_type = KF_TYPE_PIPE; 1576 pi = fp->f_data; 1577 kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; 1578 kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; 1579 kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; 1580 return (0); 1581 } 1582 1583 static void 1584 pipe_free_kmem(struct pipe *cpipe) 1585 { 1586 1587 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1588 ("pipe_free_kmem: pipe mutex locked")); 1589 1590 if (cpipe->pipe_buffer.buffer != NULL) { 1591 atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); 1592 vm_map_remove(pipe_map, 1593 (vm_offset_t)cpipe->pipe_buffer.buffer, 1594 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1595 cpipe->pipe_buffer.buffer = NULL; 1596 } 1597 #ifndef PIPE_NODIRECT 1598 { 1599 cpipe->pipe_map.cnt = 0; 1600 cpipe->pipe_map.pos = 0; 1601 cpipe->pipe_map.npages = 0; 1602 } 1603 #endif 1604 } 1605 1606 /* 1607 * shutdown the pipe 1608 */ 1609 static void 1610 pipeclose(struct pipe *cpipe) 1611 { 1612 struct pipepair *pp; 1613 struct pipe *ppipe; 1614 1615 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1616 1617 PIPE_LOCK(cpipe); 1618 pipelock(cpipe, 0); 1619 pp = cpipe->pipe_pair; 1620 1621 pipeselwakeup(cpipe); 1622 1623 /* 1624 * If the other side is blocked, wake it up saying that 1625 * we want to close it down. 1626 */ 1627 cpipe->pipe_state |= PIPE_EOF; 1628 while (cpipe->pipe_busy) { 1629 wakeup(cpipe); 1630 cpipe->pipe_state |= PIPE_WANT; 1631 pipeunlock(cpipe); 1632 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1633 pipelock(cpipe, 0); 1634 } 1635 1636 1637 /* 1638 * Disconnect from peer, if any. 1639 */ 1640 ppipe = cpipe->pipe_peer; 1641 if (ppipe->pipe_present == PIPE_ACTIVE) { 1642 pipeselwakeup(ppipe); 1643 1644 ppipe->pipe_state |= PIPE_EOF; 1645 wakeup(ppipe); 1646 KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); 1647 } 1648 1649 /* 1650 * Mark this endpoint as free. Release kmem resources. We 1651 * don't mark this endpoint as unused until we've finished 1652 * doing that, or the pipe might disappear out from under 1653 * us. 1654 */ 1655 PIPE_UNLOCK(cpipe); 1656 pipe_free_kmem(cpipe); 1657 PIPE_LOCK(cpipe); 1658 cpipe->pipe_present = PIPE_CLOSING; 1659 pipeunlock(cpipe); 1660 1661 /* 1662 * knlist_clear() may sleep dropping the PIPE_MTX. Set the 1663 * PIPE_FINALIZED, that allows other end to free the 1664 * pipe_pair, only after the knotes are completely dismantled. 1665 */ 1666 knlist_clear(&cpipe->pipe_sel.si_note, 1); 1667 cpipe->pipe_present = PIPE_FINALIZED; 1668 seldrain(&cpipe->pipe_sel); 1669 knlist_destroy(&cpipe->pipe_sel.si_note); 1670 1671 /* 1672 * If both endpoints are now closed, release the memory for the 1673 * pipe pair. If not, unlock. 1674 */ 1675 if (ppipe->pipe_present == PIPE_FINALIZED) { 1676 PIPE_UNLOCK(cpipe); 1677 #ifdef MAC 1678 mac_pipe_destroy(pp); 1679 #endif 1680 uma_zfree(pipe_zone, cpipe->pipe_pair); 1681 } else 1682 PIPE_UNLOCK(cpipe); 1683 } 1684 1685 /*ARGSUSED*/ 1686 static int 1687 pipe_kqfilter(struct file *fp, struct knote *kn) 1688 { 1689 struct pipe *cpipe; 1690 1691 /* 1692 * If a filter is requested that is not supported by this file 1693 * descriptor, don't return an error, but also don't ever generate an 1694 * event. 1695 */ 1696 if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { 1697 kn->kn_fop = &pipe_nfiltops; 1698 return (0); 1699 } 1700 if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { 1701 kn->kn_fop = &pipe_nfiltops; 1702 return (0); 1703 } 1704 cpipe = fp->f_data; 1705 PIPE_LOCK(cpipe); 1706 switch (kn->kn_filter) { 1707 case EVFILT_READ: 1708 kn->kn_fop = &pipe_rfiltops; 1709 break; 1710 case EVFILT_WRITE: 1711 kn->kn_fop = &pipe_wfiltops; 1712 if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { 1713 /* other end of pipe has been closed */ 1714 PIPE_UNLOCK(cpipe); 1715 return (EPIPE); 1716 } 1717 cpipe = PIPE_PEER(cpipe); 1718 break; 1719 default: 1720 PIPE_UNLOCK(cpipe); 1721 return (EINVAL); 1722 } 1723 1724 kn->kn_hook = cpipe; 1725 knlist_add(&cpipe->pipe_sel.si_note, kn, 1); 1726 PIPE_UNLOCK(cpipe); 1727 return (0); 1728 } 1729 1730 static void 1731 filt_pipedetach(struct knote *kn) 1732 { 1733 struct pipe *cpipe = kn->kn_hook; 1734 1735 PIPE_LOCK(cpipe); 1736 knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); 1737 PIPE_UNLOCK(cpipe); 1738 } 1739 1740 /*ARGSUSED*/ 1741 static int 1742 filt_piperead(struct knote *kn, long hint) 1743 { 1744 struct pipe *rpipe = kn->kn_hook; 1745 struct pipe *wpipe = rpipe->pipe_peer; 1746 int ret; 1747 1748 PIPE_LOCK_ASSERT(rpipe, MA_OWNED); 1749 kn->kn_data = rpipe->pipe_buffer.cnt; 1750 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1751 kn->kn_data = rpipe->pipe_map.cnt; 1752 1753 if ((rpipe->pipe_state & PIPE_EOF) || 1754 wpipe->pipe_present != PIPE_ACTIVE || 1755 (wpipe->pipe_state & PIPE_EOF)) { 1756 kn->kn_flags |= EV_EOF; 1757 return (1); 1758 } 1759 ret = kn->kn_data > 0; 1760 return ret; 1761 } 1762 1763 /*ARGSUSED*/ 1764 static int 1765 filt_pipewrite(struct knote *kn, long hint) 1766 { 1767 struct pipe *wpipe; 1768 1769 wpipe = kn->kn_hook; 1770 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 1771 if (wpipe->pipe_present != PIPE_ACTIVE || 1772 (wpipe->pipe_state & PIPE_EOF)) { 1773 kn->kn_data = 0; 1774 kn->kn_flags |= EV_EOF; 1775 return (1); 1776 } 1777 kn->kn_data = (wpipe->pipe_buffer.size > 0) ? 1778 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF; 1779 if (wpipe->pipe_state & PIPE_DIRECTW) 1780 kn->kn_data = 0; 1781 1782 return (kn->kn_data >= PIPE_BUF); 1783 } 1784 1785 static void 1786 filt_pipedetach_notsup(struct knote *kn) 1787 { 1788 1789 } 1790 1791 static int 1792 filt_pipenotsup(struct knote *kn, long hint) 1793 { 1794 1795 return (0); 1796 } 1797