1 /*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 1996 John S. Dyson 5 * Copyright (c) 2012 Giovanni Trematerra 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice immediately at the beginning of the file, without modification, 13 * this list of conditions, and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Absolutely no warranty of function or purpose is made by the author 18 * John S. Dyson. 19 * 4. Modifications may be freely made to this file if the above conditions 20 * are met. 21 */ 22 23 /* 24 * This file contains a high-performance replacement for the socket-based 25 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 26 * all features of sockets, but does do everything that pipes normally 27 * do. 28 */ 29 30 /* 31 * This code has two modes of operation, a small write mode and a large 32 * write mode. The small write mode acts like conventional pipes with 33 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 34 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 35 * and PIPE_SIZE in size, the sending process pins the underlying pages in 36 * memory, and the receiving process copies directly from these pinned pages 37 * in the sending process. 38 * 39 * If the sending process receives a signal, it is possible that it will 40 * go away, and certainly its address space can change, because control 41 * is returned back to the user-mode side. In that case, the pipe code 42 * arranges to copy the buffer supplied by the user process, to a pageable 43 * kernel buffer, and the receiving process will grab the data from the 44 * pageable kernel buffer. Since signals don't happen all that often, 45 * the copy operation is normally eliminated. 46 * 47 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 48 * happen for small transfers so that the system will not spend all of 49 * its time context switching. 50 * 51 * In order to limit the resource use of pipes, two sysctls exist: 52 * 53 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 54 * address space available to us in pipe_map. This value is normally 55 * autotuned, but may also be loader tuned. 56 * 57 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of 58 * memory in use by pipes. 59 * 60 * Based on how large pipekva is relative to maxpipekva, the following 61 * will happen: 62 * 63 * 0% - 50%: 64 * New pipes are given 16K of memory backing, pipes may dynamically 65 * grow to as large as 64K where needed. 66 * 50% - 75%: 67 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 68 * existing pipes may NOT grow. 69 * 75% - 100%: 70 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 71 * existing pipes will be shrunk down to 4K whenever possible. 72 * 73 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If 74 * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE 75 * resize which MUST occur for reverse-direction pipes when they are 76 * first used. 77 * 78 * Additional information about the current state of pipes may be obtained 79 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, 80 * and kern.ipc.piperesizefail. 81 * 82 * Locking rules: There are two locks present here: A mutex, used via 83 * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via 84 * the flag, as mutexes can not persist over uiomove. The mutex 85 * exists only to guard access to the flag, and is not in itself a 86 * locking mechanism. Also note that there is only a single mutex for 87 * both directions of a pipe. 88 * 89 * As pipelock() may have to sleep before it can acquire the flag, it 90 * is important to reread all data after a call to pipelock(); everything 91 * in the structure may have changed. 92 */ 93 94 #include <sys/cdefs.h> 95 __FBSDID("$FreeBSD$"); 96 97 #include <sys/param.h> 98 #include <sys/systm.h> 99 #include <sys/conf.h> 100 #include <sys/fcntl.h> 101 #include <sys/file.h> 102 #include <sys/filedesc.h> 103 #include <sys/filio.h> 104 #include <sys/kernel.h> 105 #include <sys/lock.h> 106 #include <sys/mutex.h> 107 #include <sys/ttycom.h> 108 #include <sys/stat.h> 109 #include <sys/malloc.h> 110 #include <sys/poll.h> 111 #include <sys/selinfo.h> 112 #include <sys/signalvar.h> 113 #include <sys/syscallsubr.h> 114 #include <sys/sysctl.h> 115 #include <sys/sysproto.h> 116 #include <sys/pipe.h> 117 #include <sys/proc.h> 118 #include <sys/vnode.h> 119 #include <sys/uio.h> 120 #include <sys/user.h> 121 #include <sys/event.h> 122 123 #include <security/mac/mac_framework.h> 124 125 #include <vm/vm.h> 126 #include <vm/vm_param.h> 127 #include <vm/vm_object.h> 128 #include <vm/vm_kern.h> 129 #include <vm/vm_extern.h> 130 #include <vm/pmap.h> 131 #include <vm/vm_map.h> 132 #include <vm/vm_page.h> 133 #include <vm/uma.h> 134 135 /* 136 * Use this define if you want to disable *fancy* VM things. Expect an 137 * approx 30% decrease in transfer rate. This could be useful for 138 * NetBSD or OpenBSD. 139 */ 140 /* #define PIPE_NODIRECT */ 141 142 #define PIPE_PEER(pipe) \ 143 (((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer)) 144 145 /* 146 * interfaces to the outside world 147 */ 148 static fo_rdwr_t pipe_read; 149 static fo_rdwr_t pipe_write; 150 static fo_truncate_t pipe_truncate; 151 static fo_ioctl_t pipe_ioctl; 152 static fo_poll_t pipe_poll; 153 static fo_kqfilter_t pipe_kqfilter; 154 static fo_stat_t pipe_stat; 155 static fo_close_t pipe_close; 156 static fo_chmod_t pipe_chmod; 157 static fo_chown_t pipe_chown; 158 static fo_fill_kinfo_t pipe_fill_kinfo; 159 160 struct fileops pipeops = { 161 .fo_read = pipe_read, 162 .fo_write = pipe_write, 163 .fo_truncate = pipe_truncate, 164 .fo_ioctl = pipe_ioctl, 165 .fo_poll = pipe_poll, 166 .fo_kqfilter = pipe_kqfilter, 167 .fo_stat = pipe_stat, 168 .fo_close = pipe_close, 169 .fo_chmod = pipe_chmod, 170 .fo_chown = pipe_chown, 171 .fo_sendfile = invfo_sendfile, 172 .fo_fill_kinfo = pipe_fill_kinfo, 173 .fo_flags = DFLAG_PASSABLE 174 }; 175 176 static void filt_pipedetach(struct knote *kn); 177 static void filt_pipedetach_notsup(struct knote *kn); 178 static int filt_pipenotsup(struct knote *kn, long hint); 179 static int filt_piperead(struct knote *kn, long hint); 180 static int filt_pipewrite(struct knote *kn, long hint); 181 182 static struct filterops pipe_nfiltops = { 183 .f_isfd = 1, 184 .f_detach = filt_pipedetach_notsup, 185 .f_event = filt_pipenotsup 186 }; 187 static struct filterops pipe_rfiltops = { 188 .f_isfd = 1, 189 .f_detach = filt_pipedetach, 190 .f_event = filt_piperead 191 }; 192 static struct filterops pipe_wfiltops = { 193 .f_isfd = 1, 194 .f_detach = filt_pipedetach, 195 .f_event = filt_pipewrite 196 }; 197 198 /* 199 * Default pipe buffer size(s), this can be kind-of large now because pipe 200 * space is pageable. The pipe code will try to maintain locality of 201 * reference for performance reasons, so small amounts of outstanding I/O 202 * will not wipe the cache. 203 */ 204 #define MINPIPESIZE (PIPE_SIZE/3) 205 #define MAXPIPESIZE (2*PIPE_SIZE/3) 206 207 static long amountpipekva; 208 static int pipefragretry; 209 static int pipeallocfail; 210 static int piperesizefail; 211 static int piperesizeallowed = 1; 212 213 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 214 &maxpipekva, 0, "Pipe KVA limit"); 215 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 216 &amountpipekva, 0, "Pipe KVA usage"); 217 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, 218 &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); 219 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, 220 &pipeallocfail, 0, "Pipe allocation failures"); 221 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, 222 &piperesizefail, 0, "Pipe resize failures"); 223 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, 224 &piperesizeallowed, 0, "Pipe resizing allowed"); 225 226 static void pipeinit(void *dummy __unused); 227 static void pipeclose(struct pipe *cpipe); 228 static void pipe_free_kmem(struct pipe *cpipe); 229 static void pipe_create(struct pipe *pipe, int backing); 230 static void pipe_paircreate(struct thread *td, struct pipepair **p_pp); 231 static __inline int pipelock(struct pipe *cpipe, int catch); 232 static __inline void pipeunlock(struct pipe *cpipe); 233 #ifndef PIPE_NODIRECT 234 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 235 static void pipe_destroy_write_buffer(struct pipe *wpipe); 236 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 237 static void pipe_clone_write_buffer(struct pipe *wpipe); 238 #endif 239 static int pipespace(struct pipe *cpipe, int size); 240 static int pipespace_new(struct pipe *cpipe, int size); 241 242 static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); 243 static int pipe_zone_init(void *mem, int size, int flags); 244 static void pipe_zone_fini(void *mem, int size); 245 246 static uma_zone_t pipe_zone; 247 static struct unrhdr *pipeino_unr; 248 static dev_t pipedev_ino; 249 250 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 251 252 static void 253 pipeinit(void *dummy __unused) 254 { 255 256 pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), 257 pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, 258 UMA_ALIGN_PTR, 0); 259 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 260 pipeino_unr = new_unrhdr(1, INT32_MAX, NULL); 261 KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized")); 262 pipedev_ino = devfs_alloc_cdp_inode(); 263 KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); 264 } 265 266 static int 267 pipe_zone_ctor(void *mem, int size, void *arg, int flags) 268 { 269 struct pipepair *pp; 270 struct pipe *rpipe, *wpipe; 271 272 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 273 274 pp = (struct pipepair *)mem; 275 276 /* 277 * We zero both pipe endpoints to make sure all the kmem pointers 278 * are NULL, flag fields are zero'd, etc. We timestamp both 279 * endpoints with the same time. 280 */ 281 rpipe = &pp->pp_rpipe; 282 bzero(rpipe, sizeof(*rpipe)); 283 vfs_timestamp(&rpipe->pipe_ctime); 284 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 285 286 wpipe = &pp->pp_wpipe; 287 bzero(wpipe, sizeof(*wpipe)); 288 wpipe->pipe_ctime = rpipe->pipe_ctime; 289 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 290 291 rpipe->pipe_peer = wpipe; 292 rpipe->pipe_pair = pp; 293 wpipe->pipe_peer = rpipe; 294 wpipe->pipe_pair = pp; 295 296 /* 297 * Mark both endpoints as present; they will later get free'd 298 * one at a time. When both are free'd, then the whole pair 299 * is released. 300 */ 301 rpipe->pipe_present = PIPE_ACTIVE; 302 wpipe->pipe_present = PIPE_ACTIVE; 303 304 /* 305 * Eventually, the MAC Framework may initialize the label 306 * in ctor or init, but for now we do it elswhere to avoid 307 * blocking in ctor or init. 308 */ 309 pp->pp_label = NULL; 310 311 return (0); 312 } 313 314 static int 315 pipe_zone_init(void *mem, int size, int flags) 316 { 317 struct pipepair *pp; 318 319 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 320 321 pp = (struct pipepair *)mem; 322 323 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW); 324 return (0); 325 } 326 327 static void 328 pipe_zone_fini(void *mem, int size) 329 { 330 struct pipepair *pp; 331 332 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 333 334 pp = (struct pipepair *)mem; 335 336 mtx_destroy(&pp->pp_mtx); 337 } 338 339 static void 340 pipe_paircreate(struct thread *td, struct pipepair **p_pp) 341 { 342 struct pipepair *pp; 343 struct pipe *rpipe, *wpipe; 344 345 *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK); 346 #ifdef MAC 347 /* 348 * The MAC label is shared between the connected endpoints. As a 349 * result mac_pipe_init() and mac_pipe_create() are called once 350 * for the pair, and not on the endpoints. 351 */ 352 mac_pipe_init(pp); 353 mac_pipe_create(td->td_ucred, pp); 354 #endif 355 rpipe = &pp->pp_rpipe; 356 wpipe = &pp->pp_wpipe; 357 358 knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); 359 knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); 360 361 /* Only the forward direction pipe is backed by default */ 362 pipe_create(rpipe, 1); 363 pipe_create(wpipe, 0); 364 365 rpipe->pipe_state |= PIPE_DIRECTOK; 366 wpipe->pipe_state |= PIPE_DIRECTOK; 367 } 368 369 void 370 pipe_named_ctor(struct pipe **ppipe, struct thread *td) 371 { 372 struct pipepair *pp; 373 374 pipe_paircreate(td, &pp); 375 pp->pp_rpipe.pipe_state |= PIPE_NAMED; 376 *ppipe = &pp->pp_rpipe; 377 } 378 379 void 380 pipe_dtor(struct pipe *dpipe) 381 { 382 struct pipe *peer; 383 ino_t ino; 384 385 ino = dpipe->pipe_ino; 386 peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; 387 funsetown(&dpipe->pipe_sigio); 388 pipeclose(dpipe); 389 if (peer != NULL) { 390 funsetown(&peer->pipe_sigio); 391 pipeclose(peer); 392 } 393 if (ino != 0 && ino != (ino_t)-1) 394 free_unr(pipeino_unr, ino); 395 } 396 397 /* 398 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let 399 * the zone pick up the pieces via pipeclose(). 400 */ 401 int 402 kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1, 403 struct filecaps *fcaps2) 404 { 405 struct file *rf, *wf; 406 struct pipe *rpipe, *wpipe; 407 struct pipepair *pp; 408 int fd, fflags, error; 409 410 pipe_paircreate(td, &pp); 411 rpipe = &pp->pp_rpipe; 412 wpipe = &pp->pp_wpipe; 413 error = falloc_caps(td, &rf, &fd, flags, fcaps1); 414 if (error) { 415 pipeclose(rpipe); 416 pipeclose(wpipe); 417 return (error); 418 } 419 /* An extra reference on `rf' has been held for us by falloc_caps(). */ 420 fildes[0] = fd; 421 422 fflags = FREAD | FWRITE; 423 if ((flags & O_NONBLOCK) != 0) 424 fflags |= FNONBLOCK; 425 426 /* 427 * Warning: once we've gotten past allocation of the fd for the 428 * read-side, we can only drop the read side via fdrop() in order 429 * to avoid races against processes which manage to dup() the read 430 * side while we are blocked trying to allocate the write side. 431 */ 432 finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops); 433 error = falloc_caps(td, &wf, &fd, flags, fcaps2); 434 if (error) { 435 fdclose(td, rf, fildes[0]); 436 fdrop(rf, td); 437 /* rpipe has been closed by fdrop(). */ 438 pipeclose(wpipe); 439 return (error); 440 } 441 /* An extra reference on `wf' has been held for us by falloc_caps(). */ 442 finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops); 443 fdrop(wf, td); 444 fildes[1] = fd; 445 fdrop(rf, td); 446 447 return (0); 448 } 449 450 #ifdef COMPAT_FREEBSD10 451 /* ARGSUSED */ 452 int 453 freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused) 454 { 455 int error; 456 int fildes[2]; 457 458 error = kern_pipe(td, fildes, 0, NULL, NULL); 459 if (error) 460 return (error); 461 462 td->td_retval[0] = fildes[0]; 463 td->td_retval[1] = fildes[1]; 464 465 return (0); 466 } 467 #endif 468 469 int 470 sys_pipe2(struct thread *td, struct pipe2_args *uap) 471 { 472 int error, fildes[2]; 473 474 if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) 475 return (EINVAL); 476 error = kern_pipe(td, fildes, uap->flags, NULL, NULL); 477 if (error) 478 return (error); 479 error = copyout(fildes, uap->fildes, 2 * sizeof(int)); 480 if (error) { 481 (void)kern_close(td, fildes[0]); 482 (void)kern_close(td, fildes[1]); 483 } 484 return (error); 485 } 486 487 /* 488 * Allocate kva for pipe circular buffer, the space is pageable 489 * This routine will 'realloc' the size of a pipe safely, if it fails 490 * it will retain the old buffer. 491 * If it fails it will return ENOMEM. 492 */ 493 static int 494 pipespace_new(struct pipe *cpipe, int size) 495 { 496 caddr_t buffer; 497 int error, cnt, firstseg; 498 static int curfail = 0; 499 static struct timeval lastfail; 500 501 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 502 KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), 503 ("pipespace: resize of direct writes not allowed")); 504 retry: 505 cnt = cpipe->pipe_buffer.cnt; 506 if (cnt > size) 507 size = cnt; 508 509 size = round_page(size); 510 buffer = (caddr_t) vm_map_min(pipe_map); 511 512 error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *)&buffer, size, 0, 513 VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0); 514 if (error != KERN_SUCCESS) { 515 if ((cpipe->pipe_buffer.buffer == NULL) && 516 (size > SMALL_PIPE_SIZE)) { 517 size = SMALL_PIPE_SIZE; 518 pipefragretry++; 519 goto retry; 520 } 521 if (cpipe->pipe_buffer.buffer == NULL) { 522 pipeallocfail++; 523 if (ppsratecheck(&lastfail, &curfail, 1)) 524 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 525 } else { 526 piperesizefail++; 527 } 528 return (ENOMEM); 529 } 530 531 /* copy data, then free old resources if we're resizing */ 532 if (cnt > 0) { 533 if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { 534 firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; 535 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 536 buffer, firstseg); 537 if ((cnt - firstseg) > 0) 538 bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], 539 cpipe->pipe_buffer.in); 540 } else { 541 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 542 buffer, cnt); 543 } 544 } 545 pipe_free_kmem(cpipe); 546 cpipe->pipe_buffer.buffer = buffer; 547 cpipe->pipe_buffer.size = size; 548 cpipe->pipe_buffer.in = cnt; 549 cpipe->pipe_buffer.out = 0; 550 cpipe->pipe_buffer.cnt = cnt; 551 atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); 552 return (0); 553 } 554 555 /* 556 * Wrapper for pipespace_new() that performs locking assertions. 557 */ 558 static int 559 pipespace(struct pipe *cpipe, int size) 560 { 561 562 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 563 ("Unlocked pipe passed to pipespace")); 564 return (pipespace_new(cpipe, size)); 565 } 566 567 /* 568 * lock a pipe for I/O, blocking other access 569 */ 570 static __inline int 571 pipelock(struct pipe *cpipe, int catch) 572 { 573 int error; 574 575 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 576 while (cpipe->pipe_state & PIPE_LOCKFL) { 577 cpipe->pipe_state |= PIPE_LWANT; 578 error = msleep(cpipe, PIPE_MTX(cpipe), 579 catch ? (PRIBIO | PCATCH) : PRIBIO, 580 "pipelk", 0); 581 if (error != 0) 582 return (error); 583 } 584 cpipe->pipe_state |= PIPE_LOCKFL; 585 return (0); 586 } 587 588 /* 589 * unlock a pipe I/O lock 590 */ 591 static __inline void 592 pipeunlock(struct pipe *cpipe) 593 { 594 595 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 596 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 597 ("Unlocked pipe passed to pipeunlock")); 598 cpipe->pipe_state &= ~PIPE_LOCKFL; 599 if (cpipe->pipe_state & PIPE_LWANT) { 600 cpipe->pipe_state &= ~PIPE_LWANT; 601 wakeup(cpipe); 602 } 603 } 604 605 void 606 pipeselwakeup(struct pipe *cpipe) 607 { 608 609 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 610 if (cpipe->pipe_state & PIPE_SEL) { 611 selwakeuppri(&cpipe->pipe_sel, PSOCK); 612 if (!SEL_WAITING(&cpipe->pipe_sel)) 613 cpipe->pipe_state &= ~PIPE_SEL; 614 } 615 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 616 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 617 KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); 618 } 619 620 /* 621 * Initialize and allocate VM and memory for pipe. The structure 622 * will start out zero'd from the ctor, so we just manage the kmem. 623 */ 624 static void 625 pipe_create(struct pipe *pipe, int backing) 626 { 627 628 if (backing) { 629 /* 630 * Note that these functions can fail if pipe map is exhausted 631 * (as a result of too many pipes created), but we ignore the 632 * error as it is not fatal and could be provoked by 633 * unprivileged users. The only consequence is worse performance 634 * with given pipe. 635 */ 636 if (amountpipekva > maxpipekva / 2) 637 (void)pipespace_new(pipe, SMALL_PIPE_SIZE); 638 else 639 (void)pipespace_new(pipe, PIPE_SIZE); 640 } 641 642 pipe->pipe_ino = -1; 643 } 644 645 /* ARGSUSED */ 646 static int 647 pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 648 int flags, struct thread *td) 649 { 650 struct pipe *rpipe; 651 int error; 652 int nread = 0; 653 int size; 654 655 rpipe = fp->f_data; 656 PIPE_LOCK(rpipe); 657 ++rpipe->pipe_busy; 658 error = pipelock(rpipe, 1); 659 if (error) 660 goto unlocked_error; 661 662 #ifdef MAC 663 error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); 664 if (error) 665 goto locked_error; 666 #endif 667 if (amountpipekva > (3 * maxpipekva) / 4) { 668 if (!(rpipe->pipe_state & PIPE_DIRECTW) && 669 (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 670 (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 671 (piperesizeallowed == 1)) { 672 PIPE_UNLOCK(rpipe); 673 pipespace(rpipe, SMALL_PIPE_SIZE); 674 PIPE_LOCK(rpipe); 675 } 676 } 677 678 while (uio->uio_resid) { 679 /* 680 * normal pipe buffer receive 681 */ 682 if (rpipe->pipe_buffer.cnt > 0) { 683 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 684 if (size > rpipe->pipe_buffer.cnt) 685 size = rpipe->pipe_buffer.cnt; 686 if (size > uio->uio_resid) 687 size = uio->uio_resid; 688 689 PIPE_UNLOCK(rpipe); 690 error = uiomove( 691 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 692 size, uio); 693 PIPE_LOCK(rpipe); 694 if (error) 695 break; 696 697 rpipe->pipe_buffer.out += size; 698 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 699 rpipe->pipe_buffer.out = 0; 700 701 rpipe->pipe_buffer.cnt -= size; 702 703 /* 704 * If there is no more to read in the pipe, reset 705 * its pointers to the beginning. This improves 706 * cache hit stats. 707 */ 708 if (rpipe->pipe_buffer.cnt == 0) { 709 rpipe->pipe_buffer.in = 0; 710 rpipe->pipe_buffer.out = 0; 711 } 712 nread += size; 713 #ifndef PIPE_NODIRECT 714 /* 715 * Direct copy, bypassing a kernel buffer. 716 */ 717 } else if ((size = rpipe->pipe_map.cnt) && 718 (rpipe->pipe_state & PIPE_DIRECTW)) { 719 if (size > uio->uio_resid) 720 size = (u_int) uio->uio_resid; 721 722 PIPE_UNLOCK(rpipe); 723 error = uiomove_fromphys(rpipe->pipe_map.ms, 724 rpipe->pipe_map.pos, size, uio); 725 PIPE_LOCK(rpipe); 726 if (error) 727 break; 728 nread += size; 729 rpipe->pipe_map.pos += size; 730 rpipe->pipe_map.cnt -= size; 731 if (rpipe->pipe_map.cnt == 0) { 732 rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW); 733 wakeup(rpipe); 734 } 735 #endif 736 } else { 737 /* 738 * detect EOF condition 739 * read returns 0 on EOF, no need to set error 740 */ 741 if (rpipe->pipe_state & PIPE_EOF) 742 break; 743 744 /* 745 * If the "write-side" has been blocked, wake it up now. 746 */ 747 if (rpipe->pipe_state & PIPE_WANTW) { 748 rpipe->pipe_state &= ~PIPE_WANTW; 749 wakeup(rpipe); 750 } 751 752 /* 753 * Break if some data was read. 754 */ 755 if (nread > 0) 756 break; 757 758 /* 759 * Unlock the pipe buffer for our remaining processing. 760 * We will either break out with an error or we will 761 * sleep and relock to loop. 762 */ 763 pipeunlock(rpipe); 764 765 /* 766 * Handle non-blocking mode operation or 767 * wait for more data. 768 */ 769 if (fp->f_flag & FNONBLOCK) { 770 error = EAGAIN; 771 } else { 772 rpipe->pipe_state |= PIPE_WANTR; 773 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 774 PRIBIO | PCATCH, 775 "piperd", 0)) == 0) 776 error = pipelock(rpipe, 1); 777 } 778 if (error) 779 goto unlocked_error; 780 } 781 } 782 #ifdef MAC 783 locked_error: 784 #endif 785 pipeunlock(rpipe); 786 787 /* XXX: should probably do this before getting any locks. */ 788 if (error == 0) 789 vfs_timestamp(&rpipe->pipe_atime); 790 unlocked_error: 791 --rpipe->pipe_busy; 792 793 /* 794 * PIPE_WANT processing only makes sense if pipe_busy is 0. 795 */ 796 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 797 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 798 wakeup(rpipe); 799 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 800 /* 801 * Handle write blocking hysteresis. 802 */ 803 if (rpipe->pipe_state & PIPE_WANTW) { 804 rpipe->pipe_state &= ~PIPE_WANTW; 805 wakeup(rpipe); 806 } 807 } 808 809 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 810 pipeselwakeup(rpipe); 811 812 PIPE_UNLOCK(rpipe); 813 return (error); 814 } 815 816 #ifndef PIPE_NODIRECT 817 /* 818 * Map the sending processes' buffer into kernel space and wire it. 819 * This is similar to a physical write operation. 820 */ 821 static int 822 pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) 823 { 824 u_int size; 825 int i; 826 827 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED); 828 KASSERT(wpipe->pipe_state & PIPE_DIRECTW, 829 ("Clone attempt on non-direct write pipe!")); 830 831 if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) 832 size = wpipe->pipe_buffer.size; 833 else 834 size = uio->uio_iov->iov_len; 835 836 if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, 837 (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, 838 wpipe->pipe_map.ms, PIPENPAGES)) < 0) 839 return (EFAULT); 840 841 /* 842 * set up the control block 843 */ 844 wpipe->pipe_map.npages = i; 845 wpipe->pipe_map.pos = 846 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 847 wpipe->pipe_map.cnt = size; 848 849 /* 850 * and update the uio data 851 */ 852 853 uio->uio_iov->iov_len -= size; 854 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 855 if (uio->uio_iov->iov_len == 0) 856 uio->uio_iov++; 857 uio->uio_resid -= size; 858 uio->uio_offset += size; 859 return (0); 860 } 861 862 /* 863 * unmap and unwire the process buffer 864 */ 865 static void 866 pipe_destroy_write_buffer(struct pipe *wpipe) 867 { 868 869 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 870 vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages); 871 wpipe->pipe_map.npages = 0; 872 } 873 874 /* 875 * In the case of a signal, the writing process might go away. This 876 * code copies the data into the circular buffer so that the source 877 * pages can be freed without loss of data. 878 */ 879 static void 880 pipe_clone_write_buffer(struct pipe *wpipe) 881 { 882 struct uio uio; 883 struct iovec iov; 884 int size; 885 int pos; 886 887 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 888 size = wpipe->pipe_map.cnt; 889 pos = wpipe->pipe_map.pos; 890 891 wpipe->pipe_buffer.in = size; 892 wpipe->pipe_buffer.out = 0; 893 wpipe->pipe_buffer.cnt = size; 894 wpipe->pipe_state &= ~PIPE_DIRECTW; 895 896 PIPE_UNLOCK(wpipe); 897 iov.iov_base = wpipe->pipe_buffer.buffer; 898 iov.iov_len = size; 899 uio.uio_iov = &iov; 900 uio.uio_iovcnt = 1; 901 uio.uio_offset = 0; 902 uio.uio_resid = size; 903 uio.uio_segflg = UIO_SYSSPACE; 904 uio.uio_rw = UIO_READ; 905 uio.uio_td = curthread; 906 uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio); 907 PIPE_LOCK(wpipe); 908 pipe_destroy_write_buffer(wpipe); 909 } 910 911 /* 912 * This implements the pipe buffer write mechanism. Note that only 913 * a direct write OR a normal pipe write can be pending at any given time. 914 * If there are any characters in the pipe buffer, the direct write will 915 * be deferred until the receiving process grabs all of the bytes from 916 * the pipe buffer. Then the direct mapping write is set-up. 917 */ 918 static int 919 pipe_direct_write(struct pipe *wpipe, struct uio *uio) 920 { 921 int error; 922 923 retry: 924 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 925 error = pipelock(wpipe, 1); 926 if (error != 0) 927 goto error1; 928 if ((wpipe->pipe_state & PIPE_EOF) != 0) { 929 error = EPIPE; 930 pipeunlock(wpipe); 931 goto error1; 932 } 933 while (wpipe->pipe_state & PIPE_DIRECTW) { 934 if (wpipe->pipe_state & PIPE_WANTR) { 935 wpipe->pipe_state &= ~PIPE_WANTR; 936 wakeup(wpipe); 937 } 938 pipeselwakeup(wpipe); 939 wpipe->pipe_state |= PIPE_WANTW; 940 pipeunlock(wpipe); 941 error = msleep(wpipe, PIPE_MTX(wpipe), 942 PRIBIO | PCATCH, "pipdww", 0); 943 if (error) 944 goto error1; 945 else 946 goto retry; 947 } 948 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 949 if (wpipe->pipe_buffer.cnt > 0) { 950 if (wpipe->pipe_state & PIPE_WANTR) { 951 wpipe->pipe_state &= ~PIPE_WANTR; 952 wakeup(wpipe); 953 } 954 pipeselwakeup(wpipe); 955 wpipe->pipe_state |= PIPE_WANTW; 956 pipeunlock(wpipe); 957 error = msleep(wpipe, PIPE_MTX(wpipe), 958 PRIBIO | PCATCH, "pipdwc", 0); 959 if (error) 960 goto error1; 961 else 962 goto retry; 963 } 964 965 wpipe->pipe_state |= PIPE_DIRECTW; 966 967 PIPE_UNLOCK(wpipe); 968 error = pipe_build_write_buffer(wpipe, uio); 969 PIPE_LOCK(wpipe); 970 if (error) { 971 wpipe->pipe_state &= ~PIPE_DIRECTW; 972 pipeunlock(wpipe); 973 goto error1; 974 } 975 976 error = 0; 977 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 978 if (wpipe->pipe_state & PIPE_EOF) { 979 pipe_destroy_write_buffer(wpipe); 980 pipeselwakeup(wpipe); 981 pipeunlock(wpipe); 982 error = EPIPE; 983 goto error1; 984 } 985 if (wpipe->pipe_state & PIPE_WANTR) { 986 wpipe->pipe_state &= ~PIPE_WANTR; 987 wakeup(wpipe); 988 } 989 pipeselwakeup(wpipe); 990 wpipe->pipe_state |= PIPE_WANTW; 991 pipeunlock(wpipe); 992 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 993 "pipdwt", 0); 994 pipelock(wpipe, 0); 995 } 996 997 if (wpipe->pipe_state & PIPE_EOF) 998 error = EPIPE; 999 if (wpipe->pipe_state & PIPE_DIRECTW) { 1000 /* 1001 * this bit of trickery substitutes a kernel buffer for 1002 * the process that might be going away. 1003 */ 1004 pipe_clone_write_buffer(wpipe); 1005 } else { 1006 pipe_destroy_write_buffer(wpipe); 1007 } 1008 pipeunlock(wpipe); 1009 return (error); 1010 1011 error1: 1012 wakeup(wpipe); 1013 return (error); 1014 } 1015 #endif 1016 1017 static int 1018 pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 1019 int flags, struct thread *td) 1020 { 1021 int error = 0; 1022 int desiredsize; 1023 ssize_t orig_resid; 1024 struct pipe *wpipe, *rpipe; 1025 1026 rpipe = fp->f_data; 1027 wpipe = PIPE_PEER(rpipe); 1028 PIPE_LOCK(rpipe); 1029 error = pipelock(wpipe, 1); 1030 if (error) { 1031 PIPE_UNLOCK(rpipe); 1032 return (error); 1033 } 1034 /* 1035 * detect loss of pipe read side, issue SIGPIPE if lost. 1036 */ 1037 if (wpipe->pipe_present != PIPE_ACTIVE || 1038 (wpipe->pipe_state & PIPE_EOF)) { 1039 pipeunlock(wpipe); 1040 PIPE_UNLOCK(rpipe); 1041 return (EPIPE); 1042 } 1043 #ifdef MAC 1044 error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); 1045 if (error) { 1046 pipeunlock(wpipe); 1047 PIPE_UNLOCK(rpipe); 1048 return (error); 1049 } 1050 #endif 1051 ++wpipe->pipe_busy; 1052 1053 /* Choose a larger size if it's advantageous */ 1054 desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); 1055 while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { 1056 if (piperesizeallowed != 1) 1057 break; 1058 if (amountpipekva > maxpipekva / 2) 1059 break; 1060 if (desiredsize == BIG_PIPE_SIZE) 1061 break; 1062 desiredsize = desiredsize * 2; 1063 } 1064 1065 /* Choose a smaller size if we're in a OOM situation */ 1066 if ((amountpipekva > (3 * maxpipekva) / 4) && 1067 (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) && 1068 (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) && 1069 (piperesizeallowed == 1)) 1070 desiredsize = SMALL_PIPE_SIZE; 1071 1072 /* Resize if the above determined that a new size was necessary */ 1073 if ((desiredsize != wpipe->pipe_buffer.size) && 1074 ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) { 1075 PIPE_UNLOCK(wpipe); 1076 pipespace(wpipe, desiredsize); 1077 PIPE_LOCK(wpipe); 1078 } 1079 if (wpipe->pipe_buffer.size == 0) { 1080 /* 1081 * This can only happen for reverse direction use of pipes 1082 * in a complete OOM situation. 1083 */ 1084 error = ENOMEM; 1085 --wpipe->pipe_busy; 1086 pipeunlock(wpipe); 1087 PIPE_UNLOCK(wpipe); 1088 return (error); 1089 } 1090 1091 pipeunlock(wpipe); 1092 1093 orig_resid = uio->uio_resid; 1094 1095 while (uio->uio_resid) { 1096 int space; 1097 1098 pipelock(wpipe, 0); 1099 if (wpipe->pipe_state & PIPE_EOF) { 1100 pipeunlock(wpipe); 1101 error = EPIPE; 1102 break; 1103 } 1104 #ifndef PIPE_NODIRECT 1105 /* 1106 * If the transfer is large, we can gain performance if 1107 * we do process-to-process copies directly. 1108 * If the write is non-blocking, we don't use the 1109 * direct write mechanism. 1110 * 1111 * The direct write mechanism will detect the reader going 1112 * away on us. 1113 */ 1114 if (uio->uio_segflg == UIO_USERSPACE && 1115 uio->uio_iov->iov_len >= PIPE_MINDIRECT && 1116 wpipe->pipe_buffer.size >= PIPE_MINDIRECT && 1117 (fp->f_flag & FNONBLOCK) == 0) { 1118 pipeunlock(wpipe); 1119 error = pipe_direct_write(wpipe, uio); 1120 if (error) 1121 break; 1122 continue; 1123 } 1124 #endif 1125 1126 /* 1127 * Pipe buffered writes cannot be coincidental with 1128 * direct writes. We wait until the currently executing 1129 * direct write is completed before we start filling the 1130 * pipe buffer. We break out if a signal occurs or the 1131 * reader goes away. 1132 */ 1133 if (wpipe->pipe_state & PIPE_DIRECTW) { 1134 if (wpipe->pipe_state & PIPE_WANTR) { 1135 wpipe->pipe_state &= ~PIPE_WANTR; 1136 wakeup(wpipe); 1137 } 1138 pipeselwakeup(wpipe); 1139 wpipe->pipe_state |= PIPE_WANTW; 1140 pipeunlock(wpipe); 1141 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1142 "pipbww", 0); 1143 if (error) 1144 break; 1145 else 1146 continue; 1147 } 1148 1149 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1150 1151 /* Writes of size <= PIPE_BUF must be atomic. */ 1152 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1153 space = 0; 1154 1155 if (space > 0) { 1156 int size; /* Transfer size */ 1157 int segsize; /* first segment to transfer */ 1158 1159 /* 1160 * Transfer size is minimum of uio transfer 1161 * and free space in pipe buffer. 1162 */ 1163 if (space > uio->uio_resid) 1164 size = uio->uio_resid; 1165 else 1166 size = space; 1167 /* 1168 * First segment to transfer is minimum of 1169 * transfer size and contiguous space in 1170 * pipe buffer. If first segment to transfer 1171 * is less than the transfer size, we've got 1172 * a wraparound in the buffer. 1173 */ 1174 segsize = wpipe->pipe_buffer.size - 1175 wpipe->pipe_buffer.in; 1176 if (segsize > size) 1177 segsize = size; 1178 1179 /* Transfer first segment */ 1180 1181 PIPE_UNLOCK(rpipe); 1182 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1183 segsize, uio); 1184 PIPE_LOCK(rpipe); 1185 1186 if (error == 0 && segsize < size) { 1187 KASSERT(wpipe->pipe_buffer.in + segsize == 1188 wpipe->pipe_buffer.size, 1189 ("Pipe buffer wraparound disappeared")); 1190 /* 1191 * Transfer remaining part now, to 1192 * support atomic writes. Wraparound 1193 * happened. 1194 */ 1195 1196 PIPE_UNLOCK(rpipe); 1197 error = uiomove( 1198 &wpipe->pipe_buffer.buffer[0], 1199 size - segsize, uio); 1200 PIPE_LOCK(rpipe); 1201 } 1202 if (error == 0) { 1203 wpipe->pipe_buffer.in += size; 1204 if (wpipe->pipe_buffer.in >= 1205 wpipe->pipe_buffer.size) { 1206 KASSERT(wpipe->pipe_buffer.in == 1207 size - segsize + 1208 wpipe->pipe_buffer.size, 1209 ("Expected wraparound bad")); 1210 wpipe->pipe_buffer.in = size - segsize; 1211 } 1212 1213 wpipe->pipe_buffer.cnt += size; 1214 KASSERT(wpipe->pipe_buffer.cnt <= 1215 wpipe->pipe_buffer.size, 1216 ("Pipe buffer overflow")); 1217 } 1218 pipeunlock(wpipe); 1219 if (error != 0) 1220 break; 1221 } else { 1222 /* 1223 * If the "read-side" has been blocked, wake it up now. 1224 */ 1225 if (wpipe->pipe_state & PIPE_WANTR) { 1226 wpipe->pipe_state &= ~PIPE_WANTR; 1227 wakeup(wpipe); 1228 } 1229 1230 /* 1231 * don't block on non-blocking I/O 1232 */ 1233 if (fp->f_flag & FNONBLOCK) { 1234 error = EAGAIN; 1235 pipeunlock(wpipe); 1236 break; 1237 } 1238 1239 /* 1240 * We have no more space and have something to offer, 1241 * wake up select/poll. 1242 */ 1243 pipeselwakeup(wpipe); 1244 1245 wpipe->pipe_state |= PIPE_WANTW; 1246 pipeunlock(wpipe); 1247 error = msleep(wpipe, PIPE_MTX(rpipe), 1248 PRIBIO | PCATCH, "pipewr", 0); 1249 if (error != 0) 1250 break; 1251 } 1252 } 1253 1254 pipelock(wpipe, 0); 1255 --wpipe->pipe_busy; 1256 1257 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1258 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1259 wakeup(wpipe); 1260 } else if (wpipe->pipe_buffer.cnt > 0) { 1261 /* 1262 * If we have put any characters in the buffer, we wake up 1263 * the reader. 1264 */ 1265 if (wpipe->pipe_state & PIPE_WANTR) { 1266 wpipe->pipe_state &= ~PIPE_WANTR; 1267 wakeup(wpipe); 1268 } 1269 } 1270 1271 /* 1272 * Don't return EPIPE if any byte was written. 1273 * EINTR and other interrupts are handled by generic I/O layer. 1274 * Do not pretend that I/O succeeded for obvious user error 1275 * like EFAULT. 1276 */ 1277 if (uio->uio_resid != orig_resid && error == EPIPE) 1278 error = 0; 1279 1280 if (error == 0) 1281 vfs_timestamp(&wpipe->pipe_mtime); 1282 1283 /* 1284 * We have something to offer, 1285 * wake up select/poll. 1286 */ 1287 if (wpipe->pipe_buffer.cnt) 1288 pipeselwakeup(wpipe); 1289 1290 pipeunlock(wpipe); 1291 PIPE_UNLOCK(rpipe); 1292 return (error); 1293 } 1294 1295 /* ARGSUSED */ 1296 static int 1297 pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred, 1298 struct thread *td) 1299 { 1300 struct pipe *cpipe; 1301 int error; 1302 1303 cpipe = fp->f_data; 1304 if (cpipe->pipe_state & PIPE_NAMED) 1305 error = vnops.fo_truncate(fp, length, active_cred, td); 1306 else 1307 error = invfo_truncate(fp, length, active_cred, td); 1308 return (error); 1309 } 1310 1311 /* 1312 * we implement a very minimal set of ioctls for compatibility with sockets. 1313 */ 1314 static int 1315 pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, 1316 struct thread *td) 1317 { 1318 struct pipe *mpipe = fp->f_data; 1319 int error; 1320 1321 PIPE_LOCK(mpipe); 1322 1323 #ifdef MAC 1324 error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1325 if (error) { 1326 PIPE_UNLOCK(mpipe); 1327 return (error); 1328 } 1329 #endif 1330 1331 error = 0; 1332 switch (cmd) { 1333 1334 case FIONBIO: 1335 break; 1336 1337 case FIOASYNC: 1338 if (*(int *)data) { 1339 mpipe->pipe_state |= PIPE_ASYNC; 1340 } else { 1341 mpipe->pipe_state &= ~PIPE_ASYNC; 1342 } 1343 break; 1344 1345 case FIONREAD: 1346 if (!(fp->f_flag & FREAD)) { 1347 *(int *)data = 0; 1348 PIPE_UNLOCK(mpipe); 1349 return (0); 1350 } 1351 if (mpipe->pipe_state & PIPE_DIRECTW) 1352 *(int *)data = mpipe->pipe_map.cnt; 1353 else 1354 *(int *)data = mpipe->pipe_buffer.cnt; 1355 break; 1356 1357 case FIOSETOWN: 1358 PIPE_UNLOCK(mpipe); 1359 error = fsetown(*(int *)data, &mpipe->pipe_sigio); 1360 goto out_unlocked; 1361 1362 case FIOGETOWN: 1363 *(int *)data = fgetown(&mpipe->pipe_sigio); 1364 break; 1365 1366 /* This is deprecated, FIOSETOWN should be used instead. */ 1367 case TIOCSPGRP: 1368 PIPE_UNLOCK(mpipe); 1369 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); 1370 goto out_unlocked; 1371 1372 /* This is deprecated, FIOGETOWN should be used instead. */ 1373 case TIOCGPGRP: 1374 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1375 break; 1376 1377 default: 1378 error = ENOTTY; 1379 break; 1380 } 1381 PIPE_UNLOCK(mpipe); 1382 out_unlocked: 1383 return (error); 1384 } 1385 1386 static int 1387 pipe_poll(struct file *fp, int events, struct ucred *active_cred, 1388 struct thread *td) 1389 { 1390 struct pipe *rpipe; 1391 struct pipe *wpipe; 1392 int levents, revents; 1393 #ifdef MAC 1394 int error; 1395 #endif 1396 1397 revents = 0; 1398 rpipe = fp->f_data; 1399 wpipe = PIPE_PEER(rpipe); 1400 PIPE_LOCK(rpipe); 1401 #ifdef MAC 1402 error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); 1403 if (error) 1404 goto locked_error; 1405 #endif 1406 if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) 1407 if ((rpipe->pipe_state & PIPE_DIRECTW) || 1408 (rpipe->pipe_buffer.cnt > 0)) 1409 revents |= events & (POLLIN | POLLRDNORM); 1410 1411 if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) 1412 if (wpipe->pipe_present != PIPE_ACTIVE || 1413 (wpipe->pipe_state & PIPE_EOF) || 1414 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 1415 ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || 1416 wpipe->pipe_buffer.size == 0))) 1417 revents |= events & (POLLOUT | POLLWRNORM); 1418 1419 levents = events & 1420 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); 1421 if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents && 1422 fp->f_seqcount == rpipe->pipe_wgen) 1423 events |= POLLINIGNEOF; 1424 1425 if ((events & POLLINIGNEOF) == 0) { 1426 if (rpipe->pipe_state & PIPE_EOF) { 1427 revents |= (events & (POLLIN | POLLRDNORM)); 1428 if (wpipe->pipe_present != PIPE_ACTIVE || 1429 (wpipe->pipe_state & PIPE_EOF)) 1430 revents |= POLLHUP; 1431 } 1432 } 1433 1434 if (revents == 0) { 1435 if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) { 1436 selrecord(td, &rpipe->pipe_sel); 1437 if (SEL_WAITING(&rpipe->pipe_sel)) 1438 rpipe->pipe_state |= PIPE_SEL; 1439 } 1440 1441 if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) { 1442 selrecord(td, &wpipe->pipe_sel); 1443 if (SEL_WAITING(&wpipe->pipe_sel)) 1444 wpipe->pipe_state |= PIPE_SEL; 1445 } 1446 } 1447 #ifdef MAC 1448 locked_error: 1449 #endif 1450 PIPE_UNLOCK(rpipe); 1451 1452 return (revents); 1453 } 1454 1455 /* 1456 * We shouldn't need locks here as we're doing a read and this should 1457 * be a natural race. 1458 */ 1459 static int 1460 pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, 1461 struct thread *td) 1462 { 1463 struct pipe *pipe; 1464 int new_unr; 1465 #ifdef MAC 1466 int error; 1467 #endif 1468 1469 pipe = fp->f_data; 1470 PIPE_LOCK(pipe); 1471 #ifdef MAC 1472 error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); 1473 if (error) { 1474 PIPE_UNLOCK(pipe); 1475 return (error); 1476 } 1477 #endif 1478 1479 /* For named pipes ask the underlying filesystem. */ 1480 if (pipe->pipe_state & PIPE_NAMED) { 1481 PIPE_UNLOCK(pipe); 1482 return (vnops.fo_stat(fp, ub, active_cred, td)); 1483 } 1484 1485 /* 1486 * Lazily allocate an inode number for the pipe. Most pipe 1487 * users do not call fstat(2) on the pipe, which means that 1488 * postponing the inode allocation until it is must be 1489 * returned to userland is useful. If alloc_unr failed, 1490 * assign st_ino zero instead of returning an error. 1491 * Special pipe_ino values: 1492 * -1 - not yet initialized; 1493 * 0 - alloc_unr failed, return 0 as st_ino forever. 1494 */ 1495 if (pipe->pipe_ino == (ino_t)-1) { 1496 new_unr = alloc_unr(pipeino_unr); 1497 if (new_unr != -1) 1498 pipe->pipe_ino = new_unr; 1499 else 1500 pipe->pipe_ino = 0; 1501 } 1502 PIPE_UNLOCK(pipe); 1503 1504 bzero(ub, sizeof(*ub)); 1505 ub->st_mode = S_IFIFO; 1506 ub->st_blksize = PAGE_SIZE; 1507 if (pipe->pipe_state & PIPE_DIRECTW) 1508 ub->st_size = pipe->pipe_map.cnt; 1509 else 1510 ub->st_size = pipe->pipe_buffer.cnt; 1511 ub->st_blocks = howmany(ub->st_size, ub->st_blksize); 1512 ub->st_atim = pipe->pipe_atime; 1513 ub->st_mtim = pipe->pipe_mtime; 1514 ub->st_ctim = pipe->pipe_ctime; 1515 ub->st_uid = fp->f_cred->cr_uid; 1516 ub->st_gid = fp->f_cred->cr_gid; 1517 ub->st_dev = pipedev_ino; 1518 ub->st_ino = pipe->pipe_ino; 1519 /* 1520 * Left as 0: st_nlink, st_rdev, st_flags, st_gen. 1521 */ 1522 return (0); 1523 } 1524 1525 /* ARGSUSED */ 1526 static int 1527 pipe_close(struct file *fp, struct thread *td) 1528 { 1529 1530 if (fp->f_vnode != NULL) 1531 return vnops.fo_close(fp, td); 1532 fp->f_ops = &badfileops; 1533 pipe_dtor(fp->f_data); 1534 fp->f_data = NULL; 1535 return (0); 1536 } 1537 1538 static int 1539 pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) 1540 { 1541 struct pipe *cpipe; 1542 int error; 1543 1544 cpipe = fp->f_data; 1545 if (cpipe->pipe_state & PIPE_NAMED) 1546 error = vn_chmod(fp, mode, active_cred, td); 1547 else 1548 error = invfo_chmod(fp, mode, active_cred, td); 1549 return (error); 1550 } 1551 1552 static int 1553 pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 1554 struct thread *td) 1555 { 1556 struct pipe *cpipe; 1557 int error; 1558 1559 cpipe = fp->f_data; 1560 if (cpipe->pipe_state & PIPE_NAMED) 1561 error = vn_chown(fp, uid, gid, active_cred, td); 1562 else 1563 error = invfo_chown(fp, uid, gid, active_cred, td); 1564 return (error); 1565 } 1566 1567 static int 1568 pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1569 { 1570 struct pipe *pi; 1571 1572 if (fp->f_type == DTYPE_FIFO) 1573 return (vn_fill_kinfo(fp, kif, fdp)); 1574 kif->kf_type = KF_TYPE_PIPE; 1575 pi = fp->f_data; 1576 kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; 1577 kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; 1578 kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; 1579 return (0); 1580 } 1581 1582 static void 1583 pipe_free_kmem(struct pipe *cpipe) 1584 { 1585 1586 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1587 ("pipe_free_kmem: pipe mutex locked")); 1588 1589 if (cpipe->pipe_buffer.buffer != NULL) { 1590 atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); 1591 vm_map_remove(pipe_map, 1592 (vm_offset_t)cpipe->pipe_buffer.buffer, 1593 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1594 cpipe->pipe_buffer.buffer = NULL; 1595 } 1596 #ifndef PIPE_NODIRECT 1597 { 1598 cpipe->pipe_map.cnt = 0; 1599 cpipe->pipe_map.pos = 0; 1600 cpipe->pipe_map.npages = 0; 1601 } 1602 #endif 1603 } 1604 1605 /* 1606 * shutdown the pipe 1607 */ 1608 static void 1609 pipeclose(struct pipe *cpipe) 1610 { 1611 struct pipepair *pp; 1612 struct pipe *ppipe; 1613 1614 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1615 1616 PIPE_LOCK(cpipe); 1617 pipelock(cpipe, 0); 1618 pp = cpipe->pipe_pair; 1619 1620 pipeselwakeup(cpipe); 1621 1622 /* 1623 * If the other side is blocked, wake it up saying that 1624 * we want to close it down. 1625 */ 1626 cpipe->pipe_state |= PIPE_EOF; 1627 while (cpipe->pipe_busy) { 1628 wakeup(cpipe); 1629 cpipe->pipe_state |= PIPE_WANT; 1630 pipeunlock(cpipe); 1631 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1632 pipelock(cpipe, 0); 1633 } 1634 1635 1636 /* 1637 * Disconnect from peer, if any. 1638 */ 1639 ppipe = cpipe->pipe_peer; 1640 if (ppipe->pipe_present == PIPE_ACTIVE) { 1641 pipeselwakeup(ppipe); 1642 1643 ppipe->pipe_state |= PIPE_EOF; 1644 wakeup(ppipe); 1645 KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0); 1646 } 1647 1648 /* 1649 * Mark this endpoint as free. Release kmem resources. We 1650 * don't mark this endpoint as unused until we've finished 1651 * doing that, or the pipe might disappear out from under 1652 * us. 1653 */ 1654 PIPE_UNLOCK(cpipe); 1655 pipe_free_kmem(cpipe); 1656 PIPE_LOCK(cpipe); 1657 cpipe->pipe_present = PIPE_CLOSING; 1658 pipeunlock(cpipe); 1659 1660 /* 1661 * knlist_clear() may sleep dropping the PIPE_MTX. Set the 1662 * PIPE_FINALIZED, that allows other end to free the 1663 * pipe_pair, only after the knotes are completely dismantled. 1664 */ 1665 knlist_clear(&cpipe->pipe_sel.si_note, 1); 1666 cpipe->pipe_present = PIPE_FINALIZED; 1667 seldrain(&cpipe->pipe_sel); 1668 knlist_destroy(&cpipe->pipe_sel.si_note); 1669 1670 /* 1671 * If both endpoints are now closed, release the memory for the 1672 * pipe pair. If not, unlock. 1673 */ 1674 if (ppipe->pipe_present == PIPE_FINALIZED) { 1675 PIPE_UNLOCK(cpipe); 1676 #ifdef MAC 1677 mac_pipe_destroy(pp); 1678 #endif 1679 uma_zfree(pipe_zone, cpipe->pipe_pair); 1680 } else 1681 PIPE_UNLOCK(cpipe); 1682 } 1683 1684 /*ARGSUSED*/ 1685 static int 1686 pipe_kqfilter(struct file *fp, struct knote *kn) 1687 { 1688 struct pipe *cpipe; 1689 1690 /* 1691 * If a filter is requested that is not supported by this file 1692 * descriptor, don't return an error, but also don't ever generate an 1693 * event. 1694 */ 1695 if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { 1696 kn->kn_fop = &pipe_nfiltops; 1697 return (0); 1698 } 1699 if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { 1700 kn->kn_fop = &pipe_nfiltops; 1701 return (0); 1702 } 1703 cpipe = fp->f_data; 1704 PIPE_LOCK(cpipe); 1705 switch (kn->kn_filter) { 1706 case EVFILT_READ: 1707 kn->kn_fop = &pipe_rfiltops; 1708 break; 1709 case EVFILT_WRITE: 1710 kn->kn_fop = &pipe_wfiltops; 1711 if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { 1712 /* other end of pipe has been closed */ 1713 PIPE_UNLOCK(cpipe); 1714 return (EPIPE); 1715 } 1716 cpipe = PIPE_PEER(cpipe); 1717 break; 1718 default: 1719 PIPE_UNLOCK(cpipe); 1720 return (EINVAL); 1721 } 1722 1723 kn->kn_hook = cpipe; 1724 knlist_add(&cpipe->pipe_sel.si_note, kn, 1); 1725 PIPE_UNLOCK(cpipe); 1726 return (0); 1727 } 1728 1729 static void 1730 filt_pipedetach(struct knote *kn) 1731 { 1732 struct pipe *cpipe = kn->kn_hook; 1733 1734 PIPE_LOCK(cpipe); 1735 knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); 1736 PIPE_UNLOCK(cpipe); 1737 } 1738 1739 /*ARGSUSED*/ 1740 static int 1741 filt_piperead(struct knote *kn, long hint) 1742 { 1743 struct pipe *rpipe = kn->kn_hook; 1744 struct pipe *wpipe = rpipe->pipe_peer; 1745 int ret; 1746 1747 PIPE_LOCK_ASSERT(rpipe, MA_OWNED); 1748 kn->kn_data = rpipe->pipe_buffer.cnt; 1749 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW)) 1750 kn->kn_data = rpipe->pipe_map.cnt; 1751 1752 if ((rpipe->pipe_state & PIPE_EOF) || 1753 wpipe->pipe_present != PIPE_ACTIVE || 1754 (wpipe->pipe_state & PIPE_EOF)) { 1755 kn->kn_flags |= EV_EOF; 1756 return (1); 1757 } 1758 ret = kn->kn_data > 0; 1759 return ret; 1760 } 1761 1762 /*ARGSUSED*/ 1763 static int 1764 filt_pipewrite(struct knote *kn, long hint) 1765 { 1766 struct pipe *wpipe; 1767 1768 wpipe = kn->kn_hook; 1769 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 1770 if (wpipe->pipe_present != PIPE_ACTIVE || 1771 (wpipe->pipe_state & PIPE_EOF)) { 1772 kn->kn_data = 0; 1773 kn->kn_flags |= EV_EOF; 1774 return (1); 1775 } 1776 kn->kn_data = (wpipe->pipe_buffer.size > 0) ? 1777 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF; 1778 if (wpipe->pipe_state & PIPE_DIRECTW) 1779 kn->kn_data = 0; 1780 1781 return (kn->kn_data >= PIPE_BUF); 1782 } 1783 1784 static void 1785 filt_pipedetach_notsup(struct knote *kn) 1786 { 1787 1788 } 1789 1790 static int 1791 filt_pipenotsup(struct knote *kn, long hint) 1792 { 1793 1794 return (0); 1795 } 1796