1 /*- 2 * Copyright (c) 1996 John S. Dyson 3 * Copyright (c) 2012 Giovanni Trematerra 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice immediately at the beginning of the file, without modification, 11 * this list of conditions, and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Absolutely no warranty of function or purpose is made by the author 16 * John S. Dyson. 17 * 4. Modifications may be freely made to this file if the above conditions 18 * are met. 19 */ 20 21 /* 22 * This file contains a high-performance replacement for the socket-based 23 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 24 * all features of sockets, but does do everything that pipes normally 25 * do. 26 */ 27 28 /* 29 * This code has two modes of operation, a small write mode and a large 30 * write mode. The small write mode acts like conventional pipes with 31 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 32 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 33 * and PIPE_SIZE in size, the sending process pins the underlying pages in 34 * memory, and the receiving process copies directly from these pinned pages 35 * in the sending process. 36 * 37 * If the sending process receives a signal, it is possible that it will 38 * go away, and certainly its address space can change, because control 39 * is returned back to the user-mode side. In that case, the pipe code 40 * arranges to copy the buffer supplied by the user process, to a pageable 41 * kernel buffer, and the receiving process will grab the data from the 42 * pageable kernel buffer. Since signals don't happen all that often, 43 * the copy operation is normally eliminated. 44 * 45 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 46 * happen for small transfers so that the system will not spend all of 47 * its time context switching. 48 * 49 * In order to limit the resource use of pipes, two sysctls exist: 50 * 51 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable 52 * address space available to us in pipe_map. This value is normally 53 * autotuned, but may also be loader tuned. 54 * 55 * kern.ipc.pipekva - This read-only sysctl tracks the current amount of 56 * memory in use by pipes. 57 * 58 * Based on how large pipekva is relative to maxpipekva, the following 59 * will happen: 60 * 61 * 0% - 50%: 62 * New pipes are given 16K of memory backing, pipes may dynamically 63 * grow to as large as 64K where needed. 64 * 50% - 75%: 65 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 66 * existing pipes may NOT grow. 67 * 75% - 100%: 68 * New pipes are given 4K (or PAGE_SIZE) of memory backing, 69 * existing pipes will be shrunk down to 4K whenever possible. 70 * 71 * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If 72 * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE 73 * resize which MUST occur for reverse-direction pipes when they are 74 * first used. 75 * 76 * Additional information about the current state of pipes may be obtained 77 * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, 78 * and kern.ipc.piperesizefail. 79 * 80 * Locking rules: There are two locks present here: A mutex, used via 81 * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via 82 * the flag, as mutexes can not persist over uiomove. The mutex 83 * exists only to guard access to the flag, and is not in itself a 84 * locking mechanism. Also note that there is only a single mutex for 85 * both directions of a pipe. 86 * 87 * As pipelock() may have to sleep before it can acquire the flag, it 88 * is important to reread all data after a call to pipelock(); everything 89 * in the structure may have changed. 90 */ 91 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/conf.h> 95 #include <sys/fcntl.h> 96 #include <sys/file.h> 97 #include <sys/filedesc.h> 98 #include <sys/filio.h> 99 #include <sys/kernel.h> 100 #include <sys/lock.h> 101 #include <sys/mutex.h> 102 #include <sys/ttycom.h> 103 #include <sys/stat.h> 104 #include <sys/malloc.h> 105 #include <sys/poll.h> 106 #include <sys/priv.h> 107 #include <sys/selinfo.h> 108 #include <sys/signalvar.h> 109 #include <sys/syscallsubr.h> 110 #include <sys/sysctl.h> 111 #include <sys/sysproto.h> 112 #include <sys/pipe.h> 113 #include <sys/proc.h> 114 #include <sys/vnode.h> 115 #include <sys/uio.h> 116 #include <sys/user.h> 117 #include <sys/event.h> 118 119 #include <security/mac/mac_framework.h> 120 121 #include <vm/vm.h> 122 #include <vm/vm_param.h> 123 #include <vm/vm_object.h> 124 #include <vm/vm_kern.h> 125 #include <vm/vm_extern.h> 126 #include <vm/pmap.h> 127 #include <vm/vm_map.h> 128 #include <vm/vm_page.h> 129 #include <vm/uma.h> 130 131 /* 132 * Use this define if you want to disable *fancy* VM things. Expect an 133 * approx 30% decrease in transfer rate. This could be useful for 134 * NetBSD or OpenBSD. 135 */ 136 /* #define PIPE_NODIRECT */ 137 138 #define PIPE_PEER(pipe) \ 139 (((pipe)->pipe_type & PIPE_TYPE_NAMED) ? (pipe) : ((pipe)->pipe_peer)) 140 141 /* 142 * interfaces to the outside world 143 */ 144 static fo_rdwr_t pipe_read; 145 static fo_rdwr_t pipe_write; 146 static fo_truncate_t pipe_truncate; 147 static fo_ioctl_t pipe_ioctl; 148 static fo_poll_t pipe_poll; 149 static fo_kqfilter_t pipe_kqfilter; 150 static fo_stat_t pipe_stat; 151 static fo_close_t pipe_close; 152 static fo_chmod_t pipe_chmod; 153 static fo_chown_t pipe_chown; 154 static fo_fill_kinfo_t pipe_fill_kinfo; 155 156 struct fileops pipeops = { 157 .fo_read = pipe_read, 158 .fo_write = pipe_write, 159 .fo_truncate = pipe_truncate, 160 .fo_ioctl = pipe_ioctl, 161 .fo_poll = pipe_poll, 162 .fo_kqfilter = pipe_kqfilter, 163 .fo_stat = pipe_stat, 164 .fo_close = pipe_close, 165 .fo_chmod = pipe_chmod, 166 .fo_chown = pipe_chown, 167 .fo_sendfile = invfo_sendfile, 168 .fo_fill_kinfo = pipe_fill_kinfo, 169 .fo_cmp = file_kcmp_generic, 170 .fo_flags = DFLAG_PASSABLE 171 }; 172 173 static void filt_pipedetach(struct knote *kn); 174 static void filt_pipedetach_notsup(struct knote *kn); 175 static int filt_pipenotsup(struct knote *kn, long hint); 176 static int filt_piperead(struct knote *kn, long hint); 177 static int filt_pipewrite(struct knote *kn, long hint); 178 179 static struct filterops pipe_nfiltops = { 180 .f_isfd = 1, 181 .f_detach = filt_pipedetach_notsup, 182 .f_event = filt_pipenotsup 183 }; 184 static struct filterops pipe_rfiltops = { 185 .f_isfd = 1, 186 .f_detach = filt_pipedetach, 187 .f_event = filt_piperead 188 }; 189 static struct filterops pipe_wfiltops = { 190 .f_isfd = 1, 191 .f_detach = filt_pipedetach, 192 .f_event = filt_pipewrite 193 }; 194 195 /* 196 * Default pipe buffer size(s), this can be kind-of large now because pipe 197 * space is pageable. The pipe code will try to maintain locality of 198 * reference for performance reasons, so small amounts of outstanding I/O 199 * will not wipe the cache. 200 */ 201 #define MINPIPESIZE (PIPE_SIZE/3) 202 #define MAXPIPESIZE (2*PIPE_SIZE/3) 203 204 static long amountpipekva; 205 static int pipefragretry; 206 static int pipeallocfail; 207 static int piperesizefail; 208 static int piperesizeallowed = 1; 209 static long pipe_mindirect = PIPE_MINDIRECT; 210 static int pipebuf_reserv = 2; 211 212 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 213 &maxpipekva, 0, "Pipe KVA limit"); 214 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, 215 &amountpipekva, 0, "Pipe KVA usage"); 216 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, 217 &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); 218 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, 219 &pipeallocfail, 0, "Pipe allocation failures"); 220 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, 221 &piperesizefail, 0, "Pipe resize failures"); 222 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, 223 &piperesizeallowed, 0, "Pipe resizing allowed"); 224 SYSCTL_INT(_kern_ipc, OID_AUTO, pipebuf_reserv, CTLFLAG_RW, 225 &pipebuf_reserv, 0, 226 "Superuser-reserved percentage of the pipe buffers space"); 227 228 static void pipeinit(void *dummy __unused); 229 static void pipeclose(struct pipe *cpipe); 230 static void pipe_free_kmem(struct pipe *cpipe); 231 static int pipe_create(struct pipe *pipe, bool backing); 232 static int pipe_paircreate(struct thread *td, struct pipepair **p_pp); 233 static __inline int pipelock(struct pipe *cpipe, bool catch); 234 static __inline void pipeunlock(struct pipe *cpipe); 235 static void pipe_timestamp(struct timespec *tsp); 236 #ifndef PIPE_NODIRECT 237 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); 238 static void pipe_destroy_write_buffer(struct pipe *wpipe); 239 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); 240 static void pipe_clone_write_buffer(struct pipe *wpipe); 241 #endif 242 static int pipespace(struct pipe *cpipe, int size); 243 static int pipespace_new(struct pipe *cpipe, int size); 244 245 static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); 246 static int pipe_zone_init(void *mem, int size, int flags); 247 static void pipe_zone_fini(void *mem, int size); 248 249 static uma_zone_t pipe_zone; 250 static struct unrhdr64 pipeino_unr; 251 static dev_t pipedev_ino; 252 253 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); 254 255 static void 256 pipeinit(void *dummy __unused) 257 { 258 259 pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), 260 pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, 261 UMA_ALIGN_PTR, 0); 262 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); 263 new_unrhdr64(&pipeino_unr, 1); 264 pipedev_ino = devfs_alloc_cdp_inode(); 265 KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); 266 } 267 268 static int 269 sysctl_handle_pipe_mindirect(SYSCTL_HANDLER_ARGS) 270 { 271 int error = 0; 272 long tmp_pipe_mindirect = pipe_mindirect; 273 274 error = sysctl_handle_long(oidp, &tmp_pipe_mindirect, arg2, req); 275 if (error != 0 || req->newptr == NULL) 276 return (error); 277 278 /* 279 * Don't allow pipe_mindirect to be set so low that we violate 280 * atomicity requirements. 281 */ 282 if (tmp_pipe_mindirect <= PIPE_BUF) 283 return (EINVAL); 284 pipe_mindirect = tmp_pipe_mindirect; 285 return (0); 286 } 287 SYSCTL_OID(_kern_ipc, OID_AUTO, pipe_mindirect, CTLTYPE_LONG | CTLFLAG_RW, 288 &pipe_mindirect, 0, sysctl_handle_pipe_mindirect, "L", 289 "Minimum write size triggering VM optimization"); 290 291 static int 292 pipe_zone_ctor(void *mem, int size, void *arg, int flags) 293 { 294 struct pipepair *pp; 295 struct pipe *rpipe, *wpipe; 296 297 KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); 298 299 pp = (struct pipepair *)mem; 300 301 /* 302 * We zero both pipe endpoints to make sure all the kmem pointers 303 * are NULL, flag fields are zero'd, etc. We timestamp both 304 * endpoints with the same time. 305 */ 306 rpipe = &pp->pp_rpipe; 307 bzero(rpipe, sizeof(*rpipe)); 308 pipe_timestamp(&rpipe->pipe_ctime); 309 rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; 310 311 wpipe = &pp->pp_wpipe; 312 bzero(wpipe, sizeof(*wpipe)); 313 wpipe->pipe_ctime = rpipe->pipe_ctime; 314 wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; 315 316 rpipe->pipe_peer = wpipe; 317 rpipe->pipe_pair = pp; 318 wpipe->pipe_peer = rpipe; 319 wpipe->pipe_pair = pp; 320 321 /* 322 * Mark both endpoints as present; they will later get free'd 323 * one at a time. When both are free'd, then the whole pair 324 * is released. 325 */ 326 rpipe->pipe_present = PIPE_ACTIVE; 327 wpipe->pipe_present = PIPE_ACTIVE; 328 329 /* 330 * Eventually, the MAC Framework may initialize the label 331 * in ctor or init, but for now we do it elswhere to avoid 332 * blocking in ctor or init. 333 */ 334 pp->pp_label = NULL; 335 336 return (0); 337 } 338 339 static int 340 pipe_zone_init(void *mem, int size, int flags) 341 { 342 struct pipepair *pp; 343 344 KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); 345 346 pp = (struct pipepair *)mem; 347 348 mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW); 349 return (0); 350 } 351 352 static void 353 pipe_zone_fini(void *mem, int size) 354 { 355 struct pipepair *pp; 356 357 KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); 358 359 pp = (struct pipepair *)mem; 360 361 mtx_destroy(&pp->pp_mtx); 362 } 363 364 static int 365 pipe_paircreate(struct thread *td, struct pipepair **p_pp) 366 { 367 struct pipepair *pp; 368 struct pipe *rpipe, *wpipe; 369 int error; 370 371 *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK); 372 #ifdef MAC 373 /* 374 * The MAC label is shared between the connected endpoints. As a 375 * result mac_pipe_init() and mac_pipe_create() are called once 376 * for the pair, and not on the endpoints. 377 */ 378 mac_pipe_init(pp); 379 mac_pipe_create(td->td_ucred, pp); 380 #endif 381 rpipe = &pp->pp_rpipe; 382 wpipe = &pp->pp_wpipe; 383 pp->pp_owner = crhold(td->td_ucred); 384 385 knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); 386 knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); 387 388 /* 389 * Only the forward direction pipe is backed by big buffer by 390 * default. 391 */ 392 error = pipe_create(rpipe, true); 393 if (error != 0) 394 goto fail; 395 error = pipe_create(wpipe, false); 396 if (error != 0) { 397 /* 398 * This cleanup leaves the pipe inode number for rpipe 399 * still allocated, but never used. We do not free 400 * inode numbers for opened pipes, which is required 401 * for correctness because numbers must be unique. 402 * But also it avoids any memory use by the unr 403 * allocator, so stashing away the transient inode 404 * number is reasonable. 405 */ 406 pipe_free_kmem(rpipe); 407 goto fail; 408 } 409 410 rpipe->pipe_state |= PIPE_DIRECTOK; 411 wpipe->pipe_state |= PIPE_DIRECTOK; 412 return (0); 413 414 fail: 415 knlist_destroy(&rpipe->pipe_sel.si_note); 416 knlist_destroy(&wpipe->pipe_sel.si_note); 417 crfree(pp->pp_owner); 418 #ifdef MAC 419 mac_pipe_destroy(pp); 420 #endif 421 uma_zfree(pipe_zone, pp); 422 return (error); 423 } 424 425 int 426 pipe_named_ctor(struct pipe **ppipe, struct thread *td) 427 { 428 struct pipepair *pp; 429 int error; 430 431 error = pipe_paircreate(td, &pp); 432 if (error != 0) 433 return (error); 434 pp->pp_rpipe.pipe_type |= PIPE_TYPE_NAMED; 435 *ppipe = &pp->pp_rpipe; 436 return (0); 437 } 438 439 void 440 pipe_dtor(struct pipe *dpipe) 441 { 442 struct pipe *peer; 443 444 peer = (dpipe->pipe_type & PIPE_TYPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; 445 funsetown(&dpipe->pipe_sigio); 446 pipeclose(dpipe); 447 if (peer != NULL) { 448 funsetown(&peer->pipe_sigio); 449 pipeclose(peer); 450 } 451 } 452 453 /* 454 * Get a timestamp. 455 * 456 * This used to be vfs_timestamp but the higher precision is unnecessary and 457 * can very negatively affect performance in virtualized environments (e.g., on 458 * vms running on amd64 when using the rdtscp instruction). 459 */ 460 static void 461 pipe_timestamp(struct timespec *tsp) 462 { 463 464 getnanotime(tsp); 465 } 466 467 /* 468 * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let 469 * the zone pick up the pieces via pipeclose(). 470 */ 471 int 472 kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1, 473 struct filecaps *fcaps2) 474 { 475 struct file *rf, *wf; 476 struct pipe *rpipe, *wpipe; 477 struct pipepair *pp; 478 int fd, fflags, error; 479 480 error = pipe_paircreate(td, &pp); 481 if (error != 0) 482 return (error); 483 rpipe = &pp->pp_rpipe; 484 wpipe = &pp->pp_wpipe; 485 error = falloc_caps(td, &rf, &fd, flags, fcaps1); 486 if (error) { 487 pipeclose(rpipe); 488 pipeclose(wpipe); 489 return (error); 490 } 491 /* An extra reference on `rf' has been held for us by falloc_caps(). */ 492 fildes[0] = fd; 493 494 fflags = FREAD | FWRITE; 495 if ((flags & O_NONBLOCK) != 0) 496 fflags |= FNONBLOCK; 497 498 /* 499 * Warning: once we've gotten past allocation of the fd for the 500 * read-side, we can only drop the read side via fdrop() in order 501 * to avoid races against processes which manage to dup() the read 502 * side while we are blocked trying to allocate the write side. 503 */ 504 finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops); 505 error = falloc_caps(td, &wf, &fd, flags, fcaps2); 506 if (error) { 507 fdclose(td, rf, fildes[0]); 508 fdrop(rf, td); 509 /* rpipe has been closed by fdrop(). */ 510 pipeclose(wpipe); 511 return (error); 512 } 513 /* An extra reference on `wf' has been held for us by falloc_caps(). */ 514 finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops); 515 fdrop(wf, td); 516 fildes[1] = fd; 517 fdrop(rf, td); 518 519 return (0); 520 } 521 522 #ifdef COMPAT_FREEBSD10 523 /* ARGSUSED */ 524 int 525 freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused) 526 { 527 int error; 528 int fildes[2]; 529 530 error = kern_pipe(td, fildes, 0, NULL, NULL); 531 if (error) 532 return (error); 533 534 td->td_retval[0] = fildes[0]; 535 td->td_retval[1] = fildes[1]; 536 537 return (0); 538 } 539 #endif 540 541 int 542 sys_pipe2(struct thread *td, struct pipe2_args *uap) 543 { 544 int error, fildes[2]; 545 546 if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) 547 return (EINVAL); 548 error = kern_pipe(td, fildes, uap->flags, NULL, NULL); 549 if (error) 550 return (error); 551 error = copyout(fildes, uap->fildes, 2 * sizeof(int)); 552 if (error) { 553 (void)kern_close(td, fildes[0]); 554 (void)kern_close(td, fildes[1]); 555 } 556 return (error); 557 } 558 559 /* 560 * Allocate kva for pipe circular buffer, the space is pageable 561 * This routine will 'realloc' the size of a pipe safely, if it fails 562 * it will retain the old buffer. 563 * If it fails it will return ENOMEM. 564 */ 565 static int 566 pipespace_new(struct pipe *cpipe, int size) 567 { 568 caddr_t buffer; 569 int error, cnt, firstseg; 570 static int curfail = 0; 571 static struct timeval lastfail; 572 573 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); 574 KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), 575 ("pipespace: resize of direct writes not allowed")); 576 retry: 577 cnt = cpipe->pipe_buffer.cnt; 578 if (cnt > size) 579 size = cnt; 580 581 size = round_page(size); 582 buffer = (caddr_t) vm_map_min(pipe_map); 583 584 if (!chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, 585 size, lim_cur(curthread, RLIMIT_PIPEBUF))) { 586 if (cpipe->pipe_buffer.buffer == NULL && 587 size > SMALL_PIPE_SIZE) { 588 size = SMALL_PIPE_SIZE; 589 goto retry; 590 } 591 return (ENOMEM); 592 } 593 594 vm_map_lock(pipe_map); 595 if (priv_check(curthread, PRIV_PIPEBUF) != 0 && maxpipekva / 100 * 596 (100 - pipebuf_reserv) < amountpipekva + size) { 597 vm_map_unlock(pipe_map); 598 chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0); 599 if (cpipe->pipe_buffer.buffer == NULL && 600 size > SMALL_PIPE_SIZE) { 601 size = SMALL_PIPE_SIZE; 602 pipefragretry++; 603 goto retry; 604 } 605 return (ENOMEM); 606 } 607 error = vm_map_find_locked(pipe_map, NULL, 0, (vm_offset_t *)&buffer, 608 size, 0, VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0); 609 vm_map_unlock(pipe_map); 610 if (error != KERN_SUCCESS) { 611 chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0); 612 if (cpipe->pipe_buffer.buffer == NULL && 613 size > SMALL_PIPE_SIZE) { 614 size = SMALL_PIPE_SIZE; 615 pipefragretry++; 616 goto retry; 617 } 618 if (cpipe->pipe_buffer.buffer == NULL) { 619 pipeallocfail++; 620 if (ppsratecheck(&lastfail, &curfail, 1)) 621 printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); 622 } else { 623 piperesizefail++; 624 } 625 return (ENOMEM); 626 } 627 628 /* copy data, then free old resources if we're resizing */ 629 if (cnt > 0) { 630 if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { 631 firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; 632 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 633 buffer, firstseg); 634 if ((cnt - firstseg) > 0) 635 bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], 636 cpipe->pipe_buffer.in); 637 } else { 638 bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], 639 buffer, cnt); 640 } 641 } 642 pipe_free_kmem(cpipe); 643 cpipe->pipe_buffer.buffer = buffer; 644 cpipe->pipe_buffer.size = size; 645 cpipe->pipe_buffer.in = cnt; 646 cpipe->pipe_buffer.out = 0; 647 cpipe->pipe_buffer.cnt = cnt; 648 atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); 649 return (0); 650 } 651 652 /* 653 * Wrapper for pipespace_new() that performs locking assertions. 654 */ 655 static int 656 pipespace(struct pipe *cpipe, int size) 657 { 658 659 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 660 ("Unlocked pipe passed to pipespace")); 661 return (pipespace_new(cpipe, size)); 662 } 663 664 /* 665 * lock a pipe for I/O, blocking other access 666 */ 667 static __inline int 668 pipelock(struct pipe *cpipe, bool catch) 669 { 670 int error, prio; 671 672 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 673 674 prio = PRIBIO; 675 if (catch) 676 prio |= PCATCH; 677 while (cpipe->pipe_state & PIPE_LOCKFL) { 678 KASSERT(cpipe->pipe_waiters >= 0, 679 ("%s: bad waiter count %d", __func__, 680 cpipe->pipe_waiters)); 681 cpipe->pipe_waiters++; 682 error = msleep(&cpipe->pipe_waiters, PIPE_MTX(cpipe), prio, 683 "pipelk", 0); 684 cpipe->pipe_waiters--; 685 if (error != 0) 686 return (error); 687 } 688 cpipe->pipe_state |= PIPE_LOCKFL; 689 return (0); 690 } 691 692 /* 693 * unlock a pipe I/O lock 694 */ 695 static __inline void 696 pipeunlock(struct pipe *cpipe) 697 { 698 699 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 700 KASSERT(cpipe->pipe_state & PIPE_LOCKFL, 701 ("Unlocked pipe passed to pipeunlock")); 702 KASSERT(cpipe->pipe_waiters >= 0, 703 ("%s: bad waiter count %d", __func__, 704 cpipe->pipe_waiters)); 705 cpipe->pipe_state &= ~PIPE_LOCKFL; 706 if (cpipe->pipe_waiters > 0) 707 wakeup_one(&cpipe->pipe_waiters); 708 } 709 710 void 711 pipeselwakeup(struct pipe *cpipe) 712 { 713 714 PIPE_LOCK_ASSERT(cpipe, MA_OWNED); 715 if (cpipe->pipe_state & PIPE_SEL) { 716 selwakeuppri(&cpipe->pipe_sel, PSOCK); 717 if (!SEL_WAITING(&cpipe->pipe_sel)) 718 cpipe->pipe_state &= ~PIPE_SEL; 719 } 720 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 721 pgsigio(&cpipe->pipe_sigio, SIGIO, 0); 722 KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); 723 } 724 725 /* 726 * Initialize and allocate VM and memory for pipe. The structure 727 * will start out zero'd from the ctor, so we just manage the kmem. 728 */ 729 static int 730 pipe_create(struct pipe *pipe, bool large_backing) 731 { 732 int error; 733 734 error = pipespace_new(pipe, !large_backing || amountpipekva > 735 maxpipekva / 2 ? SMALL_PIPE_SIZE : PIPE_SIZE); 736 if (error == 0) 737 pipe->pipe_ino = alloc_unr64(&pipeino_unr); 738 return (error); 739 } 740 741 /* ARGSUSED */ 742 static int 743 pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 744 int flags, struct thread *td) 745 { 746 struct pipe *rpipe; 747 int error; 748 int nread = 0; 749 int size; 750 751 rpipe = fp->f_data; 752 753 /* 754 * Try to avoid locking the pipe if we have nothing to do. 755 * 756 * There are programs which share one pipe amongst multiple processes 757 * and perform non-blocking reads in parallel, even if the pipe is 758 * empty. This in particular is the case with BSD make, which when 759 * spawned with a high -j number can find itself with over half of the 760 * calls failing to find anything. 761 */ 762 if ((fp->f_flag & FNONBLOCK) != 0 && !mac_pipe_check_read_enabled()) { 763 if (__predict_false(uio->uio_resid == 0)) 764 return (0); 765 if ((atomic_load_short(&rpipe->pipe_state) & PIPE_EOF) == 0 && 766 atomic_load_int(&rpipe->pipe_buffer.cnt) == 0 && 767 atomic_load_int(&rpipe->pipe_pages.cnt) == 0) 768 return (EAGAIN); 769 } 770 771 PIPE_LOCK(rpipe); 772 ++rpipe->pipe_busy; 773 error = pipelock(rpipe, true); 774 if (error) 775 goto unlocked_error; 776 777 #ifdef MAC 778 error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); 779 if (error) 780 goto locked_error; 781 #endif 782 if (amountpipekva > (3 * maxpipekva) / 4) { 783 if ((rpipe->pipe_state & PIPE_DIRECTW) == 0 && 784 rpipe->pipe_buffer.size > SMALL_PIPE_SIZE && 785 rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE && 786 piperesizeallowed == 1) { 787 PIPE_UNLOCK(rpipe); 788 pipespace(rpipe, SMALL_PIPE_SIZE); 789 PIPE_LOCK(rpipe); 790 } 791 } 792 793 while (uio->uio_resid) { 794 /* 795 * normal pipe buffer receive 796 */ 797 if (rpipe->pipe_buffer.cnt > 0) { 798 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 799 if (size > rpipe->pipe_buffer.cnt) 800 size = rpipe->pipe_buffer.cnt; 801 if (size > uio->uio_resid) 802 size = uio->uio_resid; 803 804 PIPE_UNLOCK(rpipe); 805 error = uiomove( 806 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 807 size, uio); 808 PIPE_LOCK(rpipe); 809 if (error) 810 break; 811 812 rpipe->pipe_buffer.out += size; 813 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 814 rpipe->pipe_buffer.out = 0; 815 816 rpipe->pipe_buffer.cnt -= size; 817 818 /* 819 * If there is no more to read in the pipe, reset 820 * its pointers to the beginning. This improves 821 * cache hit stats. 822 */ 823 if (rpipe->pipe_buffer.cnt == 0) { 824 rpipe->pipe_buffer.in = 0; 825 rpipe->pipe_buffer.out = 0; 826 } 827 nread += size; 828 #ifndef PIPE_NODIRECT 829 /* 830 * Direct copy, bypassing a kernel buffer. 831 */ 832 } else if ((size = rpipe->pipe_pages.cnt) != 0) { 833 if (size > uio->uio_resid) 834 size = (u_int) uio->uio_resid; 835 PIPE_UNLOCK(rpipe); 836 error = uiomove_fromphys(rpipe->pipe_pages.ms, 837 rpipe->pipe_pages.pos, size, uio); 838 PIPE_LOCK(rpipe); 839 if (error) 840 break; 841 nread += size; 842 rpipe->pipe_pages.pos += size; 843 rpipe->pipe_pages.cnt -= size; 844 if (rpipe->pipe_pages.cnt == 0) { 845 rpipe->pipe_state &= ~PIPE_WANTW; 846 wakeup(rpipe); 847 } 848 #endif 849 } else { 850 /* 851 * detect EOF condition 852 * read returns 0 on EOF, no need to set error 853 */ 854 if (rpipe->pipe_state & PIPE_EOF) 855 break; 856 857 /* 858 * If the "write-side" has been blocked, wake it up now. 859 */ 860 if (rpipe->pipe_state & PIPE_WANTW) { 861 rpipe->pipe_state &= ~PIPE_WANTW; 862 wakeup(rpipe); 863 } 864 865 /* 866 * Break if some data was read. 867 */ 868 if (nread > 0) 869 break; 870 871 /* 872 * Unlock the pipe buffer for our remaining processing. 873 * We will either break out with an error or we will 874 * sleep and relock to loop. 875 */ 876 pipeunlock(rpipe); 877 878 /* 879 * Handle non-blocking mode operation or 880 * wait for more data. 881 */ 882 if (fp->f_flag & FNONBLOCK) { 883 error = EAGAIN; 884 } else { 885 rpipe->pipe_state |= PIPE_WANTR; 886 if ((error = msleep(rpipe, PIPE_MTX(rpipe), 887 PRIBIO | PCATCH, 888 "piperd", 0)) == 0) 889 error = pipelock(rpipe, true); 890 } 891 if (error) 892 goto unlocked_error; 893 } 894 } 895 #ifdef MAC 896 locked_error: 897 #endif 898 pipeunlock(rpipe); 899 900 /* XXX: should probably do this before getting any locks. */ 901 if (error == 0) 902 pipe_timestamp(&rpipe->pipe_atime); 903 unlocked_error: 904 --rpipe->pipe_busy; 905 906 /* 907 * PIPE_WANT processing only makes sense if pipe_busy is 0. 908 */ 909 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 910 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 911 wakeup(rpipe); 912 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 913 /* 914 * Handle write blocking hysteresis. 915 */ 916 if (rpipe->pipe_state & PIPE_WANTW) { 917 rpipe->pipe_state &= ~PIPE_WANTW; 918 wakeup(rpipe); 919 } 920 } 921 922 /* 923 * Only wake up writers if there was actually something read. 924 * Otherwise, when calling read(2) at EOF, a spurious wakeup occurs. 925 */ 926 if (nread > 0 && 927 rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF) 928 pipeselwakeup(rpipe); 929 930 PIPE_UNLOCK(rpipe); 931 if (nread > 0) 932 td->td_ru.ru_msgrcv++; 933 return (error); 934 } 935 936 #ifndef PIPE_NODIRECT 937 /* 938 * Map the sending processes' buffer into kernel space and wire it. 939 * This is similar to a physical write operation. 940 */ 941 static int 942 pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) 943 { 944 u_int size; 945 int i; 946 947 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 948 KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, 949 ("%s: PIPE_DIRECTW set on %p", __func__, wpipe)); 950 KASSERT(wpipe->pipe_pages.cnt == 0, 951 ("%s: pipe map for %p contains residual data", __func__, wpipe)); 952 953 if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) 954 size = wpipe->pipe_buffer.size; 955 else 956 size = uio->uio_iov->iov_len; 957 958 wpipe->pipe_state |= PIPE_DIRECTW; 959 PIPE_UNLOCK(wpipe); 960 i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, 961 (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, 962 wpipe->pipe_pages.ms, PIPENPAGES); 963 PIPE_LOCK(wpipe); 964 if (i < 0) { 965 wpipe->pipe_state &= ~PIPE_DIRECTW; 966 return (EFAULT); 967 } 968 969 wpipe->pipe_pages.npages = i; 970 wpipe->pipe_pages.pos = 971 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 972 wpipe->pipe_pages.cnt = size; 973 974 uio->uio_iov->iov_len -= size; 975 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; 976 if (uio->uio_iov->iov_len == 0) { 977 uio->uio_iov++; 978 uio->uio_iovcnt--; 979 } 980 uio->uio_resid -= size; 981 uio->uio_offset += size; 982 return (0); 983 } 984 985 /* 986 * Unwire the process buffer. 987 */ 988 static void 989 pipe_destroy_write_buffer(struct pipe *wpipe) 990 { 991 992 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 993 KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, 994 ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); 995 KASSERT(wpipe->pipe_pages.cnt == 0, 996 ("%s: pipe map for %p contains residual data", __func__, wpipe)); 997 998 wpipe->pipe_state &= ~PIPE_DIRECTW; 999 vm_page_unhold_pages(wpipe->pipe_pages.ms, wpipe->pipe_pages.npages); 1000 wpipe->pipe_pages.npages = 0; 1001 } 1002 1003 /* 1004 * In the case of a signal, the writing process might go away. This 1005 * code copies the data into the circular buffer so that the source 1006 * pages can be freed without loss of data. 1007 */ 1008 static void 1009 pipe_clone_write_buffer(struct pipe *wpipe) 1010 { 1011 struct uio uio; 1012 struct iovec iov; 1013 int size; 1014 int pos; 1015 1016 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 1017 KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, 1018 ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); 1019 1020 size = wpipe->pipe_pages.cnt; 1021 pos = wpipe->pipe_pages.pos; 1022 wpipe->pipe_pages.cnt = 0; 1023 1024 wpipe->pipe_buffer.in = size; 1025 wpipe->pipe_buffer.out = 0; 1026 wpipe->pipe_buffer.cnt = size; 1027 1028 PIPE_UNLOCK(wpipe); 1029 iov.iov_base = wpipe->pipe_buffer.buffer; 1030 iov.iov_len = size; 1031 uio.uio_iov = &iov; 1032 uio.uio_iovcnt = 1; 1033 uio.uio_offset = 0; 1034 uio.uio_resid = size; 1035 uio.uio_segflg = UIO_SYSSPACE; 1036 uio.uio_rw = UIO_READ; 1037 uio.uio_td = curthread; 1038 uiomove_fromphys(wpipe->pipe_pages.ms, pos, size, &uio); 1039 PIPE_LOCK(wpipe); 1040 pipe_destroy_write_buffer(wpipe); 1041 } 1042 1043 /* 1044 * This implements the pipe buffer write mechanism. Note that only 1045 * a direct write OR a normal pipe write can be pending at any given time. 1046 * If there are any characters in the pipe buffer, the direct write will 1047 * be deferred until the receiving process grabs all of the bytes from 1048 * the pipe buffer. Then the direct mapping write is set-up. 1049 */ 1050 static int 1051 pipe_direct_write(struct pipe *wpipe, struct uio *uio) 1052 { 1053 int error; 1054 1055 retry: 1056 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 1057 if ((wpipe->pipe_state & PIPE_EOF) != 0) { 1058 error = EPIPE; 1059 goto error1; 1060 } 1061 if (wpipe->pipe_state & PIPE_DIRECTW) { 1062 if (wpipe->pipe_state & PIPE_WANTR) { 1063 wpipe->pipe_state &= ~PIPE_WANTR; 1064 wakeup(wpipe); 1065 } 1066 pipeselwakeup(wpipe); 1067 wpipe->pipe_state |= PIPE_WANTW; 1068 pipeunlock(wpipe); 1069 error = msleep(wpipe, PIPE_MTX(wpipe), 1070 PRIBIO | PCATCH, "pipdww", 0); 1071 pipelock(wpipe, false); 1072 if (error != 0) 1073 goto error1; 1074 goto retry; 1075 } 1076 if (wpipe->pipe_buffer.cnt > 0) { 1077 if (wpipe->pipe_state & PIPE_WANTR) { 1078 wpipe->pipe_state &= ~PIPE_WANTR; 1079 wakeup(wpipe); 1080 } 1081 pipeselwakeup(wpipe); 1082 wpipe->pipe_state |= PIPE_WANTW; 1083 pipeunlock(wpipe); 1084 error = msleep(wpipe, PIPE_MTX(wpipe), 1085 PRIBIO | PCATCH, "pipdwc", 0); 1086 pipelock(wpipe, false); 1087 if (error != 0) 1088 goto error1; 1089 goto retry; 1090 } 1091 1092 error = pipe_build_write_buffer(wpipe, uio); 1093 if (error) { 1094 goto error1; 1095 } 1096 1097 while (wpipe->pipe_pages.cnt != 0 && 1098 (wpipe->pipe_state & PIPE_EOF) == 0) { 1099 if (wpipe->pipe_state & PIPE_WANTR) { 1100 wpipe->pipe_state &= ~PIPE_WANTR; 1101 wakeup(wpipe); 1102 } 1103 pipeselwakeup(wpipe); 1104 wpipe->pipe_state |= PIPE_WANTW; 1105 pipeunlock(wpipe); 1106 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, 1107 "pipdwt", 0); 1108 pipelock(wpipe, false); 1109 if (error != 0) 1110 break; 1111 } 1112 1113 if ((wpipe->pipe_state & PIPE_EOF) != 0) { 1114 wpipe->pipe_pages.cnt = 0; 1115 pipe_destroy_write_buffer(wpipe); 1116 pipeselwakeup(wpipe); 1117 error = EPIPE; 1118 } else if (error == EINTR || error == ERESTART) { 1119 pipe_clone_write_buffer(wpipe); 1120 } else { 1121 pipe_destroy_write_buffer(wpipe); 1122 } 1123 KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, 1124 ("pipe %p leaked PIPE_DIRECTW", wpipe)); 1125 return (error); 1126 1127 error1: 1128 wakeup(wpipe); 1129 return (error); 1130 } 1131 #endif 1132 1133 static int 1134 pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 1135 int flags, struct thread *td) 1136 { 1137 struct pipe *wpipe, *rpipe; 1138 ssize_t orig_resid; 1139 int desiredsize, error; 1140 1141 rpipe = fp->f_data; 1142 wpipe = PIPE_PEER(rpipe); 1143 PIPE_LOCK(rpipe); 1144 error = pipelock(wpipe, true); 1145 if (error) { 1146 PIPE_UNLOCK(rpipe); 1147 return (error); 1148 } 1149 /* 1150 * detect loss of pipe read side, issue SIGPIPE if lost. 1151 */ 1152 if (wpipe->pipe_present != PIPE_ACTIVE || 1153 (wpipe->pipe_state & PIPE_EOF)) { 1154 pipeunlock(wpipe); 1155 PIPE_UNLOCK(rpipe); 1156 return (EPIPE); 1157 } 1158 #ifdef MAC 1159 error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); 1160 if (error) { 1161 pipeunlock(wpipe); 1162 PIPE_UNLOCK(rpipe); 1163 return (error); 1164 } 1165 #endif 1166 ++wpipe->pipe_busy; 1167 1168 /* Choose a larger size if it's advantageous */ 1169 desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); 1170 while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { 1171 if (piperesizeallowed != 1) 1172 break; 1173 if (amountpipekva > maxpipekva / 2) 1174 break; 1175 if (desiredsize == BIG_PIPE_SIZE) 1176 break; 1177 desiredsize = desiredsize * 2; 1178 } 1179 1180 /* Choose a smaller size if we're in a OOM situation */ 1181 if (amountpipekva > (3 * maxpipekva) / 4 && 1182 wpipe->pipe_buffer.size > SMALL_PIPE_SIZE && 1183 wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE && 1184 piperesizeallowed == 1) 1185 desiredsize = SMALL_PIPE_SIZE; 1186 1187 /* Resize if the above determined that a new size was necessary */ 1188 if (desiredsize != wpipe->pipe_buffer.size && 1189 (wpipe->pipe_state & PIPE_DIRECTW) == 0) { 1190 PIPE_UNLOCK(wpipe); 1191 pipespace(wpipe, desiredsize); 1192 PIPE_LOCK(wpipe); 1193 } 1194 MPASS(wpipe->pipe_buffer.size != 0); 1195 1196 orig_resid = uio->uio_resid; 1197 1198 while (uio->uio_resid) { 1199 int space; 1200 1201 if (wpipe->pipe_state & PIPE_EOF) { 1202 error = EPIPE; 1203 break; 1204 } 1205 #ifndef PIPE_NODIRECT 1206 /* 1207 * If the transfer is large, we can gain performance if 1208 * we do process-to-process copies directly. 1209 * If the write is non-blocking, we don't use the 1210 * direct write mechanism. 1211 * 1212 * The direct write mechanism will detect the reader going 1213 * away on us. 1214 */ 1215 if (uio->uio_segflg == UIO_USERSPACE && 1216 uio->uio_iov->iov_len >= pipe_mindirect && 1217 wpipe->pipe_buffer.size >= pipe_mindirect && 1218 (fp->f_flag & FNONBLOCK) == 0) { 1219 error = pipe_direct_write(wpipe, uio); 1220 if (error != 0) 1221 break; 1222 continue; 1223 } 1224 #endif 1225 1226 /* 1227 * Pipe buffered writes cannot be coincidental with 1228 * direct writes. We wait until the currently executing 1229 * direct write is completed before we start filling the 1230 * pipe buffer. We break out if a signal occurs or the 1231 * reader goes away. 1232 */ 1233 if (wpipe->pipe_pages.cnt != 0) { 1234 if (wpipe->pipe_state & PIPE_WANTR) { 1235 wpipe->pipe_state &= ~PIPE_WANTR; 1236 wakeup(wpipe); 1237 } 1238 pipeselwakeup(wpipe); 1239 wpipe->pipe_state |= PIPE_WANTW; 1240 pipeunlock(wpipe); 1241 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, 1242 "pipbww", 0); 1243 pipelock(wpipe, false); 1244 if (error != 0) 1245 break; 1246 continue; 1247 } 1248 1249 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 1250 1251 /* Writes of size <= PIPE_BUF must be atomic. */ 1252 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 1253 space = 0; 1254 1255 if (space > 0) { 1256 int size; /* Transfer size */ 1257 int segsize; /* first segment to transfer */ 1258 1259 /* 1260 * Transfer size is minimum of uio transfer 1261 * and free space in pipe buffer. 1262 */ 1263 if (space > uio->uio_resid) 1264 size = uio->uio_resid; 1265 else 1266 size = space; 1267 /* 1268 * First segment to transfer is minimum of 1269 * transfer size and contiguous space in 1270 * pipe buffer. If first segment to transfer 1271 * is less than the transfer size, we've got 1272 * a wraparound in the buffer. 1273 */ 1274 segsize = wpipe->pipe_buffer.size - 1275 wpipe->pipe_buffer.in; 1276 if (segsize > size) 1277 segsize = size; 1278 1279 /* Transfer first segment */ 1280 1281 PIPE_UNLOCK(rpipe); 1282 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 1283 segsize, uio); 1284 PIPE_LOCK(rpipe); 1285 1286 if (error == 0 && segsize < size) { 1287 KASSERT(wpipe->pipe_buffer.in + segsize == 1288 wpipe->pipe_buffer.size, 1289 ("Pipe buffer wraparound disappeared")); 1290 /* 1291 * Transfer remaining part now, to 1292 * support atomic writes. Wraparound 1293 * happened. 1294 */ 1295 1296 PIPE_UNLOCK(rpipe); 1297 error = uiomove( 1298 &wpipe->pipe_buffer.buffer[0], 1299 size - segsize, uio); 1300 PIPE_LOCK(rpipe); 1301 } 1302 if (error == 0) { 1303 wpipe->pipe_buffer.in += size; 1304 if (wpipe->pipe_buffer.in >= 1305 wpipe->pipe_buffer.size) { 1306 KASSERT(wpipe->pipe_buffer.in == 1307 size - segsize + 1308 wpipe->pipe_buffer.size, 1309 ("Expected wraparound bad")); 1310 wpipe->pipe_buffer.in = size - segsize; 1311 } 1312 1313 wpipe->pipe_buffer.cnt += size; 1314 KASSERT(wpipe->pipe_buffer.cnt <= 1315 wpipe->pipe_buffer.size, 1316 ("Pipe buffer overflow")); 1317 } 1318 if (error != 0) 1319 break; 1320 continue; 1321 } else { 1322 /* 1323 * If the "read-side" has been blocked, wake it up now. 1324 */ 1325 if (wpipe->pipe_state & PIPE_WANTR) { 1326 wpipe->pipe_state &= ~PIPE_WANTR; 1327 wakeup(wpipe); 1328 } 1329 1330 /* 1331 * don't block on non-blocking I/O 1332 */ 1333 if (fp->f_flag & FNONBLOCK) { 1334 error = EAGAIN; 1335 break; 1336 } 1337 1338 /* 1339 * We have no more space and have something to offer, 1340 * wake up select/poll. 1341 */ 1342 pipeselwakeup(wpipe); 1343 1344 wpipe->pipe_state |= PIPE_WANTW; 1345 pipeunlock(wpipe); 1346 error = msleep(wpipe, PIPE_MTX(rpipe), 1347 PRIBIO | PCATCH, "pipewr", 0); 1348 pipelock(wpipe, false); 1349 if (error != 0) 1350 break; 1351 continue; 1352 } 1353 } 1354 1355 --wpipe->pipe_busy; 1356 1357 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { 1358 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); 1359 wakeup(wpipe); 1360 } else if (wpipe->pipe_buffer.cnt > 0) { 1361 /* 1362 * If we have put any characters in the buffer, we wake up 1363 * the reader. 1364 */ 1365 if (wpipe->pipe_state & PIPE_WANTR) { 1366 wpipe->pipe_state &= ~PIPE_WANTR; 1367 wakeup(wpipe); 1368 } 1369 } 1370 1371 /* 1372 * Don't return EPIPE if any byte was written. 1373 * EINTR and other interrupts are handled by generic I/O layer. 1374 * Do not pretend that I/O succeeded for obvious user error 1375 * like EFAULT. 1376 */ 1377 if (uio->uio_resid != orig_resid && error == EPIPE) 1378 error = 0; 1379 1380 if (error == 0) 1381 pipe_timestamp(&wpipe->pipe_mtime); 1382 1383 /* 1384 * We have something to offer, 1385 * wake up select/poll. 1386 */ 1387 if (wpipe->pipe_buffer.cnt) 1388 pipeselwakeup(wpipe); 1389 1390 pipeunlock(wpipe); 1391 PIPE_UNLOCK(rpipe); 1392 if (uio->uio_resid != orig_resid) 1393 td->td_ru.ru_msgsnd++; 1394 return (error); 1395 } 1396 1397 /* ARGSUSED */ 1398 static int 1399 pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred, 1400 struct thread *td) 1401 { 1402 struct pipe *cpipe; 1403 int error; 1404 1405 cpipe = fp->f_data; 1406 if (cpipe->pipe_type & PIPE_TYPE_NAMED) 1407 error = vnops.fo_truncate(fp, length, active_cred, td); 1408 else 1409 error = invfo_truncate(fp, length, active_cred, td); 1410 return (error); 1411 } 1412 1413 /* 1414 * we implement a very minimal set of ioctls for compatibility with sockets. 1415 */ 1416 static int 1417 pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, 1418 struct thread *td) 1419 { 1420 struct pipe *mpipe = fp->f_data; 1421 int error; 1422 1423 PIPE_LOCK(mpipe); 1424 1425 #ifdef MAC 1426 error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); 1427 if (error) { 1428 PIPE_UNLOCK(mpipe); 1429 return (error); 1430 } 1431 #endif 1432 1433 error = 0; 1434 switch (cmd) { 1435 case FIONBIO: 1436 break; 1437 1438 case FIOASYNC: 1439 if (*(int *)data) { 1440 mpipe->pipe_state |= PIPE_ASYNC; 1441 } else { 1442 mpipe->pipe_state &= ~PIPE_ASYNC; 1443 } 1444 break; 1445 1446 case FIONREAD: 1447 if (!(fp->f_flag & FREAD)) { 1448 *(int *)data = 0; 1449 PIPE_UNLOCK(mpipe); 1450 return (0); 1451 } 1452 if (mpipe->pipe_pages.cnt != 0) 1453 *(int *)data = mpipe->pipe_pages.cnt; 1454 else 1455 *(int *)data = mpipe->pipe_buffer.cnt; 1456 break; 1457 1458 case FIOSETOWN: 1459 PIPE_UNLOCK(mpipe); 1460 error = fsetown(*(int *)data, &mpipe->pipe_sigio); 1461 goto out_unlocked; 1462 1463 case FIOGETOWN: 1464 *(int *)data = fgetown(&mpipe->pipe_sigio); 1465 break; 1466 1467 /* This is deprecated, FIOSETOWN should be used instead. */ 1468 case TIOCSPGRP: 1469 PIPE_UNLOCK(mpipe); 1470 error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); 1471 goto out_unlocked; 1472 1473 /* This is deprecated, FIOGETOWN should be used instead. */ 1474 case TIOCGPGRP: 1475 *(int *)data = -fgetown(&mpipe->pipe_sigio); 1476 break; 1477 1478 default: 1479 error = ENOTTY; 1480 break; 1481 } 1482 PIPE_UNLOCK(mpipe); 1483 out_unlocked: 1484 return (error); 1485 } 1486 1487 static int 1488 pipe_poll(struct file *fp, int events, struct ucred *active_cred, 1489 struct thread *td) 1490 { 1491 struct pipe *rpipe; 1492 struct pipe *wpipe; 1493 int levents, revents; 1494 #ifdef MAC 1495 int error; 1496 #endif 1497 1498 revents = 0; 1499 rpipe = fp->f_data; 1500 wpipe = PIPE_PEER(rpipe); 1501 PIPE_LOCK(rpipe); 1502 #ifdef MAC 1503 error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); 1504 if (error) 1505 goto locked_error; 1506 #endif 1507 if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) 1508 if (rpipe->pipe_pages.cnt > 0 || rpipe->pipe_buffer.cnt > 0) 1509 revents |= events & (POLLIN | POLLRDNORM); 1510 1511 if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) 1512 if (wpipe->pipe_present != PIPE_ACTIVE || 1513 (wpipe->pipe_state & PIPE_EOF) || 1514 ((wpipe->pipe_state & PIPE_DIRECTW) == 0 && 1515 ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || 1516 wpipe->pipe_buffer.size == 0))) 1517 revents |= events & (POLLOUT | POLLWRNORM); 1518 1519 levents = events & 1520 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); 1521 if (rpipe->pipe_type & PIPE_TYPE_NAMED && fp->f_flag & FREAD && levents && 1522 fp->f_pipegen == rpipe->pipe_wgen) 1523 events |= POLLINIGNEOF; 1524 1525 if ((events & POLLINIGNEOF) == 0) { 1526 if (rpipe->pipe_state & PIPE_EOF) { 1527 if (fp->f_flag & FREAD) 1528 revents |= (events & (POLLIN | POLLRDNORM)); 1529 if (wpipe->pipe_present != PIPE_ACTIVE || 1530 (wpipe->pipe_state & PIPE_EOF)) 1531 revents |= POLLHUP; 1532 } 1533 } 1534 1535 if (revents == 0) { 1536 /* 1537 * Add ourselves regardless of eventmask as we have to return 1538 * POLLHUP even if it was not asked for. 1539 */ 1540 if ((fp->f_flag & FREAD) != 0) { 1541 selrecord(td, &rpipe->pipe_sel); 1542 if (SEL_WAITING(&rpipe->pipe_sel)) 1543 rpipe->pipe_state |= PIPE_SEL; 1544 } 1545 1546 if ((fp->f_flag & FWRITE) != 0 && 1547 wpipe->pipe_present == PIPE_ACTIVE) { 1548 selrecord(td, &wpipe->pipe_sel); 1549 if (SEL_WAITING(&wpipe->pipe_sel)) 1550 wpipe->pipe_state |= PIPE_SEL; 1551 } 1552 } 1553 #ifdef MAC 1554 locked_error: 1555 #endif 1556 PIPE_UNLOCK(rpipe); 1557 1558 return (revents); 1559 } 1560 1561 /* 1562 * We shouldn't need locks here as we're doing a read and this should 1563 * be a natural race. 1564 */ 1565 static int 1566 pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred) 1567 { 1568 struct pipe *pipe; 1569 #ifdef MAC 1570 int error; 1571 #endif 1572 1573 pipe = fp->f_data; 1574 #ifdef MAC 1575 if (mac_pipe_check_stat_enabled()) { 1576 PIPE_LOCK(pipe); 1577 error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); 1578 PIPE_UNLOCK(pipe); 1579 if (error) { 1580 return (error); 1581 } 1582 } 1583 #endif 1584 1585 /* For named pipes ask the underlying filesystem. */ 1586 if (pipe->pipe_type & PIPE_TYPE_NAMED) { 1587 return (vnops.fo_stat(fp, ub, active_cred)); 1588 } 1589 1590 bzero(ub, sizeof(*ub)); 1591 ub->st_mode = S_IFIFO; 1592 ub->st_blksize = PAGE_SIZE; 1593 if (pipe->pipe_pages.cnt != 0) 1594 ub->st_size = pipe->pipe_pages.cnt; 1595 else 1596 ub->st_size = pipe->pipe_buffer.cnt; 1597 ub->st_blocks = howmany(ub->st_size, ub->st_blksize); 1598 ub->st_atim = pipe->pipe_atime; 1599 ub->st_mtim = pipe->pipe_mtime; 1600 ub->st_ctim = pipe->pipe_ctime; 1601 ub->st_uid = fp->f_cred->cr_uid; 1602 ub->st_gid = fp->f_cred->cr_gid; 1603 ub->st_dev = pipedev_ino; 1604 ub->st_ino = pipe->pipe_ino; 1605 /* 1606 * Left as 0: st_nlink, st_rdev, st_flags, st_gen. 1607 */ 1608 return (0); 1609 } 1610 1611 /* ARGSUSED */ 1612 static int 1613 pipe_close(struct file *fp, struct thread *td) 1614 { 1615 1616 if (fp->f_vnode != NULL) 1617 return vnops.fo_close(fp, td); 1618 fp->f_ops = &badfileops; 1619 pipe_dtor(fp->f_data); 1620 fp->f_data = NULL; 1621 return (0); 1622 } 1623 1624 static int 1625 pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) 1626 { 1627 struct pipe *cpipe; 1628 int error; 1629 1630 cpipe = fp->f_data; 1631 if (cpipe->pipe_type & PIPE_TYPE_NAMED) 1632 error = vn_chmod(fp, mode, active_cred, td); 1633 else 1634 error = invfo_chmod(fp, mode, active_cred, td); 1635 return (error); 1636 } 1637 1638 static int 1639 pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, 1640 struct thread *td) 1641 { 1642 struct pipe *cpipe; 1643 int error; 1644 1645 cpipe = fp->f_data; 1646 if (cpipe->pipe_type & PIPE_TYPE_NAMED) 1647 error = vn_chown(fp, uid, gid, active_cred, td); 1648 else 1649 error = invfo_chown(fp, uid, gid, active_cred, td); 1650 return (error); 1651 } 1652 1653 static int 1654 pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) 1655 { 1656 struct pipe *pi; 1657 1658 if (fp->f_type == DTYPE_FIFO) 1659 return (vn_fill_kinfo(fp, kif, fdp)); 1660 kif->kf_type = KF_TYPE_PIPE; 1661 pi = fp->f_data; 1662 kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; 1663 kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; 1664 kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; 1665 kif->kf_un.kf_pipe.kf_pipe_buffer_in = pi->pipe_buffer.in; 1666 kif->kf_un.kf_pipe.kf_pipe_buffer_out = pi->pipe_buffer.out; 1667 kif->kf_un.kf_pipe.kf_pipe_buffer_size = pi->pipe_buffer.size; 1668 return (0); 1669 } 1670 1671 static void 1672 pipe_free_kmem(struct pipe *cpipe) 1673 { 1674 1675 KASSERT(!mtx_owned(PIPE_MTX(cpipe)), 1676 ("pipe_free_kmem: pipe mutex locked")); 1677 1678 if (cpipe->pipe_buffer.buffer != NULL) { 1679 atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); 1680 chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, 1681 -cpipe->pipe_buffer.size, 0); 1682 vm_map_remove(pipe_map, 1683 (vm_offset_t)cpipe->pipe_buffer.buffer, 1684 (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); 1685 cpipe->pipe_buffer.buffer = NULL; 1686 } 1687 #ifndef PIPE_NODIRECT 1688 { 1689 cpipe->pipe_pages.cnt = 0; 1690 cpipe->pipe_pages.pos = 0; 1691 cpipe->pipe_pages.npages = 0; 1692 } 1693 #endif 1694 } 1695 1696 /* 1697 * shutdown the pipe 1698 */ 1699 static void 1700 pipeclose(struct pipe *cpipe) 1701 { 1702 #ifdef MAC 1703 struct pipepair *pp; 1704 #endif 1705 struct pipe *ppipe; 1706 1707 KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); 1708 1709 PIPE_LOCK(cpipe); 1710 pipelock(cpipe, false); 1711 #ifdef MAC 1712 pp = cpipe->pipe_pair; 1713 #endif 1714 1715 /* 1716 * If the other side is blocked, wake it up saying that 1717 * we want to close it down. 1718 */ 1719 cpipe->pipe_state |= PIPE_EOF; 1720 while (cpipe->pipe_busy) { 1721 wakeup(cpipe); 1722 cpipe->pipe_state |= PIPE_WANT; 1723 pipeunlock(cpipe); 1724 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); 1725 pipelock(cpipe, false); 1726 } 1727 1728 pipeselwakeup(cpipe); 1729 1730 /* 1731 * Disconnect from peer, if any. 1732 */ 1733 ppipe = cpipe->pipe_peer; 1734 if (ppipe->pipe_present == PIPE_ACTIVE) { 1735 ppipe->pipe_state |= PIPE_EOF; 1736 wakeup(ppipe); 1737 pipeselwakeup(ppipe); 1738 } 1739 1740 /* 1741 * Mark this endpoint as free. Release kmem resources. We 1742 * don't mark this endpoint as unused until we've finished 1743 * doing that, or the pipe might disappear out from under 1744 * us. 1745 */ 1746 PIPE_UNLOCK(cpipe); 1747 pipe_free_kmem(cpipe); 1748 PIPE_LOCK(cpipe); 1749 cpipe->pipe_present = PIPE_CLOSING; 1750 pipeunlock(cpipe); 1751 1752 /* 1753 * knlist_clear() may sleep dropping the PIPE_MTX. Set the 1754 * PIPE_FINALIZED, that allows other end to free the 1755 * pipe_pair, only after the knotes are completely dismantled. 1756 */ 1757 knlist_clear(&cpipe->pipe_sel.si_note, 1); 1758 cpipe->pipe_present = PIPE_FINALIZED; 1759 seldrain(&cpipe->pipe_sel); 1760 knlist_destroy(&cpipe->pipe_sel.si_note); 1761 1762 /* 1763 * If both endpoints are now closed, release the memory for the 1764 * pipe pair. If not, unlock. 1765 */ 1766 if (ppipe->pipe_present == PIPE_FINALIZED) { 1767 PIPE_UNLOCK(cpipe); 1768 crfree(cpipe->pipe_pair->pp_owner); 1769 #ifdef MAC 1770 mac_pipe_destroy(pp); 1771 #endif 1772 uma_zfree(pipe_zone, cpipe->pipe_pair); 1773 } else 1774 PIPE_UNLOCK(cpipe); 1775 } 1776 1777 /*ARGSUSED*/ 1778 static int 1779 pipe_kqfilter(struct file *fp, struct knote *kn) 1780 { 1781 struct pipe *cpipe; 1782 1783 /* 1784 * If a filter is requested that is not supported by this file 1785 * descriptor, don't return an error, but also don't ever generate an 1786 * event. 1787 */ 1788 if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { 1789 kn->kn_fop = &pipe_nfiltops; 1790 return (0); 1791 } 1792 if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { 1793 kn->kn_fop = &pipe_nfiltops; 1794 return (0); 1795 } 1796 cpipe = fp->f_data; 1797 PIPE_LOCK(cpipe); 1798 switch (kn->kn_filter) { 1799 case EVFILT_READ: 1800 kn->kn_fop = &pipe_rfiltops; 1801 break; 1802 case EVFILT_WRITE: 1803 kn->kn_fop = &pipe_wfiltops; 1804 if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { 1805 /* other end of pipe has been closed */ 1806 PIPE_UNLOCK(cpipe); 1807 return (EPIPE); 1808 } 1809 cpipe = PIPE_PEER(cpipe); 1810 break; 1811 default: 1812 if ((cpipe->pipe_type & PIPE_TYPE_NAMED) != 0) { 1813 PIPE_UNLOCK(cpipe); 1814 return (vnops.fo_kqfilter(fp, kn)); 1815 } 1816 PIPE_UNLOCK(cpipe); 1817 return (EINVAL); 1818 } 1819 1820 kn->kn_hook = cpipe; 1821 knlist_add(&cpipe->pipe_sel.si_note, kn, 1); 1822 PIPE_UNLOCK(cpipe); 1823 return (0); 1824 } 1825 1826 static void 1827 filt_pipedetach(struct knote *kn) 1828 { 1829 struct pipe *cpipe = kn->kn_hook; 1830 1831 PIPE_LOCK(cpipe); 1832 knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); 1833 PIPE_UNLOCK(cpipe); 1834 } 1835 1836 /*ARGSUSED*/ 1837 static int 1838 filt_piperead(struct knote *kn, long hint) 1839 { 1840 struct file *fp = kn->kn_fp; 1841 struct pipe *rpipe = kn->kn_hook; 1842 1843 PIPE_LOCK_ASSERT(rpipe, MA_OWNED); 1844 kn->kn_data = rpipe->pipe_buffer.cnt; 1845 if (kn->kn_data == 0) 1846 kn->kn_data = rpipe->pipe_pages.cnt; 1847 1848 if ((rpipe->pipe_state & PIPE_EOF) != 0 && 1849 ((rpipe->pipe_type & PIPE_TYPE_NAMED) == 0 || 1850 fp->f_pipegen != rpipe->pipe_wgen)) { 1851 kn->kn_flags |= EV_EOF; 1852 return (1); 1853 } 1854 kn->kn_flags &= ~EV_EOF; 1855 return (kn->kn_data > 0); 1856 } 1857 1858 /*ARGSUSED*/ 1859 static int 1860 filt_pipewrite(struct knote *kn, long hint) 1861 { 1862 struct pipe *wpipe = kn->kn_hook; 1863 1864 /* 1865 * If this end of the pipe is closed, the knote was removed from the 1866 * knlist and the list lock (i.e., the pipe lock) is therefore not held. 1867 */ 1868 if (wpipe->pipe_present == PIPE_ACTIVE || 1869 (wpipe->pipe_type & PIPE_TYPE_NAMED) != 0) { 1870 PIPE_LOCK_ASSERT(wpipe, MA_OWNED); 1871 1872 if (wpipe->pipe_state & PIPE_DIRECTW) { 1873 kn->kn_data = 0; 1874 } else if (wpipe->pipe_buffer.size > 0) { 1875 kn->kn_data = wpipe->pipe_buffer.size - 1876 wpipe->pipe_buffer.cnt; 1877 } else { 1878 kn->kn_data = PIPE_BUF; 1879 } 1880 } 1881 1882 if (wpipe->pipe_present != PIPE_ACTIVE || 1883 (wpipe->pipe_state & PIPE_EOF)) { 1884 kn->kn_flags |= EV_EOF; 1885 return (1); 1886 } 1887 kn->kn_flags &= ~EV_EOF; 1888 return (kn->kn_data >= PIPE_BUF); 1889 } 1890 1891 static void 1892 filt_pipedetach_notsup(struct knote *kn) 1893 { 1894 1895 } 1896 1897 static int 1898 filt_pipenotsup(struct knote *kn, long hint) 1899 { 1900 1901 return (0); 1902 } 1903