1 /* 2 * Copyright (c) 1996 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $FreeBSD$ 20 */ 21 22 /* 23 * This file contains a high-performance replacement for the socket-based 24 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support 25 * all features of sockets, but does do everything that pipes normally 26 * do. 27 */ 28 29 /* 30 * This code has two modes of operation, a small write mode and a large 31 * write mode. The small write mode acts like conventional pipes with 32 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the 33 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT 34 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and 35 * the receiving process can copy it directly from the pages in the sending 36 * process. 37 * 38 * If the sending process receives a signal, it is possible that it will 39 * go away, and certainly its address space can change, because control 40 * is returned back to the user-mode side. In that case, the pipe code 41 * arranges to copy the buffer supplied by the user process, to a pageable 42 * kernel buffer, and the receiving process will grab the data from the 43 * pageable kernel buffer. Since signals don't happen all that often, 44 * the copy operation is normally eliminated. 45 * 46 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will 47 * happen for small transfers so that the system will not spend all of 48 * its time context switching. PIPE_SIZE is constrained by the 49 * amount of kernel virtual memory. 50 */ 51 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/proc.h> 55 #include <sys/fcntl.h> 56 #include <sys/file.h> 57 #include <sys/filedesc.h> 58 #include <sys/filio.h> 59 #include <sys/ttycom.h> 60 #include <sys/stat.h> 61 #include <sys/poll.h> 62 #include <sys/select.h> 63 #include <sys/signalvar.h> 64 #include <sys/sysproto.h> 65 #include <sys/pipe.h> 66 #include <sys/uio.h> 67 68 #include <vm/vm.h> 69 #include <vm/vm_param.h> 70 #include <sys/lock.h> 71 #include <vm/vm_object.h> 72 #include <vm/vm_kern.h> 73 #include <vm/vm_extern.h> 74 #include <vm/pmap.h> 75 #include <vm/vm_map.h> 76 #include <vm/vm_page.h> 77 #include <vm/vm_zone.h> 78 79 /* 80 * Use this define if you want to disable *fancy* VM things. Expect an 81 * approx 30% decrease in transfer rate. This could be useful for 82 * NetBSD or OpenBSD. 83 */ 84 /* #define PIPE_NODIRECT */ 85 86 /* 87 * interfaces to the outside world 88 */ 89 static int pipe_read __P((struct file *fp, struct uio *uio, 90 struct ucred *cred, int flags, struct proc *p)); 91 static int pipe_write __P((struct file *fp, struct uio *uio, 92 struct ucred *cred, int flags, struct proc *p)); 93 static int pipe_close __P((struct file *fp, struct proc *p)); 94 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred, 95 struct proc *p)); 96 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p)); 97 98 static struct fileops pipeops = 99 { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_close }; 100 101 /* 102 * Default pipe buffer size(s), this can be kind-of large now because pipe 103 * space is pageable. The pipe code will try to maintain locality of 104 * reference for performance reasons, so small amounts of outstanding I/O 105 * will not wipe the cache. 106 */ 107 #define MINPIPESIZE (PIPE_SIZE/3) 108 #define MAXPIPESIZE (2*PIPE_SIZE/3) 109 110 /* 111 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but 112 * is there so that on large systems, we don't exhaust it. 113 */ 114 #define MAXPIPEKVA (8*1024*1024) 115 116 /* 117 * Limit for direct transfers, we cannot, of course limit 118 * the amount of kva for pipes in general though. 119 */ 120 #define LIMITPIPEKVA (16*1024*1024) 121 122 /* 123 * Limit the number of "big" pipes 124 */ 125 #define LIMITBIGPIPES 32 126 static int nbigpipe; 127 128 static int amountpipekva; 129 130 static void pipeclose __P((struct pipe *cpipe)); 131 static void pipeinit __P((struct pipe *cpipe)); 132 static __inline int pipelock __P((struct pipe *cpipe, int catch)); 133 static __inline void pipeunlock __P((struct pipe *cpipe)); 134 static __inline void pipeselwakeup __P((struct pipe *cpipe)); 135 #ifndef PIPE_NODIRECT 136 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio)); 137 static void pipe_destroy_write_buffer __P((struct pipe *wpipe)); 138 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); 139 static void pipe_clone_write_buffer __P((struct pipe *wpipe)); 140 #endif 141 static void pipespace __P((struct pipe *cpipe)); 142 143 static vm_zone_t pipe_zone; 144 145 /* 146 * The pipe system call for the DTYPE_PIPE type of pipes 147 */ 148 149 /* ARGSUSED */ 150 int 151 pipe(p, uap) 152 struct proc *p; 153 struct pipe_args /* { 154 int dummy; 155 } */ *uap; 156 { 157 register struct filedesc *fdp = p->p_fd; 158 struct file *rf, *wf; 159 struct pipe *rpipe, *wpipe; 160 int fd, error; 161 162 if (pipe_zone == NULL) 163 pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4); 164 165 rpipe = zalloc( pipe_zone); 166 pipeinit(rpipe); 167 rpipe->pipe_state |= PIPE_DIRECTOK; 168 wpipe = zalloc( pipe_zone); 169 pipeinit(wpipe); 170 wpipe->pipe_state |= PIPE_DIRECTOK; 171 172 error = falloc(p, &rf, &fd); 173 if (error) 174 goto free2; 175 p->p_retval[0] = fd; 176 rf->f_flag = FREAD | FWRITE; 177 rf->f_type = DTYPE_PIPE; 178 rf->f_data = (caddr_t)rpipe; 179 rf->f_ops = &pipeops; 180 error = falloc(p, &wf, &fd); 181 if (error) 182 goto free3; 183 wf->f_flag = FREAD | FWRITE; 184 wf->f_type = DTYPE_PIPE; 185 wf->f_data = (caddr_t)wpipe; 186 wf->f_ops = &pipeops; 187 p->p_retval[1] = fd; 188 189 rpipe->pipe_peer = wpipe; 190 wpipe->pipe_peer = rpipe; 191 192 return (0); 193 free3: 194 fdp->fd_ofiles[p->p_retval[0]] = 0; 195 ffree(rf); 196 free2: 197 (void)pipeclose(wpipe); 198 (void)pipeclose(rpipe); 199 return (error); 200 } 201 202 /* 203 * Allocate kva for pipe circular buffer, the space is pageable 204 */ 205 static void 206 pipespace(cpipe) 207 struct pipe *cpipe; 208 { 209 int npages, error; 210 211 npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE; 212 /* 213 * Create an object, I don't like the idea of paging to/from 214 * kernel_object. 215 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 216 */ 217 cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages); 218 cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map); 219 220 /* 221 * Insert the object into the kernel map, and allocate kva for it. 222 * The map entry is, by default, pageable. 223 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. 224 */ 225 error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0, 226 (vm_offset_t *) &cpipe->pipe_buffer.buffer, 227 cpipe->pipe_buffer.size, 1, 228 VM_PROT_ALL, VM_PROT_ALL, 0); 229 230 if (error != KERN_SUCCESS) 231 panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error); 232 amountpipekva += cpipe->pipe_buffer.size; 233 } 234 235 /* 236 * initialize and allocate VM and memory for pipe 237 */ 238 static void 239 pipeinit(cpipe) 240 struct pipe *cpipe; 241 { 242 243 cpipe->pipe_buffer.in = 0; 244 cpipe->pipe_buffer.out = 0; 245 cpipe->pipe_buffer.cnt = 0; 246 cpipe->pipe_buffer.size = PIPE_SIZE; 247 248 /* Buffer kva gets dynamically allocated */ 249 cpipe->pipe_buffer.buffer = NULL; 250 /* cpipe->pipe_buffer.object = invalid */ 251 252 cpipe->pipe_state = 0; 253 cpipe->pipe_peer = NULL; 254 cpipe->pipe_busy = 0; 255 getnanotime(&cpipe->pipe_ctime); 256 cpipe->pipe_atime = cpipe->pipe_ctime; 257 cpipe->pipe_mtime = cpipe->pipe_ctime; 258 bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel); 259 260 #ifndef PIPE_NODIRECT 261 /* 262 * pipe data structure initializations to support direct pipe I/O 263 */ 264 cpipe->pipe_map.cnt = 0; 265 cpipe->pipe_map.kva = 0; 266 cpipe->pipe_map.pos = 0; 267 cpipe->pipe_map.npages = 0; 268 /* cpipe->pipe_map.ms[] = invalid */ 269 #endif 270 } 271 272 273 /* 274 * lock a pipe for I/O, blocking other access 275 */ 276 static __inline int 277 pipelock(cpipe, catch) 278 struct pipe *cpipe; 279 int catch; 280 { 281 int error; 282 while (cpipe->pipe_state & PIPE_LOCK) { 283 cpipe->pipe_state |= PIPE_LWANT; 284 if ((error = tsleep( cpipe, 285 catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) != 0) { 286 return error; 287 } 288 } 289 cpipe->pipe_state |= PIPE_LOCK; 290 return 0; 291 } 292 293 /* 294 * unlock a pipe I/O lock 295 */ 296 static __inline void 297 pipeunlock(cpipe) 298 struct pipe *cpipe; 299 { 300 cpipe->pipe_state &= ~PIPE_LOCK; 301 if (cpipe->pipe_state & PIPE_LWANT) { 302 cpipe->pipe_state &= ~PIPE_LWANT; 303 wakeup(cpipe); 304 } 305 } 306 307 static __inline void 308 pipeselwakeup(cpipe) 309 struct pipe *cpipe; 310 { 311 if (cpipe->pipe_state & PIPE_SEL) { 312 cpipe->pipe_state &= ~PIPE_SEL; 313 selwakeup(&cpipe->pipe_sel); 314 } 315 if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) 316 pgsigio(cpipe->pipe_sigio, SIGIO, 0); 317 } 318 319 /* ARGSUSED */ 320 static int 321 pipe_read(fp, uio, cred, flags, p) 322 struct file *fp; 323 struct uio *uio; 324 struct ucred *cred; 325 struct proc *p; 326 int flags; 327 { 328 329 struct pipe *rpipe = (struct pipe *) fp->f_data; 330 int error; 331 int nread = 0; 332 u_int size; 333 334 ++rpipe->pipe_busy; 335 error = pipelock(rpipe, 1); 336 if (error) 337 goto unlocked_error; 338 339 while (uio->uio_resid) { 340 /* 341 * normal pipe buffer receive 342 */ 343 if (rpipe->pipe_buffer.cnt > 0) { 344 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; 345 if (size > rpipe->pipe_buffer.cnt) 346 size = rpipe->pipe_buffer.cnt; 347 if (size > (u_int) uio->uio_resid) 348 size = (u_int) uio->uio_resid; 349 350 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 351 size, uio); 352 if (error) { 353 break; 354 } 355 rpipe->pipe_buffer.out += size; 356 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) 357 rpipe->pipe_buffer.out = 0; 358 359 rpipe->pipe_buffer.cnt -= size; 360 361 /* 362 * If there is no more to read in the pipe, reset 363 * its pointers to the beginning. This improves 364 * cache hit stats. 365 */ 366 if (rpipe->pipe_buffer.cnt == 0) { 367 rpipe->pipe_buffer.in = 0; 368 rpipe->pipe_buffer.out = 0; 369 } 370 nread += size; 371 #ifndef PIPE_NODIRECT 372 /* 373 * Direct copy, bypassing a kernel buffer. 374 */ 375 } else if ((size = rpipe->pipe_map.cnt) && 376 (rpipe->pipe_state & PIPE_DIRECTW)) { 377 caddr_t va; 378 if (size > (u_int) uio->uio_resid) 379 size = (u_int) uio->uio_resid; 380 381 va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos; 382 error = uiomove(va, size, uio); 383 if (error) 384 break; 385 nread += size; 386 rpipe->pipe_map.pos += size; 387 rpipe->pipe_map.cnt -= size; 388 if (rpipe->pipe_map.cnt == 0) { 389 rpipe->pipe_state &= ~PIPE_DIRECTW; 390 wakeup(rpipe); 391 } 392 #endif 393 } else { 394 /* 395 * detect EOF condition 396 */ 397 if (rpipe->pipe_state & PIPE_EOF) { 398 /* XXX error = ? */ 399 break; 400 } 401 402 /* 403 * If the "write-side" has been blocked, wake it up now. 404 */ 405 if (rpipe->pipe_state & PIPE_WANTW) { 406 rpipe->pipe_state &= ~PIPE_WANTW; 407 wakeup(rpipe); 408 } 409 410 /* 411 * Break if some data was read. 412 */ 413 if (nread > 0) 414 break; 415 416 /* 417 * Unlock the pipe buffer for our remaining processing. We 418 * will either break out with an error or we will sleep and 419 * relock to loop. 420 */ 421 pipeunlock(rpipe); 422 423 /* 424 * Handle non-blocking mode operation or 425 * wait for more data. 426 */ 427 if (fp->f_flag & FNONBLOCK) 428 error = EAGAIN; 429 else { 430 rpipe->pipe_state |= PIPE_WANTR; 431 if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0) 432 error = pipelock(rpipe, 1); 433 } 434 if (error) 435 goto unlocked_error; 436 } 437 } 438 pipeunlock(rpipe); 439 440 if (error == 0) 441 getnanotime(&rpipe->pipe_atime); 442 unlocked_error: 443 --rpipe->pipe_busy; 444 445 /* 446 * PIPE_WANT processing only makes sense if pipe_busy is 0. 447 */ 448 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { 449 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); 450 wakeup(rpipe); 451 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { 452 /* 453 * Handle write blocking hysteresis. 454 */ 455 if (rpipe->pipe_state & PIPE_WANTW) { 456 rpipe->pipe_state &= ~PIPE_WANTW; 457 wakeup(rpipe); 458 } 459 } 460 461 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) 462 pipeselwakeup(rpipe); 463 464 return error; 465 } 466 467 #ifndef PIPE_NODIRECT 468 /* 469 * Map the sending processes' buffer into kernel space and wire it. 470 * This is similar to a physical write operation. 471 */ 472 static int 473 pipe_build_write_buffer(wpipe, uio) 474 struct pipe *wpipe; 475 struct uio *uio; 476 { 477 u_int size; 478 int i; 479 vm_offset_t addr, endaddr, paddr; 480 481 size = (u_int) uio->uio_iov->iov_len; 482 if (size > wpipe->pipe_buffer.size) 483 size = wpipe->pipe_buffer.size; 484 485 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); 486 for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); 487 addr < endaddr; 488 addr += PAGE_SIZE, i+=1) { 489 490 vm_page_t m; 491 492 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 || 493 (paddr = pmap_kextract(addr)) == 0) { 494 int j; 495 for(j=0;j<i;j++) 496 vm_page_unwire(wpipe->pipe_map.ms[j], 1); 497 return EFAULT; 498 } 499 500 m = PHYS_TO_VM_PAGE(paddr); 501 vm_page_wire(m); 502 wpipe->pipe_map.ms[i] = m; 503 } 504 505 /* 506 * set up the control block 507 */ 508 wpipe->pipe_map.npages = i; 509 wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; 510 wpipe->pipe_map.cnt = size; 511 512 /* 513 * and map the buffer 514 */ 515 if (wpipe->pipe_map.kva == 0) { 516 /* 517 * We need to allocate space for an extra page because the 518 * address range might (will) span pages at times. 519 */ 520 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, 521 wpipe->pipe_buffer.size + PAGE_SIZE); 522 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; 523 } 524 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, 525 wpipe->pipe_map.npages); 526 527 /* 528 * and update the uio data 529 */ 530 531 uio->uio_iov->iov_len -= size; 532 uio->uio_iov->iov_base += size; 533 if (uio->uio_iov->iov_len == 0) 534 uio->uio_iov++; 535 uio->uio_resid -= size; 536 uio->uio_offset += size; 537 return 0; 538 } 539 540 /* 541 * unmap and unwire the process buffer 542 */ 543 static void 544 pipe_destroy_write_buffer(wpipe) 545 struct pipe *wpipe; 546 { 547 int i; 548 if (wpipe->pipe_map.kva) { 549 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); 550 551 if (amountpipekva > MAXPIPEKVA) { 552 vm_offset_t kva = wpipe->pipe_map.kva; 553 wpipe->pipe_map.kva = 0; 554 kmem_free(kernel_map, kva, 555 wpipe->pipe_buffer.size + PAGE_SIZE); 556 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; 557 } 558 } 559 for (i=0;i<wpipe->pipe_map.npages;i++) 560 vm_page_unwire(wpipe->pipe_map.ms[i], 1); 561 } 562 563 /* 564 * In the case of a signal, the writing process might go away. This 565 * code copies the data into the circular buffer so that the source 566 * pages can be freed without loss of data. 567 */ 568 static void 569 pipe_clone_write_buffer(wpipe) 570 struct pipe *wpipe; 571 { 572 int size; 573 int pos; 574 575 size = wpipe->pipe_map.cnt; 576 pos = wpipe->pipe_map.pos; 577 bcopy((caddr_t) wpipe->pipe_map.kva+pos, 578 (caddr_t) wpipe->pipe_buffer.buffer, 579 size); 580 581 wpipe->pipe_buffer.in = size; 582 wpipe->pipe_buffer.out = 0; 583 wpipe->pipe_buffer.cnt = size; 584 wpipe->pipe_state &= ~PIPE_DIRECTW; 585 586 pipe_destroy_write_buffer(wpipe); 587 } 588 589 /* 590 * This implements the pipe buffer write mechanism. Note that only 591 * a direct write OR a normal pipe write can be pending at any given time. 592 * If there are any characters in the pipe buffer, the direct write will 593 * be deferred until the receiving process grabs all of the bytes from 594 * the pipe buffer. Then the direct mapping write is set-up. 595 */ 596 static int 597 pipe_direct_write(wpipe, uio) 598 struct pipe *wpipe; 599 struct uio *uio; 600 { 601 int error; 602 retry: 603 while (wpipe->pipe_state & PIPE_DIRECTW) { 604 if ( wpipe->pipe_state & PIPE_WANTR) { 605 wpipe->pipe_state &= ~PIPE_WANTR; 606 wakeup(wpipe); 607 } 608 wpipe->pipe_state |= PIPE_WANTW; 609 error = tsleep(wpipe, 610 PRIBIO|PCATCH, "pipdww", 0); 611 if (error) 612 goto error1; 613 if (wpipe->pipe_state & PIPE_EOF) { 614 error = EPIPE; 615 goto error1; 616 } 617 } 618 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ 619 if (wpipe->pipe_buffer.cnt > 0) { 620 if ( wpipe->pipe_state & PIPE_WANTR) { 621 wpipe->pipe_state &= ~PIPE_WANTR; 622 wakeup(wpipe); 623 } 624 625 wpipe->pipe_state |= PIPE_WANTW; 626 error = tsleep(wpipe, 627 PRIBIO|PCATCH, "pipdwc", 0); 628 if (error) 629 goto error1; 630 if (wpipe->pipe_state & PIPE_EOF) { 631 error = EPIPE; 632 goto error1; 633 } 634 goto retry; 635 } 636 637 wpipe->pipe_state |= PIPE_DIRECTW; 638 639 error = pipe_build_write_buffer(wpipe, uio); 640 if (error) { 641 wpipe->pipe_state &= ~PIPE_DIRECTW; 642 goto error1; 643 } 644 645 error = 0; 646 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { 647 if (wpipe->pipe_state & PIPE_EOF) { 648 pipelock(wpipe, 0); 649 pipe_destroy_write_buffer(wpipe); 650 pipeunlock(wpipe); 651 pipeselwakeup(wpipe); 652 error = EPIPE; 653 goto error1; 654 } 655 if (wpipe->pipe_state & PIPE_WANTR) { 656 wpipe->pipe_state &= ~PIPE_WANTR; 657 wakeup(wpipe); 658 } 659 pipeselwakeup(wpipe); 660 error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0); 661 } 662 663 pipelock(wpipe,0); 664 if (wpipe->pipe_state & PIPE_DIRECTW) { 665 /* 666 * this bit of trickery substitutes a kernel buffer for 667 * the process that might be going away. 668 */ 669 pipe_clone_write_buffer(wpipe); 670 } else { 671 pipe_destroy_write_buffer(wpipe); 672 } 673 pipeunlock(wpipe); 674 return error; 675 676 error1: 677 wakeup(wpipe); 678 return error; 679 } 680 #endif 681 682 static int 683 pipe_write(fp, uio, cred, flags, p) 684 struct file *fp; 685 struct uio *uio; 686 struct ucred *cred; 687 struct proc *p; 688 int flags; 689 { 690 int error = 0; 691 int orig_resid; 692 693 struct pipe *wpipe, *rpipe; 694 695 rpipe = (struct pipe *) fp->f_data; 696 wpipe = rpipe->pipe_peer; 697 698 /* 699 * detect loss of pipe read side, issue SIGPIPE if lost. 700 */ 701 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { 702 return EPIPE; 703 } 704 705 /* 706 * If it is advantageous to resize the pipe buffer, do 707 * so. 708 */ 709 if ((uio->uio_resid > PIPE_SIZE) && 710 (nbigpipe < LIMITBIGPIPES) && 711 (wpipe->pipe_state & PIPE_DIRECTW) == 0 && 712 (wpipe->pipe_buffer.size <= PIPE_SIZE) && 713 (wpipe->pipe_buffer.cnt == 0)) { 714 715 if (wpipe->pipe_buffer.buffer) { 716 amountpipekva -= wpipe->pipe_buffer.size; 717 kmem_free(kernel_map, 718 (vm_offset_t)wpipe->pipe_buffer.buffer, 719 wpipe->pipe_buffer.size); 720 } 721 722 #ifndef PIPE_NODIRECT 723 if (wpipe->pipe_map.kva) { 724 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; 725 kmem_free(kernel_map, 726 wpipe->pipe_map.kva, 727 wpipe->pipe_buffer.size + PAGE_SIZE); 728 } 729 #endif 730 731 wpipe->pipe_buffer.in = 0; 732 wpipe->pipe_buffer.out = 0; 733 wpipe->pipe_buffer.cnt = 0; 734 wpipe->pipe_buffer.size = BIG_PIPE_SIZE; 735 wpipe->pipe_buffer.buffer = NULL; 736 ++nbigpipe; 737 738 #ifndef PIPE_NODIRECT 739 wpipe->pipe_map.cnt = 0; 740 wpipe->pipe_map.kva = 0; 741 wpipe->pipe_map.pos = 0; 742 wpipe->pipe_map.npages = 0; 743 #endif 744 745 } 746 747 748 if( wpipe->pipe_buffer.buffer == NULL) { 749 if ((error = pipelock(wpipe,1)) == 0) { 750 pipespace(wpipe); 751 pipeunlock(wpipe); 752 } else { 753 return error; 754 } 755 } 756 757 ++wpipe->pipe_busy; 758 orig_resid = uio->uio_resid; 759 while (uio->uio_resid) { 760 int space; 761 #ifndef PIPE_NODIRECT 762 /* 763 * If the transfer is large, we can gain performance if 764 * we do process-to-process copies directly. 765 * If the write is non-blocking, we don't use the 766 * direct write mechanism. 767 */ 768 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && 769 (fp->f_flag & FNONBLOCK) == 0 && 770 (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) && 771 (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { 772 error = pipe_direct_write( wpipe, uio); 773 if (error) { 774 break; 775 } 776 continue; 777 } 778 #endif 779 780 /* 781 * Pipe buffered writes cannot be coincidental with 782 * direct writes. We wait until the currently executing 783 * direct write is completed before we start filling the 784 * pipe buffer. 785 */ 786 retrywrite: 787 while (wpipe->pipe_state & PIPE_DIRECTW) { 788 if (wpipe->pipe_state & PIPE_WANTR) { 789 wpipe->pipe_state &= ~PIPE_WANTR; 790 wakeup(wpipe); 791 } 792 error = tsleep(wpipe, 793 PRIBIO|PCATCH, "pipbww", 0); 794 if (error) 795 break; 796 } 797 798 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; 799 800 /* Writes of size <= PIPE_BUF must be atomic. */ 801 /* XXX perhaps they need to be contiguous to be atomic? */ 802 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) 803 space = 0; 804 805 if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { 806 /* 807 * This set the maximum transfer as a segment of 808 * the buffer. 809 */ 810 int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; 811 /* 812 * space is the size left in the buffer 813 */ 814 if (size > space) 815 size = space; 816 /* 817 * now limit it to the size of the uio transfer 818 */ 819 if (size > uio->uio_resid) 820 size = uio->uio_resid; 821 if ((error = pipelock(wpipe,1)) == 0) { 822 /* 823 * It is possible for a direct write to 824 * slip in on us... handle it here... 825 */ 826 if (wpipe->pipe_state & PIPE_DIRECTW) { 827 pipeunlock(wpipe); 828 goto retrywrite; 829 } 830 error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 831 size, uio); 832 pipeunlock(wpipe); 833 } 834 if (error) 835 break; 836 837 wpipe->pipe_buffer.in += size; 838 if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) 839 wpipe->pipe_buffer.in = 0; 840 841 wpipe->pipe_buffer.cnt += size; 842 } else { 843 /* 844 * If the "read-side" has been blocked, wake it up now. 845 */ 846 if (wpipe->pipe_state & PIPE_WANTR) { 847 wpipe->pipe_state &= ~PIPE_WANTR; 848 wakeup(wpipe); 849 } 850 851 /* 852 * don't block on non-blocking I/O 853 */ 854 if (fp->f_flag & FNONBLOCK) { 855 error = EAGAIN; 856 break; 857 } 858 859 /* 860 * We have no more space and have something to offer, 861 * wake up select/poll. 862 */ 863 pipeselwakeup(wpipe); 864 865 wpipe->pipe_state |= PIPE_WANTW; 866 if ((error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) != 0) { 867 break; 868 } 869 /* 870 * If read side wants to go away, we just issue a signal 871 * to ourselves. 872 */ 873 if (wpipe->pipe_state & PIPE_EOF) { 874 error = EPIPE; 875 break; 876 } 877 } 878 } 879 880 --wpipe->pipe_busy; 881 if ((wpipe->pipe_busy == 0) && 882 (wpipe->pipe_state & PIPE_WANT)) { 883 wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR); 884 wakeup(wpipe); 885 } else if (wpipe->pipe_buffer.cnt > 0) { 886 /* 887 * If we have put any characters in the buffer, we wake up 888 * the reader. 889 */ 890 if (wpipe->pipe_state & PIPE_WANTR) { 891 wpipe->pipe_state &= ~PIPE_WANTR; 892 wakeup(wpipe); 893 } 894 } 895 896 /* 897 * Don't return EPIPE if I/O was successful 898 */ 899 if ((wpipe->pipe_buffer.cnt == 0) && 900 (uio->uio_resid == 0) && 901 (error == EPIPE)) 902 error = 0; 903 904 if (error == 0) 905 getnanotime(&wpipe->pipe_mtime); 906 907 /* 908 * We have something to offer, 909 * wake up select/poll. 910 */ 911 if (wpipe->pipe_buffer.cnt) 912 pipeselwakeup(wpipe); 913 914 return error; 915 } 916 917 /* 918 * we implement a very minimal set of ioctls for compatibility with sockets. 919 */ 920 int 921 pipe_ioctl(fp, cmd, data, p) 922 struct file *fp; 923 u_long cmd; 924 register caddr_t data; 925 struct proc *p; 926 { 927 register struct pipe *mpipe = (struct pipe *)fp->f_data; 928 929 switch (cmd) { 930 931 case FIONBIO: 932 return (0); 933 934 case FIOASYNC: 935 if (*(int *)data) { 936 mpipe->pipe_state |= PIPE_ASYNC; 937 } else { 938 mpipe->pipe_state &= ~PIPE_ASYNC; 939 } 940 return (0); 941 942 case FIONREAD: 943 if (mpipe->pipe_state & PIPE_DIRECTW) 944 *(int *)data = mpipe->pipe_map.cnt; 945 else 946 *(int *)data = mpipe->pipe_buffer.cnt; 947 return (0); 948 949 case FIOSETOWN: 950 return (fsetown(*(int *)data, &mpipe->pipe_sigio)); 951 952 case FIOGETOWN: 953 *(int *)data = fgetown(mpipe->pipe_sigio); 954 return (0); 955 956 /* This is deprecated, FIOSETOWN should be used instead. */ 957 case TIOCSPGRP: 958 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); 959 960 /* This is deprecated, FIOGETOWN should be used instead. */ 961 case TIOCGPGRP: 962 *(int *)data = -fgetown(mpipe->pipe_sigio); 963 return (0); 964 965 } 966 return (ENOTTY); 967 } 968 969 int 970 pipe_poll(fp, events, cred, p) 971 struct file *fp; 972 int events; 973 struct ucred *cred; 974 struct proc *p; 975 { 976 register struct pipe *rpipe = (struct pipe *)fp->f_data; 977 struct pipe *wpipe; 978 int revents = 0; 979 980 wpipe = rpipe->pipe_peer; 981 if (events & (POLLIN | POLLRDNORM)) 982 if ((rpipe->pipe_state & PIPE_DIRECTW) || 983 (rpipe->pipe_buffer.cnt > 0) || 984 (rpipe->pipe_state & PIPE_EOF)) 985 revents |= events & (POLLIN | POLLRDNORM); 986 987 if (events & (POLLOUT | POLLWRNORM)) 988 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || 989 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && 990 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) 991 revents |= events & (POLLOUT | POLLWRNORM); 992 993 if ((rpipe->pipe_state & PIPE_EOF) || 994 (wpipe == NULL) || 995 (wpipe->pipe_state & PIPE_EOF)) 996 revents |= POLLHUP; 997 998 if (revents == 0) { 999 if (events & (POLLIN | POLLRDNORM)) { 1000 selrecord(p, &rpipe->pipe_sel); 1001 rpipe->pipe_state |= PIPE_SEL; 1002 } 1003 1004 if (events & (POLLOUT | POLLWRNORM)) { 1005 selrecord(p, &wpipe->pipe_sel); 1006 wpipe->pipe_state |= PIPE_SEL; 1007 } 1008 } 1009 1010 return (revents); 1011 } 1012 1013 int 1014 pipe_stat(pipe, ub) 1015 register struct pipe *pipe; 1016 register struct stat *ub; 1017 { 1018 bzero((caddr_t)ub, sizeof (*ub)); 1019 ub->st_mode = S_IFIFO; 1020 ub->st_blksize = pipe->pipe_buffer.size; 1021 ub->st_size = pipe->pipe_buffer.cnt; 1022 ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; 1023 ub->st_atimespec = pipe->pipe_atime; 1024 ub->st_mtimespec = pipe->pipe_mtime; 1025 ub->st_ctimespec = pipe->pipe_ctime; 1026 /* 1027 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev, 1028 * st_flags, st_gen. 1029 * XXX (st_dev, st_ino) should be unique. 1030 */ 1031 return 0; 1032 } 1033 1034 /* ARGSUSED */ 1035 static int 1036 pipe_close(fp, p) 1037 struct file *fp; 1038 struct proc *p; 1039 { 1040 struct pipe *cpipe = (struct pipe *)fp->f_data; 1041 1042 fp->f_ops = &badfileops; 1043 fp->f_data = NULL; 1044 funsetown(cpipe->pipe_sigio); 1045 pipeclose(cpipe); 1046 return 0; 1047 } 1048 1049 /* 1050 * shutdown the pipe 1051 */ 1052 static void 1053 pipeclose(cpipe) 1054 struct pipe *cpipe; 1055 { 1056 struct pipe *ppipe; 1057 if (cpipe) { 1058 1059 pipeselwakeup(cpipe); 1060 1061 /* 1062 * If the other side is blocked, wake it up saying that 1063 * we want to close it down. 1064 */ 1065 while (cpipe->pipe_busy) { 1066 wakeup(cpipe); 1067 cpipe->pipe_state |= PIPE_WANT|PIPE_EOF; 1068 tsleep(cpipe, PRIBIO, "pipecl", 0); 1069 } 1070 1071 /* 1072 * Disconnect from peer 1073 */ 1074 if ((ppipe = cpipe->pipe_peer) != NULL) { 1075 pipeselwakeup(ppipe); 1076 1077 ppipe->pipe_state |= PIPE_EOF; 1078 wakeup(ppipe); 1079 ppipe->pipe_peer = NULL; 1080 } 1081 1082 /* 1083 * free resources 1084 */ 1085 if (cpipe->pipe_buffer.buffer) { 1086 if (cpipe->pipe_buffer.size > PIPE_SIZE) 1087 --nbigpipe; 1088 amountpipekva -= cpipe->pipe_buffer.size; 1089 kmem_free(kernel_map, 1090 (vm_offset_t)cpipe->pipe_buffer.buffer, 1091 cpipe->pipe_buffer.size); 1092 } 1093 #ifndef PIPE_NODIRECT 1094 if (cpipe->pipe_map.kva) { 1095 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; 1096 kmem_free(kernel_map, 1097 cpipe->pipe_map.kva, 1098 cpipe->pipe_buffer.size + PAGE_SIZE); 1099 } 1100 #endif 1101 zfree(pipe_zone, cpipe); 1102 } 1103 } 1104