1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/pipe_fs_i.h> 17 #include <linux/uio.h> 18 #include <linux/highmem.h> 19 #include <linux/pagemap.h> 20 #include <linux/audit.h> 21 #include <linux/syscalls.h> 22 #include <linux/fcntl.h> 23 24 #include <asm/uaccess.h> 25 #include <asm/ioctls.h> 26 27 /* 28 * The max size that a non-root user is allowed to grow the pipe. Can 29 * be set by root in /proc/sys/fs/pipe-max-pages 30 */ 31 unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; 32 33 /* 34 * We use a start+len construction, which provides full use of the 35 * allocated memory. 36 * -- Florian Coosmann (FGC) 37 * 38 * Reads with count = 0 should always return 0. 39 * -- Julian Bradfield 1999-06-07. 40 * 41 * FIFOs and Pipes now generate SIGIO for both readers and writers. 42 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 43 * 44 * pipe_read & write cleanup 45 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 46 */ 47 48 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 49 { 50 if (pipe->inode) 51 mutex_lock_nested(&pipe->inode->i_mutex, subclass); 52 } 53 54 void pipe_lock(struct pipe_inode_info *pipe) 55 { 56 /* 57 * pipe_lock() nests non-pipe inode locks (for writing to a file) 58 */ 59 pipe_lock_nested(pipe, I_MUTEX_PARENT); 60 } 61 EXPORT_SYMBOL(pipe_lock); 62 63 void pipe_unlock(struct pipe_inode_info *pipe) 64 { 65 if (pipe->inode) 66 mutex_unlock(&pipe->inode->i_mutex); 67 } 68 EXPORT_SYMBOL(pipe_unlock); 69 70 void pipe_double_lock(struct pipe_inode_info *pipe1, 71 struct pipe_inode_info *pipe2) 72 { 73 BUG_ON(pipe1 == pipe2); 74 75 if (pipe1 < pipe2) { 76 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 77 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 78 } else { 79 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 80 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 81 } 82 } 83 84 /* Drop the inode semaphore and wait for a pipe event, atomically */ 85 void pipe_wait(struct pipe_inode_info *pipe) 86 { 87 DEFINE_WAIT(wait); 88 89 /* 90 * Pipes are system-local resources, so sleeping on them 91 * is considered a noninteractive wait: 92 */ 93 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 94 pipe_unlock(pipe); 95 schedule(); 96 finish_wait(&pipe->wait, &wait); 97 pipe_lock(pipe); 98 } 99 100 static int 101 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 102 int atomic) 103 { 104 unsigned long copy; 105 106 while (len > 0) { 107 while (!iov->iov_len) 108 iov++; 109 copy = min_t(unsigned long, len, iov->iov_len); 110 111 if (atomic) { 112 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 113 return -EFAULT; 114 } else { 115 if (copy_from_user(to, iov->iov_base, copy)) 116 return -EFAULT; 117 } 118 to += copy; 119 len -= copy; 120 iov->iov_base += copy; 121 iov->iov_len -= copy; 122 } 123 return 0; 124 } 125 126 static int 127 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, 128 int atomic) 129 { 130 unsigned long copy; 131 132 while (len > 0) { 133 while (!iov->iov_len) 134 iov++; 135 copy = min_t(unsigned long, len, iov->iov_len); 136 137 if (atomic) { 138 if (__copy_to_user_inatomic(iov->iov_base, from, copy)) 139 return -EFAULT; 140 } else { 141 if (copy_to_user(iov->iov_base, from, copy)) 142 return -EFAULT; 143 } 144 from += copy; 145 len -= copy; 146 iov->iov_base += copy; 147 iov->iov_len -= copy; 148 } 149 return 0; 150 } 151 152 /* 153 * Attempt to pre-fault in the user memory, so we can use atomic copies. 154 * Returns the number of bytes not faulted in. 155 */ 156 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) 157 { 158 while (!iov->iov_len) 159 iov++; 160 161 while (len > 0) { 162 unsigned long this_len; 163 164 this_len = min_t(unsigned long, len, iov->iov_len); 165 if (fault_in_pages_writeable(iov->iov_base, this_len)) 166 break; 167 168 len -= this_len; 169 iov++; 170 } 171 172 return len; 173 } 174 175 /* 176 * Pre-fault in the user memory, so we can use atomic copies. 177 */ 178 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 179 { 180 while (!iov->iov_len) 181 iov++; 182 183 while (len > 0) { 184 unsigned long this_len; 185 186 this_len = min_t(unsigned long, len, iov->iov_len); 187 fault_in_pages_readable(iov->iov_base, this_len); 188 len -= this_len; 189 iov++; 190 } 191 } 192 193 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 194 struct pipe_buffer *buf) 195 { 196 struct page *page = buf->page; 197 198 /* 199 * If nobody else uses this page, and we don't already have a 200 * temporary page, let's keep track of it as a one-deep 201 * allocation cache. (Otherwise just release our reference to it) 202 */ 203 if (page_count(page) == 1 && !pipe->tmp_page) 204 pipe->tmp_page = page; 205 else 206 page_cache_release(page); 207 } 208 209 /** 210 * generic_pipe_buf_map - virtually map a pipe buffer 211 * @pipe: the pipe that the buffer belongs to 212 * @buf: the buffer that should be mapped 213 * @atomic: whether to use an atomic map 214 * 215 * Description: 216 * This function returns a kernel virtual address mapping for the 217 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided 218 * and the caller has to be careful not to fault before calling 219 * the unmap function. 220 * 221 * Note that this function occupies KM_USER0 if @atomic != 0. 222 */ 223 void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 224 struct pipe_buffer *buf, int atomic) 225 { 226 if (atomic) { 227 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 228 return kmap_atomic(buf->page, KM_USER0); 229 } 230 231 return kmap(buf->page); 232 } 233 EXPORT_SYMBOL(generic_pipe_buf_map); 234 235 /** 236 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 237 * @pipe: the pipe that the buffer belongs to 238 * @buf: the buffer that should be unmapped 239 * @map_data: the data that the mapping function returned 240 * 241 * Description: 242 * This function undoes the mapping that ->map() provided. 243 */ 244 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 245 struct pipe_buffer *buf, void *map_data) 246 { 247 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 248 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 249 kunmap_atomic(map_data, KM_USER0); 250 } else 251 kunmap(buf->page); 252 } 253 EXPORT_SYMBOL(generic_pipe_buf_unmap); 254 255 /** 256 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 257 * @pipe: the pipe that the buffer belongs to 258 * @buf: the buffer to attempt to steal 259 * 260 * Description: 261 * This function attempts to steal the &struct page attached to 262 * @buf. If successful, this function returns 0 and returns with 263 * the page locked. The caller may then reuse the page for whatever 264 * he wishes; the typical use is insertion into a different file 265 * page cache. 266 */ 267 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 268 struct pipe_buffer *buf) 269 { 270 struct page *page = buf->page; 271 272 /* 273 * A reference of one is golden, that means that the owner of this 274 * page is the only one holding a reference to it. lock the page 275 * and return OK. 276 */ 277 if (page_count(page) == 1) { 278 lock_page(page); 279 return 0; 280 } 281 282 return 1; 283 } 284 EXPORT_SYMBOL(generic_pipe_buf_steal); 285 286 /** 287 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 288 * @pipe: the pipe that the buffer belongs to 289 * @buf: the buffer to get a reference to 290 * 291 * Description: 292 * This function grabs an extra reference to @buf. It's used in 293 * in the tee() system call, when we duplicate the buffers in one 294 * pipe into another. 295 */ 296 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 297 { 298 page_cache_get(buf->page); 299 } 300 EXPORT_SYMBOL(generic_pipe_buf_get); 301 302 /** 303 * generic_pipe_buf_confirm - verify contents of the pipe buffer 304 * @info: the pipe that the buffer belongs to 305 * @buf: the buffer to confirm 306 * 307 * Description: 308 * This function does nothing, because the generic pipe code uses 309 * pages that are always good when inserted into the pipe. 310 */ 311 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 312 struct pipe_buffer *buf) 313 { 314 return 0; 315 } 316 EXPORT_SYMBOL(generic_pipe_buf_confirm); 317 318 /** 319 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 320 * @pipe: the pipe that the buffer belongs to 321 * @buf: the buffer to put a reference to 322 * 323 * Description: 324 * This function releases a reference to @buf. 325 */ 326 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 327 struct pipe_buffer *buf) 328 { 329 page_cache_release(buf->page); 330 } 331 EXPORT_SYMBOL(generic_pipe_buf_release); 332 333 static const struct pipe_buf_operations anon_pipe_buf_ops = { 334 .can_merge = 1, 335 .map = generic_pipe_buf_map, 336 .unmap = generic_pipe_buf_unmap, 337 .confirm = generic_pipe_buf_confirm, 338 .release = anon_pipe_buf_release, 339 .steal = generic_pipe_buf_steal, 340 .get = generic_pipe_buf_get, 341 }; 342 343 static ssize_t 344 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 345 unsigned long nr_segs, loff_t pos) 346 { 347 struct file *filp = iocb->ki_filp; 348 struct inode *inode = filp->f_path.dentry->d_inode; 349 struct pipe_inode_info *pipe; 350 int do_wakeup; 351 ssize_t ret; 352 struct iovec *iov = (struct iovec *)_iov; 353 size_t total_len; 354 355 total_len = iov_length(iov, nr_segs); 356 /* Null read succeeds. */ 357 if (unlikely(total_len == 0)) 358 return 0; 359 360 do_wakeup = 0; 361 ret = 0; 362 mutex_lock(&inode->i_mutex); 363 pipe = inode->i_pipe; 364 for (;;) { 365 int bufs = pipe->nrbufs; 366 if (bufs) { 367 int curbuf = pipe->curbuf; 368 struct pipe_buffer *buf = pipe->bufs + curbuf; 369 const struct pipe_buf_operations *ops = buf->ops; 370 void *addr; 371 size_t chars = buf->len; 372 int error, atomic; 373 374 if (chars > total_len) 375 chars = total_len; 376 377 error = ops->confirm(pipe, buf); 378 if (error) { 379 if (!ret) 380 error = ret; 381 break; 382 } 383 384 atomic = !iov_fault_in_pages_write(iov, chars); 385 redo: 386 addr = ops->map(pipe, buf, atomic); 387 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 388 ops->unmap(pipe, buf, addr); 389 if (unlikely(error)) { 390 /* 391 * Just retry with the slow path if we failed. 392 */ 393 if (atomic) { 394 atomic = 0; 395 goto redo; 396 } 397 if (!ret) 398 ret = error; 399 break; 400 } 401 ret += chars; 402 buf->offset += chars; 403 buf->len -= chars; 404 if (!buf->len) { 405 buf->ops = NULL; 406 ops->release(pipe, buf); 407 curbuf = (curbuf + 1) & (pipe->buffers - 1); 408 pipe->curbuf = curbuf; 409 pipe->nrbufs = --bufs; 410 do_wakeup = 1; 411 } 412 total_len -= chars; 413 if (!total_len) 414 break; /* common path: read succeeded */ 415 } 416 if (bufs) /* More to do? */ 417 continue; 418 if (!pipe->writers) 419 break; 420 if (!pipe->waiting_writers) { 421 /* syscall merging: Usually we must not sleep 422 * if O_NONBLOCK is set, or if we got some data. 423 * But if a writer sleeps in kernel space, then 424 * we can wait for that data without violating POSIX. 425 */ 426 if (ret) 427 break; 428 if (filp->f_flags & O_NONBLOCK) { 429 ret = -EAGAIN; 430 break; 431 } 432 } 433 if (signal_pending(current)) { 434 if (!ret) 435 ret = -ERESTARTSYS; 436 break; 437 } 438 if (do_wakeup) { 439 wake_up_interruptible_sync(&pipe->wait); 440 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 441 } 442 pipe_wait(pipe); 443 } 444 mutex_unlock(&inode->i_mutex); 445 446 /* Signal writers asynchronously that there is more room. */ 447 if (do_wakeup) { 448 wake_up_interruptible_sync(&pipe->wait); 449 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 450 } 451 if (ret > 0) 452 file_accessed(filp); 453 return ret; 454 } 455 456 static ssize_t 457 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 458 unsigned long nr_segs, loff_t ppos) 459 { 460 struct file *filp = iocb->ki_filp; 461 struct inode *inode = filp->f_path.dentry->d_inode; 462 struct pipe_inode_info *pipe; 463 ssize_t ret; 464 int do_wakeup; 465 struct iovec *iov = (struct iovec *)_iov; 466 size_t total_len; 467 ssize_t chars; 468 469 total_len = iov_length(iov, nr_segs); 470 /* Null write succeeds. */ 471 if (unlikely(total_len == 0)) 472 return 0; 473 474 do_wakeup = 0; 475 ret = 0; 476 mutex_lock(&inode->i_mutex); 477 pipe = inode->i_pipe; 478 479 if (!pipe->readers) { 480 send_sig(SIGPIPE, current, 0); 481 ret = -EPIPE; 482 goto out; 483 } 484 485 /* We try to merge small writes */ 486 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 487 if (pipe->nrbufs && chars != 0) { 488 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 489 (pipe->buffers - 1); 490 struct pipe_buffer *buf = pipe->bufs + lastbuf; 491 const struct pipe_buf_operations *ops = buf->ops; 492 int offset = buf->offset + buf->len; 493 494 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 495 int error, atomic = 1; 496 void *addr; 497 498 error = ops->confirm(pipe, buf); 499 if (error) 500 goto out; 501 502 iov_fault_in_pages_read(iov, chars); 503 redo1: 504 addr = ops->map(pipe, buf, atomic); 505 error = pipe_iov_copy_from_user(offset + addr, iov, 506 chars, atomic); 507 ops->unmap(pipe, buf, addr); 508 ret = error; 509 do_wakeup = 1; 510 if (error) { 511 if (atomic) { 512 atomic = 0; 513 goto redo1; 514 } 515 goto out; 516 } 517 buf->len += chars; 518 total_len -= chars; 519 ret = chars; 520 if (!total_len) 521 goto out; 522 } 523 } 524 525 for (;;) { 526 int bufs; 527 528 if (!pipe->readers) { 529 send_sig(SIGPIPE, current, 0); 530 if (!ret) 531 ret = -EPIPE; 532 break; 533 } 534 bufs = pipe->nrbufs; 535 if (bufs < pipe->buffers) { 536 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 537 struct pipe_buffer *buf = pipe->bufs + newbuf; 538 struct page *page = pipe->tmp_page; 539 char *src; 540 int error, atomic = 1; 541 542 if (!page) { 543 page = alloc_page(GFP_HIGHUSER); 544 if (unlikely(!page)) { 545 ret = ret ? : -ENOMEM; 546 break; 547 } 548 pipe->tmp_page = page; 549 } 550 /* Always wake up, even if the copy fails. Otherwise 551 * we lock up (O_NONBLOCK-)readers that sleep due to 552 * syscall merging. 553 * FIXME! Is this really true? 554 */ 555 do_wakeup = 1; 556 chars = PAGE_SIZE; 557 if (chars > total_len) 558 chars = total_len; 559 560 iov_fault_in_pages_read(iov, chars); 561 redo2: 562 if (atomic) 563 src = kmap_atomic(page, KM_USER0); 564 else 565 src = kmap(page); 566 567 error = pipe_iov_copy_from_user(src, iov, chars, 568 atomic); 569 if (atomic) 570 kunmap_atomic(src, KM_USER0); 571 else 572 kunmap(page); 573 574 if (unlikely(error)) { 575 if (atomic) { 576 atomic = 0; 577 goto redo2; 578 } 579 if (!ret) 580 ret = error; 581 break; 582 } 583 ret += chars; 584 585 /* Insert it into the buffer array */ 586 buf->page = page; 587 buf->ops = &anon_pipe_buf_ops; 588 buf->offset = 0; 589 buf->len = chars; 590 pipe->nrbufs = ++bufs; 591 pipe->tmp_page = NULL; 592 593 total_len -= chars; 594 if (!total_len) 595 break; 596 } 597 if (bufs < pipe->buffers) 598 continue; 599 if (filp->f_flags & O_NONBLOCK) { 600 if (!ret) 601 ret = -EAGAIN; 602 break; 603 } 604 if (signal_pending(current)) { 605 if (!ret) 606 ret = -ERESTARTSYS; 607 break; 608 } 609 if (do_wakeup) { 610 wake_up_interruptible_sync(&pipe->wait); 611 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 612 do_wakeup = 0; 613 } 614 pipe->waiting_writers++; 615 pipe_wait(pipe); 616 pipe->waiting_writers--; 617 } 618 out: 619 mutex_unlock(&inode->i_mutex); 620 if (do_wakeup) { 621 wake_up_interruptible_sync(&pipe->wait); 622 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 623 } 624 if (ret > 0) 625 file_update_time(filp); 626 return ret; 627 } 628 629 static ssize_t 630 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 631 { 632 return -EBADF; 633 } 634 635 static ssize_t 636 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, 637 loff_t *ppos) 638 { 639 return -EBADF; 640 } 641 642 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 643 { 644 struct inode *inode = filp->f_path.dentry->d_inode; 645 struct pipe_inode_info *pipe; 646 int count, buf, nrbufs; 647 648 switch (cmd) { 649 case FIONREAD: 650 mutex_lock(&inode->i_mutex); 651 pipe = inode->i_pipe; 652 count = 0; 653 buf = pipe->curbuf; 654 nrbufs = pipe->nrbufs; 655 while (--nrbufs >= 0) { 656 count += pipe->bufs[buf].len; 657 buf = (buf+1) & (pipe->buffers - 1); 658 } 659 mutex_unlock(&inode->i_mutex); 660 661 return put_user(count, (int __user *)arg); 662 default: 663 return -EINVAL; 664 } 665 } 666 667 /* No kernel lock held - fine */ 668 static unsigned int 669 pipe_poll(struct file *filp, poll_table *wait) 670 { 671 unsigned int mask; 672 struct inode *inode = filp->f_path.dentry->d_inode; 673 struct pipe_inode_info *pipe = inode->i_pipe; 674 int nrbufs; 675 676 poll_wait(filp, &pipe->wait, wait); 677 678 /* Reading only -- no need for acquiring the semaphore. */ 679 nrbufs = pipe->nrbufs; 680 mask = 0; 681 if (filp->f_mode & FMODE_READ) { 682 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 683 if (!pipe->writers && filp->f_version != pipe->w_counter) 684 mask |= POLLHUP; 685 } 686 687 if (filp->f_mode & FMODE_WRITE) { 688 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 689 /* 690 * Most Unices do not set POLLERR for FIFOs but on Linux they 691 * behave exactly like pipes for poll(). 692 */ 693 if (!pipe->readers) 694 mask |= POLLERR; 695 } 696 697 return mask; 698 } 699 700 static int 701 pipe_release(struct inode *inode, int decr, int decw) 702 { 703 struct pipe_inode_info *pipe; 704 705 mutex_lock(&inode->i_mutex); 706 pipe = inode->i_pipe; 707 pipe->readers -= decr; 708 pipe->writers -= decw; 709 710 if (!pipe->readers && !pipe->writers) { 711 free_pipe_info(inode); 712 } else { 713 wake_up_interruptible_sync(&pipe->wait); 714 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 715 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 716 } 717 mutex_unlock(&inode->i_mutex); 718 719 return 0; 720 } 721 722 static int 723 pipe_read_fasync(int fd, struct file *filp, int on) 724 { 725 struct inode *inode = filp->f_path.dentry->d_inode; 726 int retval; 727 728 mutex_lock(&inode->i_mutex); 729 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 730 mutex_unlock(&inode->i_mutex); 731 732 return retval; 733 } 734 735 736 static int 737 pipe_write_fasync(int fd, struct file *filp, int on) 738 { 739 struct inode *inode = filp->f_path.dentry->d_inode; 740 int retval; 741 742 mutex_lock(&inode->i_mutex); 743 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 744 mutex_unlock(&inode->i_mutex); 745 746 return retval; 747 } 748 749 750 static int 751 pipe_rdwr_fasync(int fd, struct file *filp, int on) 752 { 753 struct inode *inode = filp->f_path.dentry->d_inode; 754 struct pipe_inode_info *pipe = inode->i_pipe; 755 int retval; 756 757 mutex_lock(&inode->i_mutex); 758 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 759 if (retval >= 0) { 760 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 761 if (retval < 0) /* this can happen only if on == T */ 762 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 763 } 764 mutex_unlock(&inode->i_mutex); 765 return retval; 766 } 767 768 769 static int 770 pipe_read_release(struct inode *inode, struct file *filp) 771 { 772 return pipe_release(inode, 1, 0); 773 } 774 775 static int 776 pipe_write_release(struct inode *inode, struct file *filp) 777 { 778 return pipe_release(inode, 0, 1); 779 } 780 781 static int 782 pipe_rdwr_release(struct inode *inode, struct file *filp) 783 { 784 int decr, decw; 785 786 decr = (filp->f_mode & FMODE_READ) != 0; 787 decw = (filp->f_mode & FMODE_WRITE) != 0; 788 return pipe_release(inode, decr, decw); 789 } 790 791 static int 792 pipe_read_open(struct inode *inode, struct file *filp) 793 { 794 int ret = -ENOENT; 795 796 mutex_lock(&inode->i_mutex); 797 798 if (inode->i_pipe) { 799 ret = 0; 800 inode->i_pipe->readers++; 801 } 802 803 mutex_unlock(&inode->i_mutex); 804 805 return ret; 806 } 807 808 static int 809 pipe_write_open(struct inode *inode, struct file *filp) 810 { 811 int ret = -ENOENT; 812 813 mutex_lock(&inode->i_mutex); 814 815 if (inode->i_pipe) { 816 ret = 0; 817 inode->i_pipe->writers++; 818 } 819 820 mutex_unlock(&inode->i_mutex); 821 822 return ret; 823 } 824 825 static int 826 pipe_rdwr_open(struct inode *inode, struct file *filp) 827 { 828 int ret = -ENOENT; 829 830 mutex_lock(&inode->i_mutex); 831 832 if (inode->i_pipe) { 833 ret = 0; 834 if (filp->f_mode & FMODE_READ) 835 inode->i_pipe->readers++; 836 if (filp->f_mode & FMODE_WRITE) 837 inode->i_pipe->writers++; 838 } 839 840 mutex_unlock(&inode->i_mutex); 841 842 return ret; 843 } 844 845 /* 846 * The file_operations structs are not static because they 847 * are also used in linux/fs/fifo.c to do operations on FIFOs. 848 * 849 * Pipes reuse fifos' file_operations structs. 850 */ 851 const struct file_operations read_pipefifo_fops = { 852 .llseek = no_llseek, 853 .read = do_sync_read, 854 .aio_read = pipe_read, 855 .write = bad_pipe_w, 856 .poll = pipe_poll, 857 .unlocked_ioctl = pipe_ioctl, 858 .open = pipe_read_open, 859 .release = pipe_read_release, 860 .fasync = pipe_read_fasync, 861 }; 862 863 const struct file_operations write_pipefifo_fops = { 864 .llseek = no_llseek, 865 .read = bad_pipe_r, 866 .write = do_sync_write, 867 .aio_write = pipe_write, 868 .poll = pipe_poll, 869 .unlocked_ioctl = pipe_ioctl, 870 .open = pipe_write_open, 871 .release = pipe_write_release, 872 .fasync = pipe_write_fasync, 873 }; 874 875 const struct file_operations rdwr_pipefifo_fops = { 876 .llseek = no_llseek, 877 .read = do_sync_read, 878 .aio_read = pipe_read, 879 .write = do_sync_write, 880 .aio_write = pipe_write, 881 .poll = pipe_poll, 882 .unlocked_ioctl = pipe_ioctl, 883 .open = pipe_rdwr_open, 884 .release = pipe_rdwr_release, 885 .fasync = pipe_rdwr_fasync, 886 }; 887 888 struct pipe_inode_info * alloc_pipe_info(struct inode *inode) 889 { 890 struct pipe_inode_info *pipe; 891 892 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 893 if (pipe) { 894 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 895 if (pipe->bufs) { 896 init_waitqueue_head(&pipe->wait); 897 pipe->r_counter = pipe->w_counter = 1; 898 pipe->inode = inode; 899 pipe->buffers = PIPE_DEF_BUFFERS; 900 return pipe; 901 } 902 kfree(pipe); 903 } 904 905 return NULL; 906 } 907 908 void __free_pipe_info(struct pipe_inode_info *pipe) 909 { 910 int i; 911 912 for (i = 0; i < pipe->buffers; i++) { 913 struct pipe_buffer *buf = pipe->bufs + i; 914 if (buf->ops) 915 buf->ops->release(pipe, buf); 916 } 917 if (pipe->tmp_page) 918 __free_page(pipe->tmp_page); 919 kfree(pipe->bufs); 920 kfree(pipe); 921 } 922 923 void free_pipe_info(struct inode *inode) 924 { 925 __free_pipe_info(inode->i_pipe); 926 inode->i_pipe = NULL; 927 } 928 929 static struct vfsmount *pipe_mnt __read_mostly; 930 931 /* 932 * pipefs_dname() is called from d_path(). 933 */ 934 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 935 { 936 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 937 dentry->d_inode->i_ino); 938 } 939 940 static const struct dentry_operations pipefs_dentry_operations = { 941 .d_dname = pipefs_dname, 942 }; 943 944 static struct inode * get_pipe_inode(void) 945 { 946 struct inode *inode = new_inode(pipe_mnt->mnt_sb); 947 struct pipe_inode_info *pipe; 948 949 if (!inode) 950 goto fail_inode; 951 952 pipe = alloc_pipe_info(inode); 953 if (!pipe) 954 goto fail_iput; 955 inode->i_pipe = pipe; 956 957 pipe->readers = pipe->writers = 1; 958 inode->i_fop = &rdwr_pipefifo_fops; 959 960 /* 961 * Mark the inode dirty from the very beginning, 962 * that way it will never be moved to the dirty 963 * list because "mark_inode_dirty()" will think 964 * that it already _is_ on the dirty list. 965 */ 966 inode->i_state = I_DIRTY; 967 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 968 inode->i_uid = current_fsuid(); 969 inode->i_gid = current_fsgid(); 970 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 971 972 return inode; 973 974 fail_iput: 975 iput(inode); 976 977 fail_inode: 978 return NULL; 979 } 980 981 struct file *create_write_pipe(int flags) 982 { 983 int err; 984 struct inode *inode; 985 struct file *f; 986 struct path path; 987 struct qstr name = { .name = "" }; 988 989 err = -ENFILE; 990 inode = get_pipe_inode(); 991 if (!inode) 992 goto err; 993 994 err = -ENOMEM; 995 path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); 996 if (!path.dentry) 997 goto err_inode; 998 path.mnt = mntget(pipe_mnt); 999 1000 path.dentry->d_op = &pipefs_dentry_operations; 1001 d_instantiate(path.dentry, inode); 1002 1003 err = -ENFILE; 1004 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1005 if (!f) 1006 goto err_dentry; 1007 f->f_mapping = inode->i_mapping; 1008 1009 f->f_flags = O_WRONLY | (flags & O_NONBLOCK); 1010 f->f_version = 0; 1011 1012 return f; 1013 1014 err_dentry: 1015 free_pipe_info(inode); 1016 path_put(&path); 1017 return ERR_PTR(err); 1018 1019 err_inode: 1020 free_pipe_info(inode); 1021 iput(inode); 1022 err: 1023 return ERR_PTR(err); 1024 } 1025 1026 void free_write_pipe(struct file *f) 1027 { 1028 free_pipe_info(f->f_dentry->d_inode); 1029 path_put(&f->f_path); 1030 put_filp(f); 1031 } 1032 1033 struct file *create_read_pipe(struct file *wrf, int flags) 1034 { 1035 /* Grab pipe from the writer */ 1036 struct file *f = alloc_file(&wrf->f_path, FMODE_READ, 1037 &read_pipefifo_fops); 1038 if (!f) 1039 return ERR_PTR(-ENFILE); 1040 1041 path_get(&wrf->f_path); 1042 f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1043 1044 return f; 1045 } 1046 1047 int do_pipe_flags(int *fd, int flags) 1048 { 1049 struct file *fw, *fr; 1050 int error; 1051 int fdw, fdr; 1052 1053 if (flags & ~(O_CLOEXEC | O_NONBLOCK)) 1054 return -EINVAL; 1055 1056 fw = create_write_pipe(flags); 1057 if (IS_ERR(fw)) 1058 return PTR_ERR(fw); 1059 fr = create_read_pipe(fw, flags); 1060 error = PTR_ERR(fr); 1061 if (IS_ERR(fr)) 1062 goto err_write_pipe; 1063 1064 error = get_unused_fd_flags(flags); 1065 if (error < 0) 1066 goto err_read_pipe; 1067 fdr = error; 1068 1069 error = get_unused_fd_flags(flags); 1070 if (error < 0) 1071 goto err_fdr; 1072 fdw = error; 1073 1074 audit_fd_pair(fdr, fdw); 1075 fd_install(fdr, fr); 1076 fd_install(fdw, fw); 1077 fd[0] = fdr; 1078 fd[1] = fdw; 1079 1080 return 0; 1081 1082 err_fdr: 1083 put_unused_fd(fdr); 1084 err_read_pipe: 1085 path_put(&fr->f_path); 1086 put_filp(fr); 1087 err_write_pipe: 1088 free_write_pipe(fw); 1089 return error; 1090 } 1091 1092 /* 1093 * sys_pipe() is the normal C calling standard for creating 1094 * a pipe. It's not the way Unix traditionally does this, though. 1095 */ 1096 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1097 { 1098 int fd[2]; 1099 int error; 1100 1101 error = do_pipe_flags(fd, flags); 1102 if (!error) { 1103 if (copy_to_user(fildes, fd, sizeof(fd))) { 1104 sys_close(fd[0]); 1105 sys_close(fd[1]); 1106 error = -EFAULT; 1107 } 1108 } 1109 return error; 1110 } 1111 1112 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1113 { 1114 return sys_pipe2(fildes, 0); 1115 } 1116 1117 /* 1118 * Allocate a new array of pipe buffers and copy the info over. Returns the 1119 * pipe size if successful, or return -ERROR on error. 1120 */ 1121 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1122 { 1123 struct pipe_buffer *bufs; 1124 1125 /* 1126 * Must be a power-of-2 currently 1127 */ 1128 if (!is_power_of_2(arg)) 1129 return -EINVAL; 1130 1131 /* 1132 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1133 * expect a lot of shrink+grow operations, just free and allocate 1134 * again like we would do for growing. If the pipe currently 1135 * contains more buffers than arg, then return busy. 1136 */ 1137 if (arg < pipe->nrbufs) 1138 return -EBUSY; 1139 1140 bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); 1141 if (unlikely(!bufs)) 1142 return -ENOMEM; 1143 1144 /* 1145 * The pipe array wraps around, so just start the new one at zero 1146 * and adjust the indexes. 1147 */ 1148 if (pipe->nrbufs) { 1149 const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); 1150 const unsigned int head = pipe->nrbufs - tail; 1151 1152 if (head) 1153 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1154 if (tail) 1155 memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); 1156 } 1157 1158 pipe->curbuf = 0; 1159 kfree(pipe->bufs); 1160 pipe->bufs = bufs; 1161 pipe->buffers = arg; 1162 return arg; 1163 } 1164 1165 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1166 { 1167 struct pipe_inode_info *pipe; 1168 long ret; 1169 1170 pipe = file->f_path.dentry->d_inode->i_pipe; 1171 if (!pipe) 1172 return -EBADF; 1173 1174 mutex_lock(&pipe->inode->i_mutex); 1175 1176 switch (cmd) { 1177 case F_SETPIPE_SZ: 1178 if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) { 1179 ret = -EINVAL; 1180 goto out; 1181 } 1182 /* 1183 * The pipe needs to be at least 2 pages large to 1184 * guarantee POSIX behaviour. 1185 */ 1186 if (arg < 2) { 1187 ret = -EINVAL; 1188 goto out; 1189 } 1190 ret = pipe_set_size(pipe, arg); 1191 break; 1192 case F_GETPIPE_SZ: 1193 ret = pipe->buffers; 1194 break; 1195 default: 1196 ret = -EINVAL; 1197 break; 1198 } 1199 1200 out: 1201 mutex_unlock(&pipe->inode->i_mutex); 1202 return ret; 1203 } 1204 1205 /* 1206 * pipefs should _never_ be mounted by userland - too much of security hassle, 1207 * no real gain from having the whole whorehouse mounted. So we don't need 1208 * any operations on the root directory. However, we need a non-trivial 1209 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1210 */ 1211 static int pipefs_get_sb(struct file_system_type *fs_type, 1212 int flags, const char *dev_name, void *data, 1213 struct vfsmount *mnt) 1214 { 1215 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); 1216 } 1217 1218 static struct file_system_type pipe_fs_type = { 1219 .name = "pipefs", 1220 .get_sb = pipefs_get_sb, 1221 .kill_sb = kill_anon_super, 1222 }; 1223 1224 static int __init init_pipe_fs(void) 1225 { 1226 int err = register_filesystem(&pipe_fs_type); 1227 1228 if (!err) { 1229 pipe_mnt = kern_mount(&pipe_fs_type); 1230 if (IS_ERR(pipe_mnt)) { 1231 err = PTR_ERR(pipe_mnt); 1232 unregister_filesystem(&pipe_fs_type); 1233 } 1234 } 1235 return err; 1236 } 1237 1238 static void __exit exit_pipe_fs(void) 1239 { 1240 unregister_filesystem(&pipe_fs_type); 1241 mntput(pipe_mnt); 1242 } 1243 1244 fs_initcall(init_pipe_fs); 1245 module_exit(exit_pipe_fs); 1246