1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/pipe_fs_i.h> 18 #include <linux/uio.h> 19 #include <linux/highmem.h> 20 #include <linux/pagemap.h> 21 #include <linux/audit.h> 22 #include <linux/syscalls.h> 23 #include <linux/fcntl.h> 24 25 #include <asm/uaccess.h> 26 #include <asm/ioctls.h> 27 28 /* 29 * The max size that a non-root user is allowed to grow the pipe. Can 30 * be set by root in /proc/sys/fs/pipe-max-size 31 */ 32 unsigned int pipe_max_size = 1048576; 33 34 /* 35 * Minimum pipe size, as required by POSIX 36 */ 37 unsigned int pipe_min_size = PAGE_SIZE; 38 39 /* 40 * We use a start+len construction, which provides full use of the 41 * allocated memory. 42 * -- Florian Coosmann (FGC) 43 * 44 * Reads with count = 0 should always return 0. 45 * -- Julian Bradfield 1999-06-07. 46 * 47 * FIFOs and Pipes now generate SIGIO for both readers and writers. 48 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 49 * 50 * pipe_read & write cleanup 51 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 52 */ 53 54 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 55 { 56 if (pipe->inode) 57 mutex_lock_nested(&pipe->inode->i_mutex, subclass); 58 } 59 60 void pipe_lock(struct pipe_inode_info *pipe) 61 { 62 /* 63 * pipe_lock() nests non-pipe inode locks (for writing to a file) 64 */ 65 pipe_lock_nested(pipe, I_MUTEX_PARENT); 66 } 67 EXPORT_SYMBOL(pipe_lock); 68 69 void pipe_unlock(struct pipe_inode_info *pipe) 70 { 71 if (pipe->inode) 72 mutex_unlock(&pipe->inode->i_mutex); 73 } 74 EXPORT_SYMBOL(pipe_unlock); 75 76 void pipe_double_lock(struct pipe_inode_info *pipe1, 77 struct pipe_inode_info *pipe2) 78 { 79 BUG_ON(pipe1 == pipe2); 80 81 if (pipe1 < pipe2) { 82 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 83 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 84 } else { 85 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 86 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 87 } 88 } 89 90 /* Drop the inode semaphore and wait for a pipe event, atomically */ 91 void pipe_wait(struct pipe_inode_info *pipe) 92 { 93 DEFINE_WAIT(wait); 94 95 /* 96 * Pipes are system-local resources, so sleeping on them 97 * is considered a noninteractive wait: 98 */ 99 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 100 pipe_unlock(pipe); 101 schedule(); 102 finish_wait(&pipe->wait, &wait); 103 pipe_lock(pipe); 104 } 105 106 static int 107 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 108 int atomic) 109 { 110 unsigned long copy; 111 112 while (len > 0) { 113 while (!iov->iov_len) 114 iov++; 115 copy = min_t(unsigned long, len, iov->iov_len); 116 117 if (atomic) { 118 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 119 return -EFAULT; 120 } else { 121 if (copy_from_user(to, iov->iov_base, copy)) 122 return -EFAULT; 123 } 124 to += copy; 125 len -= copy; 126 iov->iov_base += copy; 127 iov->iov_len -= copy; 128 } 129 return 0; 130 } 131 132 static int 133 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, 134 int atomic) 135 { 136 unsigned long copy; 137 138 while (len > 0) { 139 while (!iov->iov_len) 140 iov++; 141 copy = min_t(unsigned long, len, iov->iov_len); 142 143 if (atomic) { 144 if (__copy_to_user_inatomic(iov->iov_base, from, copy)) 145 return -EFAULT; 146 } else { 147 if (copy_to_user(iov->iov_base, from, copy)) 148 return -EFAULT; 149 } 150 from += copy; 151 len -= copy; 152 iov->iov_base += copy; 153 iov->iov_len -= copy; 154 } 155 return 0; 156 } 157 158 /* 159 * Attempt to pre-fault in the user memory, so we can use atomic copies. 160 * Returns the number of bytes not faulted in. 161 */ 162 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) 163 { 164 while (!iov->iov_len) 165 iov++; 166 167 while (len > 0) { 168 unsigned long this_len; 169 170 this_len = min_t(unsigned long, len, iov->iov_len); 171 if (fault_in_pages_writeable(iov->iov_base, this_len)) 172 break; 173 174 len -= this_len; 175 iov++; 176 } 177 178 return len; 179 } 180 181 /* 182 * Pre-fault in the user memory, so we can use atomic copies. 183 */ 184 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 185 { 186 while (!iov->iov_len) 187 iov++; 188 189 while (len > 0) { 190 unsigned long this_len; 191 192 this_len = min_t(unsigned long, len, iov->iov_len); 193 fault_in_pages_readable(iov->iov_base, this_len); 194 len -= this_len; 195 iov++; 196 } 197 } 198 199 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 200 struct pipe_buffer *buf) 201 { 202 struct page *page = buf->page; 203 204 /* 205 * If nobody else uses this page, and we don't already have a 206 * temporary page, let's keep track of it as a one-deep 207 * allocation cache. (Otherwise just release our reference to it) 208 */ 209 if (page_count(page) == 1 && !pipe->tmp_page) 210 pipe->tmp_page = page; 211 else 212 page_cache_release(page); 213 } 214 215 /** 216 * generic_pipe_buf_map - virtually map a pipe buffer 217 * @pipe: the pipe that the buffer belongs to 218 * @buf: the buffer that should be mapped 219 * @atomic: whether to use an atomic map 220 * 221 * Description: 222 * This function returns a kernel virtual address mapping for the 223 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided 224 * and the caller has to be careful not to fault before calling 225 * the unmap function. 226 * 227 * Note that this function occupies KM_USER0 if @atomic != 0. 228 */ 229 void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 230 struct pipe_buffer *buf, int atomic) 231 { 232 if (atomic) { 233 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 234 return kmap_atomic(buf->page); 235 } 236 237 return kmap(buf->page); 238 } 239 EXPORT_SYMBOL(generic_pipe_buf_map); 240 241 /** 242 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 243 * @pipe: the pipe that the buffer belongs to 244 * @buf: the buffer that should be unmapped 245 * @map_data: the data that the mapping function returned 246 * 247 * Description: 248 * This function undoes the mapping that ->map() provided. 249 */ 250 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 251 struct pipe_buffer *buf, void *map_data) 252 { 253 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 254 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 255 kunmap_atomic(map_data); 256 } else 257 kunmap(buf->page); 258 } 259 EXPORT_SYMBOL(generic_pipe_buf_unmap); 260 261 /** 262 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 263 * @pipe: the pipe that the buffer belongs to 264 * @buf: the buffer to attempt to steal 265 * 266 * Description: 267 * This function attempts to steal the &struct page attached to 268 * @buf. If successful, this function returns 0 and returns with 269 * the page locked. The caller may then reuse the page for whatever 270 * he wishes; the typical use is insertion into a different file 271 * page cache. 272 */ 273 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 274 struct pipe_buffer *buf) 275 { 276 struct page *page = buf->page; 277 278 /* 279 * A reference of one is golden, that means that the owner of this 280 * page is the only one holding a reference to it. lock the page 281 * and return OK. 282 */ 283 if (page_count(page) == 1) { 284 lock_page(page); 285 return 0; 286 } 287 288 return 1; 289 } 290 EXPORT_SYMBOL(generic_pipe_buf_steal); 291 292 /** 293 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 294 * @pipe: the pipe that the buffer belongs to 295 * @buf: the buffer to get a reference to 296 * 297 * Description: 298 * This function grabs an extra reference to @buf. It's used in 299 * in the tee() system call, when we duplicate the buffers in one 300 * pipe into another. 301 */ 302 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 303 { 304 page_cache_get(buf->page); 305 } 306 EXPORT_SYMBOL(generic_pipe_buf_get); 307 308 /** 309 * generic_pipe_buf_confirm - verify contents of the pipe buffer 310 * @info: the pipe that the buffer belongs to 311 * @buf: the buffer to confirm 312 * 313 * Description: 314 * This function does nothing, because the generic pipe code uses 315 * pages that are always good when inserted into the pipe. 316 */ 317 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 318 struct pipe_buffer *buf) 319 { 320 return 0; 321 } 322 EXPORT_SYMBOL(generic_pipe_buf_confirm); 323 324 /** 325 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 326 * @pipe: the pipe that the buffer belongs to 327 * @buf: the buffer to put a reference to 328 * 329 * Description: 330 * This function releases a reference to @buf. 331 */ 332 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 333 struct pipe_buffer *buf) 334 { 335 page_cache_release(buf->page); 336 } 337 EXPORT_SYMBOL(generic_pipe_buf_release); 338 339 static const struct pipe_buf_operations anon_pipe_buf_ops = { 340 .can_merge = 1, 341 .map = generic_pipe_buf_map, 342 .unmap = generic_pipe_buf_unmap, 343 .confirm = generic_pipe_buf_confirm, 344 .release = anon_pipe_buf_release, 345 .steal = generic_pipe_buf_steal, 346 .get = generic_pipe_buf_get, 347 }; 348 349 static ssize_t 350 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 351 unsigned long nr_segs, loff_t pos) 352 { 353 struct file *filp = iocb->ki_filp; 354 struct inode *inode = filp->f_path.dentry->d_inode; 355 struct pipe_inode_info *pipe; 356 int do_wakeup; 357 ssize_t ret; 358 struct iovec *iov = (struct iovec *)_iov; 359 size_t total_len; 360 361 total_len = iov_length(iov, nr_segs); 362 /* Null read succeeds. */ 363 if (unlikely(total_len == 0)) 364 return 0; 365 366 do_wakeup = 0; 367 ret = 0; 368 mutex_lock(&inode->i_mutex); 369 pipe = inode->i_pipe; 370 for (;;) { 371 int bufs = pipe->nrbufs; 372 if (bufs) { 373 int curbuf = pipe->curbuf; 374 struct pipe_buffer *buf = pipe->bufs + curbuf; 375 const struct pipe_buf_operations *ops = buf->ops; 376 void *addr; 377 size_t chars = buf->len; 378 int error, atomic; 379 380 if (chars > total_len) 381 chars = total_len; 382 383 error = ops->confirm(pipe, buf); 384 if (error) { 385 if (!ret) 386 ret = error; 387 break; 388 } 389 390 atomic = !iov_fault_in_pages_write(iov, chars); 391 redo: 392 addr = ops->map(pipe, buf, atomic); 393 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 394 ops->unmap(pipe, buf, addr); 395 if (unlikely(error)) { 396 /* 397 * Just retry with the slow path if we failed. 398 */ 399 if (atomic) { 400 atomic = 0; 401 goto redo; 402 } 403 if (!ret) 404 ret = error; 405 break; 406 } 407 ret += chars; 408 buf->offset += chars; 409 buf->len -= chars; 410 if (!buf->len) { 411 buf->ops = NULL; 412 ops->release(pipe, buf); 413 curbuf = (curbuf + 1) & (pipe->buffers - 1); 414 pipe->curbuf = curbuf; 415 pipe->nrbufs = --bufs; 416 do_wakeup = 1; 417 } 418 total_len -= chars; 419 if (!total_len) 420 break; /* common path: read succeeded */ 421 } 422 if (bufs) /* More to do? */ 423 continue; 424 if (!pipe->writers) 425 break; 426 if (!pipe->waiting_writers) { 427 /* syscall merging: Usually we must not sleep 428 * if O_NONBLOCK is set, or if we got some data. 429 * But if a writer sleeps in kernel space, then 430 * we can wait for that data without violating POSIX. 431 */ 432 if (ret) 433 break; 434 if (filp->f_flags & O_NONBLOCK) { 435 ret = -EAGAIN; 436 break; 437 } 438 } 439 if (signal_pending(current)) { 440 if (!ret) 441 ret = -ERESTARTSYS; 442 break; 443 } 444 if (do_wakeup) { 445 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 446 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 447 } 448 pipe_wait(pipe); 449 } 450 mutex_unlock(&inode->i_mutex); 451 452 /* Signal writers asynchronously that there is more room. */ 453 if (do_wakeup) { 454 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 455 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 456 } 457 if (ret > 0) 458 file_accessed(filp); 459 return ret; 460 } 461 462 static ssize_t 463 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 464 unsigned long nr_segs, loff_t ppos) 465 { 466 struct file *filp = iocb->ki_filp; 467 struct inode *inode = filp->f_path.dentry->d_inode; 468 struct pipe_inode_info *pipe; 469 ssize_t ret; 470 int do_wakeup; 471 struct iovec *iov = (struct iovec *)_iov; 472 size_t total_len; 473 ssize_t chars; 474 475 total_len = iov_length(iov, nr_segs); 476 /* Null write succeeds. */ 477 if (unlikely(total_len == 0)) 478 return 0; 479 480 do_wakeup = 0; 481 ret = 0; 482 mutex_lock(&inode->i_mutex); 483 pipe = inode->i_pipe; 484 485 if (!pipe->readers) { 486 send_sig(SIGPIPE, current, 0); 487 ret = -EPIPE; 488 goto out; 489 } 490 491 /* We try to merge small writes */ 492 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 493 if (pipe->nrbufs && chars != 0) { 494 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 495 (pipe->buffers - 1); 496 struct pipe_buffer *buf = pipe->bufs + lastbuf; 497 const struct pipe_buf_operations *ops = buf->ops; 498 int offset = buf->offset + buf->len; 499 500 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 501 int error, atomic = 1; 502 void *addr; 503 504 error = ops->confirm(pipe, buf); 505 if (error) 506 goto out; 507 508 iov_fault_in_pages_read(iov, chars); 509 redo1: 510 addr = ops->map(pipe, buf, atomic); 511 error = pipe_iov_copy_from_user(offset + addr, iov, 512 chars, atomic); 513 ops->unmap(pipe, buf, addr); 514 ret = error; 515 do_wakeup = 1; 516 if (error) { 517 if (atomic) { 518 atomic = 0; 519 goto redo1; 520 } 521 goto out; 522 } 523 buf->len += chars; 524 total_len -= chars; 525 ret = chars; 526 if (!total_len) 527 goto out; 528 } 529 } 530 531 for (;;) { 532 int bufs; 533 534 if (!pipe->readers) { 535 send_sig(SIGPIPE, current, 0); 536 if (!ret) 537 ret = -EPIPE; 538 break; 539 } 540 bufs = pipe->nrbufs; 541 if (bufs < pipe->buffers) { 542 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 543 struct pipe_buffer *buf = pipe->bufs + newbuf; 544 struct page *page = pipe->tmp_page; 545 char *src; 546 int error, atomic = 1; 547 548 if (!page) { 549 page = alloc_page(GFP_HIGHUSER); 550 if (unlikely(!page)) { 551 ret = ret ? : -ENOMEM; 552 break; 553 } 554 pipe->tmp_page = page; 555 } 556 /* Always wake up, even if the copy fails. Otherwise 557 * we lock up (O_NONBLOCK-)readers that sleep due to 558 * syscall merging. 559 * FIXME! Is this really true? 560 */ 561 do_wakeup = 1; 562 chars = PAGE_SIZE; 563 if (chars > total_len) 564 chars = total_len; 565 566 iov_fault_in_pages_read(iov, chars); 567 redo2: 568 if (atomic) 569 src = kmap_atomic(page); 570 else 571 src = kmap(page); 572 573 error = pipe_iov_copy_from_user(src, iov, chars, 574 atomic); 575 if (atomic) 576 kunmap_atomic(src); 577 else 578 kunmap(page); 579 580 if (unlikely(error)) { 581 if (atomic) { 582 atomic = 0; 583 goto redo2; 584 } 585 if (!ret) 586 ret = error; 587 break; 588 } 589 ret += chars; 590 591 /* Insert it into the buffer array */ 592 buf->page = page; 593 buf->ops = &anon_pipe_buf_ops; 594 buf->offset = 0; 595 buf->len = chars; 596 pipe->nrbufs = ++bufs; 597 pipe->tmp_page = NULL; 598 599 total_len -= chars; 600 if (!total_len) 601 break; 602 } 603 if (bufs < pipe->buffers) 604 continue; 605 if (filp->f_flags & O_NONBLOCK) { 606 if (!ret) 607 ret = -EAGAIN; 608 break; 609 } 610 if (signal_pending(current)) { 611 if (!ret) 612 ret = -ERESTARTSYS; 613 break; 614 } 615 if (do_wakeup) { 616 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 617 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 618 do_wakeup = 0; 619 } 620 pipe->waiting_writers++; 621 pipe_wait(pipe); 622 pipe->waiting_writers--; 623 } 624 out: 625 mutex_unlock(&inode->i_mutex); 626 if (do_wakeup) { 627 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 628 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 629 } 630 if (ret > 0) 631 file_update_time(filp); 632 return ret; 633 } 634 635 static ssize_t 636 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 637 { 638 return -EBADF; 639 } 640 641 static ssize_t 642 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, 643 loff_t *ppos) 644 { 645 return -EBADF; 646 } 647 648 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 649 { 650 struct inode *inode = filp->f_path.dentry->d_inode; 651 struct pipe_inode_info *pipe; 652 int count, buf, nrbufs; 653 654 switch (cmd) { 655 case FIONREAD: 656 mutex_lock(&inode->i_mutex); 657 pipe = inode->i_pipe; 658 count = 0; 659 buf = pipe->curbuf; 660 nrbufs = pipe->nrbufs; 661 while (--nrbufs >= 0) { 662 count += pipe->bufs[buf].len; 663 buf = (buf+1) & (pipe->buffers - 1); 664 } 665 mutex_unlock(&inode->i_mutex); 666 667 return put_user(count, (int __user *)arg); 668 default: 669 return -EINVAL; 670 } 671 } 672 673 /* No kernel lock held - fine */ 674 static unsigned int 675 pipe_poll(struct file *filp, poll_table *wait) 676 { 677 unsigned int mask; 678 struct inode *inode = filp->f_path.dentry->d_inode; 679 struct pipe_inode_info *pipe = inode->i_pipe; 680 int nrbufs; 681 682 poll_wait(filp, &pipe->wait, wait); 683 684 /* Reading only -- no need for acquiring the semaphore. */ 685 nrbufs = pipe->nrbufs; 686 mask = 0; 687 if (filp->f_mode & FMODE_READ) { 688 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 689 if (!pipe->writers && filp->f_version != pipe->w_counter) 690 mask |= POLLHUP; 691 } 692 693 if (filp->f_mode & FMODE_WRITE) { 694 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 695 /* 696 * Most Unices do not set POLLERR for FIFOs but on Linux they 697 * behave exactly like pipes for poll(). 698 */ 699 if (!pipe->readers) 700 mask |= POLLERR; 701 } 702 703 return mask; 704 } 705 706 static int 707 pipe_release(struct inode *inode, int decr, int decw) 708 { 709 struct pipe_inode_info *pipe; 710 711 mutex_lock(&inode->i_mutex); 712 pipe = inode->i_pipe; 713 pipe->readers -= decr; 714 pipe->writers -= decw; 715 716 if (!pipe->readers && !pipe->writers) { 717 free_pipe_info(inode); 718 } else { 719 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 720 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 721 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 722 } 723 mutex_unlock(&inode->i_mutex); 724 725 return 0; 726 } 727 728 static int 729 pipe_read_fasync(int fd, struct file *filp, int on) 730 { 731 struct inode *inode = filp->f_path.dentry->d_inode; 732 int retval; 733 734 mutex_lock(&inode->i_mutex); 735 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 736 mutex_unlock(&inode->i_mutex); 737 738 return retval; 739 } 740 741 742 static int 743 pipe_write_fasync(int fd, struct file *filp, int on) 744 { 745 struct inode *inode = filp->f_path.dentry->d_inode; 746 int retval; 747 748 mutex_lock(&inode->i_mutex); 749 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 750 mutex_unlock(&inode->i_mutex); 751 752 return retval; 753 } 754 755 756 static int 757 pipe_rdwr_fasync(int fd, struct file *filp, int on) 758 { 759 struct inode *inode = filp->f_path.dentry->d_inode; 760 struct pipe_inode_info *pipe = inode->i_pipe; 761 int retval; 762 763 mutex_lock(&inode->i_mutex); 764 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 765 if (retval >= 0) { 766 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 767 if (retval < 0) /* this can happen only if on == T */ 768 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 769 } 770 mutex_unlock(&inode->i_mutex); 771 return retval; 772 } 773 774 775 static int 776 pipe_read_release(struct inode *inode, struct file *filp) 777 { 778 return pipe_release(inode, 1, 0); 779 } 780 781 static int 782 pipe_write_release(struct inode *inode, struct file *filp) 783 { 784 return pipe_release(inode, 0, 1); 785 } 786 787 static int 788 pipe_rdwr_release(struct inode *inode, struct file *filp) 789 { 790 int decr, decw; 791 792 decr = (filp->f_mode & FMODE_READ) != 0; 793 decw = (filp->f_mode & FMODE_WRITE) != 0; 794 return pipe_release(inode, decr, decw); 795 } 796 797 static int 798 pipe_read_open(struct inode *inode, struct file *filp) 799 { 800 int ret = -ENOENT; 801 802 mutex_lock(&inode->i_mutex); 803 804 if (inode->i_pipe) { 805 ret = 0; 806 inode->i_pipe->readers++; 807 } 808 809 mutex_unlock(&inode->i_mutex); 810 811 return ret; 812 } 813 814 static int 815 pipe_write_open(struct inode *inode, struct file *filp) 816 { 817 int ret = -ENOENT; 818 819 mutex_lock(&inode->i_mutex); 820 821 if (inode->i_pipe) { 822 ret = 0; 823 inode->i_pipe->writers++; 824 } 825 826 mutex_unlock(&inode->i_mutex); 827 828 return ret; 829 } 830 831 static int 832 pipe_rdwr_open(struct inode *inode, struct file *filp) 833 { 834 int ret = -ENOENT; 835 836 mutex_lock(&inode->i_mutex); 837 838 if (inode->i_pipe) { 839 ret = 0; 840 if (filp->f_mode & FMODE_READ) 841 inode->i_pipe->readers++; 842 if (filp->f_mode & FMODE_WRITE) 843 inode->i_pipe->writers++; 844 } 845 846 mutex_unlock(&inode->i_mutex); 847 848 return ret; 849 } 850 851 /* 852 * The file_operations structs are not static because they 853 * are also used in linux/fs/fifo.c to do operations on FIFOs. 854 * 855 * Pipes reuse fifos' file_operations structs. 856 */ 857 const struct file_operations read_pipefifo_fops = { 858 .llseek = no_llseek, 859 .read = do_sync_read, 860 .aio_read = pipe_read, 861 .write = bad_pipe_w, 862 .poll = pipe_poll, 863 .unlocked_ioctl = pipe_ioctl, 864 .open = pipe_read_open, 865 .release = pipe_read_release, 866 .fasync = pipe_read_fasync, 867 }; 868 869 const struct file_operations write_pipefifo_fops = { 870 .llseek = no_llseek, 871 .read = bad_pipe_r, 872 .write = do_sync_write, 873 .aio_write = pipe_write, 874 .poll = pipe_poll, 875 .unlocked_ioctl = pipe_ioctl, 876 .open = pipe_write_open, 877 .release = pipe_write_release, 878 .fasync = pipe_write_fasync, 879 }; 880 881 const struct file_operations rdwr_pipefifo_fops = { 882 .llseek = no_llseek, 883 .read = do_sync_read, 884 .aio_read = pipe_read, 885 .write = do_sync_write, 886 .aio_write = pipe_write, 887 .poll = pipe_poll, 888 .unlocked_ioctl = pipe_ioctl, 889 .open = pipe_rdwr_open, 890 .release = pipe_rdwr_release, 891 .fasync = pipe_rdwr_fasync, 892 }; 893 894 struct pipe_inode_info * alloc_pipe_info(struct inode *inode) 895 { 896 struct pipe_inode_info *pipe; 897 898 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 899 if (pipe) { 900 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 901 if (pipe->bufs) { 902 init_waitqueue_head(&pipe->wait); 903 pipe->r_counter = pipe->w_counter = 1; 904 pipe->inode = inode; 905 pipe->buffers = PIPE_DEF_BUFFERS; 906 return pipe; 907 } 908 kfree(pipe); 909 } 910 911 return NULL; 912 } 913 914 void __free_pipe_info(struct pipe_inode_info *pipe) 915 { 916 int i; 917 918 for (i = 0; i < pipe->buffers; i++) { 919 struct pipe_buffer *buf = pipe->bufs + i; 920 if (buf->ops) 921 buf->ops->release(pipe, buf); 922 } 923 if (pipe->tmp_page) 924 __free_page(pipe->tmp_page); 925 kfree(pipe->bufs); 926 kfree(pipe); 927 } 928 929 void free_pipe_info(struct inode *inode) 930 { 931 __free_pipe_info(inode->i_pipe); 932 inode->i_pipe = NULL; 933 } 934 935 static struct vfsmount *pipe_mnt __read_mostly; 936 937 /* 938 * pipefs_dname() is called from d_path(). 939 */ 940 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 941 { 942 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 943 dentry->d_inode->i_ino); 944 } 945 946 static const struct dentry_operations pipefs_dentry_operations = { 947 .d_dname = pipefs_dname, 948 }; 949 950 static struct inode * get_pipe_inode(void) 951 { 952 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 953 struct pipe_inode_info *pipe; 954 955 if (!inode) 956 goto fail_inode; 957 958 inode->i_ino = get_next_ino(); 959 960 pipe = alloc_pipe_info(inode); 961 if (!pipe) 962 goto fail_iput; 963 inode->i_pipe = pipe; 964 965 pipe->readers = pipe->writers = 1; 966 inode->i_fop = &rdwr_pipefifo_fops; 967 968 /* 969 * Mark the inode dirty from the very beginning, 970 * that way it will never be moved to the dirty 971 * list because "mark_inode_dirty()" will think 972 * that it already _is_ on the dirty list. 973 */ 974 inode->i_state = I_DIRTY; 975 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 976 inode->i_uid = current_fsuid(); 977 inode->i_gid = current_fsgid(); 978 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 979 980 return inode; 981 982 fail_iput: 983 iput(inode); 984 985 fail_inode: 986 return NULL; 987 } 988 989 struct file *create_write_pipe(int flags) 990 { 991 int err; 992 struct inode *inode; 993 struct file *f; 994 struct path path; 995 struct qstr name = { .name = "" }; 996 997 err = -ENFILE; 998 inode = get_pipe_inode(); 999 if (!inode) 1000 goto err; 1001 1002 err = -ENOMEM; 1003 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 1004 if (!path.dentry) 1005 goto err_inode; 1006 path.mnt = mntget(pipe_mnt); 1007 1008 d_instantiate(path.dentry, inode); 1009 1010 err = -ENFILE; 1011 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1012 if (!f) 1013 goto err_dentry; 1014 f->f_mapping = inode->i_mapping; 1015 1016 f->f_flags = O_WRONLY | (flags & O_NONBLOCK); 1017 f->f_version = 0; 1018 1019 return f; 1020 1021 err_dentry: 1022 free_pipe_info(inode); 1023 path_put(&path); 1024 return ERR_PTR(err); 1025 1026 err_inode: 1027 free_pipe_info(inode); 1028 iput(inode); 1029 err: 1030 return ERR_PTR(err); 1031 } 1032 1033 void free_write_pipe(struct file *f) 1034 { 1035 free_pipe_info(f->f_dentry->d_inode); 1036 path_put(&f->f_path); 1037 put_filp(f); 1038 } 1039 1040 struct file *create_read_pipe(struct file *wrf, int flags) 1041 { 1042 /* Grab pipe from the writer */ 1043 struct file *f = alloc_file(&wrf->f_path, FMODE_READ, 1044 &read_pipefifo_fops); 1045 if (!f) 1046 return ERR_PTR(-ENFILE); 1047 1048 path_get(&wrf->f_path); 1049 f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1050 1051 return f; 1052 } 1053 1054 int do_pipe_flags(int *fd, int flags) 1055 { 1056 struct file *fw, *fr; 1057 int error; 1058 int fdw, fdr; 1059 1060 if (flags & ~(O_CLOEXEC | O_NONBLOCK)) 1061 return -EINVAL; 1062 1063 fw = create_write_pipe(flags); 1064 if (IS_ERR(fw)) 1065 return PTR_ERR(fw); 1066 fr = create_read_pipe(fw, flags); 1067 error = PTR_ERR(fr); 1068 if (IS_ERR(fr)) 1069 goto err_write_pipe; 1070 1071 error = get_unused_fd_flags(flags); 1072 if (error < 0) 1073 goto err_read_pipe; 1074 fdr = error; 1075 1076 error = get_unused_fd_flags(flags); 1077 if (error < 0) 1078 goto err_fdr; 1079 fdw = error; 1080 1081 audit_fd_pair(fdr, fdw); 1082 fd_install(fdr, fr); 1083 fd_install(fdw, fw); 1084 fd[0] = fdr; 1085 fd[1] = fdw; 1086 1087 return 0; 1088 1089 err_fdr: 1090 put_unused_fd(fdr); 1091 err_read_pipe: 1092 path_put(&fr->f_path); 1093 put_filp(fr); 1094 err_write_pipe: 1095 free_write_pipe(fw); 1096 return error; 1097 } 1098 1099 /* 1100 * sys_pipe() is the normal C calling standard for creating 1101 * a pipe. It's not the way Unix traditionally does this, though. 1102 */ 1103 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1104 { 1105 int fd[2]; 1106 int error; 1107 1108 error = do_pipe_flags(fd, flags); 1109 if (!error) { 1110 if (copy_to_user(fildes, fd, sizeof(fd))) { 1111 sys_close(fd[0]); 1112 sys_close(fd[1]); 1113 error = -EFAULT; 1114 } 1115 } 1116 return error; 1117 } 1118 1119 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1120 { 1121 return sys_pipe2(fildes, 0); 1122 } 1123 1124 /* 1125 * Allocate a new array of pipe buffers and copy the info over. Returns the 1126 * pipe size if successful, or return -ERROR on error. 1127 */ 1128 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1129 { 1130 struct pipe_buffer *bufs; 1131 1132 /* 1133 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1134 * expect a lot of shrink+grow operations, just free and allocate 1135 * again like we would do for growing. If the pipe currently 1136 * contains more buffers than arg, then return busy. 1137 */ 1138 if (nr_pages < pipe->nrbufs) 1139 return -EBUSY; 1140 1141 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); 1142 if (unlikely(!bufs)) 1143 return -ENOMEM; 1144 1145 /* 1146 * The pipe array wraps around, so just start the new one at zero 1147 * and adjust the indexes. 1148 */ 1149 if (pipe->nrbufs) { 1150 unsigned int tail; 1151 unsigned int head; 1152 1153 tail = pipe->curbuf + pipe->nrbufs; 1154 if (tail < pipe->buffers) 1155 tail = 0; 1156 else 1157 tail &= (pipe->buffers - 1); 1158 1159 head = pipe->nrbufs - tail; 1160 if (head) 1161 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1162 if (tail) 1163 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1164 } 1165 1166 pipe->curbuf = 0; 1167 kfree(pipe->bufs); 1168 pipe->bufs = bufs; 1169 pipe->buffers = nr_pages; 1170 return nr_pages * PAGE_SIZE; 1171 } 1172 1173 /* 1174 * Currently we rely on the pipe array holding a power-of-2 number 1175 * of pages. 1176 */ 1177 static inline unsigned int round_pipe_size(unsigned int size) 1178 { 1179 unsigned long nr_pages; 1180 1181 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1182 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1183 } 1184 1185 /* 1186 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1187 * will return an error. 1188 */ 1189 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1190 size_t *lenp, loff_t *ppos) 1191 { 1192 int ret; 1193 1194 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1195 if (ret < 0 || !write) 1196 return ret; 1197 1198 pipe_max_size = round_pipe_size(pipe_max_size); 1199 return ret; 1200 } 1201 1202 /* 1203 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1204 * location, so checking ->i_pipe is not enough to verify that this is a 1205 * pipe. 1206 */ 1207 struct pipe_inode_info *get_pipe_info(struct file *file) 1208 { 1209 struct inode *i = file->f_path.dentry->d_inode; 1210 1211 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; 1212 } 1213 1214 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1215 { 1216 struct pipe_inode_info *pipe; 1217 long ret; 1218 1219 pipe = get_pipe_info(file); 1220 if (!pipe) 1221 return -EBADF; 1222 1223 mutex_lock(&pipe->inode->i_mutex); 1224 1225 switch (cmd) { 1226 case F_SETPIPE_SZ: { 1227 unsigned int size, nr_pages; 1228 1229 size = round_pipe_size(arg); 1230 nr_pages = size >> PAGE_SHIFT; 1231 1232 ret = -EINVAL; 1233 if (!nr_pages) 1234 goto out; 1235 1236 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1237 ret = -EPERM; 1238 goto out; 1239 } 1240 ret = pipe_set_size(pipe, nr_pages); 1241 break; 1242 } 1243 case F_GETPIPE_SZ: 1244 ret = pipe->buffers * PAGE_SIZE; 1245 break; 1246 default: 1247 ret = -EINVAL; 1248 break; 1249 } 1250 1251 out: 1252 mutex_unlock(&pipe->inode->i_mutex); 1253 return ret; 1254 } 1255 1256 static const struct super_operations pipefs_ops = { 1257 .destroy_inode = free_inode_nonrcu, 1258 .statfs = simple_statfs, 1259 }; 1260 1261 /* 1262 * pipefs should _never_ be mounted by userland - too much of security hassle, 1263 * no real gain from having the whole whorehouse mounted. So we don't need 1264 * any operations on the root directory. However, we need a non-trivial 1265 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1266 */ 1267 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1268 int flags, const char *dev_name, void *data) 1269 { 1270 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1271 &pipefs_dentry_operations, PIPEFS_MAGIC); 1272 } 1273 1274 static struct file_system_type pipe_fs_type = { 1275 .name = "pipefs", 1276 .mount = pipefs_mount, 1277 .kill_sb = kill_anon_super, 1278 }; 1279 1280 static int __init init_pipe_fs(void) 1281 { 1282 int err = register_filesystem(&pipe_fs_type); 1283 1284 if (!err) { 1285 pipe_mnt = kern_mount(&pipe_fs_type); 1286 if (IS_ERR(pipe_mnt)) { 1287 err = PTR_ERR(pipe_mnt); 1288 unregister_filesystem(&pipe_fs_type); 1289 } 1290 } 1291 return err; 1292 } 1293 1294 fs_initcall(init_pipe_fs); 1295