1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/pipe_fs_i.h> 18 #include <linux/uio.h> 19 #include <linux/highmem.h> 20 #include <linux/pagemap.h> 21 #include <linux/audit.h> 22 #include <linux/syscalls.h> 23 #include <linux/fcntl.h> 24 #include <linux/aio.h> 25 26 #include <asm/uaccess.h> 27 #include <asm/ioctls.h> 28 29 #include "internal.h" 30 31 /* 32 * The max size that a non-root user is allowed to grow the pipe. Can 33 * be set by root in /proc/sys/fs/pipe-max-size 34 */ 35 unsigned int pipe_max_size = 1048576; 36 37 /* 38 * Minimum pipe size, as required by POSIX 39 */ 40 unsigned int pipe_min_size = PAGE_SIZE; 41 42 /* 43 * We use a start+len construction, which provides full use of the 44 * allocated memory. 45 * -- Florian Coosmann (FGC) 46 * 47 * Reads with count = 0 should always return 0. 48 * -- Julian Bradfield 1999-06-07. 49 * 50 * FIFOs and Pipes now generate SIGIO for both readers and writers. 51 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 52 * 53 * pipe_read & write cleanup 54 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 55 */ 56 57 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 58 { 59 if (pipe->files) 60 mutex_lock_nested(&pipe->mutex, subclass); 61 } 62 63 void pipe_lock(struct pipe_inode_info *pipe) 64 { 65 /* 66 * pipe_lock() nests non-pipe inode locks (for writing to a file) 67 */ 68 pipe_lock_nested(pipe, I_MUTEX_PARENT); 69 } 70 EXPORT_SYMBOL(pipe_lock); 71 72 void pipe_unlock(struct pipe_inode_info *pipe) 73 { 74 if (pipe->files) 75 mutex_unlock(&pipe->mutex); 76 } 77 EXPORT_SYMBOL(pipe_unlock); 78 79 static inline void __pipe_lock(struct pipe_inode_info *pipe) 80 { 81 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 82 } 83 84 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 85 { 86 mutex_unlock(&pipe->mutex); 87 } 88 89 void pipe_double_lock(struct pipe_inode_info *pipe1, 90 struct pipe_inode_info *pipe2) 91 { 92 BUG_ON(pipe1 == pipe2); 93 94 if (pipe1 < pipe2) { 95 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 96 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 97 } else { 98 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 99 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 100 } 101 } 102 103 /* Drop the inode semaphore and wait for a pipe event, atomically */ 104 void pipe_wait(struct pipe_inode_info *pipe) 105 { 106 DEFINE_WAIT(wait); 107 108 /* 109 * Pipes are system-local resources, so sleeping on them 110 * is considered a noninteractive wait: 111 */ 112 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 113 pipe_unlock(pipe); 114 schedule(); 115 finish_wait(&pipe->wait, &wait); 116 pipe_lock(pipe); 117 } 118 119 static int 120 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 121 int atomic) 122 { 123 unsigned long copy; 124 125 while (len > 0) { 126 while (!iov->iov_len) 127 iov++; 128 copy = min_t(unsigned long, len, iov->iov_len); 129 130 if (atomic) { 131 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 132 return -EFAULT; 133 } else { 134 if (copy_from_user(to, iov->iov_base, copy)) 135 return -EFAULT; 136 } 137 to += copy; 138 len -= copy; 139 iov->iov_base += copy; 140 iov->iov_len -= copy; 141 } 142 return 0; 143 } 144 145 /* 146 * Pre-fault in the user memory, so we can use atomic copies. 147 */ 148 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 149 { 150 while (!iov->iov_len) 151 iov++; 152 153 while (len > 0) { 154 unsigned long this_len; 155 156 this_len = min_t(unsigned long, len, iov->iov_len); 157 fault_in_pages_readable(iov->iov_base, this_len); 158 len -= this_len; 159 iov++; 160 } 161 } 162 163 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 164 struct pipe_buffer *buf) 165 { 166 struct page *page = buf->page; 167 168 /* 169 * If nobody else uses this page, and we don't already have a 170 * temporary page, let's keep track of it as a one-deep 171 * allocation cache. (Otherwise just release our reference to it) 172 */ 173 if (page_count(page) == 1 && !pipe->tmp_page) 174 pipe->tmp_page = page; 175 else 176 page_cache_release(page); 177 } 178 179 /** 180 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 181 * @pipe: the pipe that the buffer belongs to 182 * @buf: the buffer to attempt to steal 183 * 184 * Description: 185 * This function attempts to steal the &struct page attached to 186 * @buf. If successful, this function returns 0 and returns with 187 * the page locked. The caller may then reuse the page for whatever 188 * he wishes; the typical use is insertion into a different file 189 * page cache. 190 */ 191 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 192 struct pipe_buffer *buf) 193 { 194 struct page *page = buf->page; 195 196 /* 197 * A reference of one is golden, that means that the owner of this 198 * page is the only one holding a reference to it. lock the page 199 * and return OK. 200 */ 201 if (page_count(page) == 1) { 202 lock_page(page); 203 return 0; 204 } 205 206 return 1; 207 } 208 EXPORT_SYMBOL(generic_pipe_buf_steal); 209 210 /** 211 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 212 * @pipe: the pipe that the buffer belongs to 213 * @buf: the buffer to get a reference to 214 * 215 * Description: 216 * This function grabs an extra reference to @buf. It's used in 217 * in the tee() system call, when we duplicate the buffers in one 218 * pipe into another. 219 */ 220 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 221 { 222 page_cache_get(buf->page); 223 } 224 EXPORT_SYMBOL(generic_pipe_buf_get); 225 226 /** 227 * generic_pipe_buf_confirm - verify contents of the pipe buffer 228 * @info: the pipe that the buffer belongs to 229 * @buf: the buffer to confirm 230 * 231 * Description: 232 * This function does nothing, because the generic pipe code uses 233 * pages that are always good when inserted into the pipe. 234 */ 235 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 236 struct pipe_buffer *buf) 237 { 238 return 0; 239 } 240 EXPORT_SYMBOL(generic_pipe_buf_confirm); 241 242 /** 243 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 244 * @pipe: the pipe that the buffer belongs to 245 * @buf: the buffer to put a reference to 246 * 247 * Description: 248 * This function releases a reference to @buf. 249 */ 250 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 251 struct pipe_buffer *buf) 252 { 253 page_cache_release(buf->page); 254 } 255 EXPORT_SYMBOL(generic_pipe_buf_release); 256 257 static const struct pipe_buf_operations anon_pipe_buf_ops = { 258 .can_merge = 1, 259 .confirm = generic_pipe_buf_confirm, 260 .release = anon_pipe_buf_release, 261 .steal = generic_pipe_buf_steal, 262 .get = generic_pipe_buf_get, 263 }; 264 265 static const struct pipe_buf_operations packet_pipe_buf_ops = { 266 .can_merge = 0, 267 .confirm = generic_pipe_buf_confirm, 268 .release = anon_pipe_buf_release, 269 .steal = generic_pipe_buf_steal, 270 .get = generic_pipe_buf_get, 271 }; 272 273 static ssize_t 274 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 275 unsigned long nr_segs, loff_t pos) 276 { 277 struct file *filp = iocb->ki_filp; 278 struct pipe_inode_info *pipe = filp->private_data; 279 int do_wakeup; 280 ssize_t ret; 281 struct iovec *iov = (struct iovec *)_iov; 282 size_t total_len; 283 struct iov_iter iter; 284 285 total_len = iov_length(iov, nr_segs); 286 /* Null read succeeds. */ 287 if (unlikely(total_len == 0)) 288 return 0; 289 290 iov_iter_init(&iter, iov, nr_segs, total_len, 0); 291 292 do_wakeup = 0; 293 ret = 0; 294 __pipe_lock(pipe); 295 for (;;) { 296 int bufs = pipe->nrbufs; 297 if (bufs) { 298 int curbuf = pipe->curbuf; 299 struct pipe_buffer *buf = pipe->bufs + curbuf; 300 const struct pipe_buf_operations *ops = buf->ops; 301 size_t chars = buf->len; 302 size_t written; 303 int error; 304 305 if (chars > total_len) 306 chars = total_len; 307 308 error = ops->confirm(pipe, buf); 309 if (error) { 310 if (!ret) 311 ret = error; 312 break; 313 } 314 315 written = copy_page_to_iter(buf->page, buf->offset, chars, &iter); 316 if (unlikely(written < chars)) { 317 if (!ret) 318 ret = -EFAULT; 319 break; 320 } 321 ret += chars; 322 buf->offset += chars; 323 buf->len -= chars; 324 325 /* Was it a packet buffer? Clean up and exit */ 326 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 327 total_len = chars; 328 buf->len = 0; 329 } 330 331 if (!buf->len) { 332 buf->ops = NULL; 333 ops->release(pipe, buf); 334 curbuf = (curbuf + 1) & (pipe->buffers - 1); 335 pipe->curbuf = curbuf; 336 pipe->nrbufs = --bufs; 337 do_wakeup = 1; 338 } 339 total_len -= chars; 340 if (!total_len) 341 break; /* common path: read succeeded */ 342 } 343 if (bufs) /* More to do? */ 344 continue; 345 if (!pipe->writers) 346 break; 347 if (!pipe->waiting_writers) { 348 /* syscall merging: Usually we must not sleep 349 * if O_NONBLOCK is set, or if we got some data. 350 * But if a writer sleeps in kernel space, then 351 * we can wait for that data without violating POSIX. 352 */ 353 if (ret) 354 break; 355 if (filp->f_flags & O_NONBLOCK) { 356 ret = -EAGAIN; 357 break; 358 } 359 } 360 if (signal_pending(current)) { 361 if (!ret) 362 ret = -ERESTARTSYS; 363 break; 364 } 365 if (do_wakeup) { 366 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 367 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 368 } 369 pipe_wait(pipe); 370 } 371 __pipe_unlock(pipe); 372 373 /* Signal writers asynchronously that there is more room. */ 374 if (do_wakeup) { 375 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 376 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 377 } 378 if (ret > 0) 379 file_accessed(filp); 380 return ret; 381 } 382 383 static inline int is_packetized(struct file *file) 384 { 385 return (file->f_flags & O_DIRECT) != 0; 386 } 387 388 static ssize_t 389 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 390 unsigned long nr_segs, loff_t ppos) 391 { 392 struct file *filp = iocb->ki_filp; 393 struct pipe_inode_info *pipe = filp->private_data; 394 ssize_t ret; 395 int do_wakeup; 396 struct iovec *iov = (struct iovec *)_iov; 397 size_t total_len; 398 ssize_t chars; 399 400 total_len = iov_length(iov, nr_segs); 401 /* Null write succeeds. */ 402 if (unlikely(total_len == 0)) 403 return 0; 404 405 do_wakeup = 0; 406 ret = 0; 407 __pipe_lock(pipe); 408 409 if (!pipe->readers) { 410 send_sig(SIGPIPE, current, 0); 411 ret = -EPIPE; 412 goto out; 413 } 414 415 /* We try to merge small writes */ 416 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 417 if (pipe->nrbufs && chars != 0) { 418 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 419 (pipe->buffers - 1); 420 struct pipe_buffer *buf = pipe->bufs + lastbuf; 421 const struct pipe_buf_operations *ops = buf->ops; 422 int offset = buf->offset + buf->len; 423 424 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 425 int error, atomic = 1; 426 void *addr; 427 428 error = ops->confirm(pipe, buf); 429 if (error) 430 goto out; 431 432 iov_fault_in_pages_read(iov, chars); 433 redo1: 434 if (atomic) 435 addr = kmap_atomic(buf->page); 436 else 437 addr = kmap(buf->page); 438 error = pipe_iov_copy_from_user(offset + addr, iov, 439 chars, atomic); 440 if (atomic) 441 kunmap_atomic(addr); 442 else 443 kunmap(buf->page); 444 ret = error; 445 do_wakeup = 1; 446 if (error) { 447 if (atomic) { 448 atomic = 0; 449 goto redo1; 450 } 451 goto out; 452 } 453 buf->len += chars; 454 total_len -= chars; 455 ret = chars; 456 if (!total_len) 457 goto out; 458 } 459 } 460 461 for (;;) { 462 int bufs; 463 464 if (!pipe->readers) { 465 send_sig(SIGPIPE, current, 0); 466 if (!ret) 467 ret = -EPIPE; 468 break; 469 } 470 bufs = pipe->nrbufs; 471 if (bufs < pipe->buffers) { 472 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 473 struct pipe_buffer *buf = pipe->bufs + newbuf; 474 struct page *page = pipe->tmp_page; 475 char *src; 476 int error, atomic = 1; 477 478 if (!page) { 479 page = alloc_page(GFP_HIGHUSER); 480 if (unlikely(!page)) { 481 ret = ret ? : -ENOMEM; 482 break; 483 } 484 pipe->tmp_page = page; 485 } 486 /* Always wake up, even if the copy fails. Otherwise 487 * we lock up (O_NONBLOCK-)readers that sleep due to 488 * syscall merging. 489 * FIXME! Is this really true? 490 */ 491 do_wakeup = 1; 492 chars = PAGE_SIZE; 493 if (chars > total_len) 494 chars = total_len; 495 496 iov_fault_in_pages_read(iov, chars); 497 redo2: 498 if (atomic) 499 src = kmap_atomic(page); 500 else 501 src = kmap(page); 502 503 error = pipe_iov_copy_from_user(src, iov, chars, 504 atomic); 505 if (atomic) 506 kunmap_atomic(src); 507 else 508 kunmap(page); 509 510 if (unlikely(error)) { 511 if (atomic) { 512 atomic = 0; 513 goto redo2; 514 } 515 if (!ret) 516 ret = error; 517 break; 518 } 519 ret += chars; 520 521 /* Insert it into the buffer array */ 522 buf->page = page; 523 buf->ops = &anon_pipe_buf_ops; 524 buf->offset = 0; 525 buf->len = chars; 526 buf->flags = 0; 527 if (is_packetized(filp)) { 528 buf->ops = &packet_pipe_buf_ops; 529 buf->flags = PIPE_BUF_FLAG_PACKET; 530 } 531 pipe->nrbufs = ++bufs; 532 pipe->tmp_page = NULL; 533 534 total_len -= chars; 535 if (!total_len) 536 break; 537 } 538 if (bufs < pipe->buffers) 539 continue; 540 if (filp->f_flags & O_NONBLOCK) { 541 if (!ret) 542 ret = -EAGAIN; 543 break; 544 } 545 if (signal_pending(current)) { 546 if (!ret) 547 ret = -ERESTARTSYS; 548 break; 549 } 550 if (do_wakeup) { 551 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 552 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 553 do_wakeup = 0; 554 } 555 pipe->waiting_writers++; 556 pipe_wait(pipe); 557 pipe->waiting_writers--; 558 } 559 out: 560 __pipe_unlock(pipe); 561 if (do_wakeup) { 562 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 563 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 564 } 565 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 566 int err = file_update_time(filp); 567 if (err) 568 ret = err; 569 sb_end_write(file_inode(filp)->i_sb); 570 } 571 return ret; 572 } 573 574 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 575 { 576 struct pipe_inode_info *pipe = filp->private_data; 577 int count, buf, nrbufs; 578 579 switch (cmd) { 580 case FIONREAD: 581 __pipe_lock(pipe); 582 count = 0; 583 buf = pipe->curbuf; 584 nrbufs = pipe->nrbufs; 585 while (--nrbufs >= 0) { 586 count += pipe->bufs[buf].len; 587 buf = (buf+1) & (pipe->buffers - 1); 588 } 589 __pipe_unlock(pipe); 590 591 return put_user(count, (int __user *)arg); 592 default: 593 return -ENOIOCTLCMD; 594 } 595 } 596 597 /* No kernel lock held - fine */ 598 static unsigned int 599 pipe_poll(struct file *filp, poll_table *wait) 600 { 601 unsigned int mask; 602 struct pipe_inode_info *pipe = filp->private_data; 603 int nrbufs; 604 605 poll_wait(filp, &pipe->wait, wait); 606 607 /* Reading only -- no need for acquiring the semaphore. */ 608 nrbufs = pipe->nrbufs; 609 mask = 0; 610 if (filp->f_mode & FMODE_READ) { 611 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 612 if (!pipe->writers && filp->f_version != pipe->w_counter) 613 mask |= POLLHUP; 614 } 615 616 if (filp->f_mode & FMODE_WRITE) { 617 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 618 /* 619 * Most Unices do not set POLLERR for FIFOs but on Linux they 620 * behave exactly like pipes for poll(). 621 */ 622 if (!pipe->readers) 623 mask |= POLLERR; 624 } 625 626 return mask; 627 } 628 629 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 630 { 631 int kill = 0; 632 633 spin_lock(&inode->i_lock); 634 if (!--pipe->files) { 635 inode->i_pipe = NULL; 636 kill = 1; 637 } 638 spin_unlock(&inode->i_lock); 639 640 if (kill) 641 free_pipe_info(pipe); 642 } 643 644 static int 645 pipe_release(struct inode *inode, struct file *file) 646 { 647 struct pipe_inode_info *pipe = file->private_data; 648 649 __pipe_lock(pipe); 650 if (file->f_mode & FMODE_READ) 651 pipe->readers--; 652 if (file->f_mode & FMODE_WRITE) 653 pipe->writers--; 654 655 if (pipe->readers || pipe->writers) { 656 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 657 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 658 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 659 } 660 __pipe_unlock(pipe); 661 662 put_pipe_info(inode, pipe); 663 return 0; 664 } 665 666 static int 667 pipe_fasync(int fd, struct file *filp, int on) 668 { 669 struct pipe_inode_info *pipe = filp->private_data; 670 int retval = 0; 671 672 __pipe_lock(pipe); 673 if (filp->f_mode & FMODE_READ) 674 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 675 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 676 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 677 if (retval < 0 && (filp->f_mode & FMODE_READ)) 678 /* this can happen only if on == T */ 679 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 680 } 681 __pipe_unlock(pipe); 682 return retval; 683 } 684 685 struct pipe_inode_info *alloc_pipe_info(void) 686 { 687 struct pipe_inode_info *pipe; 688 689 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 690 if (pipe) { 691 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 692 if (pipe->bufs) { 693 init_waitqueue_head(&pipe->wait); 694 pipe->r_counter = pipe->w_counter = 1; 695 pipe->buffers = PIPE_DEF_BUFFERS; 696 mutex_init(&pipe->mutex); 697 return pipe; 698 } 699 kfree(pipe); 700 } 701 702 return NULL; 703 } 704 705 void free_pipe_info(struct pipe_inode_info *pipe) 706 { 707 int i; 708 709 for (i = 0; i < pipe->buffers; i++) { 710 struct pipe_buffer *buf = pipe->bufs + i; 711 if (buf->ops) 712 buf->ops->release(pipe, buf); 713 } 714 if (pipe->tmp_page) 715 __free_page(pipe->tmp_page); 716 kfree(pipe->bufs); 717 kfree(pipe); 718 } 719 720 static struct vfsmount *pipe_mnt __read_mostly; 721 722 /* 723 * pipefs_dname() is called from d_path(). 724 */ 725 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 726 { 727 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 728 dentry->d_inode->i_ino); 729 } 730 731 static const struct dentry_operations pipefs_dentry_operations = { 732 .d_dname = pipefs_dname, 733 }; 734 735 static struct inode * get_pipe_inode(void) 736 { 737 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 738 struct pipe_inode_info *pipe; 739 740 if (!inode) 741 goto fail_inode; 742 743 inode->i_ino = get_next_ino(); 744 745 pipe = alloc_pipe_info(); 746 if (!pipe) 747 goto fail_iput; 748 749 inode->i_pipe = pipe; 750 pipe->files = 2; 751 pipe->readers = pipe->writers = 1; 752 inode->i_fop = &pipefifo_fops; 753 754 /* 755 * Mark the inode dirty from the very beginning, 756 * that way it will never be moved to the dirty 757 * list because "mark_inode_dirty()" will think 758 * that it already _is_ on the dirty list. 759 */ 760 inode->i_state = I_DIRTY; 761 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 762 inode->i_uid = current_fsuid(); 763 inode->i_gid = current_fsgid(); 764 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 765 766 return inode; 767 768 fail_iput: 769 iput(inode); 770 771 fail_inode: 772 return NULL; 773 } 774 775 int create_pipe_files(struct file **res, int flags) 776 { 777 int err; 778 struct inode *inode = get_pipe_inode(); 779 struct file *f; 780 struct path path; 781 static struct qstr name = { .name = "" }; 782 783 if (!inode) 784 return -ENFILE; 785 786 err = -ENOMEM; 787 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 788 if (!path.dentry) 789 goto err_inode; 790 path.mnt = mntget(pipe_mnt); 791 792 d_instantiate(path.dentry, inode); 793 794 err = -ENFILE; 795 f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); 796 if (IS_ERR(f)) 797 goto err_dentry; 798 799 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 800 f->private_data = inode->i_pipe; 801 802 res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); 803 if (IS_ERR(res[0])) 804 goto err_file; 805 806 path_get(&path); 807 res[0]->private_data = inode->i_pipe; 808 res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); 809 res[1] = f; 810 return 0; 811 812 err_file: 813 put_filp(f); 814 err_dentry: 815 free_pipe_info(inode->i_pipe); 816 path_put(&path); 817 return err; 818 819 err_inode: 820 free_pipe_info(inode->i_pipe); 821 iput(inode); 822 return err; 823 } 824 825 static int __do_pipe_flags(int *fd, struct file **files, int flags) 826 { 827 int error; 828 int fdw, fdr; 829 830 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 831 return -EINVAL; 832 833 error = create_pipe_files(files, flags); 834 if (error) 835 return error; 836 837 error = get_unused_fd_flags(flags); 838 if (error < 0) 839 goto err_read_pipe; 840 fdr = error; 841 842 error = get_unused_fd_flags(flags); 843 if (error < 0) 844 goto err_fdr; 845 fdw = error; 846 847 audit_fd_pair(fdr, fdw); 848 fd[0] = fdr; 849 fd[1] = fdw; 850 return 0; 851 852 err_fdr: 853 put_unused_fd(fdr); 854 err_read_pipe: 855 fput(files[0]); 856 fput(files[1]); 857 return error; 858 } 859 860 int do_pipe_flags(int *fd, int flags) 861 { 862 struct file *files[2]; 863 int error = __do_pipe_flags(fd, files, flags); 864 if (!error) { 865 fd_install(fd[0], files[0]); 866 fd_install(fd[1], files[1]); 867 } 868 return error; 869 } 870 871 /* 872 * sys_pipe() is the normal C calling standard for creating 873 * a pipe. It's not the way Unix traditionally does this, though. 874 */ 875 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 876 { 877 struct file *files[2]; 878 int fd[2]; 879 int error; 880 881 error = __do_pipe_flags(fd, files, flags); 882 if (!error) { 883 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 884 fput(files[0]); 885 fput(files[1]); 886 put_unused_fd(fd[0]); 887 put_unused_fd(fd[1]); 888 error = -EFAULT; 889 } else { 890 fd_install(fd[0], files[0]); 891 fd_install(fd[1], files[1]); 892 } 893 } 894 return error; 895 } 896 897 SYSCALL_DEFINE1(pipe, int __user *, fildes) 898 { 899 return sys_pipe2(fildes, 0); 900 } 901 902 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 903 { 904 int cur = *cnt; 905 906 while (cur == *cnt) { 907 pipe_wait(pipe); 908 if (signal_pending(current)) 909 break; 910 } 911 return cur == *cnt ? -ERESTARTSYS : 0; 912 } 913 914 static void wake_up_partner(struct pipe_inode_info *pipe) 915 { 916 wake_up_interruptible(&pipe->wait); 917 } 918 919 static int fifo_open(struct inode *inode, struct file *filp) 920 { 921 struct pipe_inode_info *pipe; 922 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 923 int ret; 924 925 filp->f_version = 0; 926 927 spin_lock(&inode->i_lock); 928 if (inode->i_pipe) { 929 pipe = inode->i_pipe; 930 pipe->files++; 931 spin_unlock(&inode->i_lock); 932 } else { 933 spin_unlock(&inode->i_lock); 934 pipe = alloc_pipe_info(); 935 if (!pipe) 936 return -ENOMEM; 937 pipe->files = 1; 938 spin_lock(&inode->i_lock); 939 if (unlikely(inode->i_pipe)) { 940 inode->i_pipe->files++; 941 spin_unlock(&inode->i_lock); 942 free_pipe_info(pipe); 943 pipe = inode->i_pipe; 944 } else { 945 inode->i_pipe = pipe; 946 spin_unlock(&inode->i_lock); 947 } 948 } 949 filp->private_data = pipe; 950 /* OK, we have a pipe and it's pinned down */ 951 952 __pipe_lock(pipe); 953 954 /* We can only do regular read/write on fifos */ 955 filp->f_mode &= (FMODE_READ | FMODE_WRITE); 956 957 switch (filp->f_mode) { 958 case FMODE_READ: 959 /* 960 * O_RDONLY 961 * POSIX.1 says that O_NONBLOCK means return with the FIFO 962 * opened, even when there is no process writing the FIFO. 963 */ 964 pipe->r_counter++; 965 if (pipe->readers++ == 0) 966 wake_up_partner(pipe); 967 968 if (!is_pipe && !pipe->writers) { 969 if ((filp->f_flags & O_NONBLOCK)) { 970 /* suppress POLLHUP until we have 971 * seen a writer */ 972 filp->f_version = pipe->w_counter; 973 } else { 974 if (wait_for_partner(pipe, &pipe->w_counter)) 975 goto err_rd; 976 } 977 } 978 break; 979 980 case FMODE_WRITE: 981 /* 982 * O_WRONLY 983 * POSIX.1 says that O_NONBLOCK means return -1 with 984 * errno=ENXIO when there is no process reading the FIFO. 985 */ 986 ret = -ENXIO; 987 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 988 goto err; 989 990 pipe->w_counter++; 991 if (!pipe->writers++) 992 wake_up_partner(pipe); 993 994 if (!is_pipe && !pipe->readers) { 995 if (wait_for_partner(pipe, &pipe->r_counter)) 996 goto err_wr; 997 } 998 break; 999 1000 case FMODE_READ | FMODE_WRITE: 1001 /* 1002 * O_RDWR 1003 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 1004 * This implementation will NEVER block on a O_RDWR open, since 1005 * the process can at least talk to itself. 1006 */ 1007 1008 pipe->readers++; 1009 pipe->writers++; 1010 pipe->r_counter++; 1011 pipe->w_counter++; 1012 if (pipe->readers == 1 || pipe->writers == 1) 1013 wake_up_partner(pipe); 1014 break; 1015 1016 default: 1017 ret = -EINVAL; 1018 goto err; 1019 } 1020 1021 /* Ok! */ 1022 __pipe_unlock(pipe); 1023 return 0; 1024 1025 err_rd: 1026 if (!--pipe->readers) 1027 wake_up_interruptible(&pipe->wait); 1028 ret = -ERESTARTSYS; 1029 goto err; 1030 1031 err_wr: 1032 if (!--pipe->writers) 1033 wake_up_interruptible(&pipe->wait); 1034 ret = -ERESTARTSYS; 1035 goto err; 1036 1037 err: 1038 __pipe_unlock(pipe); 1039 1040 put_pipe_info(inode, pipe); 1041 return ret; 1042 } 1043 1044 const struct file_operations pipefifo_fops = { 1045 .open = fifo_open, 1046 .llseek = no_llseek, 1047 .read = do_sync_read, 1048 .aio_read = pipe_read, 1049 .write = do_sync_write, 1050 .aio_write = pipe_write, 1051 .poll = pipe_poll, 1052 .unlocked_ioctl = pipe_ioctl, 1053 .release = pipe_release, 1054 .fasync = pipe_fasync, 1055 }; 1056 1057 /* 1058 * Allocate a new array of pipe buffers and copy the info over. Returns the 1059 * pipe size if successful, or return -ERROR on error. 1060 */ 1061 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1062 { 1063 struct pipe_buffer *bufs; 1064 1065 /* 1066 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1067 * expect a lot of shrink+grow operations, just free and allocate 1068 * again like we would do for growing. If the pipe currently 1069 * contains more buffers than arg, then return busy. 1070 */ 1071 if (nr_pages < pipe->nrbufs) 1072 return -EBUSY; 1073 1074 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); 1075 if (unlikely(!bufs)) 1076 return -ENOMEM; 1077 1078 /* 1079 * The pipe array wraps around, so just start the new one at zero 1080 * and adjust the indexes. 1081 */ 1082 if (pipe->nrbufs) { 1083 unsigned int tail; 1084 unsigned int head; 1085 1086 tail = pipe->curbuf + pipe->nrbufs; 1087 if (tail < pipe->buffers) 1088 tail = 0; 1089 else 1090 tail &= (pipe->buffers - 1); 1091 1092 head = pipe->nrbufs - tail; 1093 if (head) 1094 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1095 if (tail) 1096 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1097 } 1098 1099 pipe->curbuf = 0; 1100 kfree(pipe->bufs); 1101 pipe->bufs = bufs; 1102 pipe->buffers = nr_pages; 1103 return nr_pages * PAGE_SIZE; 1104 } 1105 1106 /* 1107 * Currently we rely on the pipe array holding a power-of-2 number 1108 * of pages. 1109 */ 1110 static inline unsigned int round_pipe_size(unsigned int size) 1111 { 1112 unsigned long nr_pages; 1113 1114 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1115 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1116 } 1117 1118 /* 1119 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1120 * will return an error. 1121 */ 1122 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1123 size_t *lenp, loff_t *ppos) 1124 { 1125 int ret; 1126 1127 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1128 if (ret < 0 || !write) 1129 return ret; 1130 1131 pipe_max_size = round_pipe_size(pipe_max_size); 1132 return ret; 1133 } 1134 1135 /* 1136 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1137 * location, so checking ->i_pipe is not enough to verify that this is a 1138 * pipe. 1139 */ 1140 struct pipe_inode_info *get_pipe_info(struct file *file) 1141 { 1142 return file->f_op == &pipefifo_fops ? file->private_data : NULL; 1143 } 1144 1145 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1146 { 1147 struct pipe_inode_info *pipe; 1148 long ret; 1149 1150 pipe = get_pipe_info(file); 1151 if (!pipe) 1152 return -EBADF; 1153 1154 __pipe_lock(pipe); 1155 1156 switch (cmd) { 1157 case F_SETPIPE_SZ: { 1158 unsigned int size, nr_pages; 1159 1160 size = round_pipe_size(arg); 1161 nr_pages = size >> PAGE_SHIFT; 1162 1163 ret = -EINVAL; 1164 if (!nr_pages) 1165 goto out; 1166 1167 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1168 ret = -EPERM; 1169 goto out; 1170 } 1171 ret = pipe_set_size(pipe, nr_pages); 1172 break; 1173 } 1174 case F_GETPIPE_SZ: 1175 ret = pipe->buffers * PAGE_SIZE; 1176 break; 1177 default: 1178 ret = -EINVAL; 1179 break; 1180 } 1181 1182 out: 1183 __pipe_unlock(pipe); 1184 return ret; 1185 } 1186 1187 static const struct super_operations pipefs_ops = { 1188 .destroy_inode = free_inode_nonrcu, 1189 .statfs = simple_statfs, 1190 }; 1191 1192 /* 1193 * pipefs should _never_ be mounted by userland - too much of security hassle, 1194 * no real gain from having the whole whorehouse mounted. So we don't need 1195 * any operations on the root directory. However, we need a non-trivial 1196 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1197 */ 1198 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1199 int flags, const char *dev_name, void *data) 1200 { 1201 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1202 &pipefs_dentry_operations, PIPEFS_MAGIC); 1203 } 1204 1205 static struct file_system_type pipe_fs_type = { 1206 .name = "pipefs", 1207 .mount = pipefs_mount, 1208 .kill_sb = kill_anon_super, 1209 }; 1210 1211 static int __init init_pipe_fs(void) 1212 { 1213 int err = register_filesystem(&pipe_fs_type); 1214 1215 if (!err) { 1216 pipe_mnt = kern_mount(&pipe_fs_type); 1217 if (IS_ERR(pipe_mnt)) { 1218 err = PTR_ERR(pipe_mnt); 1219 unregister_filesystem(&pipe_fs_type); 1220 } 1221 } 1222 return err; 1223 } 1224 1225 fs_initcall(init_pipe_fs); 1226