1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/pipe_fs_i.h> 18 #include <linux/uio.h> 19 #include <linux/highmem.h> 20 #include <linux/pagemap.h> 21 #include <linux/audit.h> 22 #include <linux/syscalls.h> 23 #include <linux/fcntl.h> 24 25 #include <asm/uaccess.h> 26 #include <asm/ioctls.h> 27 28 #include "internal.h" 29 30 /* 31 * The max size that a non-root user is allowed to grow the pipe. Can 32 * be set by root in /proc/sys/fs/pipe-max-size 33 */ 34 unsigned int pipe_max_size = 1048576; 35 36 /* 37 * Minimum pipe size, as required by POSIX 38 */ 39 unsigned int pipe_min_size = PAGE_SIZE; 40 41 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 42 * matches default values. 43 */ 44 unsigned long pipe_user_pages_hard; 45 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 46 47 /* 48 * We use a start+len construction, which provides full use of the 49 * allocated memory. 50 * -- Florian Coosmann (FGC) 51 * 52 * Reads with count = 0 should always return 0. 53 * -- Julian Bradfield 1999-06-07. 54 * 55 * FIFOs and Pipes now generate SIGIO for both readers and writers. 56 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 57 * 58 * pipe_read & write cleanup 59 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 60 */ 61 62 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 63 { 64 if (pipe->files) 65 mutex_lock_nested(&pipe->mutex, subclass); 66 } 67 68 void pipe_lock(struct pipe_inode_info *pipe) 69 { 70 /* 71 * pipe_lock() nests non-pipe inode locks (for writing to a file) 72 */ 73 pipe_lock_nested(pipe, I_MUTEX_PARENT); 74 } 75 EXPORT_SYMBOL(pipe_lock); 76 77 void pipe_unlock(struct pipe_inode_info *pipe) 78 { 79 if (pipe->files) 80 mutex_unlock(&pipe->mutex); 81 } 82 EXPORT_SYMBOL(pipe_unlock); 83 84 static inline void __pipe_lock(struct pipe_inode_info *pipe) 85 { 86 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 87 } 88 89 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 90 { 91 mutex_unlock(&pipe->mutex); 92 } 93 94 void pipe_double_lock(struct pipe_inode_info *pipe1, 95 struct pipe_inode_info *pipe2) 96 { 97 BUG_ON(pipe1 == pipe2); 98 99 if (pipe1 < pipe2) { 100 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 101 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 102 } else { 103 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 104 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 105 } 106 } 107 108 /* Drop the inode semaphore and wait for a pipe event, atomically */ 109 void pipe_wait(struct pipe_inode_info *pipe) 110 { 111 DEFINE_WAIT(wait); 112 113 /* 114 * Pipes are system-local resources, so sleeping on them 115 * is considered a noninteractive wait: 116 */ 117 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 118 pipe_unlock(pipe); 119 schedule(); 120 finish_wait(&pipe->wait, &wait); 121 pipe_lock(pipe); 122 } 123 124 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 125 struct pipe_buffer *buf) 126 { 127 struct page *page = buf->page; 128 129 /* 130 * If nobody else uses this page, and we don't already have a 131 * temporary page, let's keep track of it as a one-deep 132 * allocation cache. (Otherwise just release our reference to it) 133 */ 134 if (page_count(page) == 1 && !pipe->tmp_page) 135 pipe->tmp_page = page; 136 else 137 put_page(page); 138 } 139 140 /** 141 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 142 * @pipe: the pipe that the buffer belongs to 143 * @buf: the buffer to attempt to steal 144 * 145 * Description: 146 * This function attempts to steal the &struct page attached to 147 * @buf. If successful, this function returns 0 and returns with 148 * the page locked. The caller may then reuse the page for whatever 149 * he wishes; the typical use is insertion into a different file 150 * page cache. 151 */ 152 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 153 struct pipe_buffer *buf) 154 { 155 struct page *page = buf->page; 156 157 /* 158 * A reference of one is golden, that means that the owner of this 159 * page is the only one holding a reference to it. lock the page 160 * and return OK. 161 */ 162 if (page_count(page) == 1) { 163 lock_page(page); 164 return 0; 165 } 166 167 return 1; 168 } 169 EXPORT_SYMBOL(generic_pipe_buf_steal); 170 171 /** 172 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 173 * @pipe: the pipe that the buffer belongs to 174 * @buf: the buffer to get a reference to 175 * 176 * Description: 177 * This function grabs an extra reference to @buf. It's used in 178 * in the tee() system call, when we duplicate the buffers in one 179 * pipe into another. 180 */ 181 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 182 { 183 get_page(buf->page); 184 } 185 EXPORT_SYMBOL(generic_pipe_buf_get); 186 187 /** 188 * generic_pipe_buf_confirm - verify contents of the pipe buffer 189 * @info: the pipe that the buffer belongs to 190 * @buf: the buffer to confirm 191 * 192 * Description: 193 * This function does nothing, because the generic pipe code uses 194 * pages that are always good when inserted into the pipe. 195 */ 196 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 197 struct pipe_buffer *buf) 198 { 199 return 0; 200 } 201 EXPORT_SYMBOL(generic_pipe_buf_confirm); 202 203 /** 204 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 205 * @pipe: the pipe that the buffer belongs to 206 * @buf: the buffer to put a reference to 207 * 208 * Description: 209 * This function releases a reference to @buf. 210 */ 211 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 212 struct pipe_buffer *buf) 213 { 214 put_page(buf->page); 215 } 216 EXPORT_SYMBOL(generic_pipe_buf_release); 217 218 static const struct pipe_buf_operations anon_pipe_buf_ops = { 219 .can_merge = 1, 220 .confirm = generic_pipe_buf_confirm, 221 .release = anon_pipe_buf_release, 222 .steal = generic_pipe_buf_steal, 223 .get = generic_pipe_buf_get, 224 }; 225 226 static const struct pipe_buf_operations packet_pipe_buf_ops = { 227 .can_merge = 0, 228 .confirm = generic_pipe_buf_confirm, 229 .release = anon_pipe_buf_release, 230 .steal = generic_pipe_buf_steal, 231 .get = generic_pipe_buf_get, 232 }; 233 234 static ssize_t 235 pipe_read(struct kiocb *iocb, struct iov_iter *to) 236 { 237 size_t total_len = iov_iter_count(to); 238 struct file *filp = iocb->ki_filp; 239 struct pipe_inode_info *pipe = filp->private_data; 240 int do_wakeup; 241 ssize_t ret; 242 243 /* Null read succeeds. */ 244 if (unlikely(total_len == 0)) 245 return 0; 246 247 do_wakeup = 0; 248 ret = 0; 249 __pipe_lock(pipe); 250 for (;;) { 251 int bufs = pipe->nrbufs; 252 if (bufs) { 253 int curbuf = pipe->curbuf; 254 struct pipe_buffer *buf = pipe->bufs + curbuf; 255 const struct pipe_buf_operations *ops = buf->ops; 256 size_t chars = buf->len; 257 size_t written; 258 int error; 259 260 if (chars > total_len) 261 chars = total_len; 262 263 error = ops->confirm(pipe, buf); 264 if (error) { 265 if (!ret) 266 ret = error; 267 break; 268 } 269 270 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 271 if (unlikely(written < chars)) { 272 if (!ret) 273 ret = -EFAULT; 274 break; 275 } 276 ret += chars; 277 buf->offset += chars; 278 buf->len -= chars; 279 280 /* Was it a packet buffer? Clean up and exit */ 281 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 282 total_len = chars; 283 buf->len = 0; 284 } 285 286 if (!buf->len) { 287 buf->ops = NULL; 288 ops->release(pipe, buf); 289 curbuf = (curbuf + 1) & (pipe->buffers - 1); 290 pipe->curbuf = curbuf; 291 pipe->nrbufs = --bufs; 292 do_wakeup = 1; 293 } 294 total_len -= chars; 295 if (!total_len) 296 break; /* common path: read succeeded */ 297 } 298 if (bufs) /* More to do? */ 299 continue; 300 if (!pipe->writers) 301 break; 302 if (!pipe->waiting_writers) { 303 /* syscall merging: Usually we must not sleep 304 * if O_NONBLOCK is set, or if we got some data. 305 * But if a writer sleeps in kernel space, then 306 * we can wait for that data without violating POSIX. 307 */ 308 if (ret) 309 break; 310 if (filp->f_flags & O_NONBLOCK) { 311 ret = -EAGAIN; 312 break; 313 } 314 } 315 if (signal_pending(current)) { 316 if (!ret) 317 ret = -ERESTARTSYS; 318 break; 319 } 320 if (do_wakeup) { 321 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 322 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 323 } 324 pipe_wait(pipe); 325 } 326 __pipe_unlock(pipe); 327 328 /* Signal writers asynchronously that there is more room. */ 329 if (do_wakeup) { 330 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 331 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 332 } 333 if (ret > 0) 334 file_accessed(filp); 335 return ret; 336 } 337 338 static inline int is_packetized(struct file *file) 339 { 340 return (file->f_flags & O_DIRECT) != 0; 341 } 342 343 static ssize_t 344 pipe_write(struct kiocb *iocb, struct iov_iter *from) 345 { 346 struct file *filp = iocb->ki_filp; 347 struct pipe_inode_info *pipe = filp->private_data; 348 ssize_t ret = 0; 349 int do_wakeup = 0; 350 size_t total_len = iov_iter_count(from); 351 ssize_t chars; 352 353 /* Null write succeeds. */ 354 if (unlikely(total_len == 0)) 355 return 0; 356 357 __pipe_lock(pipe); 358 359 if (!pipe->readers) { 360 send_sig(SIGPIPE, current, 0); 361 ret = -EPIPE; 362 goto out; 363 } 364 365 /* We try to merge small writes */ 366 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 367 if (pipe->nrbufs && chars != 0) { 368 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 369 (pipe->buffers - 1); 370 struct pipe_buffer *buf = pipe->bufs + lastbuf; 371 const struct pipe_buf_operations *ops = buf->ops; 372 int offset = buf->offset + buf->len; 373 374 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 375 ret = ops->confirm(pipe, buf); 376 if (ret) 377 goto out; 378 379 ret = copy_page_from_iter(buf->page, offset, chars, from); 380 if (unlikely(ret < chars)) { 381 ret = -EFAULT; 382 goto out; 383 } 384 do_wakeup = 1; 385 buf->len += ret; 386 if (!iov_iter_count(from)) 387 goto out; 388 } 389 } 390 391 for (;;) { 392 int bufs; 393 394 if (!pipe->readers) { 395 send_sig(SIGPIPE, current, 0); 396 if (!ret) 397 ret = -EPIPE; 398 break; 399 } 400 bufs = pipe->nrbufs; 401 if (bufs < pipe->buffers) { 402 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 403 struct pipe_buffer *buf = pipe->bufs + newbuf; 404 struct page *page = pipe->tmp_page; 405 int copied; 406 407 if (!page) { 408 page = alloc_page(GFP_HIGHUSER); 409 if (unlikely(!page)) { 410 ret = ret ? : -ENOMEM; 411 break; 412 } 413 pipe->tmp_page = page; 414 } 415 /* Always wake up, even if the copy fails. Otherwise 416 * we lock up (O_NONBLOCK-)readers that sleep due to 417 * syscall merging. 418 * FIXME! Is this really true? 419 */ 420 do_wakeup = 1; 421 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 422 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 423 if (!ret) 424 ret = -EFAULT; 425 break; 426 } 427 ret += copied; 428 429 /* Insert it into the buffer array */ 430 buf->page = page; 431 buf->ops = &anon_pipe_buf_ops; 432 buf->offset = 0; 433 buf->len = copied; 434 buf->flags = 0; 435 if (is_packetized(filp)) { 436 buf->ops = &packet_pipe_buf_ops; 437 buf->flags = PIPE_BUF_FLAG_PACKET; 438 } 439 pipe->nrbufs = ++bufs; 440 pipe->tmp_page = NULL; 441 442 if (!iov_iter_count(from)) 443 break; 444 } 445 if (bufs < pipe->buffers) 446 continue; 447 if (filp->f_flags & O_NONBLOCK) { 448 if (!ret) 449 ret = -EAGAIN; 450 break; 451 } 452 if (signal_pending(current)) { 453 if (!ret) 454 ret = -ERESTARTSYS; 455 break; 456 } 457 if (do_wakeup) { 458 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 459 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 460 do_wakeup = 0; 461 } 462 pipe->waiting_writers++; 463 pipe_wait(pipe); 464 pipe->waiting_writers--; 465 } 466 out: 467 __pipe_unlock(pipe); 468 if (do_wakeup) { 469 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 470 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 471 } 472 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 473 int err = file_update_time(filp); 474 if (err) 475 ret = err; 476 sb_end_write(file_inode(filp)->i_sb); 477 } 478 return ret; 479 } 480 481 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 482 { 483 struct pipe_inode_info *pipe = filp->private_data; 484 int count, buf, nrbufs; 485 486 switch (cmd) { 487 case FIONREAD: 488 __pipe_lock(pipe); 489 count = 0; 490 buf = pipe->curbuf; 491 nrbufs = pipe->nrbufs; 492 while (--nrbufs >= 0) { 493 count += pipe->bufs[buf].len; 494 buf = (buf+1) & (pipe->buffers - 1); 495 } 496 __pipe_unlock(pipe); 497 498 return put_user(count, (int __user *)arg); 499 default: 500 return -ENOIOCTLCMD; 501 } 502 } 503 504 /* No kernel lock held - fine */ 505 static unsigned int 506 pipe_poll(struct file *filp, poll_table *wait) 507 { 508 unsigned int mask; 509 struct pipe_inode_info *pipe = filp->private_data; 510 int nrbufs; 511 512 poll_wait(filp, &pipe->wait, wait); 513 514 /* Reading only -- no need for acquiring the semaphore. */ 515 nrbufs = pipe->nrbufs; 516 mask = 0; 517 if (filp->f_mode & FMODE_READ) { 518 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 519 if (!pipe->writers && filp->f_version != pipe->w_counter) 520 mask |= POLLHUP; 521 } 522 523 if (filp->f_mode & FMODE_WRITE) { 524 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 525 /* 526 * Most Unices do not set POLLERR for FIFOs but on Linux they 527 * behave exactly like pipes for poll(). 528 */ 529 if (!pipe->readers) 530 mask |= POLLERR; 531 } 532 533 return mask; 534 } 535 536 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 537 { 538 int kill = 0; 539 540 spin_lock(&inode->i_lock); 541 if (!--pipe->files) { 542 inode->i_pipe = NULL; 543 kill = 1; 544 } 545 spin_unlock(&inode->i_lock); 546 547 if (kill) 548 free_pipe_info(pipe); 549 } 550 551 static int 552 pipe_release(struct inode *inode, struct file *file) 553 { 554 struct pipe_inode_info *pipe = file->private_data; 555 556 __pipe_lock(pipe); 557 if (file->f_mode & FMODE_READ) 558 pipe->readers--; 559 if (file->f_mode & FMODE_WRITE) 560 pipe->writers--; 561 562 if (pipe->readers || pipe->writers) { 563 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 564 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 565 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 566 } 567 __pipe_unlock(pipe); 568 569 put_pipe_info(inode, pipe); 570 return 0; 571 } 572 573 static int 574 pipe_fasync(int fd, struct file *filp, int on) 575 { 576 struct pipe_inode_info *pipe = filp->private_data; 577 int retval = 0; 578 579 __pipe_lock(pipe); 580 if (filp->f_mode & FMODE_READ) 581 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 582 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 583 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 584 if (retval < 0 && (filp->f_mode & FMODE_READ)) 585 /* this can happen only if on == T */ 586 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 587 } 588 __pipe_unlock(pipe); 589 return retval; 590 } 591 592 static void account_pipe_buffers(struct pipe_inode_info *pipe, 593 unsigned long old, unsigned long new) 594 { 595 atomic_long_add(new - old, &pipe->user->pipe_bufs); 596 } 597 598 static bool too_many_pipe_buffers_soft(struct user_struct *user) 599 { 600 return pipe_user_pages_soft && 601 atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; 602 } 603 604 static bool too_many_pipe_buffers_hard(struct user_struct *user) 605 { 606 return pipe_user_pages_hard && 607 atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; 608 } 609 610 struct pipe_inode_info *alloc_pipe_info(void) 611 { 612 struct pipe_inode_info *pipe; 613 614 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 615 if (pipe) { 616 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 617 struct user_struct *user = get_current_user(); 618 619 if (!too_many_pipe_buffers_hard(user)) { 620 if (too_many_pipe_buffers_soft(user)) 621 pipe_bufs = 1; 622 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); 623 } 624 625 if (pipe->bufs) { 626 init_waitqueue_head(&pipe->wait); 627 pipe->r_counter = pipe->w_counter = 1; 628 pipe->buffers = pipe_bufs; 629 pipe->user = user; 630 account_pipe_buffers(pipe, 0, pipe_bufs); 631 mutex_init(&pipe->mutex); 632 return pipe; 633 } 634 free_uid(user); 635 kfree(pipe); 636 } 637 638 return NULL; 639 } 640 641 void free_pipe_info(struct pipe_inode_info *pipe) 642 { 643 int i; 644 645 account_pipe_buffers(pipe, pipe->buffers, 0); 646 free_uid(pipe->user); 647 for (i = 0; i < pipe->buffers; i++) { 648 struct pipe_buffer *buf = pipe->bufs + i; 649 if (buf->ops) 650 buf->ops->release(pipe, buf); 651 } 652 if (pipe->tmp_page) 653 __free_page(pipe->tmp_page); 654 kfree(pipe->bufs); 655 kfree(pipe); 656 } 657 658 static struct vfsmount *pipe_mnt __read_mostly; 659 660 /* 661 * pipefs_dname() is called from d_path(). 662 */ 663 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 664 { 665 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 666 d_inode(dentry)->i_ino); 667 } 668 669 static const struct dentry_operations pipefs_dentry_operations = { 670 .d_dname = pipefs_dname, 671 }; 672 673 static struct inode * get_pipe_inode(void) 674 { 675 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 676 struct pipe_inode_info *pipe; 677 678 if (!inode) 679 goto fail_inode; 680 681 inode->i_ino = get_next_ino(); 682 683 pipe = alloc_pipe_info(); 684 if (!pipe) 685 goto fail_iput; 686 687 inode->i_pipe = pipe; 688 pipe->files = 2; 689 pipe->readers = pipe->writers = 1; 690 inode->i_fop = &pipefifo_fops; 691 692 /* 693 * Mark the inode dirty from the very beginning, 694 * that way it will never be moved to the dirty 695 * list because "mark_inode_dirty()" will think 696 * that it already _is_ on the dirty list. 697 */ 698 inode->i_state = I_DIRTY; 699 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 700 inode->i_uid = current_fsuid(); 701 inode->i_gid = current_fsgid(); 702 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 703 704 return inode; 705 706 fail_iput: 707 iput(inode); 708 709 fail_inode: 710 return NULL; 711 } 712 713 int create_pipe_files(struct file **res, int flags) 714 { 715 int err; 716 struct inode *inode = get_pipe_inode(); 717 struct file *f; 718 struct path path; 719 static struct qstr name = { .name = "" }; 720 721 if (!inode) 722 return -ENFILE; 723 724 err = -ENOMEM; 725 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 726 if (!path.dentry) 727 goto err_inode; 728 path.mnt = mntget(pipe_mnt); 729 730 d_instantiate(path.dentry, inode); 731 732 f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); 733 if (IS_ERR(f)) { 734 err = PTR_ERR(f); 735 goto err_dentry; 736 } 737 738 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 739 f->private_data = inode->i_pipe; 740 741 res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); 742 if (IS_ERR(res[0])) { 743 err = PTR_ERR(res[0]); 744 goto err_file; 745 } 746 747 path_get(&path); 748 res[0]->private_data = inode->i_pipe; 749 res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); 750 res[1] = f; 751 return 0; 752 753 err_file: 754 put_filp(f); 755 err_dentry: 756 free_pipe_info(inode->i_pipe); 757 path_put(&path); 758 return err; 759 760 err_inode: 761 free_pipe_info(inode->i_pipe); 762 iput(inode); 763 return err; 764 } 765 766 static int __do_pipe_flags(int *fd, struct file **files, int flags) 767 { 768 int error; 769 int fdw, fdr; 770 771 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 772 return -EINVAL; 773 774 error = create_pipe_files(files, flags); 775 if (error) 776 return error; 777 778 error = get_unused_fd_flags(flags); 779 if (error < 0) 780 goto err_read_pipe; 781 fdr = error; 782 783 error = get_unused_fd_flags(flags); 784 if (error < 0) 785 goto err_fdr; 786 fdw = error; 787 788 audit_fd_pair(fdr, fdw); 789 fd[0] = fdr; 790 fd[1] = fdw; 791 return 0; 792 793 err_fdr: 794 put_unused_fd(fdr); 795 err_read_pipe: 796 fput(files[0]); 797 fput(files[1]); 798 return error; 799 } 800 801 int do_pipe_flags(int *fd, int flags) 802 { 803 struct file *files[2]; 804 int error = __do_pipe_flags(fd, files, flags); 805 if (!error) { 806 fd_install(fd[0], files[0]); 807 fd_install(fd[1], files[1]); 808 } 809 return error; 810 } 811 812 /* 813 * sys_pipe() is the normal C calling standard for creating 814 * a pipe. It's not the way Unix traditionally does this, though. 815 */ 816 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 817 { 818 struct file *files[2]; 819 int fd[2]; 820 int error; 821 822 error = __do_pipe_flags(fd, files, flags); 823 if (!error) { 824 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 825 fput(files[0]); 826 fput(files[1]); 827 put_unused_fd(fd[0]); 828 put_unused_fd(fd[1]); 829 error = -EFAULT; 830 } else { 831 fd_install(fd[0], files[0]); 832 fd_install(fd[1], files[1]); 833 } 834 } 835 return error; 836 } 837 838 SYSCALL_DEFINE1(pipe, int __user *, fildes) 839 { 840 return sys_pipe2(fildes, 0); 841 } 842 843 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 844 { 845 int cur = *cnt; 846 847 while (cur == *cnt) { 848 pipe_wait(pipe); 849 if (signal_pending(current)) 850 break; 851 } 852 return cur == *cnt ? -ERESTARTSYS : 0; 853 } 854 855 static void wake_up_partner(struct pipe_inode_info *pipe) 856 { 857 wake_up_interruptible(&pipe->wait); 858 } 859 860 static int fifo_open(struct inode *inode, struct file *filp) 861 { 862 struct pipe_inode_info *pipe; 863 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 864 int ret; 865 866 filp->f_version = 0; 867 868 spin_lock(&inode->i_lock); 869 if (inode->i_pipe) { 870 pipe = inode->i_pipe; 871 pipe->files++; 872 spin_unlock(&inode->i_lock); 873 } else { 874 spin_unlock(&inode->i_lock); 875 pipe = alloc_pipe_info(); 876 if (!pipe) 877 return -ENOMEM; 878 pipe->files = 1; 879 spin_lock(&inode->i_lock); 880 if (unlikely(inode->i_pipe)) { 881 inode->i_pipe->files++; 882 spin_unlock(&inode->i_lock); 883 free_pipe_info(pipe); 884 pipe = inode->i_pipe; 885 } else { 886 inode->i_pipe = pipe; 887 spin_unlock(&inode->i_lock); 888 } 889 } 890 filp->private_data = pipe; 891 /* OK, we have a pipe and it's pinned down */ 892 893 __pipe_lock(pipe); 894 895 /* We can only do regular read/write on fifos */ 896 filp->f_mode &= (FMODE_READ | FMODE_WRITE); 897 898 switch (filp->f_mode) { 899 case FMODE_READ: 900 /* 901 * O_RDONLY 902 * POSIX.1 says that O_NONBLOCK means return with the FIFO 903 * opened, even when there is no process writing the FIFO. 904 */ 905 pipe->r_counter++; 906 if (pipe->readers++ == 0) 907 wake_up_partner(pipe); 908 909 if (!is_pipe && !pipe->writers) { 910 if ((filp->f_flags & O_NONBLOCK)) { 911 /* suppress POLLHUP until we have 912 * seen a writer */ 913 filp->f_version = pipe->w_counter; 914 } else { 915 if (wait_for_partner(pipe, &pipe->w_counter)) 916 goto err_rd; 917 } 918 } 919 break; 920 921 case FMODE_WRITE: 922 /* 923 * O_WRONLY 924 * POSIX.1 says that O_NONBLOCK means return -1 with 925 * errno=ENXIO when there is no process reading the FIFO. 926 */ 927 ret = -ENXIO; 928 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 929 goto err; 930 931 pipe->w_counter++; 932 if (!pipe->writers++) 933 wake_up_partner(pipe); 934 935 if (!is_pipe && !pipe->readers) { 936 if (wait_for_partner(pipe, &pipe->r_counter)) 937 goto err_wr; 938 } 939 break; 940 941 case FMODE_READ | FMODE_WRITE: 942 /* 943 * O_RDWR 944 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 945 * This implementation will NEVER block on a O_RDWR open, since 946 * the process can at least talk to itself. 947 */ 948 949 pipe->readers++; 950 pipe->writers++; 951 pipe->r_counter++; 952 pipe->w_counter++; 953 if (pipe->readers == 1 || pipe->writers == 1) 954 wake_up_partner(pipe); 955 break; 956 957 default: 958 ret = -EINVAL; 959 goto err; 960 } 961 962 /* Ok! */ 963 __pipe_unlock(pipe); 964 return 0; 965 966 err_rd: 967 if (!--pipe->readers) 968 wake_up_interruptible(&pipe->wait); 969 ret = -ERESTARTSYS; 970 goto err; 971 972 err_wr: 973 if (!--pipe->writers) 974 wake_up_interruptible(&pipe->wait); 975 ret = -ERESTARTSYS; 976 goto err; 977 978 err: 979 __pipe_unlock(pipe); 980 981 put_pipe_info(inode, pipe); 982 return ret; 983 } 984 985 const struct file_operations pipefifo_fops = { 986 .open = fifo_open, 987 .llseek = no_llseek, 988 .read_iter = pipe_read, 989 .write_iter = pipe_write, 990 .poll = pipe_poll, 991 .unlocked_ioctl = pipe_ioctl, 992 .release = pipe_release, 993 .fasync = pipe_fasync, 994 }; 995 996 /* 997 * Allocate a new array of pipe buffers and copy the info over. Returns the 998 * pipe size if successful, or return -ERROR on error. 999 */ 1000 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1001 { 1002 struct pipe_buffer *bufs; 1003 1004 /* 1005 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1006 * expect a lot of shrink+grow operations, just free and allocate 1007 * again like we would do for growing. If the pipe currently 1008 * contains more buffers than arg, then return busy. 1009 */ 1010 if (nr_pages < pipe->nrbufs) 1011 return -EBUSY; 1012 1013 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); 1014 if (unlikely(!bufs)) 1015 return -ENOMEM; 1016 1017 /* 1018 * The pipe array wraps around, so just start the new one at zero 1019 * and adjust the indexes. 1020 */ 1021 if (pipe->nrbufs) { 1022 unsigned int tail; 1023 unsigned int head; 1024 1025 tail = pipe->curbuf + pipe->nrbufs; 1026 if (tail < pipe->buffers) 1027 tail = 0; 1028 else 1029 tail &= (pipe->buffers - 1); 1030 1031 head = pipe->nrbufs - tail; 1032 if (head) 1033 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1034 if (tail) 1035 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1036 } 1037 1038 account_pipe_buffers(pipe, pipe->buffers, nr_pages); 1039 pipe->curbuf = 0; 1040 kfree(pipe->bufs); 1041 pipe->bufs = bufs; 1042 pipe->buffers = nr_pages; 1043 return nr_pages * PAGE_SIZE; 1044 } 1045 1046 /* 1047 * Currently we rely on the pipe array holding a power-of-2 number 1048 * of pages. 1049 */ 1050 static inline unsigned int round_pipe_size(unsigned int size) 1051 { 1052 unsigned long nr_pages; 1053 1054 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1055 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1056 } 1057 1058 /* 1059 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1060 * will return an error. 1061 */ 1062 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1063 size_t *lenp, loff_t *ppos) 1064 { 1065 int ret; 1066 1067 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1068 if (ret < 0 || !write) 1069 return ret; 1070 1071 pipe_max_size = round_pipe_size(pipe_max_size); 1072 return ret; 1073 } 1074 1075 /* 1076 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1077 * location, so checking ->i_pipe is not enough to verify that this is a 1078 * pipe. 1079 */ 1080 struct pipe_inode_info *get_pipe_info(struct file *file) 1081 { 1082 return file->f_op == &pipefifo_fops ? file->private_data : NULL; 1083 } 1084 1085 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1086 { 1087 struct pipe_inode_info *pipe; 1088 long ret; 1089 1090 pipe = get_pipe_info(file); 1091 if (!pipe) 1092 return -EBADF; 1093 1094 __pipe_lock(pipe); 1095 1096 switch (cmd) { 1097 case F_SETPIPE_SZ: { 1098 unsigned int size, nr_pages; 1099 1100 size = round_pipe_size(arg); 1101 nr_pages = size >> PAGE_SHIFT; 1102 1103 ret = -EINVAL; 1104 if (!nr_pages) 1105 goto out; 1106 1107 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1108 ret = -EPERM; 1109 goto out; 1110 } else if ((too_many_pipe_buffers_hard(pipe->user) || 1111 too_many_pipe_buffers_soft(pipe->user)) && 1112 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { 1113 ret = -EPERM; 1114 goto out; 1115 } 1116 ret = pipe_set_size(pipe, nr_pages); 1117 break; 1118 } 1119 case F_GETPIPE_SZ: 1120 ret = pipe->buffers * PAGE_SIZE; 1121 break; 1122 default: 1123 ret = -EINVAL; 1124 break; 1125 } 1126 1127 out: 1128 __pipe_unlock(pipe); 1129 return ret; 1130 } 1131 1132 static const struct super_operations pipefs_ops = { 1133 .destroy_inode = free_inode_nonrcu, 1134 .statfs = simple_statfs, 1135 }; 1136 1137 /* 1138 * pipefs should _never_ be mounted by userland - too much of security hassle, 1139 * no real gain from having the whole whorehouse mounted. So we don't need 1140 * any operations on the root directory. However, we need a non-trivial 1141 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1142 */ 1143 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1144 int flags, const char *dev_name, void *data) 1145 { 1146 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1147 &pipefs_dentry_operations, PIPEFS_MAGIC); 1148 } 1149 1150 static struct file_system_type pipe_fs_type = { 1151 .name = "pipefs", 1152 .mount = pipefs_mount, 1153 .kill_sb = kill_anon_super, 1154 }; 1155 1156 static int __init init_pipe_fs(void) 1157 { 1158 int err = register_filesystem(&pipe_fs_type); 1159 1160 if (!err) { 1161 pipe_mnt = kern_mount(&pipe_fs_type); 1162 if (IS_ERR(pipe_mnt)) { 1163 err = PTR_ERR(pipe_mnt); 1164 unregister_filesystem(&pipe_fs_type); 1165 } 1166 } 1167 return err; 1168 } 1169 1170 fs_initcall(init_pipe_fs); 1171