1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/pipe.c 4 * 5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/slab.h> 12 #include <linux/module.h> 13 #include <linux/init.h> 14 #include <linux/fs.h> 15 #include <linux/log2.h> 16 #include <linux/mount.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/magic.h> 19 #include <linux/pipe_fs_i.h> 20 #include <linux/uio.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/audit.h> 24 #include <linux/syscalls.h> 25 #include <linux/fcntl.h> 26 #include <linux/memcontrol.h> 27 #include <linux/watch_queue.h> 28 29 #include <linux/uaccess.h> 30 #include <asm/ioctls.h> 31 32 #include "internal.h" 33 34 /* 35 * The max size that a non-root user is allowed to grow the pipe. Can 36 * be set by root in /proc/sys/fs/pipe-max-size 37 */ 38 unsigned int pipe_max_size = 1048576; 39 40 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 41 * matches default values. 42 */ 43 unsigned long pipe_user_pages_hard; 44 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 45 46 /* 47 * We use head and tail indices that aren't masked off, except at the point of 48 * dereference, but rather they're allowed to wrap naturally. This means there 49 * isn't a dead spot in the buffer, but the ring has to be a power of two and 50 * <= 2^31. 51 * -- David Howells 2019-09-23. 52 * 53 * Reads with count = 0 should always return 0. 54 * -- Julian Bradfield 1999-06-07. 55 * 56 * FIFOs and Pipes now generate SIGIO for both readers and writers. 57 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 58 * 59 * pipe_read & write cleanup 60 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 61 */ 62 63 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 64 { 65 if (pipe->files) 66 mutex_lock_nested(&pipe->mutex, subclass); 67 } 68 69 void pipe_lock(struct pipe_inode_info *pipe) 70 { 71 /* 72 * pipe_lock() nests non-pipe inode locks (for writing to a file) 73 */ 74 pipe_lock_nested(pipe, I_MUTEX_PARENT); 75 } 76 EXPORT_SYMBOL(pipe_lock); 77 78 void pipe_unlock(struct pipe_inode_info *pipe) 79 { 80 if (pipe->files) 81 mutex_unlock(&pipe->mutex); 82 } 83 EXPORT_SYMBOL(pipe_unlock); 84 85 static inline void __pipe_lock(struct pipe_inode_info *pipe) 86 { 87 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 88 } 89 90 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 91 { 92 mutex_unlock(&pipe->mutex); 93 } 94 95 void pipe_double_lock(struct pipe_inode_info *pipe1, 96 struct pipe_inode_info *pipe2) 97 { 98 BUG_ON(pipe1 == pipe2); 99 100 if (pipe1 < pipe2) { 101 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 102 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 103 } else { 104 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 105 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 106 } 107 } 108 109 /* Drop the inode semaphore and wait for a pipe event, atomically */ 110 void pipe_wait(struct pipe_inode_info *pipe) 111 { 112 DEFINE_WAIT(rdwait); 113 DEFINE_WAIT(wrwait); 114 115 /* 116 * Pipes are system-local resources, so sleeping on them 117 * is considered a noninteractive wait: 118 */ 119 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); 120 prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); 121 pipe_unlock(pipe); 122 schedule(); 123 finish_wait(&pipe->rd_wait, &rdwait); 124 finish_wait(&pipe->wr_wait, &wrwait); 125 pipe_lock(pipe); 126 } 127 128 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 129 struct pipe_buffer *buf) 130 { 131 struct page *page = buf->page; 132 133 /* 134 * If nobody else uses this page, and we don't already have a 135 * temporary page, let's keep track of it as a one-deep 136 * allocation cache. (Otherwise just release our reference to it) 137 */ 138 if (page_count(page) == 1 && !pipe->tmp_page) 139 pipe->tmp_page = page; 140 else 141 put_page(page); 142 } 143 144 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, 145 struct pipe_buffer *buf) 146 { 147 struct page *page = buf->page; 148 149 if (page_count(page) == 1) { 150 memcg_kmem_uncharge_page(page, 0); 151 __SetPageLocked(page); 152 return 0; 153 } 154 return 1; 155 } 156 157 /** 158 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 159 * @pipe: the pipe that the buffer belongs to 160 * @buf: the buffer to attempt to steal 161 * 162 * Description: 163 * This function attempts to steal the &struct page attached to 164 * @buf. If successful, this function returns 0 and returns with 165 * the page locked. The caller may then reuse the page for whatever 166 * he wishes; the typical use is insertion into a different file 167 * page cache. 168 */ 169 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 170 struct pipe_buffer *buf) 171 { 172 struct page *page = buf->page; 173 174 /* 175 * A reference of one is golden, that means that the owner of this 176 * page is the only one holding a reference to it. lock the page 177 * and return OK. 178 */ 179 if (page_count(page) == 1) { 180 lock_page(page); 181 return 0; 182 } 183 184 return 1; 185 } 186 EXPORT_SYMBOL(generic_pipe_buf_steal); 187 188 /** 189 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 190 * @pipe: the pipe that the buffer belongs to 191 * @buf: the buffer to get a reference to 192 * 193 * Description: 194 * This function grabs an extra reference to @buf. It's used in 195 * in the tee() system call, when we duplicate the buffers in one 196 * pipe into another. 197 */ 198 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 199 { 200 return try_get_page(buf->page); 201 } 202 EXPORT_SYMBOL(generic_pipe_buf_get); 203 204 /** 205 * generic_pipe_buf_confirm - verify contents of the pipe buffer 206 * @info: the pipe that the buffer belongs to 207 * @buf: the buffer to confirm 208 * 209 * Description: 210 * This function does nothing, because the generic pipe code uses 211 * pages that are always good when inserted into the pipe. 212 */ 213 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 214 struct pipe_buffer *buf) 215 { 216 return 0; 217 } 218 EXPORT_SYMBOL(generic_pipe_buf_confirm); 219 220 /** 221 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 222 * @pipe: the pipe that the buffer belongs to 223 * @buf: the buffer to put a reference to 224 * 225 * Description: 226 * This function releases a reference to @buf. 227 */ 228 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 229 struct pipe_buffer *buf) 230 { 231 put_page(buf->page); 232 } 233 EXPORT_SYMBOL(generic_pipe_buf_release); 234 235 /* New data written to a pipe may be appended to a buffer with this type. */ 236 static const struct pipe_buf_operations anon_pipe_buf_ops = { 237 .confirm = generic_pipe_buf_confirm, 238 .release = anon_pipe_buf_release, 239 .steal = anon_pipe_buf_steal, 240 .get = generic_pipe_buf_get, 241 }; 242 243 static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { 244 .confirm = generic_pipe_buf_confirm, 245 .release = anon_pipe_buf_release, 246 .steal = anon_pipe_buf_steal, 247 .get = generic_pipe_buf_get, 248 }; 249 250 static const struct pipe_buf_operations packet_pipe_buf_ops = { 251 .confirm = generic_pipe_buf_confirm, 252 .release = anon_pipe_buf_release, 253 .steal = anon_pipe_buf_steal, 254 .get = generic_pipe_buf_get, 255 }; 256 257 /** 258 * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable 259 * @buf: the buffer to mark 260 * 261 * Description: 262 * This function ensures that no future writes will be merged into the 263 * given &struct pipe_buffer. This is necessary when multiple pipe buffers 264 * share the same backing page. 265 */ 266 void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) 267 { 268 if (buf->ops == &anon_pipe_buf_ops) 269 buf->ops = &anon_pipe_buf_nomerge_ops; 270 } 271 272 static bool pipe_buf_can_merge(struct pipe_buffer *buf) 273 { 274 return buf->ops == &anon_pipe_buf_ops; 275 } 276 277 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 278 static inline bool pipe_readable(const struct pipe_inode_info *pipe) 279 { 280 unsigned int head = READ_ONCE(pipe->head); 281 unsigned int tail = READ_ONCE(pipe->tail); 282 unsigned int writers = READ_ONCE(pipe->writers); 283 284 return !pipe_empty(head, tail) || !writers; 285 } 286 287 static ssize_t 288 pipe_read(struct kiocb *iocb, struct iov_iter *to) 289 { 290 size_t total_len = iov_iter_count(to); 291 struct file *filp = iocb->ki_filp; 292 struct pipe_inode_info *pipe = filp->private_data; 293 bool was_full, wake_next_reader = false; 294 ssize_t ret; 295 296 /* Null read succeeds. */ 297 if (unlikely(total_len == 0)) 298 return 0; 299 300 ret = 0; 301 __pipe_lock(pipe); 302 303 /* 304 * We only wake up writers if the pipe was full when we started 305 * reading in order to avoid unnecessary wakeups. 306 * 307 * But when we do wake up writers, we do so using a sync wakeup 308 * (WF_SYNC), because we want them to get going and generate more 309 * data for us. 310 */ 311 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 312 for (;;) { 313 unsigned int head = pipe->head; 314 unsigned int tail = pipe->tail; 315 unsigned int mask = pipe->ring_size - 1; 316 317 if (!pipe_empty(head, tail)) { 318 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 319 size_t chars = buf->len; 320 size_t written; 321 int error; 322 323 if (chars > total_len) 324 chars = total_len; 325 326 error = pipe_buf_confirm(pipe, buf); 327 if (error) { 328 if (!ret) 329 ret = error; 330 break; 331 } 332 333 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 334 if (unlikely(written < chars)) { 335 if (!ret) 336 ret = -EFAULT; 337 break; 338 } 339 ret += chars; 340 buf->offset += chars; 341 buf->len -= chars; 342 343 /* Was it a packet buffer? Clean up and exit */ 344 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 345 total_len = chars; 346 buf->len = 0; 347 } 348 349 if (!buf->len) { 350 pipe_buf_release(pipe, buf); 351 spin_lock_irq(&pipe->rd_wait.lock); 352 tail++; 353 pipe->tail = tail; 354 spin_unlock_irq(&pipe->rd_wait.lock); 355 } 356 total_len -= chars; 357 if (!total_len) 358 break; /* common path: read succeeded */ 359 if (!pipe_empty(head, tail)) /* More to do? */ 360 continue; 361 } 362 363 if (!pipe->writers) 364 break; 365 if (ret) 366 break; 367 if (filp->f_flags & O_NONBLOCK) { 368 ret = -EAGAIN; 369 break; 370 } 371 __pipe_unlock(pipe); 372 373 /* 374 * We only get here if we didn't actually read anything. 375 * 376 * However, we could have seen (and removed) a zero-sized 377 * pipe buffer, and might have made space in the buffers 378 * that way. 379 * 380 * You can't make zero-sized pipe buffers by doing an empty 381 * write (not even in packet mode), but they can happen if 382 * the writer gets an EFAULT when trying to fill a buffer 383 * that already got allocated and inserted in the buffer 384 * array. 385 * 386 * So we still need to wake up any pending writers in the 387 * _very_ unlikely case that the pipe was full, but we got 388 * no data. 389 */ 390 if (unlikely(was_full)) { 391 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 392 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 393 } 394 395 /* 396 * But because we didn't read anything, at this point we can 397 * just return directly with -ERESTARTSYS if we're interrupted, 398 * since we've done any required wakeups and there's no need 399 * to mark anything accessed. And we've dropped the lock. 400 */ 401 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) 402 return -ERESTARTSYS; 403 404 __pipe_lock(pipe); 405 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 406 wake_next_reader = true; 407 } 408 if (pipe_empty(pipe->head, pipe->tail)) 409 wake_next_reader = false; 410 __pipe_unlock(pipe); 411 412 if (was_full) { 413 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 414 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 415 } 416 if (wake_next_reader) 417 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 418 if (ret > 0) 419 file_accessed(filp); 420 return ret; 421 } 422 423 static inline int is_packetized(struct file *file) 424 { 425 return (file->f_flags & O_DIRECT) != 0; 426 } 427 428 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 429 static inline bool pipe_writable(const struct pipe_inode_info *pipe) 430 { 431 unsigned int head = READ_ONCE(pipe->head); 432 unsigned int tail = READ_ONCE(pipe->tail); 433 unsigned int max_usage = READ_ONCE(pipe->max_usage); 434 435 return !pipe_full(head, tail, max_usage) || 436 !READ_ONCE(pipe->readers); 437 } 438 439 static ssize_t 440 pipe_write(struct kiocb *iocb, struct iov_iter *from) 441 { 442 struct file *filp = iocb->ki_filp; 443 struct pipe_inode_info *pipe = filp->private_data; 444 unsigned int head; 445 ssize_t ret = 0; 446 size_t total_len = iov_iter_count(from); 447 ssize_t chars; 448 bool was_empty = false; 449 bool wake_next_writer = false; 450 451 /* Null write succeeds. */ 452 if (unlikely(total_len == 0)) 453 return 0; 454 455 __pipe_lock(pipe); 456 457 if (!pipe->readers) { 458 send_sig(SIGPIPE, current, 0); 459 ret = -EPIPE; 460 goto out; 461 } 462 463 #ifdef CONFIG_WATCH_QUEUE 464 if (pipe->watch_queue) { 465 ret = -EXDEV; 466 goto out; 467 } 468 #endif 469 470 /* 471 * Only wake up if the pipe started out empty, since 472 * otherwise there should be no readers waiting. 473 * 474 * If it wasn't empty we try to merge new data into 475 * the last buffer. 476 * 477 * That naturally merges small writes, but it also 478 * page-aligs the rest of the writes for large writes 479 * spanning multiple pages. 480 */ 481 head = pipe->head; 482 was_empty = pipe_empty(head, pipe->tail); 483 chars = total_len & (PAGE_SIZE-1); 484 if (chars && !was_empty) { 485 unsigned int mask = pipe->ring_size - 1; 486 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; 487 int offset = buf->offset + buf->len; 488 489 if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { 490 ret = pipe_buf_confirm(pipe, buf); 491 if (ret) 492 goto out; 493 494 ret = copy_page_from_iter(buf->page, offset, chars, from); 495 if (unlikely(ret < chars)) { 496 ret = -EFAULT; 497 goto out; 498 } 499 500 buf->len += ret; 501 if (!iov_iter_count(from)) 502 goto out; 503 } 504 } 505 506 for (;;) { 507 if (!pipe->readers) { 508 send_sig(SIGPIPE, current, 0); 509 if (!ret) 510 ret = -EPIPE; 511 break; 512 } 513 514 head = pipe->head; 515 if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 516 unsigned int mask = pipe->ring_size - 1; 517 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 518 struct page *page = pipe->tmp_page; 519 int copied; 520 521 if (!page) { 522 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 523 if (unlikely(!page)) { 524 ret = ret ? : -ENOMEM; 525 break; 526 } 527 pipe->tmp_page = page; 528 } 529 530 /* Allocate a slot in the ring in advance and attach an 531 * empty buffer. If we fault or otherwise fail to use 532 * it, either the reader will consume it or it'll still 533 * be there for the next write. 534 */ 535 spin_lock_irq(&pipe->rd_wait.lock); 536 537 head = pipe->head; 538 if (pipe_full(head, pipe->tail, pipe->max_usage)) { 539 spin_unlock_irq(&pipe->rd_wait.lock); 540 continue; 541 } 542 543 pipe->head = head + 1; 544 spin_unlock_irq(&pipe->rd_wait.lock); 545 546 /* Insert it into the buffer array */ 547 buf = &pipe->bufs[head & mask]; 548 buf->page = page; 549 buf->ops = &anon_pipe_buf_ops; 550 buf->offset = 0; 551 buf->len = 0; 552 buf->flags = 0; 553 if (is_packetized(filp)) { 554 buf->ops = &packet_pipe_buf_ops; 555 buf->flags = PIPE_BUF_FLAG_PACKET; 556 } 557 pipe->tmp_page = NULL; 558 559 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 560 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 561 if (!ret) 562 ret = -EFAULT; 563 break; 564 } 565 ret += copied; 566 buf->offset = 0; 567 buf->len = copied; 568 569 if (!iov_iter_count(from)) 570 break; 571 } 572 573 if (!pipe_full(head, pipe->tail, pipe->max_usage)) 574 continue; 575 576 /* Wait for buffer space to become available. */ 577 if (filp->f_flags & O_NONBLOCK) { 578 if (!ret) 579 ret = -EAGAIN; 580 break; 581 } 582 if (signal_pending(current)) { 583 if (!ret) 584 ret = -ERESTARTSYS; 585 break; 586 } 587 588 /* 589 * We're going to release the pipe lock and wait for more 590 * space. We wake up any readers if necessary, and then 591 * after waiting we need to re-check whether the pipe 592 * become empty while we dropped the lock. 593 */ 594 __pipe_unlock(pipe); 595 if (was_empty) { 596 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 597 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 598 } 599 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); 600 __pipe_lock(pipe); 601 was_empty = pipe_empty(pipe->head, pipe->tail); 602 wake_next_writer = true; 603 } 604 out: 605 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 606 wake_next_writer = false; 607 __pipe_unlock(pipe); 608 609 /* 610 * If we do do a wakeup event, we do a 'sync' wakeup, because we 611 * want the reader to start processing things asap, rather than 612 * leave the data pending. 613 * 614 * This is particularly important for small writes, because of 615 * how (for example) the GNU make jobserver uses small writes to 616 * wake up pending jobs 617 */ 618 if (was_empty) { 619 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 620 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 621 } 622 if (wake_next_writer) 623 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 624 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 625 int err = file_update_time(filp); 626 if (err) 627 ret = err; 628 sb_end_write(file_inode(filp)->i_sb); 629 } 630 return ret; 631 } 632 633 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 634 { 635 struct pipe_inode_info *pipe = filp->private_data; 636 int count, head, tail, mask; 637 638 switch (cmd) { 639 case FIONREAD: 640 __pipe_lock(pipe); 641 count = 0; 642 head = pipe->head; 643 tail = pipe->tail; 644 mask = pipe->ring_size - 1; 645 646 while (tail != head) { 647 count += pipe->bufs[tail & mask].len; 648 tail++; 649 } 650 __pipe_unlock(pipe); 651 652 return put_user(count, (int __user *)arg); 653 654 #ifdef CONFIG_WATCH_QUEUE 655 case IOC_WATCH_QUEUE_SET_SIZE: { 656 int ret; 657 __pipe_lock(pipe); 658 ret = watch_queue_set_size(pipe, arg); 659 __pipe_unlock(pipe); 660 return ret; 661 } 662 663 case IOC_WATCH_QUEUE_SET_FILTER: 664 return watch_queue_set_filter( 665 pipe, (struct watch_notification_filter __user *)arg); 666 #endif 667 668 default: 669 return -ENOIOCTLCMD; 670 } 671 } 672 673 /* No kernel lock held - fine */ 674 static __poll_t 675 pipe_poll(struct file *filp, poll_table *wait) 676 { 677 __poll_t mask; 678 struct pipe_inode_info *pipe = filp->private_data; 679 unsigned int head, tail; 680 681 /* 682 * Reading pipe state only -- no need for acquiring the semaphore. 683 * 684 * But because this is racy, the code has to add the 685 * entry to the poll table _first_ .. 686 */ 687 if (filp->f_mode & FMODE_READ) 688 poll_wait(filp, &pipe->rd_wait, wait); 689 if (filp->f_mode & FMODE_WRITE) 690 poll_wait(filp, &pipe->wr_wait, wait); 691 692 /* 693 * .. and only then can you do the racy tests. That way, 694 * if something changes and you got it wrong, the poll 695 * table entry will wake you up and fix it. 696 */ 697 head = READ_ONCE(pipe->head); 698 tail = READ_ONCE(pipe->tail); 699 700 mask = 0; 701 if (filp->f_mode & FMODE_READ) { 702 if (!pipe_empty(head, tail)) 703 mask |= EPOLLIN | EPOLLRDNORM; 704 if (!pipe->writers && filp->f_version != pipe->w_counter) 705 mask |= EPOLLHUP; 706 } 707 708 if (filp->f_mode & FMODE_WRITE) { 709 if (!pipe_full(head, tail, pipe->max_usage)) 710 mask |= EPOLLOUT | EPOLLWRNORM; 711 /* 712 * Most Unices do not set EPOLLERR for FIFOs but on Linux they 713 * behave exactly like pipes for poll(). 714 */ 715 if (!pipe->readers) 716 mask |= EPOLLERR; 717 } 718 719 return mask; 720 } 721 722 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 723 { 724 int kill = 0; 725 726 spin_lock(&inode->i_lock); 727 if (!--pipe->files) { 728 inode->i_pipe = NULL; 729 kill = 1; 730 } 731 spin_unlock(&inode->i_lock); 732 733 if (kill) 734 free_pipe_info(pipe); 735 } 736 737 static int 738 pipe_release(struct inode *inode, struct file *file) 739 { 740 struct pipe_inode_info *pipe = file->private_data; 741 742 __pipe_lock(pipe); 743 if (file->f_mode & FMODE_READ) 744 pipe->readers--; 745 if (file->f_mode & FMODE_WRITE) 746 pipe->writers--; 747 748 /* Was that the last reader or writer, but not the other side? */ 749 if (!pipe->readers != !pipe->writers) { 750 wake_up_interruptible_all(&pipe->rd_wait); 751 wake_up_interruptible_all(&pipe->wr_wait); 752 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 753 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 754 } 755 __pipe_unlock(pipe); 756 757 put_pipe_info(inode, pipe); 758 return 0; 759 } 760 761 static int 762 pipe_fasync(int fd, struct file *filp, int on) 763 { 764 struct pipe_inode_info *pipe = filp->private_data; 765 int retval = 0; 766 767 __pipe_lock(pipe); 768 if (filp->f_mode & FMODE_READ) 769 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 770 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 771 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 772 if (retval < 0 && (filp->f_mode & FMODE_READ)) 773 /* this can happen only if on == T */ 774 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 775 } 776 __pipe_unlock(pipe); 777 return retval; 778 } 779 780 unsigned long account_pipe_buffers(struct user_struct *user, 781 unsigned long old, unsigned long new) 782 { 783 return atomic_long_add_return(new - old, &user->pipe_bufs); 784 } 785 786 bool too_many_pipe_buffers_soft(unsigned long user_bufs) 787 { 788 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 789 790 return soft_limit && user_bufs > soft_limit; 791 } 792 793 bool too_many_pipe_buffers_hard(unsigned long user_bufs) 794 { 795 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 796 797 return hard_limit && user_bufs > hard_limit; 798 } 799 800 bool pipe_is_unprivileged_user(void) 801 { 802 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 803 } 804 805 struct pipe_inode_info *alloc_pipe_info(void) 806 { 807 struct pipe_inode_info *pipe; 808 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 809 struct user_struct *user = get_current_user(); 810 unsigned long user_bufs; 811 unsigned int max_size = READ_ONCE(pipe_max_size); 812 813 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 814 if (pipe == NULL) 815 goto out_free_uid; 816 817 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 818 pipe_bufs = max_size >> PAGE_SHIFT; 819 820 user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 821 822 if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { 823 user_bufs = account_pipe_buffers(user, pipe_bufs, 1); 824 pipe_bufs = 1; 825 } 826 827 if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) 828 goto out_revert_acct; 829 830 pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), 831 GFP_KERNEL_ACCOUNT); 832 833 if (pipe->bufs) { 834 init_waitqueue_head(&pipe->rd_wait); 835 init_waitqueue_head(&pipe->wr_wait); 836 pipe->r_counter = pipe->w_counter = 1; 837 pipe->max_usage = pipe_bufs; 838 pipe->ring_size = pipe_bufs; 839 pipe->nr_accounted = pipe_bufs; 840 pipe->user = user; 841 mutex_init(&pipe->mutex); 842 return pipe; 843 } 844 845 out_revert_acct: 846 (void) account_pipe_buffers(user, pipe_bufs, 0); 847 kfree(pipe); 848 out_free_uid: 849 free_uid(user); 850 return NULL; 851 } 852 853 void free_pipe_info(struct pipe_inode_info *pipe) 854 { 855 int i; 856 857 #ifdef CONFIG_WATCH_QUEUE 858 if (pipe->watch_queue) { 859 watch_queue_clear(pipe->watch_queue); 860 put_watch_queue(pipe->watch_queue); 861 } 862 #endif 863 864 (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); 865 free_uid(pipe->user); 866 for (i = 0; i < pipe->ring_size; i++) { 867 struct pipe_buffer *buf = pipe->bufs + i; 868 if (buf->ops) 869 pipe_buf_release(pipe, buf); 870 } 871 if (pipe->tmp_page) 872 __free_page(pipe->tmp_page); 873 kfree(pipe->bufs); 874 kfree(pipe); 875 } 876 877 static struct vfsmount *pipe_mnt __read_mostly; 878 879 /* 880 * pipefs_dname() is called from d_path(). 881 */ 882 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 883 { 884 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 885 d_inode(dentry)->i_ino); 886 } 887 888 static const struct dentry_operations pipefs_dentry_operations = { 889 .d_dname = pipefs_dname, 890 }; 891 892 static struct inode * get_pipe_inode(void) 893 { 894 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 895 struct pipe_inode_info *pipe; 896 897 if (!inode) 898 goto fail_inode; 899 900 inode->i_ino = get_next_ino(); 901 902 pipe = alloc_pipe_info(); 903 if (!pipe) 904 goto fail_iput; 905 906 inode->i_pipe = pipe; 907 pipe->files = 2; 908 pipe->readers = pipe->writers = 1; 909 inode->i_fop = &pipefifo_fops; 910 911 /* 912 * Mark the inode dirty from the very beginning, 913 * that way it will never be moved to the dirty 914 * list because "mark_inode_dirty()" will think 915 * that it already _is_ on the dirty list. 916 */ 917 inode->i_state = I_DIRTY; 918 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 919 inode->i_uid = current_fsuid(); 920 inode->i_gid = current_fsgid(); 921 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 922 923 return inode; 924 925 fail_iput: 926 iput(inode); 927 928 fail_inode: 929 return NULL; 930 } 931 932 int create_pipe_files(struct file **res, int flags) 933 { 934 struct inode *inode = get_pipe_inode(); 935 struct file *f; 936 937 if (!inode) 938 return -ENFILE; 939 940 if (flags & O_NOTIFICATION_PIPE) { 941 #ifdef CONFIG_WATCH_QUEUE 942 if (watch_queue_init(inode->i_pipe) < 0) { 943 iput(inode); 944 return -ENOMEM; 945 } 946 #else 947 return -ENOPKG; 948 #endif 949 } 950 951 f = alloc_file_pseudo(inode, pipe_mnt, "", 952 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 953 &pipefifo_fops); 954 if (IS_ERR(f)) { 955 free_pipe_info(inode->i_pipe); 956 iput(inode); 957 return PTR_ERR(f); 958 } 959 960 f->private_data = inode->i_pipe; 961 962 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 963 &pipefifo_fops); 964 if (IS_ERR(res[0])) { 965 put_pipe_info(inode, inode->i_pipe); 966 fput(f); 967 return PTR_ERR(res[0]); 968 } 969 res[0]->private_data = inode->i_pipe; 970 res[1] = f; 971 stream_open(inode, res[0]); 972 stream_open(inode, res[1]); 973 return 0; 974 } 975 976 static int __do_pipe_flags(int *fd, struct file **files, int flags) 977 { 978 int error; 979 int fdw, fdr; 980 981 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) 982 return -EINVAL; 983 984 error = create_pipe_files(files, flags); 985 if (error) 986 return error; 987 988 error = get_unused_fd_flags(flags); 989 if (error < 0) 990 goto err_read_pipe; 991 fdr = error; 992 993 error = get_unused_fd_flags(flags); 994 if (error < 0) 995 goto err_fdr; 996 fdw = error; 997 998 audit_fd_pair(fdr, fdw); 999 fd[0] = fdr; 1000 fd[1] = fdw; 1001 return 0; 1002 1003 err_fdr: 1004 put_unused_fd(fdr); 1005 err_read_pipe: 1006 fput(files[0]); 1007 fput(files[1]); 1008 return error; 1009 } 1010 1011 int do_pipe_flags(int *fd, int flags) 1012 { 1013 struct file *files[2]; 1014 int error = __do_pipe_flags(fd, files, flags); 1015 if (!error) { 1016 fd_install(fd[0], files[0]); 1017 fd_install(fd[1], files[1]); 1018 } 1019 return error; 1020 } 1021 1022 /* 1023 * sys_pipe() is the normal C calling standard for creating 1024 * a pipe. It's not the way Unix traditionally does this, though. 1025 */ 1026 static int do_pipe2(int __user *fildes, int flags) 1027 { 1028 struct file *files[2]; 1029 int fd[2]; 1030 int error; 1031 1032 error = __do_pipe_flags(fd, files, flags); 1033 if (!error) { 1034 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 1035 fput(files[0]); 1036 fput(files[1]); 1037 put_unused_fd(fd[0]); 1038 put_unused_fd(fd[1]); 1039 error = -EFAULT; 1040 } else { 1041 fd_install(fd[0], files[0]); 1042 fd_install(fd[1], files[1]); 1043 } 1044 } 1045 return error; 1046 } 1047 1048 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1049 { 1050 return do_pipe2(fildes, flags); 1051 } 1052 1053 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1054 { 1055 return do_pipe2(fildes, 0); 1056 } 1057 1058 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 1059 { 1060 int cur = *cnt; 1061 1062 while (cur == *cnt) { 1063 pipe_wait(pipe); 1064 if (signal_pending(current)) 1065 break; 1066 } 1067 return cur == *cnt ? -ERESTARTSYS : 0; 1068 } 1069 1070 static void wake_up_partner(struct pipe_inode_info *pipe) 1071 { 1072 wake_up_interruptible_all(&pipe->rd_wait); 1073 wake_up_interruptible_all(&pipe->wr_wait); 1074 } 1075 1076 static int fifo_open(struct inode *inode, struct file *filp) 1077 { 1078 struct pipe_inode_info *pipe; 1079 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 1080 int ret; 1081 1082 filp->f_version = 0; 1083 1084 spin_lock(&inode->i_lock); 1085 if (inode->i_pipe) { 1086 pipe = inode->i_pipe; 1087 pipe->files++; 1088 spin_unlock(&inode->i_lock); 1089 } else { 1090 spin_unlock(&inode->i_lock); 1091 pipe = alloc_pipe_info(); 1092 if (!pipe) 1093 return -ENOMEM; 1094 pipe->files = 1; 1095 spin_lock(&inode->i_lock); 1096 if (unlikely(inode->i_pipe)) { 1097 inode->i_pipe->files++; 1098 spin_unlock(&inode->i_lock); 1099 free_pipe_info(pipe); 1100 pipe = inode->i_pipe; 1101 } else { 1102 inode->i_pipe = pipe; 1103 spin_unlock(&inode->i_lock); 1104 } 1105 } 1106 filp->private_data = pipe; 1107 /* OK, we have a pipe and it's pinned down */ 1108 1109 __pipe_lock(pipe); 1110 1111 /* We can only do regular read/write on fifos */ 1112 stream_open(inode, filp); 1113 1114 switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { 1115 case FMODE_READ: 1116 /* 1117 * O_RDONLY 1118 * POSIX.1 says that O_NONBLOCK means return with the FIFO 1119 * opened, even when there is no process writing the FIFO. 1120 */ 1121 pipe->r_counter++; 1122 if (pipe->readers++ == 0) 1123 wake_up_partner(pipe); 1124 1125 if (!is_pipe && !pipe->writers) { 1126 if ((filp->f_flags & O_NONBLOCK)) { 1127 /* suppress EPOLLHUP until we have 1128 * seen a writer */ 1129 filp->f_version = pipe->w_counter; 1130 } else { 1131 if (wait_for_partner(pipe, &pipe->w_counter)) 1132 goto err_rd; 1133 } 1134 } 1135 break; 1136 1137 case FMODE_WRITE: 1138 /* 1139 * O_WRONLY 1140 * POSIX.1 says that O_NONBLOCK means return -1 with 1141 * errno=ENXIO when there is no process reading the FIFO. 1142 */ 1143 ret = -ENXIO; 1144 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 1145 goto err; 1146 1147 pipe->w_counter++; 1148 if (!pipe->writers++) 1149 wake_up_partner(pipe); 1150 1151 if (!is_pipe && !pipe->readers) { 1152 if (wait_for_partner(pipe, &pipe->r_counter)) 1153 goto err_wr; 1154 } 1155 break; 1156 1157 case FMODE_READ | FMODE_WRITE: 1158 /* 1159 * O_RDWR 1160 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 1161 * This implementation will NEVER block on a O_RDWR open, since 1162 * the process can at least talk to itself. 1163 */ 1164 1165 pipe->readers++; 1166 pipe->writers++; 1167 pipe->r_counter++; 1168 pipe->w_counter++; 1169 if (pipe->readers == 1 || pipe->writers == 1) 1170 wake_up_partner(pipe); 1171 break; 1172 1173 default: 1174 ret = -EINVAL; 1175 goto err; 1176 } 1177 1178 /* Ok! */ 1179 __pipe_unlock(pipe); 1180 return 0; 1181 1182 err_rd: 1183 if (!--pipe->readers) 1184 wake_up_interruptible(&pipe->wr_wait); 1185 ret = -ERESTARTSYS; 1186 goto err; 1187 1188 err_wr: 1189 if (!--pipe->writers) 1190 wake_up_interruptible_all(&pipe->rd_wait); 1191 ret = -ERESTARTSYS; 1192 goto err; 1193 1194 err: 1195 __pipe_unlock(pipe); 1196 1197 put_pipe_info(inode, pipe); 1198 return ret; 1199 } 1200 1201 const struct file_operations pipefifo_fops = { 1202 .open = fifo_open, 1203 .llseek = no_llseek, 1204 .read_iter = pipe_read, 1205 .write_iter = pipe_write, 1206 .poll = pipe_poll, 1207 .unlocked_ioctl = pipe_ioctl, 1208 .release = pipe_release, 1209 .fasync = pipe_fasync, 1210 }; 1211 1212 /* 1213 * Currently we rely on the pipe array holding a power-of-2 number 1214 * of pages. Returns 0 on error. 1215 */ 1216 unsigned int round_pipe_size(unsigned long size) 1217 { 1218 if (size > (1U << 31)) 1219 return 0; 1220 1221 /* Minimum pipe size, as required by POSIX */ 1222 if (size < PAGE_SIZE) 1223 return PAGE_SIZE; 1224 1225 return roundup_pow_of_two(size); 1226 } 1227 1228 /* 1229 * Resize the pipe ring to a number of slots. 1230 */ 1231 int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) 1232 { 1233 struct pipe_buffer *bufs; 1234 unsigned int head, tail, mask, n; 1235 1236 /* 1237 * We can shrink the pipe, if arg is greater than the ring occupancy. 1238 * Since we don't expect a lot of shrink+grow operations, just free and 1239 * allocate again like we would do for growing. If the pipe currently 1240 * contains more buffers than arg, then return busy. 1241 */ 1242 mask = pipe->ring_size - 1; 1243 head = pipe->head; 1244 tail = pipe->tail; 1245 n = pipe_occupancy(pipe->head, pipe->tail); 1246 if (nr_slots < n) 1247 return -EBUSY; 1248 1249 bufs = kcalloc(nr_slots, sizeof(*bufs), 1250 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1251 if (unlikely(!bufs)) 1252 return -ENOMEM; 1253 1254 /* 1255 * The pipe array wraps around, so just start the new one at zero 1256 * and adjust the indices. 1257 */ 1258 if (n > 0) { 1259 unsigned int h = head & mask; 1260 unsigned int t = tail & mask; 1261 if (h > t) { 1262 memcpy(bufs, pipe->bufs + t, 1263 n * sizeof(struct pipe_buffer)); 1264 } else { 1265 unsigned int tsize = pipe->ring_size - t; 1266 if (h > 0) 1267 memcpy(bufs + tsize, pipe->bufs, 1268 h * sizeof(struct pipe_buffer)); 1269 memcpy(bufs, pipe->bufs + t, 1270 tsize * sizeof(struct pipe_buffer)); 1271 } 1272 } 1273 1274 head = n; 1275 tail = 0; 1276 1277 kfree(pipe->bufs); 1278 pipe->bufs = bufs; 1279 pipe->ring_size = nr_slots; 1280 if (pipe->max_usage > nr_slots) 1281 pipe->max_usage = nr_slots; 1282 pipe->tail = tail; 1283 pipe->head = head; 1284 1285 /* This might have made more room for writers */ 1286 wake_up_interruptible(&pipe->wr_wait); 1287 return 0; 1288 } 1289 1290 /* 1291 * Allocate a new array of pipe buffers and copy the info over. Returns the 1292 * pipe size if successful, or return -ERROR on error. 1293 */ 1294 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1295 { 1296 unsigned long user_bufs; 1297 unsigned int nr_slots, size; 1298 long ret = 0; 1299 1300 #ifdef CONFIG_WATCH_QUEUE 1301 if (pipe->watch_queue) 1302 return -EBUSY; 1303 #endif 1304 1305 size = round_pipe_size(arg); 1306 nr_slots = size >> PAGE_SHIFT; 1307 1308 if (!nr_slots) 1309 return -EINVAL; 1310 1311 /* 1312 * If trying to increase the pipe capacity, check that an 1313 * unprivileged user is not trying to exceed various limits 1314 * (soft limit check here, hard limit check just below). 1315 * Decreasing the pipe capacity is always permitted, even 1316 * if the user is currently over a limit. 1317 */ 1318 if (nr_slots > pipe->max_usage && 1319 size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1320 return -EPERM; 1321 1322 user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); 1323 1324 if (nr_slots > pipe->max_usage && 1325 (too_many_pipe_buffers_hard(user_bufs) || 1326 too_many_pipe_buffers_soft(user_bufs)) && 1327 pipe_is_unprivileged_user()) { 1328 ret = -EPERM; 1329 goto out_revert_acct; 1330 } 1331 1332 ret = pipe_resize_ring(pipe, nr_slots); 1333 if (ret < 0) 1334 goto out_revert_acct; 1335 1336 pipe->max_usage = nr_slots; 1337 pipe->nr_accounted = nr_slots; 1338 return pipe->max_usage * PAGE_SIZE; 1339 1340 out_revert_acct: 1341 (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); 1342 return ret; 1343 } 1344 1345 /* 1346 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1347 * location, so checking ->i_pipe is not enough to verify that this is a 1348 * pipe. 1349 */ 1350 struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) 1351 { 1352 struct pipe_inode_info *pipe = file->private_data; 1353 1354 if (file->f_op != &pipefifo_fops || !pipe) 1355 return NULL; 1356 #ifdef CONFIG_WATCH_QUEUE 1357 if (for_splice && pipe->watch_queue) 1358 return NULL; 1359 #endif 1360 return pipe; 1361 } 1362 1363 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1364 { 1365 struct pipe_inode_info *pipe; 1366 long ret; 1367 1368 pipe = get_pipe_info(file, false); 1369 if (!pipe) 1370 return -EBADF; 1371 1372 __pipe_lock(pipe); 1373 1374 switch (cmd) { 1375 case F_SETPIPE_SZ: 1376 ret = pipe_set_size(pipe, arg); 1377 break; 1378 case F_GETPIPE_SZ: 1379 ret = pipe->max_usage * PAGE_SIZE; 1380 break; 1381 default: 1382 ret = -EINVAL; 1383 break; 1384 } 1385 1386 __pipe_unlock(pipe); 1387 return ret; 1388 } 1389 1390 static const struct super_operations pipefs_ops = { 1391 .destroy_inode = free_inode_nonrcu, 1392 .statfs = simple_statfs, 1393 }; 1394 1395 /* 1396 * pipefs should _never_ be mounted by userland - too much of security hassle, 1397 * no real gain from having the whole whorehouse mounted. So we don't need 1398 * any operations on the root directory. However, we need a non-trivial 1399 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1400 */ 1401 1402 static int pipefs_init_fs_context(struct fs_context *fc) 1403 { 1404 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 1405 if (!ctx) 1406 return -ENOMEM; 1407 ctx->ops = &pipefs_ops; 1408 ctx->dops = &pipefs_dentry_operations; 1409 return 0; 1410 } 1411 1412 static struct file_system_type pipe_fs_type = { 1413 .name = "pipefs", 1414 .init_fs_context = pipefs_init_fs_context, 1415 .kill_sb = kill_anon_super, 1416 }; 1417 1418 static int __init init_pipe_fs(void) 1419 { 1420 int err = register_filesystem(&pipe_fs_type); 1421 1422 if (!err) { 1423 pipe_mnt = kern_mount(&pipe_fs_type); 1424 if (IS_ERR(pipe_mnt)) { 1425 err = PTR_ERR(pipe_mnt); 1426 unregister_filesystem(&pipe_fs_type); 1427 } 1428 } 1429 return err; 1430 } 1431 1432 fs_initcall(init_pipe_fs); 1433