1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/pipe.c 4 * 5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/slab.h> 12 #include <linux/module.h> 13 #include <linux/init.h> 14 #include <linux/fs.h> 15 #include <linux/log2.h> 16 #include <linux/mount.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/magic.h> 19 #include <linux/pipe_fs_i.h> 20 #include <linux/uio.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/audit.h> 24 #include <linux/syscalls.h> 25 #include <linux/fcntl.h> 26 #include <linux/memcontrol.h> 27 #include <linux/watch_queue.h> 28 29 #include <linux/uaccess.h> 30 #include <asm/ioctls.h> 31 32 #include "internal.h" 33 34 /* 35 * The max size that a non-root user is allowed to grow the pipe. Can 36 * be set by root in /proc/sys/fs/pipe-max-size 37 */ 38 unsigned int pipe_max_size = 1048576; 39 40 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 41 * matches default values. 42 */ 43 unsigned long pipe_user_pages_hard; 44 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 45 46 /* 47 * We use head and tail indices that aren't masked off, except at the point of 48 * dereference, but rather they're allowed to wrap naturally. This means there 49 * isn't a dead spot in the buffer, but the ring has to be a power of two and 50 * <= 2^31. 51 * -- David Howells 2019-09-23. 52 * 53 * Reads with count = 0 should always return 0. 54 * -- Julian Bradfield 1999-06-07. 55 * 56 * FIFOs and Pipes now generate SIGIO for both readers and writers. 57 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 58 * 59 * pipe_read & write cleanup 60 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 61 */ 62 63 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 64 { 65 if (pipe->files) 66 mutex_lock_nested(&pipe->mutex, subclass); 67 } 68 69 void pipe_lock(struct pipe_inode_info *pipe) 70 { 71 /* 72 * pipe_lock() nests non-pipe inode locks (for writing to a file) 73 */ 74 pipe_lock_nested(pipe, I_MUTEX_PARENT); 75 } 76 EXPORT_SYMBOL(pipe_lock); 77 78 void pipe_unlock(struct pipe_inode_info *pipe) 79 { 80 if (pipe->files) 81 mutex_unlock(&pipe->mutex); 82 } 83 EXPORT_SYMBOL(pipe_unlock); 84 85 static inline void __pipe_lock(struct pipe_inode_info *pipe) 86 { 87 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 88 } 89 90 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 91 { 92 mutex_unlock(&pipe->mutex); 93 } 94 95 void pipe_double_lock(struct pipe_inode_info *pipe1, 96 struct pipe_inode_info *pipe2) 97 { 98 BUG_ON(pipe1 == pipe2); 99 100 if (pipe1 < pipe2) { 101 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 102 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 103 } else { 104 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 105 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 106 } 107 } 108 109 /* Drop the inode semaphore and wait for a pipe event, atomically */ 110 void pipe_wait(struct pipe_inode_info *pipe) 111 { 112 DEFINE_WAIT(rdwait); 113 DEFINE_WAIT(wrwait); 114 115 /* 116 * Pipes are system-local resources, so sleeping on them 117 * is considered a noninteractive wait: 118 */ 119 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); 120 prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); 121 pipe_unlock(pipe); 122 schedule(); 123 finish_wait(&pipe->rd_wait, &rdwait); 124 finish_wait(&pipe->wr_wait, &wrwait); 125 pipe_lock(pipe); 126 } 127 128 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 129 struct pipe_buffer *buf) 130 { 131 struct page *page = buf->page; 132 133 /* 134 * If nobody else uses this page, and we don't already have a 135 * temporary page, let's keep track of it as a one-deep 136 * allocation cache. (Otherwise just release our reference to it) 137 */ 138 if (page_count(page) == 1 && !pipe->tmp_page) 139 pipe->tmp_page = page; 140 else 141 put_page(page); 142 } 143 144 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, 145 struct pipe_buffer *buf) 146 { 147 struct page *page = buf->page; 148 149 if (page_count(page) == 1) { 150 memcg_kmem_uncharge_page(page, 0); 151 __SetPageLocked(page); 152 return 0; 153 } 154 return 1; 155 } 156 157 /** 158 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 159 * @pipe: the pipe that the buffer belongs to 160 * @buf: the buffer to attempt to steal 161 * 162 * Description: 163 * This function attempts to steal the &struct page attached to 164 * @buf. If successful, this function returns 0 and returns with 165 * the page locked. The caller may then reuse the page for whatever 166 * he wishes; the typical use is insertion into a different file 167 * page cache. 168 */ 169 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 170 struct pipe_buffer *buf) 171 { 172 struct page *page = buf->page; 173 174 /* 175 * A reference of one is golden, that means that the owner of this 176 * page is the only one holding a reference to it. lock the page 177 * and return OK. 178 */ 179 if (page_count(page) == 1) { 180 lock_page(page); 181 return 0; 182 } 183 184 return 1; 185 } 186 EXPORT_SYMBOL(generic_pipe_buf_steal); 187 188 /** 189 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 190 * @pipe: the pipe that the buffer belongs to 191 * @buf: the buffer to get a reference to 192 * 193 * Description: 194 * This function grabs an extra reference to @buf. It's used in 195 * in the tee() system call, when we duplicate the buffers in one 196 * pipe into another. 197 */ 198 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 199 { 200 return try_get_page(buf->page); 201 } 202 EXPORT_SYMBOL(generic_pipe_buf_get); 203 204 /** 205 * generic_pipe_buf_confirm - verify contents of the pipe buffer 206 * @info: the pipe that the buffer belongs to 207 * @buf: the buffer to confirm 208 * 209 * Description: 210 * This function does nothing, because the generic pipe code uses 211 * pages that are always good when inserted into the pipe. 212 */ 213 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 214 struct pipe_buffer *buf) 215 { 216 return 0; 217 } 218 EXPORT_SYMBOL(generic_pipe_buf_confirm); 219 220 /** 221 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 222 * @pipe: the pipe that the buffer belongs to 223 * @buf: the buffer to put a reference to 224 * 225 * Description: 226 * This function releases a reference to @buf. 227 */ 228 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 229 struct pipe_buffer *buf) 230 { 231 put_page(buf->page); 232 } 233 EXPORT_SYMBOL(generic_pipe_buf_release); 234 235 /* New data written to a pipe may be appended to a buffer with this type. */ 236 static const struct pipe_buf_operations anon_pipe_buf_ops = { 237 .confirm = generic_pipe_buf_confirm, 238 .release = anon_pipe_buf_release, 239 .steal = anon_pipe_buf_steal, 240 .get = generic_pipe_buf_get, 241 }; 242 243 static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { 244 .confirm = generic_pipe_buf_confirm, 245 .release = anon_pipe_buf_release, 246 .steal = anon_pipe_buf_steal, 247 .get = generic_pipe_buf_get, 248 }; 249 250 static const struct pipe_buf_operations packet_pipe_buf_ops = { 251 .confirm = generic_pipe_buf_confirm, 252 .release = anon_pipe_buf_release, 253 .steal = anon_pipe_buf_steal, 254 .get = generic_pipe_buf_get, 255 }; 256 257 /** 258 * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable 259 * @buf: the buffer to mark 260 * 261 * Description: 262 * This function ensures that no future writes will be merged into the 263 * given &struct pipe_buffer. This is necessary when multiple pipe buffers 264 * share the same backing page. 265 */ 266 void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) 267 { 268 if (buf->ops == &anon_pipe_buf_ops) 269 buf->ops = &anon_pipe_buf_nomerge_ops; 270 } 271 272 static bool pipe_buf_can_merge(struct pipe_buffer *buf) 273 { 274 return buf->ops == &anon_pipe_buf_ops; 275 } 276 277 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 278 static inline bool pipe_readable(const struct pipe_inode_info *pipe) 279 { 280 unsigned int head = READ_ONCE(pipe->head); 281 unsigned int tail = READ_ONCE(pipe->tail); 282 unsigned int writers = READ_ONCE(pipe->writers); 283 284 return !pipe_empty(head, tail) || !writers; 285 } 286 287 static ssize_t 288 pipe_read(struct kiocb *iocb, struct iov_iter *to) 289 { 290 size_t total_len = iov_iter_count(to); 291 struct file *filp = iocb->ki_filp; 292 struct pipe_inode_info *pipe = filp->private_data; 293 bool was_full, wake_next_reader = false; 294 ssize_t ret; 295 296 /* Null read succeeds. */ 297 if (unlikely(total_len == 0)) 298 return 0; 299 300 ret = 0; 301 __pipe_lock(pipe); 302 303 /* 304 * We only wake up writers if the pipe was full when we started 305 * reading in order to avoid unnecessary wakeups. 306 * 307 * But when we do wake up writers, we do so using a sync wakeup 308 * (WF_SYNC), because we want them to get going and generate more 309 * data for us. 310 */ 311 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 312 for (;;) { 313 unsigned int head = pipe->head; 314 unsigned int tail = pipe->tail; 315 unsigned int mask = pipe->ring_size - 1; 316 317 if (!pipe_empty(head, tail)) { 318 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 319 size_t chars = buf->len; 320 size_t written; 321 int error; 322 323 if (chars > total_len) { 324 if (buf->flags & PIPE_BUF_FLAG_WHOLE) { 325 if (ret == 0) 326 ret = -ENOBUFS; 327 break; 328 } 329 chars = total_len; 330 } 331 332 error = pipe_buf_confirm(pipe, buf); 333 if (error) { 334 if (!ret) 335 ret = error; 336 break; 337 } 338 339 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 340 if (unlikely(written < chars)) { 341 if (!ret) 342 ret = -EFAULT; 343 break; 344 } 345 ret += chars; 346 buf->offset += chars; 347 buf->len -= chars; 348 349 /* Was it a packet buffer? Clean up and exit */ 350 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 351 total_len = chars; 352 buf->len = 0; 353 } 354 355 if (!buf->len) { 356 pipe_buf_release(pipe, buf); 357 spin_lock_irq(&pipe->rd_wait.lock); 358 tail++; 359 pipe->tail = tail; 360 spin_unlock_irq(&pipe->rd_wait.lock); 361 } 362 total_len -= chars; 363 if (!total_len) 364 break; /* common path: read succeeded */ 365 if (!pipe_empty(head, tail)) /* More to do? */ 366 continue; 367 } 368 369 if (!pipe->writers) 370 break; 371 if (ret) 372 break; 373 if (filp->f_flags & O_NONBLOCK) { 374 ret = -EAGAIN; 375 break; 376 } 377 __pipe_unlock(pipe); 378 379 /* 380 * We only get here if we didn't actually read anything. 381 * 382 * However, we could have seen (and removed) a zero-sized 383 * pipe buffer, and might have made space in the buffers 384 * that way. 385 * 386 * You can't make zero-sized pipe buffers by doing an empty 387 * write (not even in packet mode), but they can happen if 388 * the writer gets an EFAULT when trying to fill a buffer 389 * that already got allocated and inserted in the buffer 390 * array. 391 * 392 * So we still need to wake up any pending writers in the 393 * _very_ unlikely case that the pipe was full, but we got 394 * no data. 395 */ 396 if (unlikely(was_full)) { 397 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 398 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 399 } 400 401 /* 402 * But because we didn't read anything, at this point we can 403 * just return directly with -ERESTARTSYS if we're interrupted, 404 * since we've done any required wakeups and there's no need 405 * to mark anything accessed. And we've dropped the lock. 406 */ 407 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) 408 return -ERESTARTSYS; 409 410 __pipe_lock(pipe); 411 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 412 wake_next_reader = true; 413 } 414 if (pipe_empty(pipe->head, pipe->tail)) 415 wake_next_reader = false; 416 __pipe_unlock(pipe); 417 418 if (was_full) { 419 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 420 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 421 } 422 if (wake_next_reader) 423 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 424 if (ret > 0) 425 file_accessed(filp); 426 return ret; 427 } 428 429 static inline int is_packetized(struct file *file) 430 { 431 return (file->f_flags & O_DIRECT) != 0; 432 } 433 434 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 435 static inline bool pipe_writable(const struct pipe_inode_info *pipe) 436 { 437 unsigned int head = READ_ONCE(pipe->head); 438 unsigned int tail = READ_ONCE(pipe->tail); 439 unsigned int max_usage = READ_ONCE(pipe->max_usage); 440 441 return !pipe_full(head, tail, max_usage) || 442 !READ_ONCE(pipe->readers); 443 } 444 445 static ssize_t 446 pipe_write(struct kiocb *iocb, struct iov_iter *from) 447 { 448 struct file *filp = iocb->ki_filp; 449 struct pipe_inode_info *pipe = filp->private_data; 450 unsigned int head; 451 ssize_t ret = 0; 452 size_t total_len = iov_iter_count(from); 453 ssize_t chars; 454 bool was_empty = false; 455 bool wake_next_writer = false; 456 457 /* Null write succeeds. */ 458 if (unlikely(total_len == 0)) 459 return 0; 460 461 __pipe_lock(pipe); 462 463 if (!pipe->readers) { 464 send_sig(SIGPIPE, current, 0); 465 ret = -EPIPE; 466 goto out; 467 } 468 469 #ifdef CONFIG_WATCH_QUEUE 470 if (pipe->watch_queue) { 471 ret = -EXDEV; 472 goto out; 473 } 474 #endif 475 476 /* 477 * Only wake up if the pipe started out empty, since 478 * otherwise there should be no readers waiting. 479 * 480 * If it wasn't empty we try to merge new data into 481 * the last buffer. 482 * 483 * That naturally merges small writes, but it also 484 * page-aligs the rest of the writes for large writes 485 * spanning multiple pages. 486 */ 487 head = pipe->head; 488 was_empty = pipe_empty(head, pipe->tail); 489 chars = total_len & (PAGE_SIZE-1); 490 if (chars && !was_empty) { 491 unsigned int mask = pipe->ring_size - 1; 492 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; 493 int offset = buf->offset + buf->len; 494 495 if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { 496 ret = pipe_buf_confirm(pipe, buf); 497 if (ret) 498 goto out; 499 500 ret = copy_page_from_iter(buf->page, offset, chars, from); 501 if (unlikely(ret < chars)) { 502 ret = -EFAULT; 503 goto out; 504 } 505 506 buf->len += ret; 507 if (!iov_iter_count(from)) 508 goto out; 509 } 510 } 511 512 for (;;) { 513 if (!pipe->readers) { 514 send_sig(SIGPIPE, current, 0); 515 if (!ret) 516 ret = -EPIPE; 517 break; 518 } 519 520 head = pipe->head; 521 if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 522 unsigned int mask = pipe->ring_size - 1; 523 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 524 struct page *page = pipe->tmp_page; 525 int copied; 526 527 if (!page) { 528 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 529 if (unlikely(!page)) { 530 ret = ret ? : -ENOMEM; 531 break; 532 } 533 pipe->tmp_page = page; 534 } 535 536 /* Allocate a slot in the ring in advance and attach an 537 * empty buffer. If we fault or otherwise fail to use 538 * it, either the reader will consume it or it'll still 539 * be there for the next write. 540 */ 541 spin_lock_irq(&pipe->rd_wait.lock); 542 543 head = pipe->head; 544 if (pipe_full(head, pipe->tail, pipe->max_usage)) { 545 spin_unlock_irq(&pipe->rd_wait.lock); 546 continue; 547 } 548 549 pipe->head = head + 1; 550 spin_unlock_irq(&pipe->rd_wait.lock); 551 552 /* Insert it into the buffer array */ 553 buf = &pipe->bufs[head & mask]; 554 buf->page = page; 555 buf->ops = &anon_pipe_buf_ops; 556 buf->offset = 0; 557 buf->len = 0; 558 buf->flags = 0; 559 if (is_packetized(filp)) { 560 buf->ops = &packet_pipe_buf_ops; 561 buf->flags = PIPE_BUF_FLAG_PACKET; 562 } 563 pipe->tmp_page = NULL; 564 565 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 566 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 567 if (!ret) 568 ret = -EFAULT; 569 break; 570 } 571 ret += copied; 572 buf->offset = 0; 573 buf->len = copied; 574 575 if (!iov_iter_count(from)) 576 break; 577 } 578 579 if (!pipe_full(head, pipe->tail, pipe->max_usage)) 580 continue; 581 582 /* Wait for buffer space to become available. */ 583 if (filp->f_flags & O_NONBLOCK) { 584 if (!ret) 585 ret = -EAGAIN; 586 break; 587 } 588 if (signal_pending(current)) { 589 if (!ret) 590 ret = -ERESTARTSYS; 591 break; 592 } 593 594 /* 595 * We're going to release the pipe lock and wait for more 596 * space. We wake up any readers if necessary, and then 597 * after waiting we need to re-check whether the pipe 598 * become empty while we dropped the lock. 599 */ 600 __pipe_unlock(pipe); 601 if (was_empty) { 602 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 603 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 604 } 605 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); 606 __pipe_lock(pipe); 607 was_empty = pipe_empty(pipe->head, pipe->tail); 608 wake_next_writer = true; 609 } 610 out: 611 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 612 wake_next_writer = false; 613 __pipe_unlock(pipe); 614 615 /* 616 * If we do do a wakeup event, we do a 'sync' wakeup, because we 617 * want the reader to start processing things asap, rather than 618 * leave the data pending. 619 * 620 * This is particularly important for small writes, because of 621 * how (for example) the GNU make jobserver uses small writes to 622 * wake up pending jobs 623 */ 624 if (was_empty) { 625 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 626 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 627 } 628 if (wake_next_writer) 629 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 630 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 631 int err = file_update_time(filp); 632 if (err) 633 ret = err; 634 sb_end_write(file_inode(filp)->i_sb); 635 } 636 return ret; 637 } 638 639 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 640 { 641 struct pipe_inode_info *pipe = filp->private_data; 642 int count, head, tail, mask; 643 644 switch (cmd) { 645 case FIONREAD: 646 __pipe_lock(pipe); 647 count = 0; 648 head = pipe->head; 649 tail = pipe->tail; 650 mask = pipe->ring_size - 1; 651 652 while (tail != head) { 653 count += pipe->bufs[tail & mask].len; 654 tail++; 655 } 656 __pipe_unlock(pipe); 657 658 return put_user(count, (int __user *)arg); 659 660 #ifdef CONFIG_WATCH_QUEUE 661 case IOC_WATCH_QUEUE_SET_SIZE: { 662 int ret; 663 __pipe_lock(pipe); 664 ret = watch_queue_set_size(pipe, arg); 665 __pipe_unlock(pipe); 666 return ret; 667 } 668 669 case IOC_WATCH_QUEUE_SET_FILTER: 670 return watch_queue_set_filter( 671 pipe, (struct watch_notification_filter __user *)arg); 672 #endif 673 674 default: 675 return -ENOIOCTLCMD; 676 } 677 } 678 679 /* No kernel lock held - fine */ 680 static __poll_t 681 pipe_poll(struct file *filp, poll_table *wait) 682 { 683 __poll_t mask; 684 struct pipe_inode_info *pipe = filp->private_data; 685 unsigned int head, tail; 686 687 /* 688 * Reading pipe state only -- no need for acquiring the semaphore. 689 * 690 * But because this is racy, the code has to add the 691 * entry to the poll table _first_ .. 692 */ 693 if (filp->f_mode & FMODE_READ) 694 poll_wait(filp, &pipe->rd_wait, wait); 695 if (filp->f_mode & FMODE_WRITE) 696 poll_wait(filp, &pipe->wr_wait, wait); 697 698 /* 699 * .. and only then can you do the racy tests. That way, 700 * if something changes and you got it wrong, the poll 701 * table entry will wake you up and fix it. 702 */ 703 head = READ_ONCE(pipe->head); 704 tail = READ_ONCE(pipe->tail); 705 706 mask = 0; 707 if (filp->f_mode & FMODE_READ) { 708 if (!pipe_empty(head, tail)) 709 mask |= EPOLLIN | EPOLLRDNORM; 710 if (!pipe->writers && filp->f_version != pipe->w_counter) 711 mask |= EPOLLHUP; 712 } 713 714 if (filp->f_mode & FMODE_WRITE) { 715 if (!pipe_full(head, tail, pipe->max_usage)) 716 mask |= EPOLLOUT | EPOLLWRNORM; 717 /* 718 * Most Unices do not set EPOLLERR for FIFOs but on Linux they 719 * behave exactly like pipes for poll(). 720 */ 721 if (!pipe->readers) 722 mask |= EPOLLERR; 723 } 724 725 return mask; 726 } 727 728 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 729 { 730 int kill = 0; 731 732 spin_lock(&inode->i_lock); 733 if (!--pipe->files) { 734 inode->i_pipe = NULL; 735 kill = 1; 736 } 737 spin_unlock(&inode->i_lock); 738 739 if (kill) 740 free_pipe_info(pipe); 741 } 742 743 static int 744 pipe_release(struct inode *inode, struct file *file) 745 { 746 struct pipe_inode_info *pipe = file->private_data; 747 748 __pipe_lock(pipe); 749 if (file->f_mode & FMODE_READ) 750 pipe->readers--; 751 if (file->f_mode & FMODE_WRITE) 752 pipe->writers--; 753 754 /* Was that the last reader or writer, but not the other side? */ 755 if (!pipe->readers != !pipe->writers) { 756 wake_up_interruptible_all(&pipe->rd_wait); 757 wake_up_interruptible_all(&pipe->wr_wait); 758 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 759 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 760 } 761 __pipe_unlock(pipe); 762 763 put_pipe_info(inode, pipe); 764 return 0; 765 } 766 767 static int 768 pipe_fasync(int fd, struct file *filp, int on) 769 { 770 struct pipe_inode_info *pipe = filp->private_data; 771 int retval = 0; 772 773 __pipe_lock(pipe); 774 if (filp->f_mode & FMODE_READ) 775 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 776 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 777 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 778 if (retval < 0 && (filp->f_mode & FMODE_READ)) 779 /* this can happen only if on == T */ 780 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 781 } 782 __pipe_unlock(pipe); 783 return retval; 784 } 785 786 unsigned long account_pipe_buffers(struct user_struct *user, 787 unsigned long old, unsigned long new) 788 { 789 return atomic_long_add_return(new - old, &user->pipe_bufs); 790 } 791 792 bool too_many_pipe_buffers_soft(unsigned long user_bufs) 793 { 794 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 795 796 return soft_limit && user_bufs > soft_limit; 797 } 798 799 bool too_many_pipe_buffers_hard(unsigned long user_bufs) 800 { 801 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 802 803 return hard_limit && user_bufs > hard_limit; 804 } 805 806 bool pipe_is_unprivileged_user(void) 807 { 808 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 809 } 810 811 struct pipe_inode_info *alloc_pipe_info(void) 812 { 813 struct pipe_inode_info *pipe; 814 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 815 struct user_struct *user = get_current_user(); 816 unsigned long user_bufs; 817 unsigned int max_size = READ_ONCE(pipe_max_size); 818 819 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 820 if (pipe == NULL) 821 goto out_free_uid; 822 823 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 824 pipe_bufs = max_size >> PAGE_SHIFT; 825 826 user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 827 828 if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { 829 user_bufs = account_pipe_buffers(user, pipe_bufs, 1); 830 pipe_bufs = 1; 831 } 832 833 if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) 834 goto out_revert_acct; 835 836 pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), 837 GFP_KERNEL_ACCOUNT); 838 839 if (pipe->bufs) { 840 init_waitqueue_head(&pipe->rd_wait); 841 init_waitqueue_head(&pipe->wr_wait); 842 pipe->r_counter = pipe->w_counter = 1; 843 pipe->max_usage = pipe_bufs; 844 pipe->ring_size = pipe_bufs; 845 pipe->nr_accounted = pipe_bufs; 846 pipe->user = user; 847 mutex_init(&pipe->mutex); 848 return pipe; 849 } 850 851 out_revert_acct: 852 (void) account_pipe_buffers(user, pipe_bufs, 0); 853 kfree(pipe); 854 out_free_uid: 855 free_uid(user); 856 return NULL; 857 } 858 859 void free_pipe_info(struct pipe_inode_info *pipe) 860 { 861 int i; 862 863 #ifdef CONFIG_WATCH_QUEUE 864 if (pipe->watch_queue) { 865 watch_queue_clear(pipe->watch_queue); 866 put_watch_queue(pipe->watch_queue); 867 } 868 #endif 869 870 (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); 871 free_uid(pipe->user); 872 for (i = 0; i < pipe->ring_size; i++) { 873 struct pipe_buffer *buf = pipe->bufs + i; 874 if (buf->ops) 875 pipe_buf_release(pipe, buf); 876 } 877 if (pipe->tmp_page) 878 __free_page(pipe->tmp_page); 879 kfree(pipe->bufs); 880 kfree(pipe); 881 } 882 883 static struct vfsmount *pipe_mnt __read_mostly; 884 885 /* 886 * pipefs_dname() is called from d_path(). 887 */ 888 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 889 { 890 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 891 d_inode(dentry)->i_ino); 892 } 893 894 static const struct dentry_operations pipefs_dentry_operations = { 895 .d_dname = pipefs_dname, 896 }; 897 898 static struct inode * get_pipe_inode(void) 899 { 900 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 901 struct pipe_inode_info *pipe; 902 903 if (!inode) 904 goto fail_inode; 905 906 inode->i_ino = get_next_ino(); 907 908 pipe = alloc_pipe_info(); 909 if (!pipe) 910 goto fail_iput; 911 912 inode->i_pipe = pipe; 913 pipe->files = 2; 914 pipe->readers = pipe->writers = 1; 915 inode->i_fop = &pipefifo_fops; 916 917 /* 918 * Mark the inode dirty from the very beginning, 919 * that way it will never be moved to the dirty 920 * list because "mark_inode_dirty()" will think 921 * that it already _is_ on the dirty list. 922 */ 923 inode->i_state = I_DIRTY; 924 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 925 inode->i_uid = current_fsuid(); 926 inode->i_gid = current_fsgid(); 927 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 928 929 return inode; 930 931 fail_iput: 932 iput(inode); 933 934 fail_inode: 935 return NULL; 936 } 937 938 int create_pipe_files(struct file **res, int flags) 939 { 940 struct inode *inode = get_pipe_inode(); 941 struct file *f; 942 943 if (!inode) 944 return -ENFILE; 945 946 if (flags & O_NOTIFICATION_PIPE) { 947 #ifdef CONFIG_WATCH_QUEUE 948 if (watch_queue_init(inode->i_pipe) < 0) { 949 iput(inode); 950 return -ENOMEM; 951 } 952 #else 953 return -ENOPKG; 954 #endif 955 } 956 957 f = alloc_file_pseudo(inode, pipe_mnt, "", 958 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 959 &pipefifo_fops); 960 if (IS_ERR(f)) { 961 free_pipe_info(inode->i_pipe); 962 iput(inode); 963 return PTR_ERR(f); 964 } 965 966 f->private_data = inode->i_pipe; 967 968 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 969 &pipefifo_fops); 970 if (IS_ERR(res[0])) { 971 put_pipe_info(inode, inode->i_pipe); 972 fput(f); 973 return PTR_ERR(res[0]); 974 } 975 res[0]->private_data = inode->i_pipe; 976 res[1] = f; 977 stream_open(inode, res[0]); 978 stream_open(inode, res[1]); 979 return 0; 980 } 981 982 static int __do_pipe_flags(int *fd, struct file **files, int flags) 983 { 984 int error; 985 int fdw, fdr; 986 987 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) 988 return -EINVAL; 989 990 error = create_pipe_files(files, flags); 991 if (error) 992 return error; 993 994 error = get_unused_fd_flags(flags); 995 if (error < 0) 996 goto err_read_pipe; 997 fdr = error; 998 999 error = get_unused_fd_flags(flags); 1000 if (error < 0) 1001 goto err_fdr; 1002 fdw = error; 1003 1004 audit_fd_pair(fdr, fdw); 1005 fd[0] = fdr; 1006 fd[1] = fdw; 1007 return 0; 1008 1009 err_fdr: 1010 put_unused_fd(fdr); 1011 err_read_pipe: 1012 fput(files[0]); 1013 fput(files[1]); 1014 return error; 1015 } 1016 1017 int do_pipe_flags(int *fd, int flags) 1018 { 1019 struct file *files[2]; 1020 int error = __do_pipe_flags(fd, files, flags); 1021 if (!error) { 1022 fd_install(fd[0], files[0]); 1023 fd_install(fd[1], files[1]); 1024 } 1025 return error; 1026 } 1027 1028 /* 1029 * sys_pipe() is the normal C calling standard for creating 1030 * a pipe. It's not the way Unix traditionally does this, though. 1031 */ 1032 static int do_pipe2(int __user *fildes, int flags) 1033 { 1034 struct file *files[2]; 1035 int fd[2]; 1036 int error; 1037 1038 error = __do_pipe_flags(fd, files, flags); 1039 if (!error) { 1040 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 1041 fput(files[0]); 1042 fput(files[1]); 1043 put_unused_fd(fd[0]); 1044 put_unused_fd(fd[1]); 1045 error = -EFAULT; 1046 } else { 1047 fd_install(fd[0], files[0]); 1048 fd_install(fd[1], files[1]); 1049 } 1050 } 1051 return error; 1052 } 1053 1054 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1055 { 1056 return do_pipe2(fildes, flags); 1057 } 1058 1059 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1060 { 1061 return do_pipe2(fildes, 0); 1062 } 1063 1064 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 1065 { 1066 int cur = *cnt; 1067 1068 while (cur == *cnt) { 1069 pipe_wait(pipe); 1070 if (signal_pending(current)) 1071 break; 1072 } 1073 return cur == *cnt ? -ERESTARTSYS : 0; 1074 } 1075 1076 static void wake_up_partner(struct pipe_inode_info *pipe) 1077 { 1078 wake_up_interruptible_all(&pipe->rd_wait); 1079 wake_up_interruptible_all(&pipe->wr_wait); 1080 } 1081 1082 static int fifo_open(struct inode *inode, struct file *filp) 1083 { 1084 struct pipe_inode_info *pipe; 1085 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 1086 int ret; 1087 1088 filp->f_version = 0; 1089 1090 spin_lock(&inode->i_lock); 1091 if (inode->i_pipe) { 1092 pipe = inode->i_pipe; 1093 pipe->files++; 1094 spin_unlock(&inode->i_lock); 1095 } else { 1096 spin_unlock(&inode->i_lock); 1097 pipe = alloc_pipe_info(); 1098 if (!pipe) 1099 return -ENOMEM; 1100 pipe->files = 1; 1101 spin_lock(&inode->i_lock); 1102 if (unlikely(inode->i_pipe)) { 1103 inode->i_pipe->files++; 1104 spin_unlock(&inode->i_lock); 1105 free_pipe_info(pipe); 1106 pipe = inode->i_pipe; 1107 } else { 1108 inode->i_pipe = pipe; 1109 spin_unlock(&inode->i_lock); 1110 } 1111 } 1112 filp->private_data = pipe; 1113 /* OK, we have a pipe and it's pinned down */ 1114 1115 __pipe_lock(pipe); 1116 1117 /* We can only do regular read/write on fifos */ 1118 stream_open(inode, filp); 1119 1120 switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { 1121 case FMODE_READ: 1122 /* 1123 * O_RDONLY 1124 * POSIX.1 says that O_NONBLOCK means return with the FIFO 1125 * opened, even when there is no process writing the FIFO. 1126 */ 1127 pipe->r_counter++; 1128 if (pipe->readers++ == 0) 1129 wake_up_partner(pipe); 1130 1131 if (!is_pipe && !pipe->writers) { 1132 if ((filp->f_flags & O_NONBLOCK)) { 1133 /* suppress EPOLLHUP until we have 1134 * seen a writer */ 1135 filp->f_version = pipe->w_counter; 1136 } else { 1137 if (wait_for_partner(pipe, &pipe->w_counter)) 1138 goto err_rd; 1139 } 1140 } 1141 break; 1142 1143 case FMODE_WRITE: 1144 /* 1145 * O_WRONLY 1146 * POSIX.1 says that O_NONBLOCK means return -1 with 1147 * errno=ENXIO when there is no process reading the FIFO. 1148 */ 1149 ret = -ENXIO; 1150 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 1151 goto err; 1152 1153 pipe->w_counter++; 1154 if (!pipe->writers++) 1155 wake_up_partner(pipe); 1156 1157 if (!is_pipe && !pipe->readers) { 1158 if (wait_for_partner(pipe, &pipe->r_counter)) 1159 goto err_wr; 1160 } 1161 break; 1162 1163 case FMODE_READ | FMODE_WRITE: 1164 /* 1165 * O_RDWR 1166 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 1167 * This implementation will NEVER block on a O_RDWR open, since 1168 * the process can at least talk to itself. 1169 */ 1170 1171 pipe->readers++; 1172 pipe->writers++; 1173 pipe->r_counter++; 1174 pipe->w_counter++; 1175 if (pipe->readers == 1 || pipe->writers == 1) 1176 wake_up_partner(pipe); 1177 break; 1178 1179 default: 1180 ret = -EINVAL; 1181 goto err; 1182 } 1183 1184 /* Ok! */ 1185 __pipe_unlock(pipe); 1186 return 0; 1187 1188 err_rd: 1189 if (!--pipe->readers) 1190 wake_up_interruptible(&pipe->wr_wait); 1191 ret = -ERESTARTSYS; 1192 goto err; 1193 1194 err_wr: 1195 if (!--pipe->writers) 1196 wake_up_interruptible_all(&pipe->rd_wait); 1197 ret = -ERESTARTSYS; 1198 goto err; 1199 1200 err: 1201 __pipe_unlock(pipe); 1202 1203 put_pipe_info(inode, pipe); 1204 return ret; 1205 } 1206 1207 const struct file_operations pipefifo_fops = { 1208 .open = fifo_open, 1209 .llseek = no_llseek, 1210 .read_iter = pipe_read, 1211 .write_iter = pipe_write, 1212 .poll = pipe_poll, 1213 .unlocked_ioctl = pipe_ioctl, 1214 .release = pipe_release, 1215 .fasync = pipe_fasync, 1216 }; 1217 1218 /* 1219 * Currently we rely on the pipe array holding a power-of-2 number 1220 * of pages. Returns 0 on error. 1221 */ 1222 unsigned int round_pipe_size(unsigned long size) 1223 { 1224 if (size > (1U << 31)) 1225 return 0; 1226 1227 /* Minimum pipe size, as required by POSIX */ 1228 if (size < PAGE_SIZE) 1229 return PAGE_SIZE; 1230 1231 return roundup_pow_of_two(size); 1232 } 1233 1234 /* 1235 * Resize the pipe ring to a number of slots. 1236 */ 1237 int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) 1238 { 1239 struct pipe_buffer *bufs; 1240 unsigned int head, tail, mask, n; 1241 1242 /* 1243 * We can shrink the pipe, if arg is greater than the ring occupancy. 1244 * Since we don't expect a lot of shrink+grow operations, just free and 1245 * allocate again like we would do for growing. If the pipe currently 1246 * contains more buffers than arg, then return busy. 1247 */ 1248 mask = pipe->ring_size - 1; 1249 head = pipe->head; 1250 tail = pipe->tail; 1251 n = pipe_occupancy(pipe->head, pipe->tail); 1252 if (nr_slots < n) 1253 return -EBUSY; 1254 1255 bufs = kcalloc(nr_slots, sizeof(*bufs), 1256 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1257 if (unlikely(!bufs)) 1258 return -ENOMEM; 1259 1260 /* 1261 * The pipe array wraps around, so just start the new one at zero 1262 * and adjust the indices. 1263 */ 1264 if (n > 0) { 1265 unsigned int h = head & mask; 1266 unsigned int t = tail & mask; 1267 if (h > t) { 1268 memcpy(bufs, pipe->bufs + t, 1269 n * sizeof(struct pipe_buffer)); 1270 } else { 1271 unsigned int tsize = pipe->ring_size - t; 1272 if (h > 0) 1273 memcpy(bufs + tsize, pipe->bufs, 1274 h * sizeof(struct pipe_buffer)); 1275 memcpy(bufs, pipe->bufs + t, 1276 tsize * sizeof(struct pipe_buffer)); 1277 } 1278 } 1279 1280 head = n; 1281 tail = 0; 1282 1283 kfree(pipe->bufs); 1284 pipe->bufs = bufs; 1285 pipe->ring_size = nr_slots; 1286 if (pipe->max_usage > nr_slots) 1287 pipe->max_usage = nr_slots; 1288 pipe->tail = tail; 1289 pipe->head = head; 1290 1291 /* This might have made more room for writers */ 1292 wake_up_interruptible(&pipe->wr_wait); 1293 return 0; 1294 } 1295 1296 /* 1297 * Allocate a new array of pipe buffers and copy the info over. Returns the 1298 * pipe size if successful, or return -ERROR on error. 1299 */ 1300 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1301 { 1302 unsigned long user_bufs; 1303 unsigned int nr_slots, size; 1304 long ret = 0; 1305 1306 #ifdef CONFIG_WATCH_QUEUE 1307 if (pipe->watch_queue) 1308 return -EBUSY; 1309 #endif 1310 1311 size = round_pipe_size(arg); 1312 nr_slots = size >> PAGE_SHIFT; 1313 1314 if (!nr_slots) 1315 return -EINVAL; 1316 1317 /* 1318 * If trying to increase the pipe capacity, check that an 1319 * unprivileged user is not trying to exceed various limits 1320 * (soft limit check here, hard limit check just below). 1321 * Decreasing the pipe capacity is always permitted, even 1322 * if the user is currently over a limit. 1323 */ 1324 if (nr_slots > pipe->max_usage && 1325 size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1326 return -EPERM; 1327 1328 user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); 1329 1330 if (nr_slots > pipe->max_usage && 1331 (too_many_pipe_buffers_hard(user_bufs) || 1332 too_many_pipe_buffers_soft(user_bufs)) && 1333 pipe_is_unprivileged_user()) { 1334 ret = -EPERM; 1335 goto out_revert_acct; 1336 } 1337 1338 ret = pipe_resize_ring(pipe, nr_slots); 1339 if (ret < 0) 1340 goto out_revert_acct; 1341 1342 pipe->max_usage = nr_slots; 1343 pipe->nr_accounted = nr_slots; 1344 return pipe->max_usage * PAGE_SIZE; 1345 1346 out_revert_acct: 1347 (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); 1348 return ret; 1349 } 1350 1351 /* 1352 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1353 * location, so checking ->i_pipe is not enough to verify that this is a 1354 * pipe. 1355 */ 1356 struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) 1357 { 1358 struct pipe_inode_info *pipe = file->private_data; 1359 1360 if (file->f_op != &pipefifo_fops || !pipe) 1361 return NULL; 1362 #ifdef CONFIG_WATCH_QUEUE 1363 if (for_splice && pipe->watch_queue) 1364 return NULL; 1365 #endif 1366 return pipe; 1367 } 1368 1369 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1370 { 1371 struct pipe_inode_info *pipe; 1372 long ret; 1373 1374 pipe = get_pipe_info(file, false); 1375 if (!pipe) 1376 return -EBADF; 1377 1378 __pipe_lock(pipe); 1379 1380 switch (cmd) { 1381 case F_SETPIPE_SZ: 1382 ret = pipe_set_size(pipe, arg); 1383 break; 1384 case F_GETPIPE_SZ: 1385 ret = pipe->max_usage * PAGE_SIZE; 1386 break; 1387 default: 1388 ret = -EINVAL; 1389 break; 1390 } 1391 1392 __pipe_unlock(pipe); 1393 return ret; 1394 } 1395 1396 static const struct super_operations pipefs_ops = { 1397 .destroy_inode = free_inode_nonrcu, 1398 .statfs = simple_statfs, 1399 }; 1400 1401 /* 1402 * pipefs should _never_ be mounted by userland - too much of security hassle, 1403 * no real gain from having the whole whorehouse mounted. So we don't need 1404 * any operations on the root directory. However, we need a non-trivial 1405 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1406 */ 1407 1408 static int pipefs_init_fs_context(struct fs_context *fc) 1409 { 1410 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 1411 if (!ctx) 1412 return -ENOMEM; 1413 ctx->ops = &pipefs_ops; 1414 ctx->dops = &pipefs_dentry_operations; 1415 return 0; 1416 } 1417 1418 static struct file_system_type pipe_fs_type = { 1419 .name = "pipefs", 1420 .init_fs_context = pipefs_init_fs_context, 1421 .kill_sb = kill_anon_super, 1422 }; 1423 1424 static int __init init_pipe_fs(void) 1425 { 1426 int err = register_filesystem(&pipe_fs_type); 1427 1428 if (!err) { 1429 pipe_mnt = kern_mount(&pipe_fs_type); 1430 if (IS_ERR(pipe_mnt)) { 1431 err = PTR_ERR(pipe_mnt); 1432 unregister_filesystem(&pipe_fs_type); 1433 } 1434 } 1435 return err; 1436 } 1437 1438 fs_initcall(init_pipe_fs); 1439