1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/pipe.c 4 * 5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/slab.h> 12 #include <linux/module.h> 13 #include <linux/init.h> 14 #include <linux/fs.h> 15 #include <linux/log2.h> 16 #include <linux/mount.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/magic.h> 19 #include <linux/pipe_fs_i.h> 20 #include <linux/uio.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/audit.h> 24 #include <linux/syscalls.h> 25 #include <linux/fcntl.h> 26 #include <linux/memcontrol.h> 27 #include <linux/watch_queue.h> 28 #include <linux/sysctl.h> 29 #include <linux/sort.h> 30 31 #include <linux/uaccess.h> 32 #include <asm/ioctls.h> 33 34 #include "internal.h" 35 36 /* 37 * New pipe buffers will be restricted to this size while the user is exceeding 38 * their pipe buffer quota. The general pipe use case needs at least two 39 * buffers: one for data yet to be read, and one for new data. If this is less 40 * than two, then a write to a non-empty pipe may block even if the pipe is not 41 * full. This can occur with GNU make jobserver or similar uses of pipes as 42 * semaphores: multiple processes may be waiting to write tokens back to the 43 * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/. 44 * 45 * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their 46 * own risk, namely: pipe writes to non-full pipes may block until the pipe is 47 * emptied. 48 */ 49 #define PIPE_MIN_DEF_BUFFERS 2 50 51 /* 52 * The max size that a non-root user is allowed to grow the pipe. Can 53 * be set by root in /proc/sys/fs/pipe-max-size 54 */ 55 static unsigned int pipe_max_size = 1048576; 56 57 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 58 * matches default values. 59 */ 60 static unsigned long pipe_user_pages_hard; 61 static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 62 63 /* 64 * We use head and tail indices that aren't masked off, except at the point of 65 * dereference, but rather they're allowed to wrap naturally. This means there 66 * isn't a dead spot in the buffer, but the ring has to be a power of two and 67 * <= 2^31. 68 * -- David Howells 2019-09-23. 69 * 70 * Reads with count = 0 should always return 0. 71 * -- Julian Bradfield 1999-06-07. 72 * 73 * FIFOs and Pipes now generate SIGIO for both readers and writers. 74 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 75 * 76 * pipe_read & write cleanup 77 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 78 */ 79 80 #ifdef CONFIG_PROVE_LOCKING 81 static int pipe_lock_cmp_fn(const struct lockdep_map *a, 82 const struct lockdep_map *b) 83 { 84 return cmp_int((unsigned long) a, (unsigned long) b); 85 } 86 #endif 87 88 void pipe_lock(struct pipe_inode_info *pipe) 89 { 90 if (pipe->files) 91 mutex_lock(&pipe->mutex); 92 } 93 EXPORT_SYMBOL(pipe_lock); 94 95 void pipe_unlock(struct pipe_inode_info *pipe) 96 { 97 if (pipe->files) 98 mutex_unlock(&pipe->mutex); 99 } 100 EXPORT_SYMBOL(pipe_unlock); 101 102 void pipe_double_lock(struct pipe_inode_info *pipe1, 103 struct pipe_inode_info *pipe2) 104 { 105 BUG_ON(pipe1 == pipe2); 106 107 if (pipe1 > pipe2) 108 swap(pipe1, pipe2); 109 110 pipe_lock(pipe1); 111 pipe_lock(pipe2); 112 } 113 114 #define PIPE_PREALLOC_MAX 8 115 116 struct anon_pipe_prealloc { 117 struct page *pages[PIPE_PREALLOC_MAX]; 118 unsigned int count; 119 }; 120 121 /* 122 * Pre-allocate pages outside pipe->mutex for multi-page writes. 123 * alloc_page() with GFP_HIGHUSER can sleep in reclaim and runs memcg 124 * charging; doing it under the mutex stalls a concurrent reader. 125 * 126 * Loop alloc_page() instead of alloc_pages_bulk_*(): the bulk path refuses 127 * __GFP_ACCOUNT under memcg (see commit 8dcb3060d81d "memcg: page_alloc: 128 * skip bulk allocator for __GFP_ACCOUNT") and silently degrades to a single 129 * page. A per-page loop keeps memcg accounting and the task NUMA mempolicy 130 * honoured for every page; the per-call overhead is small compared to the 131 * pipe->mutex hold-time being shrunk. Any shortfall is covered by the 132 * in-lock alloc_page() fallback in anon_pipe_get_page(). 133 */ 134 static void anon_pipe_get_page_prealloc(struct anon_pipe_prealloc *prealloc, 135 size_t total_len) 136 { 137 unsigned int want, i; 138 struct page *page; 139 140 prealloc->count = 0; 141 if (total_len <= PAGE_SIZE) 142 return; 143 144 want = min_t(unsigned int, DIV_ROUND_UP(total_len, PAGE_SIZE), 145 PIPE_PREALLOC_MAX); 146 147 for (i = 0; i < want; i++) { 148 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 149 if (!page) 150 break; 151 prealloc->pages[prealloc->count++] = page; 152 } 153 } 154 155 static struct page *anon_pipe_prealloc_pop(struct anon_pipe_prealloc *prealloc) 156 { 157 if (!prealloc->count) 158 return NULL; 159 160 prealloc->count--; 161 162 return prealloc->pages[prealloc->count]; 163 } 164 165 static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe, 166 struct anon_pipe_prealloc *prealloc) 167 { 168 struct page *page; 169 170 /* Drain prealloc first to keep tmp_page[] hot for later small writes. */ 171 page = anon_pipe_prealloc_pop(prealloc); 172 if (page) 173 return page; 174 175 for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { 176 if (pipe->tmp_page[i]) { 177 page = pipe->tmp_page[i]; 178 pipe->tmp_page[i] = NULL; 179 return page; 180 } 181 } 182 183 /* FWIW: This is called with pipe->mutex held */ 184 return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 185 } 186 187 static void anon_pipe_put_page(struct pipe_inode_info *pipe, 188 struct page *page) 189 { 190 if (page_count(page) == 1) { 191 for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { 192 if (!pipe->tmp_page[i]) { 193 pipe->tmp_page[i] = page; 194 return; 195 } 196 } 197 } 198 199 put_page(page); 200 } 201 202 /* 203 * Stash leftover prealloc pages in tmp_page[] so the next write to this 204 * pipe gets a hot page without entering the allocator. 205 */ 206 static void anon_pipe_refill_tmp_pages(struct pipe_inode_info *pipe, 207 struct anon_pipe_prealloc *prealloc) 208 { 209 int i, idx; 210 211 if (!prealloc->count) 212 return; 213 214 for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { 215 if (pipe->tmp_page[i]) 216 continue; 217 if (!prealloc->count) 218 return; 219 idx = --prealloc->count; 220 pipe->tmp_page[i] = prealloc->pages[idx]; 221 prealloc->pages[idx] = NULL; 222 } 223 } 224 225 /* Runs after mutex_unlock() to keep put_page() out of the critical section. */ 226 static void anon_pipe_free_pages(struct anon_pipe_prealloc *prealloc) 227 { 228 while (prealloc->count) { 229 prealloc->count--; 230 put_page(prealloc->pages[prealloc->count]); 231 } 232 } 233 234 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 235 struct pipe_buffer *buf) 236 { 237 struct page *page = buf->page; 238 239 anon_pipe_put_page(pipe, page); 240 } 241 242 static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, 243 struct pipe_buffer *buf) 244 { 245 struct page *page = buf->page; 246 247 if (page_count(page) != 1) 248 return false; 249 memcg_kmem_uncharge_page(page, 0); 250 __SetPageLocked(page); 251 return true; 252 } 253 254 /** 255 * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer 256 * @pipe: the pipe that the buffer belongs to 257 * @buf: the buffer to attempt to steal 258 * 259 * Description: 260 * This function attempts to steal the &struct page attached to 261 * @buf. If successful, this function returns 0 and returns with 262 * the page locked. The caller may then reuse the page for whatever 263 * he wishes; the typical use is insertion into a different file 264 * page cache. 265 */ 266 bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe, 267 struct pipe_buffer *buf) 268 { 269 struct page *page = buf->page; 270 271 /* 272 * A reference of one is golden, that means that the owner of this 273 * page is the only one holding a reference to it. lock the page 274 * and return OK. 275 */ 276 if (page_count(page) == 1) { 277 lock_page(page); 278 return true; 279 } 280 return false; 281 } 282 EXPORT_SYMBOL(generic_pipe_buf_try_steal); 283 284 /** 285 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 286 * @pipe: the pipe that the buffer belongs to 287 * @buf: the buffer to get a reference to 288 * 289 * Description: 290 * This function grabs an extra reference to @buf. It's used in 291 * the tee() system call, when we duplicate the buffers in one 292 * pipe into another. 293 */ 294 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 295 { 296 return try_get_page(buf->page); 297 } 298 EXPORT_SYMBOL(generic_pipe_buf_get); 299 300 /** 301 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 302 * @pipe: the pipe that the buffer belongs to 303 * @buf: the buffer to put a reference to 304 * 305 * Description: 306 * This function releases a reference to @buf. 307 */ 308 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 309 struct pipe_buffer *buf) 310 { 311 put_page(buf->page); 312 } 313 EXPORT_SYMBOL(generic_pipe_buf_release); 314 315 static const struct pipe_buf_operations anon_pipe_buf_ops = { 316 .release = anon_pipe_buf_release, 317 .try_steal = anon_pipe_buf_try_steal, 318 .get = generic_pipe_buf_get, 319 }; 320 321 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 322 static inline bool pipe_readable(const struct pipe_inode_info *pipe) 323 { 324 union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) }; 325 unsigned int writers = READ_ONCE(pipe->writers); 326 327 return !pipe_empty(idx.head, idx.tail) || !writers; 328 } 329 330 static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, 331 struct pipe_buffer *buf, 332 unsigned int tail) 333 { 334 pipe_buf_release(pipe, buf); 335 336 /* 337 * If the pipe has a watch_queue, we need additional protection 338 * by the spinlock because notifications get posted with only 339 * this spinlock, no mutex 340 */ 341 if (pipe_has_watch_queue(pipe)) { 342 spin_lock_irq(&pipe->rd_wait.lock); 343 #ifdef CONFIG_WATCH_QUEUE 344 if (buf->flags & PIPE_BUF_FLAG_LOSS) 345 pipe->note_loss = true; 346 #endif 347 pipe->tail = ++tail; 348 spin_unlock_irq(&pipe->rd_wait.lock); 349 return tail; 350 } 351 352 /* 353 * Without a watch_queue, we can simply increment the tail 354 * without the spinlock - the mutex is enough. 355 */ 356 pipe->tail = ++tail; 357 return tail; 358 } 359 360 static ssize_t 361 anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) 362 { 363 size_t total_len = iov_iter_count(to); 364 struct file *filp = iocb->ki_filp; 365 struct pipe_inode_info *pipe = filp->private_data; 366 bool wake_writer = false, wake_next_reader = false; 367 ssize_t ret; 368 369 /* Null read succeeds. */ 370 if (unlikely(total_len == 0)) 371 return 0; 372 373 ret = 0; 374 mutex_lock(&pipe->mutex); 375 376 /* 377 * We only wake up writers if the pipe was full when we started reading 378 * and it is no longer full after reading to avoid unnecessary wakeups. 379 * 380 * But when we do wake up writers, we do so using a sync wakeup 381 * (WF_SYNC), because we want them to get going and generate more 382 * data for us. 383 */ 384 for (;;) { 385 /* Read ->head with a barrier vs post_one_notification() */ 386 unsigned int head = smp_load_acquire(&pipe->head); 387 unsigned int tail = pipe->tail; 388 389 #ifdef CONFIG_WATCH_QUEUE 390 if (pipe->note_loss) { 391 struct watch_notification n; 392 393 if (total_len < 8) { 394 if (ret == 0) 395 ret = -ENOBUFS; 396 break; 397 } 398 399 n.type = WATCH_TYPE_META; 400 n.subtype = WATCH_META_LOSS_NOTIFICATION; 401 n.info = watch_sizeof(n); 402 if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) { 403 if (ret == 0) 404 ret = -EFAULT; 405 break; 406 } 407 ret += sizeof(n); 408 total_len -= sizeof(n); 409 pipe->note_loss = false; 410 } 411 #endif 412 413 if (!pipe_empty(head, tail)) { 414 struct pipe_buffer *buf = pipe_buf(pipe, tail); 415 size_t chars = buf->len; 416 size_t written; 417 int error; 418 419 if (chars > total_len) { 420 if (buf->flags & PIPE_BUF_FLAG_WHOLE) { 421 if (ret == 0) 422 ret = -ENOBUFS; 423 break; 424 } 425 chars = total_len; 426 } 427 428 error = pipe_buf_confirm(pipe, buf); 429 if (error) { 430 if (!ret) 431 ret = error; 432 break; 433 } 434 435 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 436 if (unlikely(written < chars)) { 437 if (!ret) 438 ret = -EFAULT; 439 break; 440 } 441 ret += chars; 442 buf->offset += chars; 443 buf->len -= chars; 444 445 /* Was it a packet buffer? Clean up and exit */ 446 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 447 total_len = chars; 448 buf->len = 0; 449 } 450 451 if (!buf->len) { 452 wake_writer |= pipe_full(head, tail, pipe->max_usage); 453 tail = pipe_update_tail(pipe, buf, tail); 454 } 455 total_len -= chars; 456 if (!total_len) 457 break; /* common path: read succeeded */ 458 if (!pipe_empty(head, tail)) /* More to do? */ 459 continue; 460 } 461 462 if (!pipe->writers) 463 break; 464 if (ret) 465 break; 466 if ((filp->f_flags & O_NONBLOCK) || 467 (iocb->ki_flags & IOCB_NOWAIT)) { 468 ret = -EAGAIN; 469 break; 470 } 471 mutex_unlock(&pipe->mutex); 472 /* 473 * We only get here if we didn't actually read anything. 474 * 475 * But because we didn't read anything, at this point we can 476 * just return directly with -ERESTARTSYS if we're interrupted, 477 * since we've done any required wakeups and there's no need 478 * to mark anything accessed. And we've dropped the lock. 479 */ 480 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) 481 return -ERESTARTSYS; 482 483 wake_next_reader = true; 484 mutex_lock(&pipe->mutex); 485 } 486 if (pipe_is_empty(pipe)) 487 wake_next_reader = false; 488 mutex_unlock(&pipe->mutex); 489 490 if (wake_writer) 491 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 492 if (wake_next_reader) 493 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 494 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 495 return ret; 496 } 497 498 static ssize_t 499 fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to) 500 { 501 int ret = anon_pipe_read(iocb, to); 502 if (ret > 0) 503 file_accessed(iocb->ki_filp); 504 return ret; 505 } 506 507 static inline int is_packetized(struct file *file) 508 { 509 return (file->f_flags & O_DIRECT) != 0; 510 } 511 512 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 513 static inline bool pipe_writable(const struct pipe_inode_info *pipe) 514 { 515 union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) }; 516 unsigned int max_usage = READ_ONCE(pipe->max_usage); 517 518 return !pipe_full(idx.head, idx.tail, max_usage) || 519 !READ_ONCE(pipe->readers); 520 } 521 522 static ssize_t 523 anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) 524 { 525 struct file *filp = iocb->ki_filp; 526 struct pipe_inode_info *pipe = filp->private_data; 527 struct anon_pipe_prealloc prealloc; 528 unsigned int head; 529 ssize_t ret = 0; 530 size_t total_len = iov_iter_count(from); 531 ssize_t chars; 532 bool was_empty = false; 533 bool wake_next_writer = false; 534 535 /* 536 * Reject writing to watch queue pipes before the point where we lock 537 * the pipe. 538 * Otherwise, lockdep would be unhappy if the caller already has another 539 * pipe locked. 540 * If we had to support locking a normal pipe and a notification pipe at 541 * the same time, we could set up lockdep annotations for that, but 542 * since we don't actually need that, it's simpler to just bail here. 543 */ 544 if (pipe_has_watch_queue(pipe)) 545 return -EXDEV; 546 547 /* Null write succeeds. */ 548 if (unlikely(total_len == 0)) 549 return 0; 550 551 anon_pipe_get_page_prealloc(&prealloc, total_len); 552 553 mutex_lock(&pipe->mutex); 554 555 if (!pipe->readers) { 556 if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) 557 send_sig(SIGPIPE, current, 0); 558 ret = -EPIPE; 559 goto out; 560 } 561 562 /* 563 * If it wasn't empty we try to merge new data into 564 * the last buffer. 565 * 566 * That naturally merges small writes, but it also 567 * page-aligns the rest of the writes for large writes 568 * spanning multiple pages. 569 */ 570 head = pipe->head; 571 was_empty = pipe_empty(head, pipe->tail); 572 chars = total_len & (PAGE_SIZE-1); 573 if (chars && !was_empty) { 574 struct pipe_buffer *buf = pipe_buf(pipe, head - 1); 575 int offset = buf->offset + buf->len; 576 577 if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && 578 offset + chars <= PAGE_SIZE) { 579 ret = pipe_buf_confirm(pipe, buf); 580 if (ret) 581 goto out; 582 583 ret = copy_page_from_iter(buf->page, offset, chars, from); 584 if (unlikely(ret < chars)) { 585 ret = -EFAULT; 586 goto out; 587 } 588 589 buf->len += ret; 590 if (!iov_iter_count(from)) 591 goto out; 592 } 593 } 594 595 for (;;) { 596 if (!pipe->readers) { 597 if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) 598 send_sig(SIGPIPE, current, 0); 599 if (!ret) 600 ret = -EPIPE; 601 break; 602 } 603 604 head = pipe->head; 605 if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 606 struct pipe_buffer *buf; 607 struct page *page; 608 int copied; 609 610 page = anon_pipe_get_page(pipe, &prealloc); 611 if (unlikely(!page)) { 612 if (!ret) 613 ret = -ENOMEM; 614 break; 615 } 616 617 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 618 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 619 anon_pipe_put_page(pipe, page); 620 if (!ret) 621 ret = -EFAULT; 622 break; 623 } 624 625 pipe->head = head + 1; 626 /* Insert it into the buffer array */ 627 buf = pipe_buf(pipe, head); 628 buf->page = page; 629 buf->ops = &anon_pipe_buf_ops; 630 buf->offset = 0; 631 if (is_packetized(filp)) 632 buf->flags = PIPE_BUF_FLAG_PACKET; 633 else 634 buf->flags = PIPE_BUF_FLAG_CAN_MERGE; 635 636 buf->len = copied; 637 ret += copied; 638 639 if (!iov_iter_count(from)) 640 break; 641 642 continue; 643 } 644 645 /* Wait for buffer space to become available. */ 646 if ((filp->f_flags & O_NONBLOCK) || 647 (iocb->ki_flags & IOCB_NOWAIT)) { 648 if (!ret) 649 ret = -EAGAIN; 650 break; 651 } 652 if (signal_pending(current)) { 653 if (!ret) 654 ret = -ERESTARTSYS; 655 break; 656 } 657 658 /* 659 * We're going to release the pipe lock and wait for more 660 * space. We wake up any readers if necessary, and then 661 * after waiting we need to re-check whether the pipe 662 * become empty while we dropped the lock. 663 */ 664 mutex_unlock(&pipe->mutex); 665 if (was_empty) 666 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 667 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 668 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); 669 mutex_lock(&pipe->mutex); 670 was_empty = pipe_is_empty(pipe); 671 wake_next_writer = true; 672 } 673 out: 674 anon_pipe_refill_tmp_pages(pipe, &prealloc); 675 if (pipe_is_full(pipe)) 676 wake_next_writer = false; 677 mutex_unlock(&pipe->mutex); 678 anon_pipe_free_pages(&prealloc); 679 680 /* 681 * If we do do a wakeup event, we do a 'sync' wakeup, because we 682 * want the reader to start processing things asap, rather than 683 * leave the data pending. 684 * 685 * This is particularly important for small writes, because of 686 * how (for example) the GNU make jobserver uses small writes to 687 * wake up pending jobs 688 * 689 * Epoll nonsensically wants a wakeup whether the pipe 690 * was already empty or not. 691 */ 692 if (was_empty || pipe->poll_usage) 693 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 694 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 695 if (wake_next_writer) 696 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 697 return ret; 698 } 699 700 static ssize_t 701 fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from) 702 { 703 int ret = anon_pipe_write(iocb, from); 704 if (ret > 0) { 705 struct file *filp = iocb->ki_filp; 706 if (sb_start_write_trylock(file_inode(filp)->i_sb)) { 707 int err = file_update_time(filp); 708 if (err) 709 ret = err; 710 sb_end_write(file_inode(filp)->i_sb); 711 } 712 } 713 return ret; 714 } 715 716 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 717 { 718 struct pipe_inode_info *pipe = filp->private_data; 719 unsigned int count, head, tail; 720 721 switch (cmd) { 722 case FIONREAD: 723 mutex_lock(&pipe->mutex); 724 count = 0; 725 head = pipe->head; 726 tail = pipe->tail; 727 728 while (!pipe_empty(head, tail)) { 729 count += pipe_buf(pipe, tail)->len; 730 tail++; 731 } 732 mutex_unlock(&pipe->mutex); 733 734 return put_user(count, (int __user *)arg); 735 736 #ifdef CONFIG_WATCH_QUEUE 737 case IOC_WATCH_QUEUE_SET_SIZE: { 738 int ret; 739 mutex_lock(&pipe->mutex); 740 ret = watch_queue_set_size(pipe, arg); 741 mutex_unlock(&pipe->mutex); 742 return ret; 743 } 744 745 case IOC_WATCH_QUEUE_SET_FILTER: 746 return watch_queue_set_filter( 747 pipe, (struct watch_notification_filter __user *)arg); 748 #endif 749 750 default: 751 return -ENOIOCTLCMD; 752 } 753 } 754 755 /* No kernel lock held - fine */ 756 static __poll_t 757 pipe_poll(struct file *filp, poll_table *wait) 758 { 759 __poll_t mask; 760 struct pipe_inode_info *pipe = filp->private_data; 761 union pipe_index idx; 762 763 /* Epoll has some historical nasty semantics, this enables them */ 764 if (unlikely(!READ_ONCE(pipe->poll_usage))) 765 WRITE_ONCE(pipe->poll_usage, true); 766 767 /* 768 * Reading pipe state only -- no need for acquiring the semaphore. 769 * 770 * But because this is racy, the code has to add the 771 * entry to the poll table _first_ .. 772 */ 773 if (filp->f_mode & FMODE_READ) 774 poll_wait(filp, &pipe->rd_wait, wait); 775 if (filp->f_mode & FMODE_WRITE) 776 poll_wait(filp, &pipe->wr_wait, wait); 777 778 /* 779 * .. and only then can you do the racy tests. That way, 780 * if something changes and you got it wrong, the poll 781 * table entry will wake you up and fix it. 782 */ 783 idx.head_tail = READ_ONCE(pipe->head_tail); 784 785 mask = 0; 786 if (filp->f_mode & FMODE_READ) { 787 if (!pipe_empty(idx.head, idx.tail)) 788 mask |= EPOLLIN | EPOLLRDNORM; 789 if (!pipe->writers && filp->f_pipe != pipe->w_counter) 790 mask |= EPOLLHUP; 791 } 792 793 if (filp->f_mode & FMODE_WRITE) { 794 if (!pipe_full(idx.head, idx.tail, pipe->max_usage)) 795 mask |= EPOLLOUT | EPOLLWRNORM; 796 /* 797 * Most Unices do not set EPOLLERR for FIFOs but on Linux they 798 * behave exactly like pipes for poll(). 799 */ 800 if (!pipe->readers) 801 mask |= EPOLLERR; 802 } 803 804 return mask; 805 } 806 807 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 808 { 809 int kill = 0; 810 811 spin_lock(&inode->i_lock); 812 if (!--pipe->files) { 813 inode->i_pipe = NULL; 814 kill = 1; 815 } 816 spin_unlock(&inode->i_lock); 817 818 if (kill) 819 free_pipe_info(pipe); 820 } 821 822 static int 823 pipe_release(struct inode *inode, struct file *file) 824 { 825 struct pipe_inode_info *pipe = file->private_data; 826 827 mutex_lock(&pipe->mutex); 828 if (file->f_mode & FMODE_READ) 829 pipe->readers--; 830 if (file->f_mode & FMODE_WRITE) 831 pipe->writers--; 832 833 /* Was that the last reader or writer, but not the other side? */ 834 if (!pipe->readers != !pipe->writers) { 835 wake_up_interruptible_all(&pipe->rd_wait); 836 wake_up_interruptible_all(&pipe->wr_wait); 837 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 838 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 839 } 840 mutex_unlock(&pipe->mutex); 841 842 put_pipe_info(inode, pipe); 843 return 0; 844 } 845 846 static int 847 pipe_fasync(int fd, struct file *filp, int on) 848 { 849 struct pipe_inode_info *pipe = filp->private_data; 850 int retval = 0; 851 852 mutex_lock(&pipe->mutex); 853 if (filp->f_mode & FMODE_READ) 854 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 855 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 856 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 857 if (retval < 0 && (filp->f_mode & FMODE_READ)) 858 /* this can happen only if on == T */ 859 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 860 } 861 mutex_unlock(&pipe->mutex); 862 return retval; 863 } 864 865 unsigned long account_pipe_buffers(struct user_struct *user, 866 unsigned long old, unsigned long new) 867 { 868 return atomic_long_add_return(new - old, &user->pipe_bufs); 869 } 870 871 bool too_many_pipe_buffers_soft(unsigned long user_bufs) 872 { 873 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 874 875 return soft_limit && user_bufs > soft_limit; 876 } 877 878 bool too_many_pipe_buffers_hard(unsigned long user_bufs) 879 { 880 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 881 882 return hard_limit && user_bufs > hard_limit; 883 } 884 885 bool pipe_is_unprivileged_user(void) 886 { 887 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 888 } 889 890 struct pipe_inode_info *alloc_pipe_info(void) 891 { 892 struct pipe_inode_info *pipe; 893 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 894 struct user_struct *user = get_current_user(); 895 unsigned long user_bufs; 896 unsigned int max_size = READ_ONCE(pipe_max_size); 897 898 pipe = kzalloc_obj(struct pipe_inode_info, GFP_KERNEL_ACCOUNT); 899 if (pipe == NULL) 900 goto out_free_uid; 901 902 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 903 pipe_bufs = max_size >> PAGE_SHIFT; 904 905 user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 906 907 if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { 908 user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS); 909 pipe_bufs = PIPE_MIN_DEF_BUFFERS; 910 } 911 912 if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) 913 goto out_revert_acct; 914 915 pipe->bufs = kzalloc_objs(struct pipe_buffer, pipe_bufs, 916 GFP_KERNEL_ACCOUNT); 917 918 if (pipe->bufs) { 919 init_waitqueue_head(&pipe->rd_wait); 920 init_waitqueue_head(&pipe->wr_wait); 921 pipe->r_counter = pipe->w_counter = 1; 922 pipe->max_usage = pipe_bufs; 923 pipe->ring_size = pipe_bufs; 924 pipe->nr_accounted = pipe_bufs; 925 pipe->user = user; 926 mutex_init(&pipe->mutex); 927 lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL); 928 return pipe; 929 } 930 931 out_revert_acct: 932 (void) account_pipe_buffers(user, pipe_bufs, 0); 933 kfree(pipe); 934 out_free_uid: 935 free_uid(user); 936 return NULL; 937 } 938 939 void free_pipe_info(struct pipe_inode_info *pipe) 940 { 941 unsigned int i; 942 943 #ifdef CONFIG_WATCH_QUEUE 944 if (pipe->watch_queue) 945 watch_queue_clear(pipe->watch_queue); 946 #endif 947 948 (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); 949 free_uid(pipe->user); 950 for (i = 0; i < pipe->ring_size; i++) { 951 struct pipe_buffer *buf = pipe->bufs + i; 952 if (buf->ops) 953 pipe_buf_release(pipe, buf); 954 } 955 #ifdef CONFIG_WATCH_QUEUE 956 if (pipe->watch_queue) 957 put_watch_queue(pipe->watch_queue); 958 #endif 959 for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { 960 if (pipe->tmp_page[i]) 961 __free_page(pipe->tmp_page[i]); 962 } 963 kfree(pipe->bufs); 964 kfree(pipe); 965 } 966 967 static struct vfsmount *pipe_mnt __ro_after_init; 968 969 /* 970 * pipefs_dname() is called from d_path(). 971 */ 972 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 973 { 974 return dynamic_dname(buffer, buflen, "pipe:[%llu]", 975 d_inode(dentry)->i_ino); 976 } 977 978 static const struct dentry_operations pipefs_dentry_operations = { 979 .d_dname = pipefs_dname, 980 }; 981 982 static const struct file_operations pipeanon_fops; 983 984 static struct inode * get_pipe_inode(void) 985 { 986 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 987 struct pipe_inode_info *pipe; 988 989 if (!inode) 990 goto fail_inode; 991 992 inode->i_ino = get_next_ino(); 993 994 pipe = alloc_pipe_info(); 995 if (!pipe) 996 goto fail_iput; 997 998 inode->i_pipe = pipe; 999 pipe->files = 2; 1000 pipe->readers = pipe->writers = 1; 1001 inode->i_fop = &pipeanon_fops; 1002 1003 /* 1004 * Mark the inode dirty from the very beginning, 1005 * that way it will never be moved to the dirty 1006 * list because "mark_inode_dirty()" will think 1007 * that it already _is_ on the dirty list. 1008 */ 1009 inode_state_assign_raw(inode, I_DIRTY); 1010 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 1011 inode->i_uid = current_fsuid(); 1012 inode->i_gid = current_fsgid(); 1013 simple_inode_init_ts(inode); 1014 1015 return inode; 1016 1017 fail_iput: 1018 iput(inode); 1019 1020 fail_inode: 1021 return NULL; 1022 } 1023 1024 int create_pipe_files(struct file **res, int flags) 1025 { 1026 struct inode *inode = get_pipe_inode(); 1027 struct file *f; 1028 int error; 1029 1030 if (!inode) 1031 return -ENFILE; 1032 1033 if (flags & O_NOTIFICATION_PIPE) { 1034 error = watch_queue_init(inode->i_pipe); 1035 if (error) { 1036 free_pipe_info(inode->i_pipe); 1037 iput(inode); 1038 return error; 1039 } 1040 } 1041 1042 f = alloc_file_pseudo(inode, pipe_mnt, "", 1043 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 1044 &pipeanon_fops); 1045 if (IS_ERR(f)) { 1046 free_pipe_info(inode->i_pipe); 1047 iput(inode); 1048 return PTR_ERR(f); 1049 } 1050 1051 f->private_data = inode->i_pipe; 1052 f->f_pipe = 0; 1053 1054 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 1055 &pipeanon_fops); 1056 if (IS_ERR(res[0])) { 1057 put_pipe_info(inode, inode->i_pipe); 1058 fput(f); 1059 return PTR_ERR(res[0]); 1060 } 1061 res[0]->private_data = inode->i_pipe; 1062 res[0]->f_pipe = 0; 1063 res[1] = f; 1064 stream_open(inode, res[0]); 1065 stream_open(inode, res[1]); 1066 1067 /* pipe groks IOCB_NOWAIT */ 1068 res[0]->f_mode |= FMODE_NOWAIT; 1069 res[1]->f_mode |= FMODE_NOWAIT; 1070 1071 /* 1072 * Disable permission and pre-content events, but enable legacy 1073 * inotify events for legacy users. 1074 */ 1075 file_set_fsnotify_mode(res[0], FMODE_NONOTIFY_PERM); 1076 file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM); 1077 return 0; 1078 } 1079 1080 static int __do_pipe_flags(int *fd, struct file **files, int flags) 1081 { 1082 int error; 1083 int fdw, fdr; 1084 1085 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) 1086 return -EINVAL; 1087 1088 error = create_pipe_files(files, flags); 1089 if (error) 1090 return error; 1091 1092 error = get_unused_fd_flags(flags); 1093 if (error < 0) 1094 goto err_read_pipe; 1095 fdr = error; 1096 1097 error = get_unused_fd_flags(flags); 1098 if (error < 0) 1099 goto err_fdr; 1100 fdw = error; 1101 1102 audit_fd_pair(fdr, fdw); 1103 fd[0] = fdr; 1104 fd[1] = fdw; 1105 return 0; 1106 1107 err_fdr: 1108 put_unused_fd(fdr); 1109 err_read_pipe: 1110 fput(files[0]); 1111 fput(files[1]); 1112 return error; 1113 } 1114 1115 int do_pipe_flags(int *fd, int flags) 1116 { 1117 struct file *files[2]; 1118 int error = __do_pipe_flags(fd, files, flags); 1119 if (!error) { 1120 fd_install(fd[0], files[0]); 1121 fd_install(fd[1], files[1]); 1122 } 1123 return error; 1124 } 1125 1126 /* 1127 * sys_pipe() is the normal C calling standard for creating 1128 * a pipe. It's not the way Unix traditionally does this, though. 1129 */ 1130 static int do_pipe2(int __user *fildes, int flags) 1131 { 1132 struct file *files[2]; 1133 int fd[2]; 1134 int error; 1135 1136 error = __do_pipe_flags(fd, files, flags); 1137 if (!error) { 1138 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 1139 fput(files[0]); 1140 fput(files[1]); 1141 put_unused_fd(fd[0]); 1142 put_unused_fd(fd[1]); 1143 error = -EFAULT; 1144 } else { 1145 fd_install(fd[0], files[0]); 1146 fd_install(fd[1], files[1]); 1147 } 1148 } 1149 return error; 1150 } 1151 1152 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1153 { 1154 return do_pipe2(fildes, flags); 1155 } 1156 1157 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1158 { 1159 return do_pipe2(fildes, 0); 1160 } 1161 1162 /* 1163 * This is the stupid "wait for pipe to be readable or writable" 1164 * model. 1165 * 1166 * See pipe_read/write() for the proper kind of exclusive wait, 1167 * but that requires that we wake up any other readers/writers 1168 * if we then do not end up reading everything (ie the whole 1169 * "wake_next_reader/writer" logic in pipe_read/write()). 1170 */ 1171 void pipe_wait_readable(struct pipe_inode_info *pipe) 1172 { 1173 pipe_unlock(pipe); 1174 wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); 1175 pipe_lock(pipe); 1176 } 1177 1178 void pipe_wait_writable(struct pipe_inode_info *pipe) 1179 { 1180 pipe_unlock(pipe); 1181 wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); 1182 pipe_lock(pipe); 1183 } 1184 1185 /* 1186 * This depends on both the wait (here) and the wakeup (wake_up_partner) 1187 * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot 1188 * race with the count check and waitqueue prep. 1189 * 1190 * Normally in order to avoid races, you'd do the prepare_to_wait() first, 1191 * then check the condition you're waiting for, and only then sleep. But 1192 * because of the pipe lock, we can check the condition before being on 1193 * the wait queue. 1194 * 1195 * We use the 'rd_wait' waitqueue for pipe partner waiting. 1196 */ 1197 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 1198 { 1199 DEFINE_WAIT(rdwait); 1200 int cur = *cnt; 1201 1202 while (cur == *cnt) { 1203 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); 1204 pipe_unlock(pipe); 1205 schedule(); 1206 finish_wait(&pipe->rd_wait, &rdwait); 1207 pipe_lock(pipe); 1208 if (signal_pending(current)) 1209 break; 1210 } 1211 return cur == *cnt ? -ERESTARTSYS : 0; 1212 } 1213 1214 static void wake_up_partner(struct pipe_inode_info *pipe) 1215 { 1216 wake_up_interruptible_all(&pipe->rd_wait); 1217 } 1218 1219 static int fifo_open(struct inode *inode, struct file *filp) 1220 { 1221 bool is_pipe = inode->i_fop == &pipeanon_fops; 1222 struct pipe_inode_info *pipe; 1223 int ret; 1224 1225 filp->f_pipe = 0; 1226 1227 spin_lock(&inode->i_lock); 1228 if (inode->i_pipe) { 1229 pipe = inode->i_pipe; 1230 pipe->files++; 1231 spin_unlock(&inode->i_lock); 1232 } else { 1233 spin_unlock(&inode->i_lock); 1234 pipe = alloc_pipe_info(); 1235 if (!pipe) 1236 return -ENOMEM; 1237 pipe->files = 1; 1238 spin_lock(&inode->i_lock); 1239 if (unlikely(inode->i_pipe)) { 1240 inode->i_pipe->files++; 1241 spin_unlock(&inode->i_lock); 1242 free_pipe_info(pipe); 1243 pipe = inode->i_pipe; 1244 } else { 1245 inode->i_pipe = pipe; 1246 spin_unlock(&inode->i_lock); 1247 } 1248 } 1249 filp->private_data = pipe; 1250 /* OK, we have a pipe and it's pinned down */ 1251 1252 mutex_lock(&pipe->mutex); 1253 1254 /* We can only do regular read/write on fifos */ 1255 stream_open(inode, filp); 1256 1257 switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { 1258 case FMODE_READ: 1259 /* 1260 * O_RDONLY 1261 * POSIX.1 says that O_NONBLOCK means return with the FIFO 1262 * opened, even when there is no process writing the FIFO. 1263 */ 1264 pipe->r_counter++; 1265 if (pipe->readers++ == 0) 1266 wake_up_partner(pipe); 1267 1268 if (!is_pipe && !pipe->writers) { 1269 if ((filp->f_flags & O_NONBLOCK)) { 1270 /* suppress EPOLLHUP until we have 1271 * seen a writer */ 1272 filp->f_pipe = pipe->w_counter; 1273 } else { 1274 if (wait_for_partner(pipe, &pipe->w_counter)) 1275 goto err_rd; 1276 } 1277 } 1278 break; 1279 1280 case FMODE_WRITE: 1281 /* 1282 * O_WRONLY 1283 * POSIX.1 says that O_NONBLOCK means return -1 with 1284 * errno=ENXIO when there is no process reading the FIFO. 1285 */ 1286 ret = -ENXIO; 1287 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 1288 goto err; 1289 1290 pipe->w_counter++; 1291 if (!pipe->writers++) 1292 wake_up_partner(pipe); 1293 1294 if (!is_pipe && !pipe->readers) { 1295 if (wait_for_partner(pipe, &pipe->r_counter)) 1296 goto err_wr; 1297 } 1298 break; 1299 1300 case FMODE_READ | FMODE_WRITE: 1301 /* 1302 * O_RDWR 1303 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 1304 * This implementation will NEVER block on a O_RDWR open, since 1305 * the process can at least talk to itself. 1306 */ 1307 1308 pipe->readers++; 1309 pipe->writers++; 1310 pipe->r_counter++; 1311 pipe->w_counter++; 1312 if (pipe->readers == 1 || pipe->writers == 1) 1313 wake_up_partner(pipe); 1314 break; 1315 1316 default: 1317 ret = -EINVAL; 1318 goto err; 1319 } 1320 1321 /* Ok! */ 1322 mutex_unlock(&pipe->mutex); 1323 return 0; 1324 1325 err_rd: 1326 if (!--pipe->readers) 1327 wake_up_interruptible(&pipe->wr_wait); 1328 ret = -ERESTARTSYS; 1329 goto err; 1330 1331 err_wr: 1332 if (!--pipe->writers) 1333 wake_up_interruptible_all(&pipe->rd_wait); 1334 ret = -ERESTARTSYS; 1335 goto err; 1336 1337 err: 1338 mutex_unlock(&pipe->mutex); 1339 1340 put_pipe_info(inode, pipe); 1341 return ret; 1342 } 1343 1344 const struct file_operations pipefifo_fops = { 1345 .open = fifo_open, 1346 .read_iter = fifo_pipe_read, 1347 .write_iter = fifo_pipe_write, 1348 .poll = pipe_poll, 1349 .unlocked_ioctl = pipe_ioctl, 1350 .release = pipe_release, 1351 .fasync = pipe_fasync, 1352 .splice_write = iter_file_splice_write, 1353 }; 1354 1355 static const struct file_operations pipeanon_fops = { 1356 .open = fifo_open, 1357 .read_iter = anon_pipe_read, 1358 .write_iter = anon_pipe_write, 1359 .poll = pipe_poll, 1360 .unlocked_ioctl = pipe_ioctl, 1361 .release = pipe_release, 1362 .fasync = pipe_fasync, 1363 .splice_write = iter_file_splice_write, 1364 }; 1365 1366 /* 1367 * Currently we rely on the pipe array holding a power-of-2 number 1368 * of pages. Returns 0 on error. 1369 */ 1370 unsigned int round_pipe_size(unsigned int size) 1371 { 1372 if (size > (1U << 31)) 1373 return 0; 1374 1375 /* Minimum pipe size, as required by POSIX */ 1376 if (size < PAGE_SIZE) 1377 return PAGE_SIZE; 1378 1379 return roundup_pow_of_two(size); 1380 } 1381 1382 /* 1383 * Resize the pipe ring to a number of slots. 1384 * 1385 * Note the pipe can be reduced in capacity, but only if the current 1386 * occupancy doesn't exceed nr_slots; if it does, EBUSY will be 1387 * returned instead. 1388 */ 1389 int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) 1390 { 1391 struct pipe_buffer *bufs; 1392 unsigned int head, tail, mask, n; 1393 1394 /* nr_slots larger than limits of pipe->{head,tail} */ 1395 if (unlikely(nr_slots > (pipe_index_t)-1u)) 1396 return -EINVAL; 1397 1398 bufs = kzalloc_objs(*bufs, nr_slots, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1399 if (unlikely(!bufs)) 1400 return -ENOMEM; 1401 1402 spin_lock_irq(&pipe->rd_wait.lock); 1403 mask = pipe->ring_size - 1; 1404 head = pipe->head; 1405 tail = pipe->tail; 1406 1407 n = pipe_occupancy(head, tail); 1408 if (nr_slots < n) { 1409 spin_unlock_irq(&pipe->rd_wait.lock); 1410 kfree(bufs); 1411 return -EBUSY; 1412 } 1413 1414 /* 1415 * The pipe array wraps around, so just start the new one at zero 1416 * and adjust the indices. 1417 */ 1418 if (n > 0) { 1419 unsigned int h = head & mask; 1420 unsigned int t = tail & mask; 1421 if (h > t) { 1422 memcpy(bufs, pipe->bufs + t, 1423 n * sizeof(struct pipe_buffer)); 1424 } else { 1425 unsigned int tsize = pipe->ring_size - t; 1426 if (h > 0) 1427 memcpy(bufs + tsize, pipe->bufs, 1428 h * sizeof(struct pipe_buffer)); 1429 memcpy(bufs, pipe->bufs + t, 1430 tsize * sizeof(struct pipe_buffer)); 1431 } 1432 } 1433 1434 head = n; 1435 tail = 0; 1436 1437 kfree(pipe->bufs); 1438 pipe->bufs = bufs; 1439 pipe->ring_size = nr_slots; 1440 if (pipe->max_usage > nr_slots) 1441 pipe->max_usage = nr_slots; 1442 pipe->tail = tail; 1443 pipe->head = head; 1444 1445 if (!pipe_has_watch_queue(pipe)) { 1446 pipe->max_usage = nr_slots; 1447 pipe->nr_accounted = nr_slots; 1448 } 1449 1450 spin_unlock_irq(&pipe->rd_wait.lock); 1451 1452 /* This might have made more room for writers */ 1453 wake_up_interruptible(&pipe->wr_wait); 1454 return 0; 1455 } 1456 1457 /* 1458 * Allocate a new array of pipe buffers and copy the info over. Returns the 1459 * pipe size if successful, or return -ERROR on error. 1460 */ 1461 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) 1462 { 1463 unsigned long user_bufs; 1464 unsigned int nr_slots, size; 1465 long ret = 0; 1466 1467 if (pipe_has_watch_queue(pipe)) 1468 return -EBUSY; 1469 1470 size = round_pipe_size(arg); 1471 nr_slots = size >> PAGE_SHIFT; 1472 1473 if (!nr_slots) 1474 return -EINVAL; 1475 1476 /* 1477 * If trying to increase the pipe capacity, check that an 1478 * unprivileged user is not trying to exceed various limits 1479 * (soft limit check here, hard limit check just below). 1480 * Decreasing the pipe capacity is always permitted, even 1481 * if the user is currently over a limit. 1482 */ 1483 if (nr_slots > pipe->max_usage && 1484 size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1485 return -EPERM; 1486 1487 user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); 1488 1489 if (nr_slots > pipe->max_usage && 1490 (too_many_pipe_buffers_hard(user_bufs) || 1491 too_many_pipe_buffers_soft(user_bufs)) && 1492 pipe_is_unprivileged_user()) { 1493 ret = -EPERM; 1494 goto out_revert_acct; 1495 } 1496 1497 ret = pipe_resize_ring(pipe, nr_slots); 1498 if (ret < 0) 1499 goto out_revert_acct; 1500 1501 return pipe->max_usage * PAGE_SIZE; 1502 1503 out_revert_acct: 1504 (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); 1505 return ret; 1506 } 1507 1508 /* 1509 * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is 1510 * not enough to verify that this is a pipe. 1511 */ 1512 struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) 1513 { 1514 struct pipe_inode_info *pipe = file->private_data; 1515 1516 if (!pipe) 1517 return NULL; 1518 if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops) 1519 return NULL; 1520 if (for_splice && pipe_has_watch_queue(pipe)) 1521 return NULL; 1522 return pipe; 1523 } 1524 1525 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) 1526 { 1527 struct pipe_inode_info *pipe; 1528 long ret; 1529 1530 pipe = get_pipe_info(file, false); 1531 if (!pipe) 1532 return -EBADF; 1533 1534 mutex_lock(&pipe->mutex); 1535 1536 switch (cmd) { 1537 case F_SETPIPE_SZ: 1538 ret = pipe_set_size(pipe, arg); 1539 break; 1540 case F_GETPIPE_SZ: 1541 ret = pipe->max_usage * PAGE_SIZE; 1542 break; 1543 default: 1544 ret = -EINVAL; 1545 break; 1546 } 1547 1548 mutex_unlock(&pipe->mutex); 1549 return ret; 1550 } 1551 1552 static const struct super_operations pipefs_ops = { 1553 .destroy_inode = free_inode_nonrcu, 1554 .statfs = simple_statfs, 1555 }; 1556 1557 /* 1558 * pipefs should _never_ be mounted by userland - too much of security hassle, 1559 * no real gain from having the whole file system mounted. So we don't need 1560 * any operations on the root directory. However, we need a non-trivial 1561 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1562 */ 1563 1564 static int pipefs_init_fs_context(struct fs_context *fc) 1565 { 1566 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 1567 if (!ctx) 1568 return -ENOMEM; 1569 ctx->ops = &pipefs_ops; 1570 ctx->dops = &pipefs_dentry_operations; 1571 return 0; 1572 } 1573 1574 static struct file_system_type pipe_fs_type = { 1575 .name = "pipefs", 1576 .init_fs_context = pipefs_init_fs_context, 1577 .kill_sb = kill_anon_super, 1578 }; 1579 1580 #ifdef CONFIG_SYSCTL 1581 1582 static ulong round_pipe_size_ul(ulong size) 1583 { 1584 return round_pipe_size(size); 1585 } 1586 1587 static int u2k_pipe_maxsz(const ulong *u_ptr, uint *k_ptr) 1588 { 1589 return proc_uint_u2k_conv_uop(u_ptr, k_ptr, round_pipe_size_ul); 1590 } 1591 1592 static int do_proc_uint_conv_pipe_maxsz(ulong *u_ptr, uint *k_ptr, 1593 int dir, const struct ctl_table *table) 1594 { 1595 return proc_uint_conv(u_ptr, k_ptr, dir, table, true, 1596 u2k_pipe_maxsz, 1597 proc_uint_k2u_conv); 1598 } 1599 1600 static int proc_dopipe_max_size(const struct ctl_table *table, int write, 1601 void *buffer, size_t *lenp, loff_t *ppos) 1602 { 1603 return proc_douintvec_conv(table, write, buffer, lenp, ppos, 1604 do_proc_uint_conv_pipe_maxsz); 1605 } 1606 1607 static const struct ctl_table fs_pipe_sysctls[] = { 1608 { 1609 .procname = "pipe-max-size", 1610 .data = &pipe_max_size, 1611 .maxlen = sizeof(pipe_max_size), 1612 .mode = 0644, 1613 .proc_handler = proc_dopipe_max_size, 1614 .extra1 = SYSCTL_ONE, 1615 }, 1616 { 1617 .procname = "pipe-user-pages-hard", 1618 .data = &pipe_user_pages_hard, 1619 .maxlen = sizeof(pipe_user_pages_hard), 1620 .mode = 0644, 1621 .proc_handler = proc_doulongvec_minmax, 1622 }, 1623 { 1624 .procname = "pipe-user-pages-soft", 1625 .data = &pipe_user_pages_soft, 1626 .maxlen = sizeof(pipe_user_pages_soft), 1627 .mode = 0644, 1628 .proc_handler = proc_doulongvec_minmax, 1629 }, 1630 }; 1631 #endif 1632 1633 static int __init init_pipe_fs(void) 1634 { 1635 int err = register_filesystem(&pipe_fs_type); 1636 1637 if (!err) { 1638 pipe_mnt = kern_mount(&pipe_fs_type); 1639 if (IS_ERR(pipe_mnt)) { 1640 err = PTR_ERR(pipe_mnt); 1641 unregister_filesystem(&pipe_fs_type); 1642 } 1643 } 1644 #ifdef CONFIG_SYSCTL 1645 register_sysctl_init("fs", fs_pipe_sysctls); 1646 #endif 1647 return err; 1648 } 1649 1650 fs_initcall(init_pipe_fs); 1651