1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/splice.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 #include <linux/security.h> 32 33 /* 34 * Attempt to steal a page from a pipe buffer. This should perhaps go into 35 * a vm helper function, it's already simplified quite a bit by the 36 * addition of remove_mapping(). If success is returned, the caller may 37 * attempt to reuse this page for another destination. 38 */ 39 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 40 struct pipe_buffer *buf) 41 { 42 struct page *page = buf->page; 43 struct address_space *mapping; 44 45 lock_page(page); 46 47 mapping = page_mapping(page); 48 if (mapping) { 49 WARN_ON(!PageUptodate(page)); 50 51 /* 52 * At least for ext2 with nobh option, we need to wait on 53 * writeback completing on this page, since we'll remove it 54 * from the pagecache. Otherwise truncate wont wait on the 55 * page, allowing the disk blocks to be reused by someone else 56 * before we actually wrote our data to them. fs corruption 57 * ensues. 58 */ 59 wait_on_page_writeback(page); 60 61 if (PagePrivate(page)) 62 try_to_release_page(page, GFP_KERNEL); 63 64 /* 65 * If we succeeded in removing the mapping, set LRU flag 66 * and return good. 67 */ 68 if (remove_mapping(mapping, page)) { 69 buf->flags |= PIPE_BUF_FLAG_LRU; 70 return 0; 71 } 72 } 73 74 /* 75 * Raced with truncate or failed to remove page from current 76 * address space, unlock and return failure. 77 */ 78 unlock_page(page); 79 return 1; 80 } 81 82 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 83 struct pipe_buffer *buf) 84 { 85 page_cache_release(buf->page); 86 buf->flags &= ~PIPE_BUF_FLAG_LRU; 87 } 88 89 /* 90 * Check whether the contents of buf is OK to access. Since the content 91 * is a page cache page, IO may be in flight. 92 */ 93 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 94 struct pipe_buffer *buf) 95 { 96 struct page *page = buf->page; 97 int err; 98 99 if (!PageUptodate(page)) { 100 lock_page(page); 101 102 /* 103 * Page got truncated/unhashed. This will cause a 0-byte 104 * splice, if this is the first page. 105 */ 106 if (!page->mapping) { 107 err = -ENODATA; 108 goto error; 109 } 110 111 /* 112 * Uh oh, read-error from disk. 113 */ 114 if (!PageUptodate(page)) { 115 err = -EIO; 116 goto error; 117 } 118 119 /* 120 * Page is ok afterall, we are done. 121 */ 122 unlock_page(page); 123 } 124 125 return 0; 126 error: 127 unlock_page(page); 128 return err; 129 } 130 131 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 132 .can_merge = 0, 133 .map = generic_pipe_buf_map, 134 .unmap = generic_pipe_buf_unmap, 135 .confirm = page_cache_pipe_buf_confirm, 136 .release = page_cache_pipe_buf_release, 137 .steal = page_cache_pipe_buf_steal, 138 .get = generic_pipe_buf_get, 139 }; 140 141 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 142 struct pipe_buffer *buf) 143 { 144 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 145 return 1; 146 147 buf->flags |= PIPE_BUF_FLAG_LRU; 148 return generic_pipe_buf_steal(pipe, buf); 149 } 150 151 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 152 .can_merge = 0, 153 .map = generic_pipe_buf_map, 154 .unmap = generic_pipe_buf_unmap, 155 .confirm = generic_pipe_buf_confirm, 156 .release = page_cache_pipe_buf_release, 157 .steal = user_page_pipe_buf_steal, 158 .get = generic_pipe_buf_get, 159 }; 160 161 /** 162 * splice_to_pipe - fill passed data into a pipe 163 * @pipe: pipe to fill 164 * @spd: data to fill 165 * 166 * Description: 167 * @spd contains a map of pages and len/offset tupples, a long with 168 * the struct pipe_buf_operations associated with these pages. This 169 * function will link that data to the pipe. 170 * 171 */ 172 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 173 struct splice_pipe_desc *spd) 174 { 175 unsigned int spd_pages = spd->nr_pages; 176 int ret, do_wakeup, page_nr; 177 178 ret = 0; 179 do_wakeup = 0; 180 page_nr = 0; 181 182 if (pipe->inode) 183 mutex_lock(&pipe->inode->i_mutex); 184 185 for (;;) { 186 if (!pipe->readers) { 187 send_sig(SIGPIPE, current, 0); 188 if (!ret) 189 ret = -EPIPE; 190 break; 191 } 192 193 if (pipe->nrbufs < PIPE_BUFFERS) { 194 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 195 struct pipe_buffer *buf = pipe->bufs + newbuf; 196 197 buf->page = spd->pages[page_nr]; 198 buf->offset = spd->partial[page_nr].offset; 199 buf->len = spd->partial[page_nr].len; 200 buf->private = spd->partial[page_nr].private; 201 buf->ops = spd->ops; 202 if (spd->flags & SPLICE_F_GIFT) 203 buf->flags |= PIPE_BUF_FLAG_GIFT; 204 205 pipe->nrbufs++; 206 page_nr++; 207 ret += buf->len; 208 209 if (pipe->inode) 210 do_wakeup = 1; 211 212 if (!--spd->nr_pages) 213 break; 214 if (pipe->nrbufs < PIPE_BUFFERS) 215 continue; 216 217 break; 218 } 219 220 if (spd->flags & SPLICE_F_NONBLOCK) { 221 if (!ret) 222 ret = -EAGAIN; 223 break; 224 } 225 226 if (signal_pending(current)) { 227 if (!ret) 228 ret = -ERESTARTSYS; 229 break; 230 } 231 232 if (do_wakeup) { 233 smp_mb(); 234 if (waitqueue_active(&pipe->wait)) 235 wake_up_interruptible_sync(&pipe->wait); 236 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 237 do_wakeup = 0; 238 } 239 240 pipe->waiting_writers++; 241 pipe_wait(pipe); 242 pipe->waiting_writers--; 243 } 244 245 if (pipe->inode) { 246 mutex_unlock(&pipe->inode->i_mutex); 247 248 if (do_wakeup) { 249 smp_mb(); 250 if (waitqueue_active(&pipe->wait)) 251 wake_up_interruptible(&pipe->wait); 252 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 253 } 254 } 255 256 while (page_nr < spd_pages) 257 page_cache_release(spd->pages[page_nr++]); 258 259 return ret; 260 } 261 262 static int 263 __generic_file_splice_read(struct file *in, loff_t *ppos, 264 struct pipe_inode_info *pipe, size_t len, 265 unsigned int flags) 266 { 267 struct address_space *mapping = in->f_mapping; 268 unsigned int loff, nr_pages; 269 struct page *pages[PIPE_BUFFERS]; 270 struct partial_page partial[PIPE_BUFFERS]; 271 struct page *page; 272 pgoff_t index, end_index; 273 loff_t isize; 274 int error, page_nr; 275 struct splice_pipe_desc spd = { 276 .pages = pages, 277 .partial = partial, 278 .flags = flags, 279 .ops = &page_cache_pipe_buf_ops, 280 }; 281 282 index = *ppos >> PAGE_CACHE_SHIFT; 283 loff = *ppos & ~PAGE_CACHE_MASK; 284 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 285 286 if (nr_pages > PIPE_BUFFERS) 287 nr_pages = PIPE_BUFFERS; 288 289 /* 290 * Don't try to 2nd guess the read-ahead logic, call into 291 * page_cache_readahead() like the page cache reads would do. 292 */ 293 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 294 295 /* 296 * Lookup the (hopefully) full range of pages we need. 297 */ 298 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 299 300 /* 301 * If find_get_pages_contig() returned fewer pages than we needed, 302 * allocate the rest and fill in the holes. 303 */ 304 error = 0; 305 index += spd.nr_pages; 306 while (spd.nr_pages < nr_pages) { 307 /* 308 * Page could be there, find_get_pages_contig() breaks on 309 * the first hole. 310 */ 311 page = find_get_page(mapping, index); 312 if (!page) { 313 /* 314 * Make sure the read-ahead engine is notified 315 * about this failure. 316 */ 317 handle_ra_miss(mapping, &in->f_ra, index); 318 319 /* 320 * page didn't exist, allocate one. 321 */ 322 page = page_cache_alloc_cold(mapping); 323 if (!page) 324 break; 325 326 error = add_to_page_cache_lru(page, mapping, index, 327 GFP_KERNEL); 328 if (unlikely(error)) { 329 page_cache_release(page); 330 if (error == -EEXIST) 331 continue; 332 break; 333 } 334 /* 335 * add_to_page_cache() locks the page, unlock it 336 * to avoid convoluting the logic below even more. 337 */ 338 unlock_page(page); 339 } 340 341 pages[spd.nr_pages++] = page; 342 index++; 343 } 344 345 /* 346 * Now loop over the map and see if we need to start IO on any 347 * pages, fill in the partial map, etc. 348 */ 349 index = *ppos >> PAGE_CACHE_SHIFT; 350 nr_pages = spd.nr_pages; 351 spd.nr_pages = 0; 352 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 353 unsigned int this_len; 354 355 if (!len) 356 break; 357 358 /* 359 * this_len is the max we'll use from this page 360 */ 361 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 362 page = pages[page_nr]; 363 364 /* 365 * If the page isn't uptodate, we may need to start io on it 366 */ 367 if (!PageUptodate(page)) { 368 /* 369 * If in nonblock mode then dont block on waiting 370 * for an in-flight io page 371 */ 372 if (flags & SPLICE_F_NONBLOCK) { 373 if (TestSetPageLocked(page)) 374 break; 375 } else 376 lock_page(page); 377 378 /* 379 * page was truncated, stop here. if this isn't the 380 * first page, we'll just complete what we already 381 * added 382 */ 383 if (!page->mapping) { 384 unlock_page(page); 385 break; 386 } 387 /* 388 * page was already under io and is now done, great 389 */ 390 if (PageUptodate(page)) { 391 unlock_page(page); 392 goto fill_it; 393 } 394 395 /* 396 * need to read in the page 397 */ 398 error = mapping->a_ops->readpage(in, page); 399 if (unlikely(error)) { 400 /* 401 * We really should re-lookup the page here, 402 * but it complicates things a lot. Instead 403 * lets just do what we already stored, and 404 * we'll get it the next time we are called. 405 */ 406 if (error == AOP_TRUNCATED_PAGE) 407 error = 0; 408 409 break; 410 } 411 } 412 fill_it: 413 /* 414 * i_size must be checked after PageUptodate. 415 */ 416 isize = i_size_read(mapping->host); 417 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 418 if (unlikely(!isize || index > end_index)) 419 break; 420 421 /* 422 * if this is the last page, see if we need to shrink 423 * the length and stop 424 */ 425 if (end_index == index) { 426 unsigned int plen; 427 428 /* 429 * max good bytes in this page 430 */ 431 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 432 if (plen <= loff) 433 break; 434 435 /* 436 * force quit after adding this page 437 */ 438 this_len = min(this_len, plen - loff); 439 len = this_len; 440 } 441 442 partial[page_nr].offset = loff; 443 partial[page_nr].len = this_len; 444 len -= this_len; 445 loff = 0; 446 spd.nr_pages++; 447 index++; 448 } 449 450 /* 451 * Release any pages at the end, if we quit early. 'page_nr' is how far 452 * we got, 'nr_pages' is how many pages are in the map. 453 */ 454 while (page_nr < nr_pages) 455 page_cache_release(pages[page_nr++]); 456 457 if (spd.nr_pages) 458 return splice_to_pipe(pipe, &spd); 459 460 return error; 461 } 462 463 /** 464 * generic_file_splice_read - splice data from file to a pipe 465 * @in: file to splice from 466 * @ppos: position in @in 467 * @pipe: pipe to splice to 468 * @len: number of bytes to splice 469 * @flags: splice modifier flags 470 * 471 * Description: 472 * Will read pages from given file and fill them into a pipe. Can be 473 * used as long as the address_space operations for the source implements 474 * a readpage() hook. 475 * 476 */ 477 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 478 struct pipe_inode_info *pipe, size_t len, 479 unsigned int flags) 480 { 481 ssize_t spliced; 482 int ret; 483 loff_t isize, left; 484 485 isize = i_size_read(in->f_mapping->host); 486 if (unlikely(*ppos >= isize)) 487 return 0; 488 489 left = isize - *ppos; 490 if (unlikely(left < len)) 491 len = left; 492 493 ret = 0; 494 spliced = 0; 495 while (len && !spliced) { 496 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 497 498 if (ret < 0) 499 break; 500 else if (!ret) { 501 if (spliced) 502 break; 503 if (flags & SPLICE_F_NONBLOCK) { 504 ret = -EAGAIN; 505 break; 506 } 507 } 508 509 *ppos += ret; 510 len -= ret; 511 spliced += ret; 512 } 513 514 if (spliced) 515 return spliced; 516 517 return ret; 518 } 519 520 EXPORT_SYMBOL(generic_file_splice_read); 521 522 /* 523 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 524 * using sendpage(). Return the number of bytes sent. 525 */ 526 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 527 struct pipe_buffer *buf, struct splice_desc *sd) 528 { 529 struct file *file = sd->u.file; 530 loff_t pos = sd->pos; 531 int ret, more; 532 533 ret = buf->ops->confirm(pipe, buf); 534 if (!ret) { 535 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 536 537 ret = file->f_op->sendpage(file, buf->page, buf->offset, 538 sd->len, &pos, more); 539 } 540 541 return ret; 542 } 543 544 /* 545 * This is a little more tricky than the file -> pipe splicing. There are 546 * basically three cases: 547 * 548 * - Destination page already exists in the address space and there 549 * are users of it. For that case we have no other option that 550 * copying the data. Tough luck. 551 * - Destination page already exists in the address space, but there 552 * are no users of it. Make sure it's uptodate, then drop it. Fall 553 * through to last case. 554 * - Destination page does not exist, we can add the pipe page to 555 * the page cache and avoid the copy. 556 * 557 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 558 * sd->flags), we attempt to migrate pages from the pipe to the output 559 * file address space page cache. This is possible if no one else has 560 * the pipe page referenced outside of the pipe and page cache. If 561 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 562 * a new page in the output file page cache and fill/dirty that. 563 */ 564 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 565 struct splice_desc *sd) 566 { 567 struct file *file = sd->u.file; 568 struct address_space *mapping = file->f_mapping; 569 unsigned int offset, this_len; 570 struct page *page; 571 pgoff_t index; 572 int ret; 573 574 /* 575 * make sure the data in this buffer is uptodate 576 */ 577 ret = buf->ops->confirm(pipe, buf); 578 if (unlikely(ret)) 579 return ret; 580 581 index = sd->pos >> PAGE_CACHE_SHIFT; 582 offset = sd->pos & ~PAGE_CACHE_MASK; 583 584 this_len = sd->len; 585 if (this_len + offset > PAGE_CACHE_SIZE) 586 this_len = PAGE_CACHE_SIZE - offset; 587 588 find_page: 589 page = find_lock_page(mapping, index); 590 if (!page) { 591 ret = -ENOMEM; 592 page = page_cache_alloc_cold(mapping); 593 if (unlikely(!page)) 594 goto out_ret; 595 596 /* 597 * This will also lock the page 598 */ 599 ret = add_to_page_cache_lru(page, mapping, index, 600 GFP_KERNEL); 601 if (unlikely(ret)) 602 goto out; 603 } 604 605 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 606 if (unlikely(ret)) { 607 loff_t isize = i_size_read(mapping->host); 608 609 if (ret != AOP_TRUNCATED_PAGE) 610 unlock_page(page); 611 page_cache_release(page); 612 if (ret == AOP_TRUNCATED_PAGE) 613 goto find_page; 614 615 /* 616 * prepare_write() may have instantiated a few blocks 617 * outside i_size. Trim these off again. 618 */ 619 if (sd->pos + this_len > isize) 620 vmtruncate(mapping->host, isize); 621 622 goto out_ret; 623 } 624 625 if (buf->page != page) { 626 /* 627 * Careful, ->map() uses KM_USER0! 628 */ 629 char *src = buf->ops->map(pipe, buf, 1); 630 char *dst = kmap_atomic(page, KM_USER1); 631 632 memcpy(dst + offset, src + buf->offset, this_len); 633 flush_dcache_page(page); 634 kunmap_atomic(dst, KM_USER1); 635 buf->ops->unmap(pipe, buf, src); 636 } 637 638 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 639 if (ret) { 640 if (ret == AOP_TRUNCATED_PAGE) { 641 page_cache_release(page); 642 goto find_page; 643 } 644 if (ret < 0) 645 goto out; 646 /* 647 * Partial write has happened, so 'ret' already initialized by 648 * number of bytes written, Where is nothing we have to do here. 649 */ 650 } else 651 ret = this_len; 652 /* 653 * Return the number of bytes written and mark page as 654 * accessed, we are now done! 655 */ 656 mark_page_accessed(page); 657 out: 658 page_cache_release(page); 659 unlock_page(page); 660 out_ret: 661 return ret; 662 } 663 664 /** 665 * __splice_from_pipe - splice data from a pipe to given actor 666 * @pipe: pipe to splice from 667 * @sd: information to @actor 668 * @actor: handler that splices the data 669 * 670 * Description: 671 * This function does little more than loop over the pipe and call 672 * @actor to do the actual moving of a single struct pipe_buffer to 673 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 674 * pipe_to_user. 675 * 676 */ 677 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 678 splice_actor *actor) 679 { 680 int ret, do_wakeup, err; 681 682 ret = 0; 683 do_wakeup = 0; 684 685 for (;;) { 686 if (pipe->nrbufs) { 687 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 688 const struct pipe_buf_operations *ops = buf->ops; 689 690 sd->len = buf->len; 691 if (sd->len > sd->total_len) 692 sd->len = sd->total_len; 693 694 err = actor(pipe, buf, sd); 695 if (err <= 0) { 696 if (!ret && err != -ENODATA) 697 ret = err; 698 699 break; 700 } 701 702 ret += err; 703 buf->offset += err; 704 buf->len -= err; 705 706 sd->len -= err; 707 sd->pos += err; 708 sd->total_len -= err; 709 if (sd->len) 710 continue; 711 712 if (!buf->len) { 713 buf->ops = NULL; 714 ops->release(pipe, buf); 715 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 716 pipe->nrbufs--; 717 if (pipe->inode) 718 do_wakeup = 1; 719 } 720 721 if (!sd->total_len) 722 break; 723 } 724 725 if (pipe->nrbufs) 726 continue; 727 if (!pipe->writers) 728 break; 729 if (!pipe->waiting_writers) { 730 if (ret) 731 break; 732 } 733 734 if (sd->flags & SPLICE_F_NONBLOCK) { 735 if (!ret) 736 ret = -EAGAIN; 737 break; 738 } 739 740 if (signal_pending(current)) { 741 if (!ret) 742 ret = -ERESTARTSYS; 743 break; 744 } 745 746 if (do_wakeup) { 747 smp_mb(); 748 if (waitqueue_active(&pipe->wait)) 749 wake_up_interruptible_sync(&pipe->wait); 750 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 751 do_wakeup = 0; 752 } 753 754 pipe_wait(pipe); 755 } 756 757 if (do_wakeup) { 758 smp_mb(); 759 if (waitqueue_active(&pipe->wait)) 760 wake_up_interruptible(&pipe->wait); 761 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 762 } 763 764 return ret; 765 } 766 EXPORT_SYMBOL(__splice_from_pipe); 767 768 /** 769 * splice_from_pipe - splice data from a pipe to a file 770 * @pipe: pipe to splice from 771 * @out: file to splice to 772 * @ppos: position in @out 773 * @len: how many bytes to splice 774 * @flags: splice modifier flags 775 * @actor: handler that splices the data 776 * 777 * Description: 778 * See __splice_from_pipe. This function locks the input and output inodes, 779 * otherwise it's identical to __splice_from_pipe(). 780 * 781 */ 782 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 783 loff_t *ppos, size_t len, unsigned int flags, 784 splice_actor *actor) 785 { 786 ssize_t ret; 787 struct inode *inode = out->f_mapping->host; 788 struct splice_desc sd = { 789 .total_len = len, 790 .flags = flags, 791 .pos = *ppos, 792 .u.file = out, 793 }; 794 795 /* 796 * The actor worker might be calling ->prepare_write and 797 * ->commit_write. Most of the time, these expect i_mutex to 798 * be held. Since this may result in an ABBA deadlock with 799 * pipe->inode, we have to order lock acquiry here. 800 */ 801 inode_double_lock(inode, pipe->inode); 802 ret = __splice_from_pipe(pipe, &sd, actor); 803 inode_double_unlock(inode, pipe->inode); 804 805 return ret; 806 } 807 808 /** 809 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 810 * @pipe: pipe info 811 * @out: file to write to 812 * @ppos: position in @out 813 * @len: number of bytes to splice 814 * @flags: splice modifier flags 815 * 816 * Description: 817 * Will either move or copy pages (determined by @flags options) from 818 * the given pipe inode to the given file. The caller is responsible 819 * for acquiring i_mutex on both inodes. 820 * 821 */ 822 ssize_t 823 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 824 loff_t *ppos, size_t len, unsigned int flags) 825 { 826 struct address_space *mapping = out->f_mapping; 827 struct inode *inode = mapping->host; 828 struct splice_desc sd = { 829 .total_len = len, 830 .flags = flags, 831 .pos = *ppos, 832 .u.file = out, 833 }; 834 ssize_t ret; 835 int err; 836 837 err = remove_suid(out->f_path.dentry); 838 if (unlikely(err)) 839 return err; 840 841 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 842 if (ret > 0) { 843 unsigned long nr_pages; 844 845 *ppos += ret; 846 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 847 848 /* 849 * If file or inode is SYNC and we actually wrote some data, 850 * sync it. 851 */ 852 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 853 err = generic_osync_inode(inode, mapping, 854 OSYNC_METADATA|OSYNC_DATA); 855 856 if (err) 857 ret = err; 858 } 859 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 860 } 861 862 return ret; 863 } 864 865 EXPORT_SYMBOL(generic_file_splice_write_nolock); 866 867 /** 868 * generic_file_splice_write - splice data from a pipe to a file 869 * @pipe: pipe info 870 * @out: file to write to 871 * @ppos: position in @out 872 * @len: number of bytes to splice 873 * @flags: splice modifier flags 874 * 875 * Description: 876 * Will either move or copy pages (determined by @flags options) from 877 * the given pipe inode to the given file. 878 * 879 */ 880 ssize_t 881 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 882 loff_t *ppos, size_t len, unsigned int flags) 883 { 884 struct address_space *mapping = out->f_mapping; 885 struct inode *inode = mapping->host; 886 ssize_t ret; 887 int err; 888 889 err = should_remove_suid(out->f_path.dentry); 890 if (unlikely(err)) { 891 mutex_lock(&inode->i_mutex); 892 err = __remove_suid(out->f_path.dentry, err); 893 mutex_unlock(&inode->i_mutex); 894 if (err) 895 return err; 896 } 897 898 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 899 if (ret > 0) { 900 unsigned long nr_pages; 901 902 *ppos += ret; 903 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 904 905 /* 906 * If file or inode is SYNC and we actually wrote some data, 907 * sync it. 908 */ 909 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 910 mutex_lock(&inode->i_mutex); 911 err = generic_osync_inode(inode, mapping, 912 OSYNC_METADATA|OSYNC_DATA); 913 mutex_unlock(&inode->i_mutex); 914 915 if (err) 916 ret = err; 917 } 918 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 919 } 920 921 return ret; 922 } 923 924 EXPORT_SYMBOL(generic_file_splice_write); 925 926 /** 927 * generic_splice_sendpage - splice data from a pipe to a socket 928 * @pipe: pipe to splice from 929 * @out: socket to write to 930 * @ppos: position in @out 931 * @len: number of bytes to splice 932 * @flags: splice modifier flags 933 * 934 * Description: 935 * Will send @len bytes from the pipe to a network socket. No data copying 936 * is involved. 937 * 938 */ 939 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 940 loff_t *ppos, size_t len, unsigned int flags) 941 { 942 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 943 } 944 945 EXPORT_SYMBOL(generic_splice_sendpage); 946 947 /* 948 * Attempt to initiate a splice from pipe to file. 949 */ 950 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 951 loff_t *ppos, size_t len, unsigned int flags) 952 { 953 int ret; 954 955 if (unlikely(!out->f_op || !out->f_op->splice_write)) 956 return -EINVAL; 957 958 if (unlikely(!(out->f_mode & FMODE_WRITE))) 959 return -EBADF; 960 961 ret = rw_verify_area(WRITE, out, ppos, len); 962 if (unlikely(ret < 0)) 963 return ret; 964 965 ret = security_file_permission(out, MAY_WRITE); 966 if (unlikely(ret < 0)) 967 return ret; 968 969 return out->f_op->splice_write(pipe, out, ppos, len, flags); 970 } 971 972 /* 973 * Attempt to initiate a splice from a file to a pipe. 974 */ 975 static long do_splice_to(struct file *in, loff_t *ppos, 976 struct pipe_inode_info *pipe, size_t len, 977 unsigned int flags) 978 { 979 int ret; 980 981 if (unlikely(!in->f_op || !in->f_op->splice_read)) 982 return -EINVAL; 983 984 if (unlikely(!(in->f_mode & FMODE_READ))) 985 return -EBADF; 986 987 ret = rw_verify_area(READ, in, ppos, len); 988 if (unlikely(ret < 0)) 989 return ret; 990 991 ret = security_file_permission(in, MAY_READ); 992 if (unlikely(ret < 0)) 993 return ret; 994 995 return in->f_op->splice_read(in, ppos, pipe, len, flags); 996 } 997 998 /** 999 * splice_direct_to_actor - splices data directly between two non-pipes 1000 * @in: file to splice from 1001 * @sd: actor information on where to splice to 1002 * @actor: handles the data splicing 1003 * 1004 * Description: 1005 * This is a special case helper to splice directly between two 1006 * points, without requiring an explicit pipe. Internally an allocated 1007 * pipe is cached in the process, and reused during the life time of 1008 * that process. 1009 * 1010 */ 1011 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 1012 splice_direct_actor *actor) 1013 { 1014 struct pipe_inode_info *pipe; 1015 long ret, bytes; 1016 umode_t i_mode; 1017 size_t len; 1018 int i, flags; 1019 1020 /* 1021 * We require the input being a regular file, as we don't want to 1022 * randomly drop data for eg socket -> socket splicing. Use the 1023 * piped splicing for that! 1024 */ 1025 i_mode = in->f_path.dentry->d_inode->i_mode; 1026 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 1027 return -EINVAL; 1028 1029 /* 1030 * neither in nor out is a pipe, setup an internal pipe attached to 1031 * 'out' and transfer the wanted data from 'in' to 'out' through that 1032 */ 1033 pipe = current->splice_pipe; 1034 if (unlikely(!pipe)) { 1035 pipe = alloc_pipe_info(NULL); 1036 if (!pipe) 1037 return -ENOMEM; 1038 1039 /* 1040 * We don't have an immediate reader, but we'll read the stuff 1041 * out of the pipe right after the splice_to_pipe(). So set 1042 * PIPE_READERS appropriately. 1043 */ 1044 pipe->readers = 1; 1045 1046 current->splice_pipe = pipe; 1047 } 1048 1049 /* 1050 * Do the splice. 1051 */ 1052 ret = 0; 1053 bytes = 0; 1054 len = sd->total_len; 1055 flags = sd->flags; 1056 1057 /* 1058 * Don't block on output, we have to drain the direct pipe. 1059 */ 1060 sd->flags &= ~SPLICE_F_NONBLOCK; 1061 1062 while (len) { 1063 size_t read_len; 1064 loff_t pos = sd->pos; 1065 1066 ret = do_splice_to(in, &pos, pipe, len, flags); 1067 if (unlikely(ret <= 0)) 1068 goto out_release; 1069 1070 read_len = ret; 1071 sd->total_len = read_len; 1072 1073 /* 1074 * NOTE: nonblocking mode only applies to the input. We 1075 * must not do the output in nonblocking mode as then we 1076 * could get stuck data in the internal pipe: 1077 */ 1078 ret = actor(pipe, sd); 1079 if (unlikely(ret <= 0)) 1080 goto out_release; 1081 1082 bytes += ret; 1083 len -= ret; 1084 sd->pos = pos; 1085 1086 if (ret < read_len) 1087 goto out_release; 1088 } 1089 1090 pipe->nrbufs = pipe->curbuf = 0; 1091 return bytes; 1092 1093 out_release: 1094 /* 1095 * If we did an incomplete transfer we must release 1096 * the pipe buffers in question: 1097 */ 1098 for (i = 0; i < PIPE_BUFFERS; i++) { 1099 struct pipe_buffer *buf = pipe->bufs + i; 1100 1101 if (buf->ops) { 1102 buf->ops->release(pipe, buf); 1103 buf->ops = NULL; 1104 } 1105 } 1106 pipe->nrbufs = pipe->curbuf = 0; 1107 1108 /* 1109 * If we transferred some data, return the number of bytes: 1110 */ 1111 if (bytes > 0) 1112 return bytes; 1113 1114 return ret; 1115 1116 } 1117 EXPORT_SYMBOL(splice_direct_to_actor); 1118 1119 static int direct_splice_actor(struct pipe_inode_info *pipe, 1120 struct splice_desc *sd) 1121 { 1122 struct file *file = sd->u.file; 1123 1124 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1125 } 1126 1127 /** 1128 * do_splice_direct - splices data directly between two files 1129 * @in: file to splice from 1130 * @ppos: input file offset 1131 * @out: file to splice to 1132 * @len: number of bytes to splice 1133 * @flags: splice modifier flags 1134 * 1135 * Description: 1136 * For use by do_sendfile(). splice can easily emulate sendfile, but 1137 * doing it in the application would incur an extra system call 1138 * (splice in + splice out, as compared to just sendfile()). So this helper 1139 * can splice directly through a process-private pipe. 1140 * 1141 */ 1142 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1143 size_t len, unsigned int flags) 1144 { 1145 struct splice_desc sd = { 1146 .len = len, 1147 .total_len = len, 1148 .flags = flags, 1149 .pos = *ppos, 1150 .u.file = out, 1151 }; 1152 long ret; 1153 1154 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1155 if (ret > 0) 1156 *ppos += ret; 1157 1158 return ret; 1159 } 1160 1161 /* 1162 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1163 * location, so checking ->i_pipe is not enough to verify that this is a 1164 * pipe. 1165 */ 1166 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1167 { 1168 if (S_ISFIFO(inode->i_mode)) 1169 return inode->i_pipe; 1170 1171 return NULL; 1172 } 1173 1174 /* 1175 * Determine where to splice to/from. 1176 */ 1177 static long do_splice(struct file *in, loff_t __user *off_in, 1178 struct file *out, loff_t __user *off_out, 1179 size_t len, unsigned int flags) 1180 { 1181 struct pipe_inode_info *pipe; 1182 loff_t offset, *off; 1183 long ret; 1184 1185 pipe = pipe_info(in->f_path.dentry->d_inode); 1186 if (pipe) { 1187 if (off_in) 1188 return -ESPIPE; 1189 if (off_out) { 1190 if (out->f_op->llseek == no_llseek) 1191 return -EINVAL; 1192 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1193 return -EFAULT; 1194 off = &offset; 1195 } else 1196 off = &out->f_pos; 1197 1198 ret = do_splice_from(pipe, out, off, len, flags); 1199 1200 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1201 ret = -EFAULT; 1202 1203 return ret; 1204 } 1205 1206 pipe = pipe_info(out->f_path.dentry->d_inode); 1207 if (pipe) { 1208 if (off_out) 1209 return -ESPIPE; 1210 if (off_in) { 1211 if (in->f_op->llseek == no_llseek) 1212 return -EINVAL; 1213 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1214 return -EFAULT; 1215 off = &offset; 1216 } else 1217 off = &in->f_pos; 1218 1219 ret = do_splice_to(in, off, pipe, len, flags); 1220 1221 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1222 ret = -EFAULT; 1223 1224 return ret; 1225 } 1226 1227 return -EINVAL; 1228 } 1229 1230 /* 1231 * Map an iov into an array of pages and offset/length tupples. With the 1232 * partial_page structure, we can map several non-contiguous ranges into 1233 * our ones pages[] map instead of splitting that operation into pieces. 1234 * Could easily be exported as a generic helper for other users, in which 1235 * case one would probably want to add a 'max_nr_pages' parameter as well. 1236 */ 1237 static int get_iovec_page_array(const struct iovec __user *iov, 1238 unsigned int nr_vecs, struct page **pages, 1239 struct partial_page *partial, int aligned) 1240 { 1241 int buffers = 0, error = 0; 1242 1243 /* 1244 * It's ok to take the mmap_sem for reading, even 1245 * across a "get_user()". 1246 */ 1247 down_read(¤t->mm->mmap_sem); 1248 1249 while (nr_vecs) { 1250 unsigned long off, npages; 1251 void __user *base; 1252 size_t len; 1253 int i; 1254 1255 /* 1256 * Get user address base and length for this iovec. 1257 */ 1258 error = get_user(base, &iov->iov_base); 1259 if (unlikely(error)) 1260 break; 1261 error = get_user(len, &iov->iov_len); 1262 if (unlikely(error)) 1263 break; 1264 1265 /* 1266 * Sanity check this iovec. 0 read succeeds. 1267 */ 1268 if (unlikely(!len)) 1269 break; 1270 error = -EFAULT; 1271 if (unlikely(!base)) 1272 break; 1273 1274 /* 1275 * Get this base offset and number of pages, then map 1276 * in the user pages. 1277 */ 1278 off = (unsigned long) base & ~PAGE_MASK; 1279 1280 /* 1281 * If asked for alignment, the offset must be zero and the 1282 * length a multiple of the PAGE_SIZE. 1283 */ 1284 error = -EINVAL; 1285 if (aligned && (off || len & ~PAGE_MASK)) 1286 break; 1287 1288 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1289 if (npages > PIPE_BUFFERS - buffers) 1290 npages = PIPE_BUFFERS - buffers; 1291 1292 error = get_user_pages(current, current->mm, 1293 (unsigned long) base, npages, 0, 0, 1294 &pages[buffers], NULL); 1295 1296 if (unlikely(error <= 0)) 1297 break; 1298 1299 /* 1300 * Fill this contiguous range into the partial page map. 1301 */ 1302 for (i = 0; i < error; i++) { 1303 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1304 1305 partial[buffers].offset = off; 1306 partial[buffers].len = plen; 1307 1308 off = 0; 1309 len -= plen; 1310 buffers++; 1311 } 1312 1313 /* 1314 * We didn't complete this iov, stop here since it probably 1315 * means we have to move some of this into a pipe to 1316 * be able to continue. 1317 */ 1318 if (len) 1319 break; 1320 1321 /* 1322 * Don't continue if we mapped fewer pages than we asked for, 1323 * or if we mapped the max number of pages that we have 1324 * room for. 1325 */ 1326 if (error < npages || buffers == PIPE_BUFFERS) 1327 break; 1328 1329 nr_vecs--; 1330 iov++; 1331 } 1332 1333 up_read(¤t->mm->mmap_sem); 1334 1335 if (buffers) 1336 return buffers; 1337 1338 return error; 1339 } 1340 1341 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1342 struct splice_desc *sd) 1343 { 1344 char *src; 1345 int ret; 1346 1347 ret = buf->ops->confirm(pipe, buf); 1348 if (unlikely(ret)) 1349 return ret; 1350 1351 /* 1352 * See if we can use the atomic maps, by prefaulting in the 1353 * pages and doing an atomic copy 1354 */ 1355 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { 1356 src = buf->ops->map(pipe, buf, 1); 1357 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, 1358 sd->len); 1359 buf->ops->unmap(pipe, buf, src); 1360 if (!ret) { 1361 ret = sd->len; 1362 goto out; 1363 } 1364 } 1365 1366 /* 1367 * No dice, use slow non-atomic map and copy 1368 */ 1369 src = buf->ops->map(pipe, buf, 0); 1370 1371 ret = sd->len; 1372 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) 1373 ret = -EFAULT; 1374 1375 out: 1376 if (ret > 0) 1377 sd->u.userptr += ret; 1378 buf->ops->unmap(pipe, buf, src); 1379 return ret; 1380 } 1381 1382 /* 1383 * For lack of a better implementation, implement vmsplice() to userspace 1384 * as a simple copy of the pipes pages to the user iov. 1385 */ 1386 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1387 unsigned long nr_segs, unsigned int flags) 1388 { 1389 struct pipe_inode_info *pipe; 1390 struct splice_desc sd; 1391 ssize_t size; 1392 int error; 1393 long ret; 1394 1395 pipe = pipe_info(file->f_path.dentry->d_inode); 1396 if (!pipe) 1397 return -EBADF; 1398 1399 if (pipe->inode) 1400 mutex_lock(&pipe->inode->i_mutex); 1401 1402 error = ret = 0; 1403 while (nr_segs) { 1404 void __user *base; 1405 size_t len; 1406 1407 /* 1408 * Get user address base and length for this iovec. 1409 */ 1410 error = get_user(base, &iov->iov_base); 1411 if (unlikely(error)) 1412 break; 1413 error = get_user(len, &iov->iov_len); 1414 if (unlikely(error)) 1415 break; 1416 1417 /* 1418 * Sanity check this iovec. 0 read succeeds. 1419 */ 1420 if (unlikely(!len)) 1421 break; 1422 if (unlikely(!base)) { 1423 error = -EFAULT; 1424 break; 1425 } 1426 1427 sd.len = 0; 1428 sd.total_len = len; 1429 sd.flags = flags; 1430 sd.u.userptr = base; 1431 sd.pos = 0; 1432 1433 size = __splice_from_pipe(pipe, &sd, pipe_to_user); 1434 if (size < 0) { 1435 if (!ret) 1436 ret = size; 1437 1438 break; 1439 } 1440 1441 ret += size; 1442 1443 if (size < len) 1444 break; 1445 1446 nr_segs--; 1447 iov++; 1448 } 1449 1450 if (pipe->inode) 1451 mutex_unlock(&pipe->inode->i_mutex); 1452 1453 if (!ret) 1454 ret = error; 1455 1456 return ret; 1457 } 1458 1459 /* 1460 * vmsplice splices a user address range into a pipe. It can be thought of 1461 * as splice-from-memory, where the regular splice is splice-from-file (or 1462 * to file). In both cases the output is a pipe, naturally. 1463 */ 1464 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1465 unsigned long nr_segs, unsigned int flags) 1466 { 1467 struct pipe_inode_info *pipe; 1468 struct page *pages[PIPE_BUFFERS]; 1469 struct partial_page partial[PIPE_BUFFERS]; 1470 struct splice_pipe_desc spd = { 1471 .pages = pages, 1472 .partial = partial, 1473 .flags = flags, 1474 .ops = &user_page_pipe_buf_ops, 1475 }; 1476 1477 pipe = pipe_info(file->f_path.dentry->d_inode); 1478 if (!pipe) 1479 return -EBADF; 1480 1481 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1482 flags & SPLICE_F_GIFT); 1483 if (spd.nr_pages <= 0) 1484 return spd.nr_pages; 1485 1486 return splice_to_pipe(pipe, &spd); 1487 } 1488 1489 /* 1490 * Note that vmsplice only really supports true splicing _from_ user memory 1491 * to a pipe, not the other way around. Splicing from user memory is a simple 1492 * operation that can be supported without any funky alignment restrictions 1493 * or nasty vm tricks. We simply map in the user memory and fill them into 1494 * a pipe. The reverse isn't quite as easy, though. There are two possible 1495 * solutions for that: 1496 * 1497 * - memcpy() the data internally, at which point we might as well just 1498 * do a regular read() on the buffer anyway. 1499 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1500 * has restriction limitations on both ends of the pipe). 1501 * 1502 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1503 * 1504 */ 1505 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1506 unsigned long nr_segs, unsigned int flags) 1507 { 1508 struct file *file; 1509 long error; 1510 int fput; 1511 1512 if (unlikely(nr_segs > UIO_MAXIOV)) 1513 return -EINVAL; 1514 else if (unlikely(!nr_segs)) 1515 return 0; 1516 1517 error = -EBADF; 1518 file = fget_light(fd, &fput); 1519 if (file) { 1520 if (file->f_mode & FMODE_WRITE) 1521 error = vmsplice_to_pipe(file, iov, nr_segs, flags); 1522 else if (file->f_mode & FMODE_READ) 1523 error = vmsplice_to_user(file, iov, nr_segs, flags); 1524 1525 fput_light(file, fput); 1526 } 1527 1528 return error; 1529 } 1530 1531 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1532 int fd_out, loff_t __user *off_out, 1533 size_t len, unsigned int flags) 1534 { 1535 long error; 1536 struct file *in, *out; 1537 int fput_in, fput_out; 1538 1539 if (unlikely(!len)) 1540 return 0; 1541 1542 error = -EBADF; 1543 in = fget_light(fd_in, &fput_in); 1544 if (in) { 1545 if (in->f_mode & FMODE_READ) { 1546 out = fget_light(fd_out, &fput_out); 1547 if (out) { 1548 if (out->f_mode & FMODE_WRITE) 1549 error = do_splice(in, off_in, 1550 out, off_out, 1551 len, flags); 1552 fput_light(out, fput_out); 1553 } 1554 } 1555 1556 fput_light(in, fput_in); 1557 } 1558 1559 return error; 1560 } 1561 1562 /* 1563 * Make sure there's data to read. Wait for input if we can, otherwise 1564 * return an appropriate error. 1565 */ 1566 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1567 { 1568 int ret; 1569 1570 /* 1571 * Check ->nrbufs without the inode lock first. This function 1572 * is speculative anyways, so missing one is ok. 1573 */ 1574 if (pipe->nrbufs) 1575 return 0; 1576 1577 ret = 0; 1578 mutex_lock(&pipe->inode->i_mutex); 1579 1580 while (!pipe->nrbufs) { 1581 if (signal_pending(current)) { 1582 ret = -ERESTARTSYS; 1583 break; 1584 } 1585 if (!pipe->writers) 1586 break; 1587 if (!pipe->waiting_writers) { 1588 if (flags & SPLICE_F_NONBLOCK) { 1589 ret = -EAGAIN; 1590 break; 1591 } 1592 } 1593 pipe_wait(pipe); 1594 } 1595 1596 mutex_unlock(&pipe->inode->i_mutex); 1597 return ret; 1598 } 1599 1600 /* 1601 * Make sure there's writeable room. Wait for room if we can, otherwise 1602 * return an appropriate error. 1603 */ 1604 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1605 { 1606 int ret; 1607 1608 /* 1609 * Check ->nrbufs without the inode lock first. This function 1610 * is speculative anyways, so missing one is ok. 1611 */ 1612 if (pipe->nrbufs < PIPE_BUFFERS) 1613 return 0; 1614 1615 ret = 0; 1616 mutex_lock(&pipe->inode->i_mutex); 1617 1618 while (pipe->nrbufs >= PIPE_BUFFERS) { 1619 if (!pipe->readers) { 1620 send_sig(SIGPIPE, current, 0); 1621 ret = -EPIPE; 1622 break; 1623 } 1624 if (flags & SPLICE_F_NONBLOCK) { 1625 ret = -EAGAIN; 1626 break; 1627 } 1628 if (signal_pending(current)) { 1629 ret = -ERESTARTSYS; 1630 break; 1631 } 1632 pipe->waiting_writers++; 1633 pipe_wait(pipe); 1634 pipe->waiting_writers--; 1635 } 1636 1637 mutex_unlock(&pipe->inode->i_mutex); 1638 return ret; 1639 } 1640 1641 /* 1642 * Link contents of ipipe to opipe. 1643 */ 1644 static int link_pipe(struct pipe_inode_info *ipipe, 1645 struct pipe_inode_info *opipe, 1646 size_t len, unsigned int flags) 1647 { 1648 struct pipe_buffer *ibuf, *obuf; 1649 int ret = 0, i = 0, nbuf; 1650 1651 /* 1652 * Potential ABBA deadlock, work around it by ordering lock 1653 * grabbing by inode address. Otherwise two different processes 1654 * could deadlock (one doing tee from A -> B, the other from B -> A). 1655 */ 1656 inode_double_lock(ipipe->inode, opipe->inode); 1657 1658 do { 1659 if (!opipe->readers) { 1660 send_sig(SIGPIPE, current, 0); 1661 if (!ret) 1662 ret = -EPIPE; 1663 break; 1664 } 1665 1666 /* 1667 * If we have iterated all input buffers or ran out of 1668 * output room, break. 1669 */ 1670 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1671 break; 1672 1673 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1674 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1675 1676 /* 1677 * Get a reference to this pipe buffer, 1678 * so we can copy the contents over. 1679 */ 1680 ibuf->ops->get(ipipe, ibuf); 1681 1682 obuf = opipe->bufs + nbuf; 1683 *obuf = *ibuf; 1684 1685 /* 1686 * Don't inherit the gift flag, we need to 1687 * prevent multiple steals of this page. 1688 */ 1689 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1690 1691 if (obuf->len > len) 1692 obuf->len = len; 1693 1694 opipe->nrbufs++; 1695 ret += obuf->len; 1696 len -= obuf->len; 1697 i++; 1698 } while (len); 1699 1700 inode_double_unlock(ipipe->inode, opipe->inode); 1701 1702 /* 1703 * If we put data in the output pipe, wakeup any potential readers. 1704 */ 1705 if (ret > 0) { 1706 smp_mb(); 1707 if (waitqueue_active(&opipe->wait)) 1708 wake_up_interruptible(&opipe->wait); 1709 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1710 } 1711 1712 return ret; 1713 } 1714 1715 /* 1716 * This is a tee(1) implementation that works on pipes. It doesn't copy 1717 * any data, it simply references the 'in' pages on the 'out' pipe. 1718 * The 'flags' used are the SPLICE_F_* variants, currently the only 1719 * applicable one is SPLICE_F_NONBLOCK. 1720 */ 1721 static long do_tee(struct file *in, struct file *out, size_t len, 1722 unsigned int flags) 1723 { 1724 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1725 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1726 int ret = -EINVAL; 1727 1728 /* 1729 * Duplicate the contents of ipipe to opipe without actually 1730 * copying the data. 1731 */ 1732 if (ipipe && opipe && ipipe != opipe) { 1733 /* 1734 * Keep going, unless we encounter an error. The ipipe/opipe 1735 * ordering doesn't really matter. 1736 */ 1737 ret = link_ipipe_prep(ipipe, flags); 1738 if (!ret) { 1739 ret = link_opipe_prep(opipe, flags); 1740 if (!ret) { 1741 ret = link_pipe(ipipe, opipe, len, flags); 1742 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1743 ret = -EAGAIN; 1744 } 1745 } 1746 } 1747 1748 return ret; 1749 } 1750 1751 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1752 { 1753 struct file *in; 1754 int error, fput_in; 1755 1756 if (unlikely(!len)) 1757 return 0; 1758 1759 error = -EBADF; 1760 in = fget_light(fdin, &fput_in); 1761 if (in) { 1762 if (in->f_mode & FMODE_READ) { 1763 int fput_out; 1764 struct file *out = fget_light(fdout, &fput_out); 1765 1766 if (out) { 1767 if (out->f_mode & FMODE_WRITE) 1768 error = do_tee(in, out, len, flags); 1769 fput_light(out, fput_out); 1770 } 1771 } 1772 fput_light(in, fput_in); 1773 } 1774 1775 return error; 1776 } 1777