1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/pipe_fs_i.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 32 struct partial_page { 33 unsigned int offset; 34 unsigned int len; 35 }; 36 37 /* 38 * Passed to splice_to_pipe 39 */ 40 struct splice_pipe_desc { 41 struct page **pages; /* page map */ 42 struct partial_page *partial; /* pages[] may not be contig */ 43 int nr_pages; /* number of pages in map */ 44 unsigned int flags; /* splice flags */ 45 struct pipe_buf_operations *ops;/* ops associated with output pipe */ 46 }; 47 48 /* 49 * Attempt to steal a page from a pipe buffer. This should perhaps go into 50 * a vm helper function, it's already simplified quite a bit by the 51 * addition of remove_mapping(). If success is returned, the caller may 52 * attempt to reuse this page for another destination. 53 */ 54 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 55 struct pipe_buffer *buf) 56 { 57 struct page *page = buf->page; 58 struct address_space *mapping; 59 60 lock_page(page); 61 62 mapping = page_mapping(page); 63 if (mapping) { 64 WARN_ON(!PageUptodate(page)); 65 66 /* 67 * At least for ext2 with nobh option, we need to wait on 68 * writeback completing on this page, since we'll remove it 69 * from the pagecache. Otherwise truncate wont wait on the 70 * page, allowing the disk blocks to be reused by someone else 71 * before we actually wrote our data to them. fs corruption 72 * ensues. 73 */ 74 wait_on_page_writeback(page); 75 76 if (PagePrivate(page)) 77 try_to_release_page(page, mapping_gfp_mask(mapping)); 78 79 /* 80 * If we succeeded in removing the mapping, set LRU flag 81 * and return good. 82 */ 83 if (remove_mapping(mapping, page)) { 84 buf->flags |= PIPE_BUF_FLAG_LRU; 85 return 0; 86 } 87 } 88 89 /* 90 * Raced with truncate or failed to remove page from current 91 * address space, unlock and return failure. 92 */ 93 unlock_page(page); 94 return 1; 95 } 96 97 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 98 struct pipe_buffer *buf) 99 { 100 page_cache_release(buf->page); 101 buf->flags &= ~PIPE_BUF_FLAG_LRU; 102 } 103 104 static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, 105 struct pipe_buffer *buf) 106 { 107 struct page *page = buf->page; 108 int err; 109 110 if (!PageUptodate(page)) { 111 lock_page(page); 112 113 /* 114 * Page got truncated/unhashed. This will cause a 0-byte 115 * splice, if this is the first page. 116 */ 117 if (!page->mapping) { 118 err = -ENODATA; 119 goto error; 120 } 121 122 /* 123 * Uh oh, read-error from disk. 124 */ 125 if (!PageUptodate(page)) { 126 err = -EIO; 127 goto error; 128 } 129 130 /* 131 * Page is ok afterall, we are done. 132 */ 133 unlock_page(page); 134 } 135 136 return 0; 137 error: 138 unlock_page(page); 139 return err; 140 } 141 142 static struct pipe_buf_operations page_cache_pipe_buf_ops = { 143 .can_merge = 0, 144 .map = generic_pipe_buf_map, 145 .unmap = generic_pipe_buf_unmap, 146 .pin = page_cache_pipe_buf_pin, 147 .release = page_cache_pipe_buf_release, 148 .steal = page_cache_pipe_buf_steal, 149 .get = generic_pipe_buf_get, 150 }; 151 152 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 153 struct pipe_buffer *buf) 154 { 155 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 156 return 1; 157 158 buf->flags |= PIPE_BUF_FLAG_LRU; 159 return generic_pipe_buf_steal(pipe, buf); 160 } 161 162 static struct pipe_buf_operations user_page_pipe_buf_ops = { 163 .can_merge = 0, 164 .map = generic_pipe_buf_map, 165 .unmap = generic_pipe_buf_unmap, 166 .pin = generic_pipe_buf_pin, 167 .release = page_cache_pipe_buf_release, 168 .steal = user_page_pipe_buf_steal, 169 .get = generic_pipe_buf_get, 170 }; 171 172 /* 173 * Pipe output worker. This sets up our pipe format with the page cache 174 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 175 */ 176 static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 177 struct splice_pipe_desc *spd) 178 { 179 int ret, do_wakeup, page_nr; 180 181 ret = 0; 182 do_wakeup = 0; 183 page_nr = 0; 184 185 if (pipe->inode) 186 mutex_lock(&pipe->inode->i_mutex); 187 188 for (;;) { 189 if (!pipe->readers) { 190 send_sig(SIGPIPE, current, 0); 191 if (!ret) 192 ret = -EPIPE; 193 break; 194 } 195 196 if (pipe->nrbufs < PIPE_BUFFERS) { 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 198 struct pipe_buffer *buf = pipe->bufs + newbuf; 199 200 buf->page = spd->pages[page_nr]; 201 buf->offset = spd->partial[page_nr].offset; 202 buf->len = spd->partial[page_nr].len; 203 buf->ops = spd->ops; 204 if (spd->flags & SPLICE_F_GIFT) 205 buf->flags |= PIPE_BUF_FLAG_GIFT; 206 207 pipe->nrbufs++; 208 page_nr++; 209 ret += buf->len; 210 211 if (pipe->inode) 212 do_wakeup = 1; 213 214 if (!--spd->nr_pages) 215 break; 216 if (pipe->nrbufs < PIPE_BUFFERS) 217 continue; 218 219 break; 220 } 221 222 if (spd->flags & SPLICE_F_NONBLOCK) { 223 if (!ret) 224 ret = -EAGAIN; 225 break; 226 } 227 228 if (signal_pending(current)) { 229 if (!ret) 230 ret = -ERESTARTSYS; 231 break; 232 } 233 234 if (do_wakeup) { 235 smp_mb(); 236 if (waitqueue_active(&pipe->wait)) 237 wake_up_interruptible_sync(&pipe->wait); 238 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 239 do_wakeup = 0; 240 } 241 242 pipe->waiting_writers++; 243 pipe_wait(pipe); 244 pipe->waiting_writers--; 245 } 246 247 if (pipe->inode) 248 mutex_unlock(&pipe->inode->i_mutex); 249 250 if (do_wakeup) { 251 smp_mb(); 252 if (waitqueue_active(&pipe->wait)) 253 wake_up_interruptible(&pipe->wait); 254 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 255 } 256 257 while (page_nr < spd->nr_pages) 258 page_cache_release(spd->pages[page_nr++]); 259 260 return ret; 261 } 262 263 static int 264 __generic_file_splice_read(struct file *in, loff_t *ppos, 265 struct pipe_inode_info *pipe, size_t len, 266 unsigned int flags) 267 { 268 struct address_space *mapping = in->f_mapping; 269 unsigned int loff, nr_pages; 270 struct page *pages[PIPE_BUFFERS]; 271 struct partial_page partial[PIPE_BUFFERS]; 272 struct page *page; 273 pgoff_t index, end_index; 274 loff_t isize; 275 size_t total_len; 276 int error, page_nr; 277 struct splice_pipe_desc spd = { 278 .pages = pages, 279 .partial = partial, 280 .flags = flags, 281 .ops = &page_cache_pipe_buf_ops, 282 }; 283 284 index = *ppos >> PAGE_CACHE_SHIFT; 285 loff = *ppos & ~PAGE_CACHE_MASK; 286 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 287 288 if (nr_pages > PIPE_BUFFERS) 289 nr_pages = PIPE_BUFFERS; 290 291 /* 292 * Initiate read-ahead on this page range. however, don't call into 293 * read-ahead if this is a non-zero offset (we are likely doing small 294 * chunk splice and the page is already there) for a single page. 295 */ 296 if (!loff || nr_pages > 1) 297 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 298 299 /* 300 * Now fill in the holes: 301 */ 302 error = 0; 303 total_len = 0; 304 305 /* 306 * Lookup the (hopefully) full range of pages we need. 307 */ 308 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 309 310 /* 311 * If find_get_pages_contig() returned fewer pages than we needed, 312 * allocate the rest. 313 */ 314 index += spd.nr_pages; 315 while (spd.nr_pages < nr_pages) { 316 /* 317 * Page could be there, find_get_pages_contig() breaks on 318 * the first hole. 319 */ 320 page = find_get_page(mapping, index); 321 if (!page) { 322 /* 323 * Make sure the read-ahead engine is notified 324 * about this failure. 325 */ 326 handle_ra_miss(mapping, &in->f_ra, index); 327 328 /* 329 * page didn't exist, allocate one. 330 */ 331 page = page_cache_alloc_cold(mapping); 332 if (!page) 333 break; 334 335 error = add_to_page_cache_lru(page, mapping, index, 336 mapping_gfp_mask(mapping)); 337 if (unlikely(error)) { 338 page_cache_release(page); 339 if (error == -EEXIST) 340 continue; 341 break; 342 } 343 /* 344 * add_to_page_cache() locks the page, unlock it 345 * to avoid convoluting the logic below even more. 346 */ 347 unlock_page(page); 348 } 349 350 pages[spd.nr_pages++] = page; 351 index++; 352 } 353 354 /* 355 * Now loop over the map and see if we need to start IO on any 356 * pages, fill in the partial map, etc. 357 */ 358 index = *ppos >> PAGE_CACHE_SHIFT; 359 nr_pages = spd.nr_pages; 360 spd.nr_pages = 0; 361 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 362 unsigned int this_len; 363 364 if (!len) 365 break; 366 367 /* 368 * this_len is the max we'll use from this page 369 */ 370 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 371 page = pages[page_nr]; 372 373 /* 374 * If the page isn't uptodate, we may need to start io on it 375 */ 376 if (!PageUptodate(page)) { 377 /* 378 * If in nonblock mode then dont block on waiting 379 * for an in-flight io page 380 */ 381 if (flags & SPLICE_F_NONBLOCK) 382 break; 383 384 lock_page(page); 385 386 /* 387 * page was truncated, stop here. if this isn't the 388 * first page, we'll just complete what we already 389 * added 390 */ 391 if (!page->mapping) { 392 unlock_page(page); 393 break; 394 } 395 /* 396 * page was already under io and is now done, great 397 */ 398 if (PageUptodate(page)) { 399 unlock_page(page); 400 goto fill_it; 401 } 402 403 /* 404 * need to read in the page 405 */ 406 error = mapping->a_ops->readpage(in, page); 407 if (unlikely(error)) { 408 /* 409 * We really should re-lookup the page here, 410 * but it complicates things a lot. Instead 411 * lets just do what we already stored, and 412 * we'll get it the next time we are called. 413 */ 414 if (error == AOP_TRUNCATED_PAGE) 415 error = 0; 416 417 break; 418 } 419 420 /* 421 * i_size must be checked after ->readpage(). 422 */ 423 isize = i_size_read(mapping->host); 424 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 425 if (unlikely(!isize || index > end_index)) 426 break; 427 428 /* 429 * if this is the last page, see if we need to shrink 430 * the length and stop 431 */ 432 if (end_index == index) { 433 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 434 if (total_len + loff > isize) 435 break; 436 /* 437 * force quit after adding this page 438 */ 439 len = this_len; 440 this_len = min(this_len, loff); 441 loff = 0; 442 } 443 } 444 fill_it: 445 partial[page_nr].offset = loff; 446 partial[page_nr].len = this_len; 447 len -= this_len; 448 total_len += this_len; 449 loff = 0; 450 spd.nr_pages++; 451 index++; 452 } 453 454 /* 455 * Release any pages at the end, if we quit early. 'i' is how far 456 * we got, 'nr_pages' is how many pages are in the map. 457 */ 458 while (page_nr < nr_pages) 459 page_cache_release(pages[page_nr++]); 460 461 if (spd.nr_pages) 462 return splice_to_pipe(pipe, &spd); 463 464 return error; 465 } 466 467 /** 468 * generic_file_splice_read - splice data from file to a pipe 469 * @in: file to splice from 470 * @pipe: pipe to splice to 471 * @len: number of bytes to splice 472 * @flags: splice modifier flags 473 * 474 * Will read pages from given file and fill them into a pipe. 475 */ 476 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 477 struct pipe_inode_info *pipe, size_t len, 478 unsigned int flags) 479 { 480 ssize_t spliced; 481 int ret; 482 483 ret = 0; 484 spliced = 0; 485 486 while (len) { 487 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 488 489 if (ret < 0) 490 break; 491 else if (!ret) { 492 if (spliced) 493 break; 494 if (flags & SPLICE_F_NONBLOCK) { 495 ret = -EAGAIN; 496 break; 497 } 498 } 499 500 *ppos += ret; 501 len -= ret; 502 spliced += ret; 503 } 504 505 if (spliced) 506 return spliced; 507 508 return ret; 509 } 510 511 EXPORT_SYMBOL(generic_file_splice_read); 512 513 /* 514 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 515 * using sendpage(). Return the number of bytes sent. 516 */ 517 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 518 struct pipe_buffer *buf, struct splice_desc *sd) 519 { 520 struct file *file = sd->file; 521 loff_t pos = sd->pos; 522 int ret, more; 523 524 ret = buf->ops->pin(pipe, buf); 525 if (!ret) { 526 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 527 528 ret = file->f_op->sendpage(file, buf->page, buf->offset, 529 sd->len, &pos, more); 530 } 531 532 return ret; 533 } 534 535 /* 536 * This is a little more tricky than the file -> pipe splicing. There are 537 * basically three cases: 538 * 539 * - Destination page already exists in the address space and there 540 * are users of it. For that case we have no other option that 541 * copying the data. Tough luck. 542 * - Destination page already exists in the address space, but there 543 * are no users of it. Make sure it's uptodate, then drop it. Fall 544 * through to last case. 545 * - Destination page does not exist, we can add the pipe page to 546 * the page cache and avoid the copy. 547 * 548 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 549 * sd->flags), we attempt to migrate pages from the pipe to the output 550 * file address space page cache. This is possible if no one else has 551 * the pipe page referenced outside of the pipe and page cache. If 552 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 553 * a new page in the output file page cache and fill/dirty that. 554 */ 555 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 556 struct splice_desc *sd) 557 { 558 struct file *file = sd->file; 559 struct address_space *mapping = file->f_mapping; 560 gfp_t gfp_mask = mapping_gfp_mask(mapping); 561 unsigned int offset, this_len; 562 struct page *page; 563 pgoff_t index; 564 int ret; 565 566 /* 567 * make sure the data in this buffer is uptodate 568 */ 569 ret = buf->ops->pin(pipe, buf); 570 if (unlikely(ret)) 571 return ret; 572 573 index = sd->pos >> PAGE_CACHE_SHIFT; 574 offset = sd->pos & ~PAGE_CACHE_MASK; 575 576 this_len = sd->len; 577 if (this_len + offset > PAGE_CACHE_SIZE) 578 this_len = PAGE_CACHE_SIZE - offset; 579 580 /* 581 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full 582 * page. 583 */ 584 if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { 585 /* 586 * If steal succeeds, buf->page is now pruned from the 587 * pagecache and we can reuse it. The page will also be 588 * locked on successful return. 589 */ 590 if (buf->ops->steal(pipe, buf)) 591 goto find_page; 592 593 page = buf->page; 594 if (add_to_page_cache(page, mapping, index, gfp_mask)) { 595 unlock_page(page); 596 goto find_page; 597 } 598 599 page_cache_get(page); 600 601 if (!(buf->flags & PIPE_BUF_FLAG_LRU)) 602 lru_cache_add(page); 603 } else { 604 find_page: 605 page = find_lock_page(mapping, index); 606 if (!page) { 607 ret = -ENOMEM; 608 page = page_cache_alloc_cold(mapping); 609 if (unlikely(!page)) 610 goto out_nomem; 611 612 /* 613 * This will also lock the page 614 */ 615 ret = add_to_page_cache_lru(page, mapping, index, 616 gfp_mask); 617 if (unlikely(ret)) 618 goto out; 619 } 620 621 /* 622 * We get here with the page locked. If the page is also 623 * uptodate, we don't need to do more. If it isn't, we 624 * may need to bring it in if we are not going to overwrite 625 * the full page. 626 */ 627 if (!PageUptodate(page)) { 628 if (this_len < PAGE_CACHE_SIZE) { 629 ret = mapping->a_ops->readpage(file, page); 630 if (unlikely(ret)) 631 goto out; 632 633 lock_page(page); 634 635 if (!PageUptodate(page)) { 636 /* 637 * Page got invalidated, repeat. 638 */ 639 if (!page->mapping) { 640 unlock_page(page); 641 page_cache_release(page); 642 goto find_page; 643 } 644 ret = -EIO; 645 goto out; 646 } 647 } else 648 SetPageUptodate(page); 649 } 650 } 651 652 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 653 if (unlikely(ret)) { 654 loff_t isize = i_size_read(mapping->host); 655 656 if (ret != AOP_TRUNCATED_PAGE) 657 unlock_page(page); 658 page_cache_release(page); 659 if (ret == AOP_TRUNCATED_PAGE) 660 goto find_page; 661 662 /* 663 * prepare_write() may have instantiated a few blocks 664 * outside i_size. Trim these off again. 665 */ 666 if (sd->pos + this_len > isize) 667 vmtruncate(mapping->host, isize); 668 669 goto out; 670 } 671 672 if (buf->page != page) { 673 /* 674 * Careful, ->map() uses KM_USER0! 675 */ 676 char *src = buf->ops->map(pipe, buf, 1); 677 char *dst = kmap_atomic(page, KM_USER1); 678 679 memcpy(dst + offset, src + buf->offset, this_len); 680 flush_dcache_page(page); 681 kunmap_atomic(dst, KM_USER1); 682 buf->ops->unmap(pipe, buf, src); 683 } 684 685 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 686 if (!ret) { 687 /* 688 * Return the number of bytes written and mark page as 689 * accessed, we are now done! 690 */ 691 ret = this_len; 692 mark_page_accessed(page); 693 balance_dirty_pages_ratelimited(mapping); 694 } else if (ret == AOP_TRUNCATED_PAGE) { 695 page_cache_release(page); 696 goto find_page; 697 } 698 out: 699 page_cache_release(page); 700 unlock_page(page); 701 out_nomem: 702 return ret; 703 } 704 705 /* 706 * Pipe input worker. Most of this logic works like a regular pipe, the 707 * key here is the 'actor' worker passed in that actually moves the data 708 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 709 */ 710 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 711 loff_t *ppos, size_t len, unsigned int flags, 712 splice_actor *actor) 713 { 714 int ret, do_wakeup, err; 715 struct splice_desc sd; 716 717 ret = 0; 718 do_wakeup = 0; 719 720 sd.total_len = len; 721 sd.flags = flags; 722 sd.file = out; 723 sd.pos = *ppos; 724 725 if (pipe->inode) 726 mutex_lock(&pipe->inode->i_mutex); 727 728 for (;;) { 729 if (pipe->nrbufs) { 730 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 731 struct pipe_buf_operations *ops = buf->ops; 732 733 sd.len = buf->len; 734 if (sd.len > sd.total_len) 735 sd.len = sd.total_len; 736 737 err = actor(pipe, buf, &sd); 738 if (err <= 0) { 739 if (!ret && err != -ENODATA) 740 ret = err; 741 742 break; 743 } 744 745 ret += err; 746 buf->offset += err; 747 buf->len -= err; 748 749 sd.len -= err; 750 sd.pos += err; 751 sd.total_len -= err; 752 if (sd.len) 753 continue; 754 755 if (!buf->len) { 756 buf->ops = NULL; 757 ops->release(pipe, buf); 758 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 759 pipe->nrbufs--; 760 if (pipe->inode) 761 do_wakeup = 1; 762 } 763 764 if (!sd.total_len) 765 break; 766 } 767 768 if (pipe->nrbufs) 769 continue; 770 if (!pipe->writers) 771 break; 772 if (!pipe->waiting_writers) { 773 if (ret) 774 break; 775 } 776 777 if (flags & SPLICE_F_NONBLOCK) { 778 if (!ret) 779 ret = -EAGAIN; 780 break; 781 } 782 783 if (signal_pending(current)) { 784 if (!ret) 785 ret = -ERESTARTSYS; 786 break; 787 } 788 789 if (do_wakeup) { 790 smp_mb(); 791 if (waitqueue_active(&pipe->wait)) 792 wake_up_interruptible_sync(&pipe->wait); 793 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 794 do_wakeup = 0; 795 } 796 797 pipe_wait(pipe); 798 } 799 800 if (pipe->inode) 801 mutex_unlock(&pipe->inode->i_mutex); 802 803 if (do_wakeup) { 804 smp_mb(); 805 if (waitqueue_active(&pipe->wait)) 806 wake_up_interruptible(&pipe->wait); 807 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 808 } 809 810 return ret; 811 } 812 813 /** 814 * generic_file_splice_write - splice data from a pipe to a file 815 * @pipe: pipe info 816 * @out: file to write to 817 * @len: number of bytes to splice 818 * @flags: splice modifier flags 819 * 820 * Will either move or copy pages (determined by @flags options) from 821 * the given pipe inode to the given file. 822 * 823 */ 824 ssize_t 825 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 826 loff_t *ppos, size_t len, unsigned int flags) 827 { 828 struct address_space *mapping = out->f_mapping; 829 ssize_t ret; 830 831 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 832 if (ret > 0) { 833 struct inode *inode = mapping->host; 834 835 *ppos += ret; 836 837 /* 838 * If file or inode is SYNC and we actually wrote some data, 839 * sync it. 840 */ 841 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 842 int err; 843 844 mutex_lock(&inode->i_mutex); 845 err = generic_osync_inode(inode, mapping, 846 OSYNC_METADATA|OSYNC_DATA); 847 mutex_unlock(&inode->i_mutex); 848 849 if (err) 850 ret = err; 851 } 852 } 853 854 return ret; 855 } 856 857 EXPORT_SYMBOL(generic_file_splice_write); 858 859 /** 860 * generic_splice_sendpage - splice data from a pipe to a socket 861 * @inode: pipe inode 862 * @out: socket to write to 863 * @len: number of bytes to splice 864 * @flags: splice modifier flags 865 * 866 * Will send @len bytes from the pipe to a network socket. No data copying 867 * is involved. 868 * 869 */ 870 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 871 loff_t *ppos, size_t len, unsigned int flags) 872 { 873 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 874 } 875 876 EXPORT_SYMBOL(generic_splice_sendpage); 877 878 /* 879 * Attempt to initiate a splice from pipe to file. 880 */ 881 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 882 loff_t *ppos, size_t len, unsigned int flags) 883 { 884 int ret; 885 886 if (unlikely(!out->f_op || !out->f_op->splice_write)) 887 return -EINVAL; 888 889 if (unlikely(!(out->f_mode & FMODE_WRITE))) 890 return -EBADF; 891 892 ret = rw_verify_area(WRITE, out, ppos, len); 893 if (unlikely(ret < 0)) 894 return ret; 895 896 return out->f_op->splice_write(pipe, out, ppos, len, flags); 897 } 898 899 /* 900 * Attempt to initiate a splice from a file to a pipe. 901 */ 902 static long do_splice_to(struct file *in, loff_t *ppos, 903 struct pipe_inode_info *pipe, size_t len, 904 unsigned int flags) 905 { 906 loff_t isize, left; 907 int ret; 908 909 if (unlikely(!in->f_op || !in->f_op->splice_read)) 910 return -EINVAL; 911 912 if (unlikely(!(in->f_mode & FMODE_READ))) 913 return -EBADF; 914 915 ret = rw_verify_area(READ, in, ppos, len); 916 if (unlikely(ret < 0)) 917 return ret; 918 919 isize = i_size_read(in->f_mapping->host); 920 if (unlikely(*ppos >= isize)) 921 return 0; 922 923 left = isize - *ppos; 924 if (unlikely(left < len)) 925 len = left; 926 927 return in->f_op->splice_read(in, ppos, pipe, len, flags); 928 } 929 930 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 931 size_t len, unsigned int flags) 932 { 933 struct pipe_inode_info *pipe; 934 long ret, bytes; 935 loff_t out_off; 936 umode_t i_mode; 937 int i; 938 939 /* 940 * We require the input being a regular file, as we don't want to 941 * randomly drop data for eg socket -> socket splicing. Use the 942 * piped splicing for that! 943 */ 944 i_mode = in->f_dentry->d_inode->i_mode; 945 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 946 return -EINVAL; 947 948 /* 949 * neither in nor out is a pipe, setup an internal pipe attached to 950 * 'out' and transfer the wanted data from 'in' to 'out' through that 951 */ 952 pipe = current->splice_pipe; 953 if (unlikely(!pipe)) { 954 pipe = alloc_pipe_info(NULL); 955 if (!pipe) 956 return -ENOMEM; 957 958 /* 959 * We don't have an immediate reader, but we'll read the stuff 960 * out of the pipe right after the splice_to_pipe(). So set 961 * PIPE_READERS appropriately. 962 */ 963 pipe->readers = 1; 964 965 current->splice_pipe = pipe; 966 } 967 968 /* 969 * Do the splice. 970 */ 971 ret = 0; 972 bytes = 0; 973 out_off = 0; 974 975 while (len) { 976 size_t read_len, max_read_len; 977 978 /* 979 * Do at most PIPE_BUFFERS pages worth of transfer: 980 */ 981 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); 982 983 ret = do_splice_to(in, ppos, pipe, max_read_len, flags); 984 if (unlikely(ret < 0)) 985 goto out_release; 986 987 read_len = ret; 988 989 /* 990 * NOTE: nonblocking mode only applies to the input. We 991 * must not do the output in nonblocking mode as then we 992 * could get stuck data in the internal pipe: 993 */ 994 ret = do_splice_from(pipe, out, &out_off, read_len, 995 flags & ~SPLICE_F_NONBLOCK); 996 if (unlikely(ret < 0)) 997 goto out_release; 998 999 bytes += ret; 1000 len -= ret; 1001 1002 /* 1003 * In nonblocking mode, if we got back a short read then 1004 * that was due to either an IO error or due to the 1005 * pagecache entry not being there. In the IO error case 1006 * the _next_ splice attempt will produce a clean IO error 1007 * return value (not a short read), so in both cases it's 1008 * correct to break out of the loop here: 1009 */ 1010 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) 1011 break; 1012 } 1013 1014 pipe->nrbufs = pipe->curbuf = 0; 1015 1016 return bytes; 1017 1018 out_release: 1019 /* 1020 * If we did an incomplete transfer we must release 1021 * the pipe buffers in question: 1022 */ 1023 for (i = 0; i < PIPE_BUFFERS; i++) { 1024 struct pipe_buffer *buf = pipe->bufs + i; 1025 1026 if (buf->ops) { 1027 buf->ops->release(pipe, buf); 1028 buf->ops = NULL; 1029 } 1030 } 1031 pipe->nrbufs = pipe->curbuf = 0; 1032 1033 /* 1034 * If we transferred some data, return the number of bytes: 1035 */ 1036 if (bytes > 0) 1037 return bytes; 1038 1039 return ret; 1040 } 1041 1042 EXPORT_SYMBOL(do_splice_direct); 1043 1044 /* 1045 * Determine where to splice to/from. 1046 */ 1047 static long do_splice(struct file *in, loff_t __user *off_in, 1048 struct file *out, loff_t __user *off_out, 1049 size_t len, unsigned int flags) 1050 { 1051 struct pipe_inode_info *pipe; 1052 loff_t offset, *off; 1053 long ret; 1054 1055 pipe = in->f_dentry->d_inode->i_pipe; 1056 if (pipe) { 1057 if (off_in) 1058 return -ESPIPE; 1059 if (off_out) { 1060 if (out->f_op->llseek == no_llseek) 1061 return -EINVAL; 1062 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1063 return -EFAULT; 1064 off = &offset; 1065 } else 1066 off = &out->f_pos; 1067 1068 ret = do_splice_from(pipe, out, off, len, flags); 1069 1070 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1071 ret = -EFAULT; 1072 1073 return ret; 1074 } 1075 1076 pipe = out->f_dentry->d_inode->i_pipe; 1077 if (pipe) { 1078 if (off_out) 1079 return -ESPIPE; 1080 if (off_in) { 1081 if (in->f_op->llseek == no_llseek) 1082 return -EINVAL; 1083 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1084 return -EFAULT; 1085 off = &offset; 1086 } else 1087 off = &in->f_pos; 1088 1089 ret = do_splice_to(in, off, pipe, len, flags); 1090 1091 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1092 ret = -EFAULT; 1093 1094 return ret; 1095 } 1096 1097 return -EINVAL; 1098 } 1099 1100 /* 1101 * Map an iov into an array of pages and offset/length tupples. With the 1102 * partial_page structure, we can map several non-contiguous ranges into 1103 * our ones pages[] map instead of splitting that operation into pieces. 1104 * Could easily be exported as a generic helper for other users, in which 1105 * case one would probably want to add a 'max_nr_pages' parameter as well. 1106 */ 1107 static int get_iovec_page_array(const struct iovec __user *iov, 1108 unsigned int nr_vecs, struct page **pages, 1109 struct partial_page *partial, int aligned) 1110 { 1111 int buffers = 0, error = 0; 1112 1113 /* 1114 * It's ok to take the mmap_sem for reading, even 1115 * across a "get_user()". 1116 */ 1117 down_read(¤t->mm->mmap_sem); 1118 1119 while (nr_vecs) { 1120 unsigned long off, npages; 1121 void __user *base; 1122 size_t len; 1123 int i; 1124 1125 /* 1126 * Get user address base and length for this iovec. 1127 */ 1128 error = get_user(base, &iov->iov_base); 1129 if (unlikely(error)) 1130 break; 1131 error = get_user(len, &iov->iov_len); 1132 if (unlikely(error)) 1133 break; 1134 1135 /* 1136 * Sanity check this iovec. 0 read succeeds. 1137 */ 1138 if (unlikely(!len)) 1139 break; 1140 error = -EFAULT; 1141 if (unlikely(!base)) 1142 break; 1143 1144 /* 1145 * Get this base offset and number of pages, then map 1146 * in the user pages. 1147 */ 1148 off = (unsigned long) base & ~PAGE_MASK; 1149 1150 /* 1151 * If asked for alignment, the offset must be zero and the 1152 * length a multiple of the PAGE_SIZE. 1153 */ 1154 error = -EINVAL; 1155 if (aligned && (off || len & ~PAGE_MASK)) 1156 break; 1157 1158 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1159 if (npages > PIPE_BUFFERS - buffers) 1160 npages = PIPE_BUFFERS - buffers; 1161 1162 error = get_user_pages(current, current->mm, 1163 (unsigned long) base, npages, 0, 0, 1164 &pages[buffers], NULL); 1165 1166 if (unlikely(error <= 0)) 1167 break; 1168 1169 /* 1170 * Fill this contiguous range into the partial page map. 1171 */ 1172 for (i = 0; i < error; i++) { 1173 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1174 1175 partial[buffers].offset = off; 1176 partial[buffers].len = plen; 1177 1178 off = 0; 1179 len -= plen; 1180 buffers++; 1181 } 1182 1183 /* 1184 * We didn't complete this iov, stop here since it probably 1185 * means we have to move some of this into a pipe to 1186 * be able to continue. 1187 */ 1188 if (len) 1189 break; 1190 1191 /* 1192 * Don't continue if we mapped fewer pages than we asked for, 1193 * or if we mapped the max number of pages that we have 1194 * room for. 1195 */ 1196 if (error < npages || buffers == PIPE_BUFFERS) 1197 break; 1198 1199 nr_vecs--; 1200 iov++; 1201 } 1202 1203 up_read(¤t->mm->mmap_sem); 1204 1205 if (buffers) 1206 return buffers; 1207 1208 return error; 1209 } 1210 1211 /* 1212 * vmsplice splices a user address range into a pipe. It can be thought of 1213 * as splice-from-memory, where the regular splice is splice-from-file (or 1214 * to file). In both cases the output is a pipe, naturally. 1215 * 1216 * Note that vmsplice only supports splicing _from_ user memory to a pipe, 1217 * not the other way around. Splicing from user memory is a simple operation 1218 * that can be supported without any funky alignment restrictions or nasty 1219 * vm tricks. We simply map in the user memory and fill them into a pipe. 1220 * The reverse isn't quite as easy, though. There are two possible solutions 1221 * for that: 1222 * 1223 * - memcpy() the data internally, at which point we might as well just 1224 * do a regular read() on the buffer anyway. 1225 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1226 * has restriction limitations on both ends of the pipe). 1227 * 1228 * Alas, it isn't here. 1229 * 1230 */ 1231 static long do_vmsplice(struct file *file, const struct iovec __user *iov, 1232 unsigned long nr_segs, unsigned int flags) 1233 { 1234 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; 1235 struct page *pages[PIPE_BUFFERS]; 1236 struct partial_page partial[PIPE_BUFFERS]; 1237 struct splice_pipe_desc spd = { 1238 .pages = pages, 1239 .partial = partial, 1240 .flags = flags, 1241 .ops = &user_page_pipe_buf_ops, 1242 }; 1243 1244 if (unlikely(!pipe)) 1245 return -EBADF; 1246 if (unlikely(nr_segs > UIO_MAXIOV)) 1247 return -EINVAL; 1248 else if (unlikely(!nr_segs)) 1249 return 0; 1250 1251 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1252 flags & SPLICE_F_GIFT); 1253 if (spd.nr_pages <= 0) 1254 return spd.nr_pages; 1255 1256 return splice_to_pipe(pipe, &spd); 1257 } 1258 1259 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1260 unsigned long nr_segs, unsigned int flags) 1261 { 1262 struct file *file; 1263 long error; 1264 int fput; 1265 1266 error = -EBADF; 1267 file = fget_light(fd, &fput); 1268 if (file) { 1269 if (file->f_mode & FMODE_WRITE) 1270 error = do_vmsplice(file, iov, nr_segs, flags); 1271 1272 fput_light(file, fput); 1273 } 1274 1275 return error; 1276 } 1277 1278 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1279 int fd_out, loff_t __user *off_out, 1280 size_t len, unsigned int flags) 1281 { 1282 long error; 1283 struct file *in, *out; 1284 int fput_in, fput_out; 1285 1286 if (unlikely(!len)) 1287 return 0; 1288 1289 error = -EBADF; 1290 in = fget_light(fd_in, &fput_in); 1291 if (in) { 1292 if (in->f_mode & FMODE_READ) { 1293 out = fget_light(fd_out, &fput_out); 1294 if (out) { 1295 if (out->f_mode & FMODE_WRITE) 1296 error = do_splice(in, off_in, 1297 out, off_out, 1298 len, flags); 1299 fput_light(out, fput_out); 1300 } 1301 } 1302 1303 fput_light(in, fput_in); 1304 } 1305 1306 return error; 1307 } 1308 1309 /* 1310 * Make sure there's data to read. Wait for input if we can, otherwise 1311 * return an appropriate error. 1312 */ 1313 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1314 { 1315 int ret; 1316 1317 /* 1318 * Check ->nrbufs without the inode lock first. This function 1319 * is speculative anyways, so missing one is ok. 1320 */ 1321 if (pipe->nrbufs) 1322 return 0; 1323 1324 ret = 0; 1325 mutex_lock(&pipe->inode->i_mutex); 1326 1327 while (!pipe->nrbufs) { 1328 if (signal_pending(current)) { 1329 ret = -ERESTARTSYS; 1330 break; 1331 } 1332 if (!pipe->writers) 1333 break; 1334 if (!pipe->waiting_writers) { 1335 if (flags & SPLICE_F_NONBLOCK) { 1336 ret = -EAGAIN; 1337 break; 1338 } 1339 } 1340 pipe_wait(pipe); 1341 } 1342 1343 mutex_unlock(&pipe->inode->i_mutex); 1344 return ret; 1345 } 1346 1347 /* 1348 * Make sure there's writeable room. Wait for room if we can, otherwise 1349 * return an appropriate error. 1350 */ 1351 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1352 { 1353 int ret; 1354 1355 /* 1356 * Check ->nrbufs without the inode lock first. This function 1357 * is speculative anyways, so missing one is ok. 1358 */ 1359 if (pipe->nrbufs < PIPE_BUFFERS) 1360 return 0; 1361 1362 ret = 0; 1363 mutex_lock(&pipe->inode->i_mutex); 1364 1365 while (pipe->nrbufs >= PIPE_BUFFERS) { 1366 if (!pipe->readers) { 1367 send_sig(SIGPIPE, current, 0); 1368 ret = -EPIPE; 1369 break; 1370 } 1371 if (flags & SPLICE_F_NONBLOCK) { 1372 ret = -EAGAIN; 1373 break; 1374 } 1375 if (signal_pending(current)) { 1376 ret = -ERESTARTSYS; 1377 break; 1378 } 1379 pipe->waiting_writers++; 1380 pipe_wait(pipe); 1381 pipe->waiting_writers--; 1382 } 1383 1384 mutex_unlock(&pipe->inode->i_mutex); 1385 return ret; 1386 } 1387 1388 /* 1389 * Link contents of ipipe to opipe. 1390 */ 1391 static int link_pipe(struct pipe_inode_info *ipipe, 1392 struct pipe_inode_info *opipe, 1393 size_t len, unsigned int flags) 1394 { 1395 struct pipe_buffer *ibuf, *obuf; 1396 int ret = 0, i = 0, nbuf; 1397 1398 /* 1399 * Potential ABBA deadlock, work around it by ordering lock 1400 * grabbing by inode address. Otherwise two different processes 1401 * could deadlock (one doing tee from A -> B, the other from B -> A). 1402 */ 1403 if (ipipe->inode < opipe->inode) { 1404 mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT); 1405 mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD); 1406 } else { 1407 mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT); 1408 mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD); 1409 } 1410 1411 do { 1412 if (!opipe->readers) { 1413 send_sig(SIGPIPE, current, 0); 1414 if (!ret) 1415 ret = -EPIPE; 1416 break; 1417 } 1418 1419 /* 1420 * If we have iterated all input buffers or ran out of 1421 * output room, break. 1422 */ 1423 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1424 break; 1425 1426 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1427 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1428 1429 /* 1430 * Get a reference to this pipe buffer, 1431 * so we can copy the contents over. 1432 */ 1433 ibuf->ops->get(ipipe, ibuf); 1434 1435 obuf = opipe->bufs + nbuf; 1436 *obuf = *ibuf; 1437 1438 /* 1439 * Don't inherit the gift flag, we need to 1440 * prevent multiple steals of this page. 1441 */ 1442 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1443 1444 if (obuf->len > len) 1445 obuf->len = len; 1446 1447 opipe->nrbufs++; 1448 ret += obuf->len; 1449 len -= obuf->len; 1450 i++; 1451 } while (len); 1452 1453 mutex_unlock(&ipipe->inode->i_mutex); 1454 mutex_unlock(&opipe->inode->i_mutex); 1455 1456 /* 1457 * If we put data in the output pipe, wakeup any potential readers. 1458 */ 1459 if (ret > 0) { 1460 smp_mb(); 1461 if (waitqueue_active(&opipe->wait)) 1462 wake_up_interruptible(&opipe->wait); 1463 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1464 } 1465 1466 return ret; 1467 } 1468 1469 /* 1470 * This is a tee(1) implementation that works on pipes. It doesn't copy 1471 * any data, it simply references the 'in' pages on the 'out' pipe. 1472 * The 'flags' used are the SPLICE_F_* variants, currently the only 1473 * applicable one is SPLICE_F_NONBLOCK. 1474 */ 1475 static long do_tee(struct file *in, struct file *out, size_t len, 1476 unsigned int flags) 1477 { 1478 struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; 1479 struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; 1480 int ret = -EINVAL; 1481 1482 /* 1483 * Duplicate the contents of ipipe to opipe without actually 1484 * copying the data. 1485 */ 1486 if (ipipe && opipe && ipipe != opipe) { 1487 /* 1488 * Keep going, unless we encounter an error. The ipipe/opipe 1489 * ordering doesn't really matter. 1490 */ 1491 ret = link_ipipe_prep(ipipe, flags); 1492 if (!ret) { 1493 ret = link_opipe_prep(opipe, flags); 1494 if (!ret) { 1495 ret = link_pipe(ipipe, opipe, len, flags); 1496 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1497 ret = -EAGAIN; 1498 } 1499 } 1500 } 1501 1502 return ret; 1503 } 1504 1505 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1506 { 1507 struct file *in; 1508 int error, fput_in; 1509 1510 if (unlikely(!len)) 1511 return 0; 1512 1513 error = -EBADF; 1514 in = fget_light(fdin, &fput_in); 1515 if (in) { 1516 if (in->f_mode & FMODE_READ) { 1517 int fput_out; 1518 struct file *out = fget_light(fdout, &fput_out); 1519 1520 if (out) { 1521 if (out->f_mode & FMODE_WRITE) 1522 error = do_tee(in, out, len, flags); 1523 fput_light(out, fput_out); 1524 } 1525 } 1526 fput_light(in, fput_in); 1527 } 1528 1529 return error; 1530 } 1531