1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/pipe_fs_i.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 32 struct partial_page { 33 unsigned int offset; 34 unsigned int len; 35 }; 36 37 /* 38 * Passed to splice_to_pipe 39 */ 40 struct splice_pipe_desc { 41 struct page **pages; /* page map */ 42 struct partial_page *partial; /* pages[] may not be contig */ 43 int nr_pages; /* number of pages in map */ 44 unsigned int flags; /* splice flags */ 45 const struct pipe_buf_operations *ops;/* ops associated with output pipe */ 46 }; 47 48 /* 49 * Attempt to steal a page from a pipe buffer. This should perhaps go into 50 * a vm helper function, it's already simplified quite a bit by the 51 * addition of remove_mapping(). If success is returned, the caller may 52 * attempt to reuse this page for another destination. 53 */ 54 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 55 struct pipe_buffer *buf) 56 { 57 struct page *page = buf->page; 58 struct address_space *mapping; 59 60 lock_page(page); 61 62 mapping = page_mapping(page); 63 if (mapping) { 64 WARN_ON(!PageUptodate(page)); 65 66 /* 67 * At least for ext2 with nobh option, we need to wait on 68 * writeback completing on this page, since we'll remove it 69 * from the pagecache. Otherwise truncate wont wait on the 70 * page, allowing the disk blocks to be reused by someone else 71 * before we actually wrote our data to them. fs corruption 72 * ensues. 73 */ 74 wait_on_page_writeback(page); 75 76 if (PagePrivate(page)) 77 try_to_release_page(page, GFP_KERNEL); 78 79 /* 80 * If we succeeded in removing the mapping, set LRU flag 81 * and return good. 82 */ 83 if (remove_mapping(mapping, page)) { 84 buf->flags |= PIPE_BUF_FLAG_LRU; 85 return 0; 86 } 87 } 88 89 /* 90 * Raced with truncate or failed to remove page from current 91 * address space, unlock and return failure. 92 */ 93 unlock_page(page); 94 return 1; 95 } 96 97 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 98 struct pipe_buffer *buf) 99 { 100 page_cache_release(buf->page); 101 buf->flags &= ~PIPE_BUF_FLAG_LRU; 102 } 103 104 static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, 105 struct pipe_buffer *buf) 106 { 107 struct page *page = buf->page; 108 int err; 109 110 if (!PageUptodate(page)) { 111 lock_page(page); 112 113 /* 114 * Page got truncated/unhashed. This will cause a 0-byte 115 * splice, if this is the first page. 116 */ 117 if (!page->mapping) { 118 err = -ENODATA; 119 goto error; 120 } 121 122 /* 123 * Uh oh, read-error from disk. 124 */ 125 if (!PageUptodate(page)) { 126 err = -EIO; 127 goto error; 128 } 129 130 /* 131 * Page is ok afterall, we are done. 132 */ 133 unlock_page(page); 134 } 135 136 return 0; 137 error: 138 unlock_page(page); 139 return err; 140 } 141 142 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 143 .can_merge = 0, 144 .map = generic_pipe_buf_map, 145 .unmap = generic_pipe_buf_unmap, 146 .pin = page_cache_pipe_buf_pin, 147 .release = page_cache_pipe_buf_release, 148 .steal = page_cache_pipe_buf_steal, 149 .get = generic_pipe_buf_get, 150 }; 151 152 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 153 struct pipe_buffer *buf) 154 { 155 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 156 return 1; 157 158 buf->flags |= PIPE_BUF_FLAG_LRU; 159 return generic_pipe_buf_steal(pipe, buf); 160 } 161 162 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 163 .can_merge = 0, 164 .map = generic_pipe_buf_map, 165 .unmap = generic_pipe_buf_unmap, 166 .pin = generic_pipe_buf_pin, 167 .release = page_cache_pipe_buf_release, 168 .steal = user_page_pipe_buf_steal, 169 .get = generic_pipe_buf_get, 170 }; 171 172 /* 173 * Pipe output worker. This sets up our pipe format with the page cache 174 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 175 */ 176 static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 177 struct splice_pipe_desc *spd) 178 { 179 int ret, do_wakeup, page_nr; 180 181 ret = 0; 182 do_wakeup = 0; 183 page_nr = 0; 184 185 if (pipe->inode) 186 mutex_lock(&pipe->inode->i_mutex); 187 188 for (;;) { 189 if (!pipe->readers) { 190 send_sig(SIGPIPE, current, 0); 191 if (!ret) 192 ret = -EPIPE; 193 break; 194 } 195 196 if (pipe->nrbufs < PIPE_BUFFERS) { 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 198 struct pipe_buffer *buf = pipe->bufs + newbuf; 199 200 buf->page = spd->pages[page_nr]; 201 buf->offset = spd->partial[page_nr].offset; 202 buf->len = spd->partial[page_nr].len; 203 buf->ops = spd->ops; 204 if (spd->flags & SPLICE_F_GIFT) 205 buf->flags |= PIPE_BUF_FLAG_GIFT; 206 207 pipe->nrbufs++; 208 page_nr++; 209 ret += buf->len; 210 211 if (pipe->inode) 212 do_wakeup = 1; 213 214 if (!--spd->nr_pages) 215 break; 216 if (pipe->nrbufs < PIPE_BUFFERS) 217 continue; 218 219 break; 220 } 221 222 if (spd->flags & SPLICE_F_NONBLOCK) { 223 if (!ret) 224 ret = -EAGAIN; 225 break; 226 } 227 228 if (signal_pending(current)) { 229 if (!ret) 230 ret = -ERESTARTSYS; 231 break; 232 } 233 234 if (do_wakeup) { 235 smp_mb(); 236 if (waitqueue_active(&pipe->wait)) 237 wake_up_interruptible_sync(&pipe->wait); 238 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 239 do_wakeup = 0; 240 } 241 242 pipe->waiting_writers++; 243 pipe_wait(pipe); 244 pipe->waiting_writers--; 245 } 246 247 if (pipe->inode) 248 mutex_unlock(&pipe->inode->i_mutex); 249 250 if (do_wakeup) { 251 smp_mb(); 252 if (waitqueue_active(&pipe->wait)) 253 wake_up_interruptible(&pipe->wait); 254 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 255 } 256 257 while (page_nr < spd->nr_pages) 258 page_cache_release(spd->pages[page_nr++]); 259 260 return ret; 261 } 262 263 static int 264 __generic_file_splice_read(struct file *in, loff_t *ppos, 265 struct pipe_inode_info *pipe, size_t len, 266 unsigned int flags) 267 { 268 struct address_space *mapping = in->f_mapping; 269 unsigned int loff, nr_pages; 270 struct page *pages[PIPE_BUFFERS]; 271 struct partial_page partial[PIPE_BUFFERS]; 272 struct page *page; 273 pgoff_t index, end_index; 274 loff_t isize; 275 size_t total_len; 276 int error, page_nr; 277 struct splice_pipe_desc spd = { 278 .pages = pages, 279 .partial = partial, 280 .flags = flags, 281 .ops = &page_cache_pipe_buf_ops, 282 }; 283 284 index = *ppos >> PAGE_CACHE_SHIFT; 285 loff = *ppos & ~PAGE_CACHE_MASK; 286 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 287 288 if (nr_pages > PIPE_BUFFERS) 289 nr_pages = PIPE_BUFFERS; 290 291 /* 292 * Initiate read-ahead on this page range. however, don't call into 293 * read-ahead if this is a non-zero offset (we are likely doing small 294 * chunk splice and the page is already there) for a single page. 295 */ 296 if (!loff || nr_pages > 1) 297 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 298 299 /* 300 * Now fill in the holes: 301 */ 302 error = 0; 303 total_len = 0; 304 305 /* 306 * Lookup the (hopefully) full range of pages we need. 307 */ 308 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 309 310 /* 311 * If find_get_pages_contig() returned fewer pages than we needed, 312 * allocate the rest. 313 */ 314 index += spd.nr_pages; 315 while (spd.nr_pages < nr_pages) { 316 /* 317 * Page could be there, find_get_pages_contig() breaks on 318 * the first hole. 319 */ 320 page = find_get_page(mapping, index); 321 if (!page) { 322 /* 323 * Make sure the read-ahead engine is notified 324 * about this failure. 325 */ 326 handle_ra_miss(mapping, &in->f_ra, index); 327 328 /* 329 * page didn't exist, allocate one. 330 */ 331 page = page_cache_alloc_cold(mapping); 332 if (!page) 333 break; 334 335 error = add_to_page_cache_lru(page, mapping, index, 336 GFP_KERNEL); 337 if (unlikely(error)) { 338 page_cache_release(page); 339 if (error == -EEXIST) 340 continue; 341 break; 342 } 343 /* 344 * add_to_page_cache() locks the page, unlock it 345 * to avoid convoluting the logic below even more. 346 */ 347 unlock_page(page); 348 } 349 350 pages[spd.nr_pages++] = page; 351 index++; 352 } 353 354 /* 355 * Now loop over the map and see if we need to start IO on any 356 * pages, fill in the partial map, etc. 357 */ 358 index = *ppos >> PAGE_CACHE_SHIFT; 359 nr_pages = spd.nr_pages; 360 spd.nr_pages = 0; 361 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 362 unsigned int this_len; 363 364 if (!len) 365 break; 366 367 /* 368 * this_len is the max we'll use from this page 369 */ 370 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 371 page = pages[page_nr]; 372 373 /* 374 * If the page isn't uptodate, we may need to start io on it 375 */ 376 if (!PageUptodate(page)) { 377 /* 378 * If in nonblock mode then dont block on waiting 379 * for an in-flight io page 380 */ 381 if (flags & SPLICE_F_NONBLOCK) 382 break; 383 384 lock_page(page); 385 386 /* 387 * page was truncated, stop here. if this isn't the 388 * first page, we'll just complete what we already 389 * added 390 */ 391 if (!page->mapping) { 392 unlock_page(page); 393 break; 394 } 395 /* 396 * page was already under io and is now done, great 397 */ 398 if (PageUptodate(page)) { 399 unlock_page(page); 400 goto fill_it; 401 } 402 403 /* 404 * need to read in the page 405 */ 406 error = mapping->a_ops->readpage(in, page); 407 if (unlikely(error)) { 408 /* 409 * We really should re-lookup the page here, 410 * but it complicates things a lot. Instead 411 * lets just do what we already stored, and 412 * we'll get it the next time we are called. 413 */ 414 if (error == AOP_TRUNCATED_PAGE) 415 error = 0; 416 417 break; 418 } 419 420 /* 421 * i_size must be checked after ->readpage(). 422 */ 423 isize = i_size_read(mapping->host); 424 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 425 if (unlikely(!isize || index > end_index)) 426 break; 427 428 /* 429 * if this is the last page, see if we need to shrink 430 * the length and stop 431 */ 432 if (end_index == index) { 433 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 434 if (total_len + loff > isize) 435 break; 436 /* 437 * force quit after adding this page 438 */ 439 len = this_len; 440 this_len = min(this_len, loff); 441 loff = 0; 442 } 443 } 444 fill_it: 445 partial[page_nr].offset = loff; 446 partial[page_nr].len = this_len; 447 len -= this_len; 448 total_len += this_len; 449 loff = 0; 450 spd.nr_pages++; 451 index++; 452 } 453 454 /* 455 * Release any pages at the end, if we quit early. 'i' is how far 456 * we got, 'nr_pages' is how many pages are in the map. 457 */ 458 while (page_nr < nr_pages) 459 page_cache_release(pages[page_nr++]); 460 461 if (spd.nr_pages) 462 return splice_to_pipe(pipe, &spd); 463 464 return error; 465 } 466 467 /** 468 * generic_file_splice_read - splice data from file to a pipe 469 * @in: file to splice from 470 * @pipe: pipe to splice to 471 * @len: number of bytes to splice 472 * @flags: splice modifier flags 473 * 474 * Will read pages from given file and fill them into a pipe. 475 */ 476 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 477 struct pipe_inode_info *pipe, size_t len, 478 unsigned int flags) 479 { 480 ssize_t spliced; 481 int ret; 482 483 ret = 0; 484 spliced = 0; 485 486 while (len) { 487 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 488 489 if (ret < 0) 490 break; 491 else if (!ret) { 492 if (spliced) 493 break; 494 if (flags & SPLICE_F_NONBLOCK) { 495 ret = -EAGAIN; 496 break; 497 } 498 } 499 500 *ppos += ret; 501 len -= ret; 502 spliced += ret; 503 } 504 505 if (spliced) 506 return spliced; 507 508 return ret; 509 } 510 511 EXPORT_SYMBOL(generic_file_splice_read); 512 513 /* 514 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 515 * using sendpage(). Return the number of bytes sent. 516 */ 517 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 518 struct pipe_buffer *buf, struct splice_desc *sd) 519 { 520 struct file *file = sd->file; 521 loff_t pos = sd->pos; 522 int ret, more; 523 524 ret = buf->ops->pin(pipe, buf); 525 if (!ret) { 526 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 527 528 ret = file->f_op->sendpage(file, buf->page, buf->offset, 529 sd->len, &pos, more); 530 } 531 532 return ret; 533 } 534 535 /* 536 * This is a little more tricky than the file -> pipe splicing. There are 537 * basically three cases: 538 * 539 * - Destination page already exists in the address space and there 540 * are users of it. For that case we have no other option that 541 * copying the data. Tough luck. 542 * - Destination page already exists in the address space, but there 543 * are no users of it. Make sure it's uptodate, then drop it. Fall 544 * through to last case. 545 * - Destination page does not exist, we can add the pipe page to 546 * the page cache and avoid the copy. 547 * 548 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 549 * sd->flags), we attempt to migrate pages from the pipe to the output 550 * file address space page cache. This is possible if no one else has 551 * the pipe page referenced outside of the pipe and page cache. If 552 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 553 * a new page in the output file page cache and fill/dirty that. 554 */ 555 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 556 struct splice_desc *sd) 557 { 558 struct file *file = sd->file; 559 struct address_space *mapping = file->f_mapping; 560 unsigned int offset, this_len; 561 struct page *page; 562 pgoff_t index; 563 int ret; 564 565 /* 566 * make sure the data in this buffer is uptodate 567 */ 568 ret = buf->ops->pin(pipe, buf); 569 if (unlikely(ret)) 570 return ret; 571 572 index = sd->pos >> PAGE_CACHE_SHIFT; 573 offset = sd->pos & ~PAGE_CACHE_MASK; 574 575 this_len = sd->len; 576 if (this_len + offset > PAGE_CACHE_SIZE) 577 this_len = PAGE_CACHE_SIZE - offset; 578 579 find_page: 580 page = find_lock_page(mapping, index); 581 if (!page) { 582 ret = -ENOMEM; 583 page = page_cache_alloc_cold(mapping); 584 if (unlikely(!page)) 585 goto out_ret; 586 587 /* 588 * This will also lock the page 589 */ 590 ret = add_to_page_cache_lru(page, mapping, index, 591 GFP_KERNEL); 592 if (unlikely(ret)) 593 goto out; 594 } 595 596 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 597 if (unlikely(ret)) { 598 loff_t isize = i_size_read(mapping->host); 599 600 if (ret != AOP_TRUNCATED_PAGE) 601 unlock_page(page); 602 page_cache_release(page); 603 if (ret == AOP_TRUNCATED_PAGE) 604 goto find_page; 605 606 /* 607 * prepare_write() may have instantiated a few blocks 608 * outside i_size. Trim these off again. 609 */ 610 if (sd->pos + this_len > isize) 611 vmtruncate(mapping->host, isize); 612 613 goto out_ret; 614 } 615 616 if (buf->page != page) { 617 /* 618 * Careful, ->map() uses KM_USER0! 619 */ 620 char *src = buf->ops->map(pipe, buf, 1); 621 char *dst = kmap_atomic(page, KM_USER1); 622 623 memcpy(dst + offset, src + buf->offset, this_len); 624 flush_dcache_page(page); 625 kunmap_atomic(dst, KM_USER1); 626 buf->ops->unmap(pipe, buf, src); 627 } 628 629 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 630 if (ret) { 631 if (ret == AOP_TRUNCATED_PAGE) { 632 page_cache_release(page); 633 goto find_page; 634 } 635 if (ret < 0) 636 goto out; 637 /* 638 * Partial write has happened, so 'ret' already initialized by 639 * number of bytes written, Where is nothing we have to do here. 640 */ 641 } else 642 ret = this_len; 643 /* 644 * Return the number of bytes written and mark page as 645 * accessed, we are now done! 646 */ 647 mark_page_accessed(page); 648 balance_dirty_pages_ratelimited(mapping); 649 out: 650 page_cache_release(page); 651 unlock_page(page); 652 out_ret: 653 return ret; 654 } 655 656 /* 657 * Pipe input worker. Most of this logic works like a regular pipe, the 658 * key here is the 'actor' worker passed in that actually moves the data 659 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 660 */ 661 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, 662 struct file *out, loff_t *ppos, size_t len, 663 unsigned int flags, splice_actor *actor) 664 { 665 int ret, do_wakeup, err; 666 struct splice_desc sd; 667 668 ret = 0; 669 do_wakeup = 0; 670 671 sd.total_len = len; 672 sd.flags = flags; 673 sd.file = out; 674 sd.pos = *ppos; 675 676 for (;;) { 677 if (pipe->nrbufs) { 678 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 679 const struct pipe_buf_operations *ops = buf->ops; 680 681 sd.len = buf->len; 682 if (sd.len > sd.total_len) 683 sd.len = sd.total_len; 684 685 err = actor(pipe, buf, &sd); 686 if (err <= 0) { 687 if (!ret && err != -ENODATA) 688 ret = err; 689 690 break; 691 } 692 693 ret += err; 694 buf->offset += err; 695 buf->len -= err; 696 697 sd.len -= err; 698 sd.pos += err; 699 sd.total_len -= err; 700 if (sd.len) 701 continue; 702 703 if (!buf->len) { 704 buf->ops = NULL; 705 ops->release(pipe, buf); 706 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 707 pipe->nrbufs--; 708 if (pipe->inode) 709 do_wakeup = 1; 710 } 711 712 if (!sd.total_len) 713 break; 714 } 715 716 if (pipe->nrbufs) 717 continue; 718 if (!pipe->writers) 719 break; 720 if (!pipe->waiting_writers) { 721 if (ret) 722 break; 723 } 724 725 if (flags & SPLICE_F_NONBLOCK) { 726 if (!ret) 727 ret = -EAGAIN; 728 break; 729 } 730 731 if (signal_pending(current)) { 732 if (!ret) 733 ret = -ERESTARTSYS; 734 break; 735 } 736 737 if (do_wakeup) { 738 smp_mb(); 739 if (waitqueue_active(&pipe->wait)) 740 wake_up_interruptible_sync(&pipe->wait); 741 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 742 do_wakeup = 0; 743 } 744 745 pipe_wait(pipe); 746 } 747 748 if (do_wakeup) { 749 smp_mb(); 750 if (waitqueue_active(&pipe->wait)) 751 wake_up_interruptible(&pipe->wait); 752 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 753 } 754 755 return ret; 756 } 757 EXPORT_SYMBOL(__splice_from_pipe); 758 759 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 760 loff_t *ppos, size_t len, unsigned int flags, 761 splice_actor *actor) 762 { 763 ssize_t ret; 764 struct inode *inode = out->f_mapping->host; 765 766 /* 767 * The actor worker might be calling ->prepare_write and 768 * ->commit_write. Most of the time, these expect i_mutex to 769 * be held. Since this may result in an ABBA deadlock with 770 * pipe->inode, we have to order lock acquiry here. 771 */ 772 inode_double_lock(inode, pipe->inode); 773 ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor); 774 inode_double_unlock(inode, pipe->inode); 775 776 return ret; 777 } 778 779 /** 780 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 781 * @pipe: pipe info 782 * @out: file to write to 783 * @len: number of bytes to splice 784 * @flags: splice modifier flags 785 * 786 * Will either move or copy pages (determined by @flags options) from 787 * the given pipe inode to the given file. The caller is responsible 788 * for acquiring i_mutex on both inodes. 789 * 790 */ 791 ssize_t 792 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 793 loff_t *ppos, size_t len, unsigned int flags) 794 { 795 struct address_space *mapping = out->f_mapping; 796 struct inode *inode = mapping->host; 797 ssize_t ret; 798 int err; 799 800 err = remove_suid(out->f_path.dentry); 801 if (unlikely(err)) 802 return err; 803 804 ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 805 if (ret > 0) { 806 *ppos += ret; 807 808 /* 809 * If file or inode is SYNC and we actually wrote some data, 810 * sync it. 811 */ 812 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 813 err = generic_osync_inode(inode, mapping, 814 OSYNC_METADATA|OSYNC_DATA); 815 816 if (err) 817 ret = err; 818 } 819 } 820 821 return ret; 822 } 823 824 EXPORT_SYMBOL(generic_file_splice_write_nolock); 825 826 /** 827 * generic_file_splice_write - splice data from a pipe to a file 828 * @pipe: pipe info 829 * @out: file to write to 830 * @len: number of bytes to splice 831 * @flags: splice modifier flags 832 * 833 * Will either move or copy pages (determined by @flags options) from 834 * the given pipe inode to the given file. 835 * 836 */ 837 ssize_t 838 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 839 loff_t *ppos, size_t len, unsigned int flags) 840 { 841 struct address_space *mapping = out->f_mapping; 842 struct inode *inode = mapping->host; 843 ssize_t ret; 844 int err; 845 846 err = should_remove_suid(out->f_path.dentry); 847 if (unlikely(err)) { 848 mutex_lock(&inode->i_mutex); 849 err = __remove_suid(out->f_path.dentry, err); 850 mutex_unlock(&inode->i_mutex); 851 if (err) 852 return err; 853 } 854 855 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 856 if (ret > 0) { 857 *ppos += ret; 858 859 /* 860 * If file or inode is SYNC and we actually wrote some data, 861 * sync it. 862 */ 863 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 864 mutex_lock(&inode->i_mutex); 865 err = generic_osync_inode(inode, mapping, 866 OSYNC_METADATA|OSYNC_DATA); 867 mutex_unlock(&inode->i_mutex); 868 869 if (err) 870 ret = err; 871 } 872 } 873 874 return ret; 875 } 876 877 EXPORT_SYMBOL(generic_file_splice_write); 878 879 /** 880 * generic_splice_sendpage - splice data from a pipe to a socket 881 * @inode: pipe inode 882 * @out: socket to write to 883 * @len: number of bytes to splice 884 * @flags: splice modifier flags 885 * 886 * Will send @len bytes from the pipe to a network socket. No data copying 887 * is involved. 888 * 889 */ 890 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 891 loff_t *ppos, size_t len, unsigned int flags) 892 { 893 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 894 } 895 896 EXPORT_SYMBOL(generic_splice_sendpage); 897 898 /* 899 * Attempt to initiate a splice from pipe to file. 900 */ 901 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 902 loff_t *ppos, size_t len, unsigned int flags) 903 { 904 int ret; 905 906 if (unlikely(!out->f_op || !out->f_op->splice_write)) 907 return -EINVAL; 908 909 if (unlikely(!(out->f_mode & FMODE_WRITE))) 910 return -EBADF; 911 912 ret = rw_verify_area(WRITE, out, ppos, len); 913 if (unlikely(ret < 0)) 914 return ret; 915 916 return out->f_op->splice_write(pipe, out, ppos, len, flags); 917 } 918 919 /* 920 * Attempt to initiate a splice from a file to a pipe. 921 */ 922 static long do_splice_to(struct file *in, loff_t *ppos, 923 struct pipe_inode_info *pipe, size_t len, 924 unsigned int flags) 925 { 926 loff_t isize, left; 927 int ret; 928 929 if (unlikely(!in->f_op || !in->f_op->splice_read)) 930 return -EINVAL; 931 932 if (unlikely(!(in->f_mode & FMODE_READ))) 933 return -EBADF; 934 935 ret = rw_verify_area(READ, in, ppos, len); 936 if (unlikely(ret < 0)) 937 return ret; 938 939 isize = i_size_read(in->f_mapping->host); 940 if (unlikely(*ppos >= isize)) 941 return 0; 942 943 left = isize - *ppos; 944 if (unlikely(left < len)) 945 len = left; 946 947 return in->f_op->splice_read(in, ppos, pipe, len, flags); 948 } 949 950 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 951 size_t len, unsigned int flags) 952 { 953 struct pipe_inode_info *pipe; 954 long ret, bytes; 955 loff_t out_off; 956 umode_t i_mode; 957 int i; 958 959 /* 960 * We require the input being a regular file, as we don't want to 961 * randomly drop data for eg socket -> socket splicing. Use the 962 * piped splicing for that! 963 */ 964 i_mode = in->f_path.dentry->d_inode->i_mode; 965 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 966 return -EINVAL; 967 968 /* 969 * neither in nor out is a pipe, setup an internal pipe attached to 970 * 'out' and transfer the wanted data from 'in' to 'out' through that 971 */ 972 pipe = current->splice_pipe; 973 if (unlikely(!pipe)) { 974 pipe = alloc_pipe_info(NULL); 975 if (!pipe) 976 return -ENOMEM; 977 978 /* 979 * We don't have an immediate reader, but we'll read the stuff 980 * out of the pipe right after the splice_to_pipe(). So set 981 * PIPE_READERS appropriately. 982 */ 983 pipe->readers = 1; 984 985 current->splice_pipe = pipe; 986 } 987 988 /* 989 * Do the splice. 990 */ 991 ret = 0; 992 bytes = 0; 993 out_off = 0; 994 995 while (len) { 996 size_t read_len, max_read_len; 997 998 /* 999 * Do at most PIPE_BUFFERS pages worth of transfer: 1000 */ 1001 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); 1002 1003 ret = do_splice_to(in, ppos, pipe, max_read_len, flags); 1004 if (unlikely(ret < 0)) 1005 goto out_release; 1006 1007 read_len = ret; 1008 1009 /* 1010 * NOTE: nonblocking mode only applies to the input. We 1011 * must not do the output in nonblocking mode as then we 1012 * could get stuck data in the internal pipe: 1013 */ 1014 ret = do_splice_from(pipe, out, &out_off, read_len, 1015 flags & ~SPLICE_F_NONBLOCK); 1016 if (unlikely(ret < 0)) 1017 goto out_release; 1018 1019 bytes += ret; 1020 len -= ret; 1021 1022 /* 1023 * In nonblocking mode, if we got back a short read then 1024 * that was due to either an IO error or due to the 1025 * pagecache entry not being there. In the IO error case 1026 * the _next_ splice attempt will produce a clean IO error 1027 * return value (not a short read), so in both cases it's 1028 * correct to break out of the loop here: 1029 */ 1030 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) 1031 break; 1032 } 1033 1034 pipe->nrbufs = pipe->curbuf = 0; 1035 1036 return bytes; 1037 1038 out_release: 1039 /* 1040 * If we did an incomplete transfer we must release 1041 * the pipe buffers in question: 1042 */ 1043 for (i = 0; i < PIPE_BUFFERS; i++) { 1044 struct pipe_buffer *buf = pipe->bufs + i; 1045 1046 if (buf->ops) { 1047 buf->ops->release(pipe, buf); 1048 buf->ops = NULL; 1049 } 1050 } 1051 pipe->nrbufs = pipe->curbuf = 0; 1052 1053 /* 1054 * If we transferred some data, return the number of bytes: 1055 */ 1056 if (bytes > 0) 1057 return bytes; 1058 1059 return ret; 1060 } 1061 1062 EXPORT_SYMBOL(do_splice_direct); 1063 1064 /* 1065 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1066 * location, so checking ->i_pipe is not enough to verify that this is a 1067 * pipe. 1068 */ 1069 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1070 { 1071 if (S_ISFIFO(inode->i_mode)) 1072 return inode->i_pipe; 1073 1074 return NULL; 1075 } 1076 1077 /* 1078 * Determine where to splice to/from. 1079 */ 1080 static long do_splice(struct file *in, loff_t __user *off_in, 1081 struct file *out, loff_t __user *off_out, 1082 size_t len, unsigned int flags) 1083 { 1084 struct pipe_inode_info *pipe; 1085 loff_t offset, *off; 1086 long ret; 1087 1088 pipe = pipe_info(in->f_path.dentry->d_inode); 1089 if (pipe) { 1090 if (off_in) 1091 return -ESPIPE; 1092 if (off_out) { 1093 if (out->f_op->llseek == no_llseek) 1094 return -EINVAL; 1095 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1096 return -EFAULT; 1097 off = &offset; 1098 } else 1099 off = &out->f_pos; 1100 1101 ret = do_splice_from(pipe, out, off, len, flags); 1102 1103 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1104 ret = -EFAULT; 1105 1106 return ret; 1107 } 1108 1109 pipe = pipe_info(out->f_path.dentry->d_inode); 1110 if (pipe) { 1111 if (off_out) 1112 return -ESPIPE; 1113 if (off_in) { 1114 if (in->f_op->llseek == no_llseek) 1115 return -EINVAL; 1116 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1117 return -EFAULT; 1118 off = &offset; 1119 } else 1120 off = &in->f_pos; 1121 1122 ret = do_splice_to(in, off, pipe, len, flags); 1123 1124 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1125 ret = -EFAULT; 1126 1127 return ret; 1128 } 1129 1130 return -EINVAL; 1131 } 1132 1133 /* 1134 * Map an iov into an array of pages and offset/length tupples. With the 1135 * partial_page structure, we can map several non-contiguous ranges into 1136 * our ones pages[] map instead of splitting that operation into pieces. 1137 * Could easily be exported as a generic helper for other users, in which 1138 * case one would probably want to add a 'max_nr_pages' parameter as well. 1139 */ 1140 static int get_iovec_page_array(const struct iovec __user *iov, 1141 unsigned int nr_vecs, struct page **pages, 1142 struct partial_page *partial, int aligned) 1143 { 1144 int buffers = 0, error = 0; 1145 1146 /* 1147 * It's ok to take the mmap_sem for reading, even 1148 * across a "get_user()". 1149 */ 1150 down_read(¤t->mm->mmap_sem); 1151 1152 while (nr_vecs) { 1153 unsigned long off, npages; 1154 void __user *base; 1155 size_t len; 1156 int i; 1157 1158 /* 1159 * Get user address base and length for this iovec. 1160 */ 1161 error = get_user(base, &iov->iov_base); 1162 if (unlikely(error)) 1163 break; 1164 error = get_user(len, &iov->iov_len); 1165 if (unlikely(error)) 1166 break; 1167 1168 /* 1169 * Sanity check this iovec. 0 read succeeds. 1170 */ 1171 if (unlikely(!len)) 1172 break; 1173 error = -EFAULT; 1174 if (unlikely(!base)) 1175 break; 1176 1177 /* 1178 * Get this base offset and number of pages, then map 1179 * in the user pages. 1180 */ 1181 off = (unsigned long) base & ~PAGE_MASK; 1182 1183 /* 1184 * If asked for alignment, the offset must be zero and the 1185 * length a multiple of the PAGE_SIZE. 1186 */ 1187 error = -EINVAL; 1188 if (aligned && (off || len & ~PAGE_MASK)) 1189 break; 1190 1191 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1192 if (npages > PIPE_BUFFERS - buffers) 1193 npages = PIPE_BUFFERS - buffers; 1194 1195 error = get_user_pages(current, current->mm, 1196 (unsigned long) base, npages, 0, 0, 1197 &pages[buffers], NULL); 1198 1199 if (unlikely(error <= 0)) 1200 break; 1201 1202 /* 1203 * Fill this contiguous range into the partial page map. 1204 */ 1205 for (i = 0; i < error; i++) { 1206 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1207 1208 partial[buffers].offset = off; 1209 partial[buffers].len = plen; 1210 1211 off = 0; 1212 len -= plen; 1213 buffers++; 1214 } 1215 1216 /* 1217 * We didn't complete this iov, stop here since it probably 1218 * means we have to move some of this into a pipe to 1219 * be able to continue. 1220 */ 1221 if (len) 1222 break; 1223 1224 /* 1225 * Don't continue if we mapped fewer pages than we asked for, 1226 * or if we mapped the max number of pages that we have 1227 * room for. 1228 */ 1229 if (error < npages || buffers == PIPE_BUFFERS) 1230 break; 1231 1232 nr_vecs--; 1233 iov++; 1234 } 1235 1236 up_read(¤t->mm->mmap_sem); 1237 1238 if (buffers) 1239 return buffers; 1240 1241 return error; 1242 } 1243 1244 /* 1245 * vmsplice splices a user address range into a pipe. It can be thought of 1246 * as splice-from-memory, where the regular splice is splice-from-file (or 1247 * to file). In both cases the output is a pipe, naturally. 1248 * 1249 * Note that vmsplice only supports splicing _from_ user memory to a pipe, 1250 * not the other way around. Splicing from user memory is a simple operation 1251 * that can be supported without any funky alignment restrictions or nasty 1252 * vm tricks. We simply map in the user memory and fill them into a pipe. 1253 * The reverse isn't quite as easy, though. There are two possible solutions 1254 * for that: 1255 * 1256 * - memcpy() the data internally, at which point we might as well just 1257 * do a regular read() on the buffer anyway. 1258 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1259 * has restriction limitations on both ends of the pipe). 1260 * 1261 * Alas, it isn't here. 1262 * 1263 */ 1264 static long do_vmsplice(struct file *file, const struct iovec __user *iov, 1265 unsigned long nr_segs, unsigned int flags) 1266 { 1267 struct pipe_inode_info *pipe; 1268 struct page *pages[PIPE_BUFFERS]; 1269 struct partial_page partial[PIPE_BUFFERS]; 1270 struct splice_pipe_desc spd = { 1271 .pages = pages, 1272 .partial = partial, 1273 .flags = flags, 1274 .ops = &user_page_pipe_buf_ops, 1275 }; 1276 1277 pipe = pipe_info(file->f_path.dentry->d_inode); 1278 if (!pipe) 1279 return -EBADF; 1280 if (unlikely(nr_segs > UIO_MAXIOV)) 1281 return -EINVAL; 1282 else if (unlikely(!nr_segs)) 1283 return 0; 1284 1285 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1286 flags & SPLICE_F_GIFT); 1287 if (spd.nr_pages <= 0) 1288 return spd.nr_pages; 1289 1290 return splice_to_pipe(pipe, &spd); 1291 } 1292 1293 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1294 unsigned long nr_segs, unsigned int flags) 1295 { 1296 struct file *file; 1297 long error; 1298 int fput; 1299 1300 error = -EBADF; 1301 file = fget_light(fd, &fput); 1302 if (file) { 1303 if (file->f_mode & FMODE_WRITE) 1304 error = do_vmsplice(file, iov, nr_segs, flags); 1305 1306 fput_light(file, fput); 1307 } 1308 1309 return error; 1310 } 1311 1312 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1313 int fd_out, loff_t __user *off_out, 1314 size_t len, unsigned int flags) 1315 { 1316 long error; 1317 struct file *in, *out; 1318 int fput_in, fput_out; 1319 1320 if (unlikely(!len)) 1321 return 0; 1322 1323 error = -EBADF; 1324 in = fget_light(fd_in, &fput_in); 1325 if (in) { 1326 if (in->f_mode & FMODE_READ) { 1327 out = fget_light(fd_out, &fput_out); 1328 if (out) { 1329 if (out->f_mode & FMODE_WRITE) 1330 error = do_splice(in, off_in, 1331 out, off_out, 1332 len, flags); 1333 fput_light(out, fput_out); 1334 } 1335 } 1336 1337 fput_light(in, fput_in); 1338 } 1339 1340 return error; 1341 } 1342 1343 /* 1344 * Make sure there's data to read. Wait for input if we can, otherwise 1345 * return an appropriate error. 1346 */ 1347 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1348 { 1349 int ret; 1350 1351 /* 1352 * Check ->nrbufs without the inode lock first. This function 1353 * is speculative anyways, so missing one is ok. 1354 */ 1355 if (pipe->nrbufs) 1356 return 0; 1357 1358 ret = 0; 1359 mutex_lock(&pipe->inode->i_mutex); 1360 1361 while (!pipe->nrbufs) { 1362 if (signal_pending(current)) { 1363 ret = -ERESTARTSYS; 1364 break; 1365 } 1366 if (!pipe->writers) 1367 break; 1368 if (!pipe->waiting_writers) { 1369 if (flags & SPLICE_F_NONBLOCK) { 1370 ret = -EAGAIN; 1371 break; 1372 } 1373 } 1374 pipe_wait(pipe); 1375 } 1376 1377 mutex_unlock(&pipe->inode->i_mutex); 1378 return ret; 1379 } 1380 1381 /* 1382 * Make sure there's writeable room. Wait for room if we can, otherwise 1383 * return an appropriate error. 1384 */ 1385 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1386 { 1387 int ret; 1388 1389 /* 1390 * Check ->nrbufs without the inode lock first. This function 1391 * is speculative anyways, so missing one is ok. 1392 */ 1393 if (pipe->nrbufs < PIPE_BUFFERS) 1394 return 0; 1395 1396 ret = 0; 1397 mutex_lock(&pipe->inode->i_mutex); 1398 1399 while (pipe->nrbufs >= PIPE_BUFFERS) { 1400 if (!pipe->readers) { 1401 send_sig(SIGPIPE, current, 0); 1402 ret = -EPIPE; 1403 break; 1404 } 1405 if (flags & SPLICE_F_NONBLOCK) { 1406 ret = -EAGAIN; 1407 break; 1408 } 1409 if (signal_pending(current)) { 1410 ret = -ERESTARTSYS; 1411 break; 1412 } 1413 pipe->waiting_writers++; 1414 pipe_wait(pipe); 1415 pipe->waiting_writers--; 1416 } 1417 1418 mutex_unlock(&pipe->inode->i_mutex); 1419 return ret; 1420 } 1421 1422 /* 1423 * Link contents of ipipe to opipe. 1424 */ 1425 static int link_pipe(struct pipe_inode_info *ipipe, 1426 struct pipe_inode_info *opipe, 1427 size_t len, unsigned int flags) 1428 { 1429 struct pipe_buffer *ibuf, *obuf; 1430 int ret = 0, i = 0, nbuf; 1431 1432 /* 1433 * Potential ABBA deadlock, work around it by ordering lock 1434 * grabbing by inode address. Otherwise two different processes 1435 * could deadlock (one doing tee from A -> B, the other from B -> A). 1436 */ 1437 inode_double_lock(ipipe->inode, opipe->inode); 1438 1439 do { 1440 if (!opipe->readers) { 1441 send_sig(SIGPIPE, current, 0); 1442 if (!ret) 1443 ret = -EPIPE; 1444 break; 1445 } 1446 1447 /* 1448 * If we have iterated all input buffers or ran out of 1449 * output room, break. 1450 */ 1451 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1452 break; 1453 1454 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1455 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1456 1457 /* 1458 * Get a reference to this pipe buffer, 1459 * so we can copy the contents over. 1460 */ 1461 ibuf->ops->get(ipipe, ibuf); 1462 1463 obuf = opipe->bufs + nbuf; 1464 *obuf = *ibuf; 1465 1466 /* 1467 * Don't inherit the gift flag, we need to 1468 * prevent multiple steals of this page. 1469 */ 1470 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1471 1472 if (obuf->len > len) 1473 obuf->len = len; 1474 1475 opipe->nrbufs++; 1476 ret += obuf->len; 1477 len -= obuf->len; 1478 i++; 1479 } while (len); 1480 1481 inode_double_unlock(ipipe->inode, opipe->inode); 1482 1483 /* 1484 * If we put data in the output pipe, wakeup any potential readers. 1485 */ 1486 if (ret > 0) { 1487 smp_mb(); 1488 if (waitqueue_active(&opipe->wait)) 1489 wake_up_interruptible(&opipe->wait); 1490 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1491 } 1492 1493 return ret; 1494 } 1495 1496 /* 1497 * This is a tee(1) implementation that works on pipes. It doesn't copy 1498 * any data, it simply references the 'in' pages on the 'out' pipe. 1499 * The 'flags' used are the SPLICE_F_* variants, currently the only 1500 * applicable one is SPLICE_F_NONBLOCK. 1501 */ 1502 static long do_tee(struct file *in, struct file *out, size_t len, 1503 unsigned int flags) 1504 { 1505 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1506 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1507 int ret = -EINVAL; 1508 1509 /* 1510 * Duplicate the contents of ipipe to opipe without actually 1511 * copying the data. 1512 */ 1513 if (ipipe && opipe && ipipe != opipe) { 1514 /* 1515 * Keep going, unless we encounter an error. The ipipe/opipe 1516 * ordering doesn't really matter. 1517 */ 1518 ret = link_ipipe_prep(ipipe, flags); 1519 if (!ret) { 1520 ret = link_opipe_prep(opipe, flags); 1521 if (!ret) { 1522 ret = link_pipe(ipipe, opipe, len, flags); 1523 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1524 ret = -EAGAIN; 1525 } 1526 } 1527 } 1528 1529 return ret; 1530 } 1531 1532 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1533 { 1534 struct file *in; 1535 int error, fput_in; 1536 1537 if (unlikely(!len)) 1538 return 0; 1539 1540 error = -EBADF; 1541 in = fget_light(fdin, &fput_in); 1542 if (in) { 1543 if (in->f_mode & FMODE_READ) { 1544 int fput_out; 1545 struct file *out = fget_light(fdout, &fput_out); 1546 1547 if (out) { 1548 if (out->f_mode & FMODE_WRITE) 1549 error = do_tee(in, out, len, flags); 1550 fput_light(out, fput_out); 1551 } 1552 } 1553 fput_light(in, fput_in); 1554 } 1555 1556 return error; 1557 } 1558