1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/pipe_fs_i.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 32 struct partial_page { 33 unsigned int offset; 34 unsigned int len; 35 }; 36 37 /* 38 * Passed to splice_to_pipe 39 */ 40 struct splice_pipe_desc { 41 struct page **pages; /* page map */ 42 struct partial_page *partial; /* pages[] may not be contig */ 43 int nr_pages; /* number of pages in map */ 44 unsigned int flags; /* splice flags */ 45 struct pipe_buf_operations *ops;/* ops associated with output pipe */ 46 }; 47 48 /* 49 * Attempt to steal a page from a pipe buffer. This should perhaps go into 50 * a vm helper function, it's already simplified quite a bit by the 51 * addition of remove_mapping(). If success is returned, the caller may 52 * attempt to reuse this page for another destination. 53 */ 54 static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, 55 struct pipe_buffer *buf) 56 { 57 struct page *page = buf->page; 58 struct address_space *mapping = page_mapping(page); 59 60 lock_page(page); 61 62 WARN_ON(!PageUptodate(page)); 63 64 /* 65 * At least for ext2 with nobh option, we need to wait on writeback 66 * completing on this page, since we'll remove it from the pagecache. 67 * Otherwise truncate wont wait on the page, allowing the disk 68 * blocks to be reused by someone else before we actually wrote our 69 * data to them. fs corruption ensues. 70 */ 71 wait_on_page_writeback(page); 72 73 if (PagePrivate(page)) 74 try_to_release_page(page, mapping_gfp_mask(mapping)); 75 76 if (!remove_mapping(mapping, page)) { 77 unlock_page(page); 78 return 1; 79 } 80 81 buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; 82 return 0; 83 } 84 85 static void page_cache_pipe_buf_release(struct pipe_inode_info *info, 86 struct pipe_buffer *buf) 87 { 88 page_cache_release(buf->page); 89 buf->page = NULL; 90 buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); 91 } 92 93 static void *page_cache_pipe_buf_map(struct file *file, 94 struct pipe_inode_info *info, 95 struct pipe_buffer *buf) 96 { 97 struct page *page = buf->page; 98 int err; 99 100 if (!PageUptodate(page)) { 101 lock_page(page); 102 103 /* 104 * Page got truncated/unhashed. This will cause a 0-byte 105 * splice, if this is the first page. 106 */ 107 if (!page->mapping) { 108 err = -ENODATA; 109 goto error; 110 } 111 112 /* 113 * Uh oh, read-error from disk. 114 */ 115 if (!PageUptodate(page)) { 116 err = -EIO; 117 goto error; 118 } 119 120 /* 121 * Page is ok afterall, fall through to mapping. 122 */ 123 unlock_page(page); 124 } 125 126 return kmap(page); 127 error: 128 unlock_page(page); 129 return ERR_PTR(err); 130 } 131 132 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, 133 struct pipe_buffer *buf) 134 { 135 kunmap(buf->page); 136 } 137 138 static void *user_page_pipe_buf_map(struct file *file, 139 struct pipe_inode_info *pipe, 140 struct pipe_buffer *buf) 141 { 142 return kmap(buf->page); 143 } 144 145 static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, 146 struct pipe_buffer *buf) 147 { 148 kunmap(buf->page); 149 } 150 151 static void page_cache_pipe_buf_get(struct pipe_inode_info *info, 152 struct pipe_buffer *buf) 153 { 154 page_cache_get(buf->page); 155 } 156 157 static struct pipe_buf_operations page_cache_pipe_buf_ops = { 158 .can_merge = 0, 159 .map = page_cache_pipe_buf_map, 160 .unmap = page_cache_pipe_buf_unmap, 161 .release = page_cache_pipe_buf_release, 162 .steal = page_cache_pipe_buf_steal, 163 .get = page_cache_pipe_buf_get, 164 }; 165 166 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 167 struct pipe_buffer *buf) 168 { 169 return 1; 170 } 171 172 static struct pipe_buf_operations user_page_pipe_buf_ops = { 173 .can_merge = 0, 174 .map = user_page_pipe_buf_map, 175 .unmap = user_page_pipe_buf_unmap, 176 .release = page_cache_pipe_buf_release, 177 .steal = user_page_pipe_buf_steal, 178 .get = page_cache_pipe_buf_get, 179 }; 180 181 /* 182 * Pipe output worker. This sets up our pipe format with the page cache 183 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 184 */ 185 static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 186 struct splice_pipe_desc *spd) 187 { 188 int ret, do_wakeup, page_nr; 189 190 ret = 0; 191 do_wakeup = 0; 192 page_nr = 0; 193 194 if (pipe->inode) 195 mutex_lock(&pipe->inode->i_mutex); 196 197 for (;;) { 198 if (!pipe->readers) { 199 send_sig(SIGPIPE, current, 0); 200 if (!ret) 201 ret = -EPIPE; 202 break; 203 } 204 205 if (pipe->nrbufs < PIPE_BUFFERS) { 206 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 207 struct pipe_buffer *buf = pipe->bufs + newbuf; 208 209 buf->page = spd->pages[page_nr]; 210 buf->offset = spd->partial[page_nr].offset; 211 buf->len = spd->partial[page_nr].len; 212 buf->ops = spd->ops; 213 pipe->nrbufs++; 214 page_nr++; 215 ret += buf->len; 216 217 if (pipe->inode) 218 do_wakeup = 1; 219 220 if (!--spd->nr_pages) 221 break; 222 if (pipe->nrbufs < PIPE_BUFFERS) 223 continue; 224 225 break; 226 } 227 228 if (spd->flags & SPLICE_F_NONBLOCK) { 229 if (!ret) 230 ret = -EAGAIN; 231 break; 232 } 233 234 if (signal_pending(current)) { 235 if (!ret) 236 ret = -ERESTARTSYS; 237 break; 238 } 239 240 if (do_wakeup) { 241 smp_mb(); 242 if (waitqueue_active(&pipe->wait)) 243 wake_up_interruptible_sync(&pipe->wait); 244 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 245 do_wakeup = 0; 246 } 247 248 pipe->waiting_writers++; 249 pipe_wait(pipe); 250 pipe->waiting_writers--; 251 } 252 253 if (pipe->inode) 254 mutex_unlock(&pipe->inode->i_mutex); 255 256 if (do_wakeup) { 257 smp_mb(); 258 if (waitqueue_active(&pipe->wait)) 259 wake_up_interruptible(&pipe->wait); 260 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 261 } 262 263 while (page_nr < spd->nr_pages) 264 page_cache_release(spd->pages[page_nr++]); 265 266 return ret; 267 } 268 269 static int 270 __generic_file_splice_read(struct file *in, loff_t *ppos, 271 struct pipe_inode_info *pipe, size_t len, 272 unsigned int flags) 273 { 274 struct address_space *mapping = in->f_mapping; 275 unsigned int loff, nr_pages; 276 struct page *pages[PIPE_BUFFERS]; 277 struct partial_page partial[PIPE_BUFFERS]; 278 struct page *page; 279 pgoff_t index, end_index; 280 loff_t isize; 281 size_t total_len; 282 int error, page_nr; 283 struct splice_pipe_desc spd = { 284 .pages = pages, 285 .partial = partial, 286 .flags = flags, 287 .ops = &page_cache_pipe_buf_ops, 288 }; 289 290 index = *ppos >> PAGE_CACHE_SHIFT; 291 loff = *ppos & ~PAGE_CACHE_MASK; 292 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 293 294 if (nr_pages > PIPE_BUFFERS) 295 nr_pages = PIPE_BUFFERS; 296 297 /* 298 * Initiate read-ahead on this page range. however, don't call into 299 * read-ahead if this is a non-zero offset (we are likely doing small 300 * chunk splice and the page is already there) for a single page. 301 */ 302 if (!loff || nr_pages > 1) 303 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 304 305 /* 306 * Now fill in the holes: 307 */ 308 error = 0; 309 total_len = 0; 310 311 /* 312 * Lookup the (hopefully) full range of pages we need. 313 */ 314 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 315 316 /* 317 * If find_get_pages_contig() returned fewer pages than we needed, 318 * allocate the rest. 319 */ 320 index += spd.nr_pages; 321 while (spd.nr_pages < nr_pages) { 322 /* 323 * Page could be there, find_get_pages_contig() breaks on 324 * the first hole. 325 */ 326 page = find_get_page(mapping, index); 327 if (!page) { 328 /* 329 * page didn't exist, allocate one. 330 */ 331 page = page_cache_alloc_cold(mapping); 332 if (!page) 333 break; 334 335 error = add_to_page_cache_lru(page, mapping, index, 336 mapping_gfp_mask(mapping)); 337 if (unlikely(error)) { 338 page_cache_release(page); 339 break; 340 } 341 /* 342 * add_to_page_cache() locks the page, unlock it 343 * to avoid convoluting the logic below even more. 344 */ 345 unlock_page(page); 346 } 347 348 pages[spd.nr_pages++] = page; 349 index++; 350 } 351 352 /* 353 * Now loop over the map and see if we need to start IO on any 354 * pages, fill in the partial map, etc. 355 */ 356 index = *ppos >> PAGE_CACHE_SHIFT; 357 nr_pages = spd.nr_pages; 358 spd.nr_pages = 0; 359 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 360 unsigned int this_len; 361 362 if (!len) 363 break; 364 365 /* 366 * this_len is the max we'll use from this page 367 */ 368 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 369 page = pages[page_nr]; 370 371 /* 372 * If the page isn't uptodate, we may need to start io on it 373 */ 374 if (!PageUptodate(page)) { 375 /* 376 * If in nonblock mode then dont block on waiting 377 * for an in-flight io page 378 */ 379 if (flags & SPLICE_F_NONBLOCK) 380 break; 381 382 lock_page(page); 383 384 /* 385 * page was truncated, stop here. if this isn't the 386 * first page, we'll just complete what we already 387 * added 388 */ 389 if (!page->mapping) { 390 unlock_page(page); 391 break; 392 } 393 /* 394 * page was already under io and is now done, great 395 */ 396 if (PageUptodate(page)) { 397 unlock_page(page); 398 goto fill_it; 399 } 400 401 /* 402 * need to read in the page 403 */ 404 error = mapping->a_ops->readpage(in, page); 405 if (unlikely(error)) { 406 /* 407 * We really should re-lookup the page here, 408 * but it complicates things a lot. Instead 409 * lets just do what we already stored, and 410 * we'll get it the next time we are called. 411 */ 412 if (error == AOP_TRUNCATED_PAGE) 413 error = 0; 414 415 break; 416 } 417 418 /* 419 * i_size must be checked after ->readpage(). 420 */ 421 isize = i_size_read(mapping->host); 422 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 423 if (unlikely(!isize || index > end_index)) 424 break; 425 426 /* 427 * if this is the last page, see if we need to shrink 428 * the length and stop 429 */ 430 if (end_index == index) { 431 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 432 if (total_len + loff > isize) 433 break; 434 /* 435 * force quit after adding this page 436 */ 437 len = this_len; 438 this_len = min(this_len, loff); 439 loff = 0; 440 } 441 } 442 fill_it: 443 partial[page_nr].offset = loff; 444 partial[page_nr].len = this_len; 445 len -= this_len; 446 total_len += this_len; 447 loff = 0; 448 spd.nr_pages++; 449 index++; 450 } 451 452 /* 453 * Release any pages at the end, if we quit early. 'i' is how far 454 * we got, 'nr_pages' is how many pages are in the map. 455 */ 456 while (page_nr < nr_pages) 457 page_cache_release(pages[page_nr++]); 458 459 if (spd.nr_pages) 460 return splice_to_pipe(pipe, &spd); 461 462 return error; 463 } 464 465 /** 466 * generic_file_splice_read - splice data from file to a pipe 467 * @in: file to splice from 468 * @pipe: pipe to splice to 469 * @len: number of bytes to splice 470 * @flags: splice modifier flags 471 * 472 * Will read pages from given file and fill them into a pipe. 473 */ 474 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 475 struct pipe_inode_info *pipe, size_t len, 476 unsigned int flags) 477 { 478 ssize_t spliced; 479 int ret; 480 481 ret = 0; 482 spliced = 0; 483 484 while (len) { 485 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 486 487 if (ret < 0) 488 break; 489 else if (!ret) { 490 if (spliced) 491 break; 492 if (flags & SPLICE_F_NONBLOCK) { 493 ret = -EAGAIN; 494 break; 495 } 496 } 497 498 *ppos += ret; 499 len -= ret; 500 spliced += ret; 501 } 502 503 if (spliced) 504 return spliced; 505 506 return ret; 507 } 508 509 EXPORT_SYMBOL(generic_file_splice_read); 510 511 /* 512 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 513 * using sendpage(). Return the number of bytes sent. 514 */ 515 static int pipe_to_sendpage(struct pipe_inode_info *info, 516 struct pipe_buffer *buf, struct splice_desc *sd) 517 { 518 struct file *file = sd->file; 519 loff_t pos = sd->pos; 520 ssize_t ret; 521 void *ptr; 522 int more; 523 524 /* 525 * Sub-optimal, but we are limited by the pipe ->map. We don't 526 * need a kmap'ed buffer here, we just want to make sure we 527 * have the page pinned if the pipe page originates from the 528 * page cache. 529 */ 530 ptr = buf->ops->map(file, info, buf); 531 if (IS_ERR(ptr)) 532 return PTR_ERR(ptr); 533 534 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 535 536 ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len, 537 &pos, more); 538 539 buf->ops->unmap(info, buf); 540 return ret; 541 } 542 543 /* 544 * This is a little more tricky than the file -> pipe splicing. There are 545 * basically three cases: 546 * 547 * - Destination page already exists in the address space and there 548 * are users of it. For that case we have no other option that 549 * copying the data. Tough luck. 550 * - Destination page already exists in the address space, but there 551 * are no users of it. Make sure it's uptodate, then drop it. Fall 552 * through to last case. 553 * - Destination page does not exist, we can add the pipe page to 554 * the page cache and avoid the copy. 555 * 556 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 557 * sd->flags), we attempt to migrate pages from the pipe to the output 558 * file address space page cache. This is possible if no one else has 559 * the pipe page referenced outside of the pipe and page cache. If 560 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 561 * a new page in the output file page cache and fill/dirty that. 562 */ 563 static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, 564 struct splice_desc *sd) 565 { 566 struct file *file = sd->file; 567 struct address_space *mapping = file->f_mapping; 568 gfp_t gfp_mask = mapping_gfp_mask(mapping); 569 unsigned int offset, this_len; 570 struct page *page; 571 pgoff_t index; 572 char *src; 573 int ret; 574 575 /* 576 * make sure the data in this buffer is uptodate 577 */ 578 src = buf->ops->map(file, info, buf); 579 if (IS_ERR(src)) 580 return PTR_ERR(src); 581 582 index = sd->pos >> PAGE_CACHE_SHIFT; 583 offset = sd->pos & ~PAGE_CACHE_MASK; 584 585 this_len = sd->len; 586 if (this_len + offset > PAGE_CACHE_SIZE) 587 this_len = PAGE_CACHE_SIZE - offset; 588 589 /* 590 * Reuse buf page, if SPLICE_F_MOVE is set. 591 */ 592 if (sd->flags & SPLICE_F_MOVE) { 593 /* 594 * If steal succeeds, buf->page is now pruned from the vm 595 * side (LRU and page cache) and we can reuse it. The page 596 * will also be looked on successful return. 597 */ 598 if (buf->ops->steal(info, buf)) 599 goto find_page; 600 601 page = buf->page; 602 if (add_to_page_cache(page, mapping, index, gfp_mask)) 603 goto find_page; 604 605 if (!(buf->flags & PIPE_BUF_FLAG_LRU)) 606 lru_cache_add(page); 607 } else { 608 find_page: 609 page = find_lock_page(mapping, index); 610 if (!page) { 611 ret = -ENOMEM; 612 page = page_cache_alloc_cold(mapping); 613 if (unlikely(!page)) 614 goto out_nomem; 615 616 /* 617 * This will also lock the page 618 */ 619 ret = add_to_page_cache_lru(page, mapping, index, 620 gfp_mask); 621 if (unlikely(ret)) 622 goto out; 623 } 624 625 /* 626 * We get here with the page locked. If the page is also 627 * uptodate, we don't need to do more. If it isn't, we 628 * may need to bring it in if we are not going to overwrite 629 * the full page. 630 */ 631 if (!PageUptodate(page)) { 632 if (this_len < PAGE_CACHE_SIZE) { 633 ret = mapping->a_ops->readpage(file, page); 634 if (unlikely(ret)) 635 goto out; 636 637 lock_page(page); 638 639 if (!PageUptodate(page)) { 640 /* 641 * Page got invalidated, repeat. 642 */ 643 if (!page->mapping) { 644 unlock_page(page); 645 page_cache_release(page); 646 goto find_page; 647 } 648 ret = -EIO; 649 goto out; 650 } 651 } else 652 SetPageUptodate(page); 653 } 654 } 655 656 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 657 if (ret == AOP_TRUNCATED_PAGE) { 658 page_cache_release(page); 659 goto find_page; 660 } else if (ret) 661 goto out; 662 663 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 664 char *dst = kmap_atomic(page, KM_USER0); 665 666 memcpy(dst + offset, src + buf->offset, this_len); 667 flush_dcache_page(page); 668 kunmap_atomic(dst, KM_USER0); 669 } 670 671 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 672 if (ret == AOP_TRUNCATED_PAGE) { 673 page_cache_release(page); 674 goto find_page; 675 } else if (ret) 676 goto out; 677 678 /* 679 * Return the number of bytes written. 680 */ 681 ret = this_len; 682 mark_page_accessed(page); 683 balance_dirty_pages_ratelimited(mapping); 684 out: 685 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) 686 page_cache_release(page); 687 688 unlock_page(page); 689 out_nomem: 690 buf->ops->unmap(info, buf); 691 return ret; 692 } 693 694 /* 695 * Pipe input worker. Most of this logic works like a regular pipe, the 696 * key here is the 'actor' worker passed in that actually moves the data 697 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 698 */ 699 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 700 loff_t *ppos, size_t len, unsigned int flags, 701 splice_actor *actor) 702 { 703 int ret, do_wakeup, err; 704 struct splice_desc sd; 705 706 ret = 0; 707 do_wakeup = 0; 708 709 sd.total_len = len; 710 sd.flags = flags; 711 sd.file = out; 712 sd.pos = *ppos; 713 714 if (pipe->inode) 715 mutex_lock(&pipe->inode->i_mutex); 716 717 for (;;) { 718 if (pipe->nrbufs) { 719 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 720 struct pipe_buf_operations *ops = buf->ops; 721 722 sd.len = buf->len; 723 if (sd.len > sd.total_len) 724 sd.len = sd.total_len; 725 726 err = actor(pipe, buf, &sd); 727 if (err <= 0) { 728 if (!ret && err != -ENODATA) 729 ret = err; 730 731 break; 732 } 733 734 ret += err; 735 buf->offset += err; 736 buf->len -= err; 737 738 sd.len -= err; 739 sd.pos += err; 740 sd.total_len -= err; 741 if (sd.len) 742 continue; 743 744 if (!buf->len) { 745 buf->ops = NULL; 746 ops->release(pipe, buf); 747 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 748 pipe->nrbufs--; 749 if (pipe->inode) 750 do_wakeup = 1; 751 } 752 753 if (!sd.total_len) 754 break; 755 } 756 757 if (pipe->nrbufs) 758 continue; 759 if (!pipe->writers) 760 break; 761 if (!pipe->waiting_writers) { 762 if (ret) 763 break; 764 } 765 766 if (flags & SPLICE_F_NONBLOCK) { 767 if (!ret) 768 ret = -EAGAIN; 769 break; 770 } 771 772 if (signal_pending(current)) { 773 if (!ret) 774 ret = -ERESTARTSYS; 775 break; 776 } 777 778 if (do_wakeup) { 779 smp_mb(); 780 if (waitqueue_active(&pipe->wait)) 781 wake_up_interruptible_sync(&pipe->wait); 782 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 783 do_wakeup = 0; 784 } 785 786 pipe_wait(pipe); 787 } 788 789 if (pipe->inode) 790 mutex_unlock(&pipe->inode->i_mutex); 791 792 if (do_wakeup) { 793 smp_mb(); 794 if (waitqueue_active(&pipe->wait)) 795 wake_up_interruptible(&pipe->wait); 796 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 797 } 798 799 return ret; 800 } 801 802 /** 803 * generic_file_splice_write - splice data from a pipe to a file 804 * @pipe: pipe info 805 * @out: file to write to 806 * @len: number of bytes to splice 807 * @flags: splice modifier flags 808 * 809 * Will either move or copy pages (determined by @flags options) from 810 * the given pipe inode to the given file. 811 * 812 */ 813 ssize_t 814 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 815 loff_t *ppos, size_t len, unsigned int flags) 816 { 817 struct address_space *mapping = out->f_mapping; 818 ssize_t ret; 819 820 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 821 if (ret > 0) { 822 struct inode *inode = mapping->host; 823 824 *ppos += ret; 825 826 /* 827 * If file or inode is SYNC and we actually wrote some data, 828 * sync it. 829 */ 830 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 831 int err; 832 833 mutex_lock(&inode->i_mutex); 834 err = generic_osync_inode(inode, mapping, 835 OSYNC_METADATA|OSYNC_DATA); 836 mutex_unlock(&inode->i_mutex); 837 838 if (err) 839 ret = err; 840 } 841 } 842 843 return ret; 844 } 845 846 EXPORT_SYMBOL(generic_file_splice_write); 847 848 /** 849 * generic_splice_sendpage - splice data from a pipe to a socket 850 * @inode: pipe inode 851 * @out: socket to write to 852 * @len: number of bytes to splice 853 * @flags: splice modifier flags 854 * 855 * Will send @len bytes from the pipe to a network socket. No data copying 856 * is involved. 857 * 858 */ 859 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 860 loff_t *ppos, size_t len, unsigned int flags) 861 { 862 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 863 } 864 865 EXPORT_SYMBOL(generic_splice_sendpage); 866 867 /* 868 * Attempt to initiate a splice from pipe to file. 869 */ 870 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 871 loff_t *ppos, size_t len, unsigned int flags) 872 { 873 int ret; 874 875 if (unlikely(!out->f_op || !out->f_op->splice_write)) 876 return -EINVAL; 877 878 if (unlikely(!(out->f_mode & FMODE_WRITE))) 879 return -EBADF; 880 881 ret = rw_verify_area(WRITE, out, ppos, len); 882 if (unlikely(ret < 0)) 883 return ret; 884 885 return out->f_op->splice_write(pipe, out, ppos, len, flags); 886 } 887 888 /* 889 * Attempt to initiate a splice from a file to a pipe. 890 */ 891 static long do_splice_to(struct file *in, loff_t *ppos, 892 struct pipe_inode_info *pipe, size_t len, 893 unsigned int flags) 894 { 895 loff_t isize, left; 896 int ret; 897 898 if (unlikely(!in->f_op || !in->f_op->splice_read)) 899 return -EINVAL; 900 901 if (unlikely(!(in->f_mode & FMODE_READ))) 902 return -EBADF; 903 904 ret = rw_verify_area(READ, in, ppos, len); 905 if (unlikely(ret < 0)) 906 return ret; 907 908 isize = i_size_read(in->f_mapping->host); 909 if (unlikely(*ppos >= isize)) 910 return 0; 911 912 left = isize - *ppos; 913 if (unlikely(left < len)) 914 len = left; 915 916 return in->f_op->splice_read(in, ppos, pipe, len, flags); 917 } 918 919 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 920 size_t len, unsigned int flags) 921 { 922 struct pipe_inode_info *pipe; 923 long ret, bytes; 924 loff_t out_off; 925 umode_t i_mode; 926 int i; 927 928 /* 929 * We require the input being a regular file, as we don't want to 930 * randomly drop data for eg socket -> socket splicing. Use the 931 * piped splicing for that! 932 */ 933 i_mode = in->f_dentry->d_inode->i_mode; 934 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 935 return -EINVAL; 936 937 /* 938 * neither in nor out is a pipe, setup an internal pipe attached to 939 * 'out' and transfer the wanted data from 'in' to 'out' through that 940 */ 941 pipe = current->splice_pipe; 942 if (unlikely(!pipe)) { 943 pipe = alloc_pipe_info(NULL); 944 if (!pipe) 945 return -ENOMEM; 946 947 /* 948 * We don't have an immediate reader, but we'll read the stuff 949 * out of the pipe right after the splice_to_pipe(). So set 950 * PIPE_READERS appropriately. 951 */ 952 pipe->readers = 1; 953 954 current->splice_pipe = pipe; 955 } 956 957 /* 958 * Do the splice. 959 */ 960 ret = 0; 961 bytes = 0; 962 out_off = 0; 963 964 while (len) { 965 size_t read_len, max_read_len; 966 967 /* 968 * Do at most PIPE_BUFFERS pages worth of transfer: 969 */ 970 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); 971 972 ret = do_splice_to(in, ppos, pipe, max_read_len, flags); 973 if (unlikely(ret < 0)) 974 goto out_release; 975 976 read_len = ret; 977 978 /* 979 * NOTE: nonblocking mode only applies to the input. We 980 * must not do the output in nonblocking mode as then we 981 * could get stuck data in the internal pipe: 982 */ 983 ret = do_splice_from(pipe, out, &out_off, read_len, 984 flags & ~SPLICE_F_NONBLOCK); 985 if (unlikely(ret < 0)) 986 goto out_release; 987 988 bytes += ret; 989 len -= ret; 990 991 /* 992 * In nonblocking mode, if we got back a short read then 993 * that was due to either an IO error or due to the 994 * pagecache entry not being there. In the IO error case 995 * the _next_ splice attempt will produce a clean IO error 996 * return value (not a short read), so in both cases it's 997 * correct to break out of the loop here: 998 */ 999 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) 1000 break; 1001 } 1002 1003 pipe->nrbufs = pipe->curbuf = 0; 1004 1005 return bytes; 1006 1007 out_release: 1008 /* 1009 * If we did an incomplete transfer we must release 1010 * the pipe buffers in question: 1011 */ 1012 for (i = 0; i < PIPE_BUFFERS; i++) { 1013 struct pipe_buffer *buf = pipe->bufs + i; 1014 1015 if (buf->ops) { 1016 buf->ops->release(pipe, buf); 1017 buf->ops = NULL; 1018 } 1019 } 1020 pipe->nrbufs = pipe->curbuf = 0; 1021 1022 /* 1023 * If we transferred some data, return the number of bytes: 1024 */ 1025 if (bytes > 0) 1026 return bytes; 1027 1028 return ret; 1029 } 1030 1031 EXPORT_SYMBOL(do_splice_direct); 1032 1033 /* 1034 * Determine where to splice to/from. 1035 */ 1036 static long do_splice(struct file *in, loff_t __user *off_in, 1037 struct file *out, loff_t __user *off_out, 1038 size_t len, unsigned int flags) 1039 { 1040 struct pipe_inode_info *pipe; 1041 loff_t offset, *off; 1042 long ret; 1043 1044 pipe = in->f_dentry->d_inode->i_pipe; 1045 if (pipe) { 1046 if (off_in) 1047 return -ESPIPE; 1048 if (off_out) { 1049 if (out->f_op->llseek == no_llseek) 1050 return -EINVAL; 1051 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1052 return -EFAULT; 1053 off = &offset; 1054 } else 1055 off = &out->f_pos; 1056 1057 ret = do_splice_from(pipe, out, off, len, flags); 1058 1059 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1060 ret = -EFAULT; 1061 1062 return ret; 1063 } 1064 1065 pipe = out->f_dentry->d_inode->i_pipe; 1066 if (pipe) { 1067 if (off_out) 1068 return -ESPIPE; 1069 if (off_in) { 1070 if (in->f_op->llseek == no_llseek) 1071 return -EINVAL; 1072 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1073 return -EFAULT; 1074 off = &offset; 1075 } else 1076 off = &in->f_pos; 1077 1078 ret = do_splice_to(in, off, pipe, len, flags); 1079 1080 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1081 ret = -EFAULT; 1082 1083 return ret; 1084 } 1085 1086 return -EINVAL; 1087 } 1088 1089 /* 1090 * Map an iov into an array of pages and offset/length tupples. With the 1091 * partial_page structure, we can map several non-contiguous ranges into 1092 * our ones pages[] map instead of splitting that operation into pieces. 1093 * Could easily be exported as a generic helper for other users, in which 1094 * case one would probably want to add a 'max_nr_pages' parameter as well. 1095 */ 1096 static int get_iovec_page_array(const struct iovec __user *iov, 1097 unsigned int nr_vecs, struct page **pages, 1098 struct partial_page *partial) 1099 { 1100 int buffers = 0, error = 0; 1101 1102 /* 1103 * It's ok to take the mmap_sem for reading, even 1104 * across a "get_user()". 1105 */ 1106 down_read(¤t->mm->mmap_sem); 1107 1108 while (nr_vecs) { 1109 unsigned long off, npages; 1110 void __user *base; 1111 size_t len; 1112 int i; 1113 1114 /* 1115 * Get user address base and length for this iovec. 1116 */ 1117 error = get_user(base, &iov->iov_base); 1118 if (unlikely(error)) 1119 break; 1120 error = get_user(len, &iov->iov_len); 1121 if (unlikely(error)) 1122 break; 1123 1124 /* 1125 * Sanity check this iovec. 0 read succeeds. 1126 */ 1127 if (unlikely(!len)) 1128 break; 1129 error = -EFAULT; 1130 if (unlikely(!base)) 1131 break; 1132 1133 /* 1134 * Get this base offset and number of pages, then map 1135 * in the user pages. 1136 */ 1137 off = (unsigned long) base & ~PAGE_MASK; 1138 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1139 if (npages > PIPE_BUFFERS - buffers) 1140 npages = PIPE_BUFFERS - buffers; 1141 1142 error = get_user_pages(current, current->mm, 1143 (unsigned long) base, npages, 0, 0, 1144 &pages[buffers], NULL); 1145 1146 if (unlikely(error <= 0)) 1147 break; 1148 1149 /* 1150 * Fill this contiguous range into the partial page map. 1151 */ 1152 for (i = 0; i < error; i++) { 1153 const int plen = min_t(size_t, len, PAGE_SIZE) - off; 1154 1155 partial[buffers].offset = off; 1156 partial[buffers].len = plen; 1157 1158 off = 0; 1159 len -= plen; 1160 buffers++; 1161 } 1162 1163 /* 1164 * We didn't complete this iov, stop here since it probably 1165 * means we have to move some of this into a pipe to 1166 * be able to continue. 1167 */ 1168 if (len) 1169 break; 1170 1171 /* 1172 * Don't continue if we mapped fewer pages than we asked for, 1173 * or if we mapped the max number of pages that we have 1174 * room for. 1175 */ 1176 if (error < npages || buffers == PIPE_BUFFERS) 1177 break; 1178 1179 nr_vecs--; 1180 iov++; 1181 } 1182 1183 up_read(¤t->mm->mmap_sem); 1184 1185 if (buffers) 1186 return buffers; 1187 1188 return error; 1189 } 1190 1191 /* 1192 * vmsplice splices a user address range into a pipe. It can be thought of 1193 * as splice-from-memory, where the regular splice is splice-from-file (or 1194 * to file). In both cases the output is a pipe, naturally. 1195 * 1196 * Note that vmsplice only supports splicing _from_ user memory to a pipe, 1197 * not the other way around. Splicing from user memory is a simple operation 1198 * that can be supported without any funky alignment restrictions or nasty 1199 * vm tricks. We simply map in the user memory and fill them into a pipe. 1200 * The reverse isn't quite as easy, though. There are two possible solutions 1201 * for that: 1202 * 1203 * - memcpy() the data internally, at which point we might as well just 1204 * do a regular read() on the buffer anyway. 1205 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1206 * has restriction limitations on both ends of the pipe). 1207 * 1208 * Alas, it isn't here. 1209 * 1210 */ 1211 static long do_vmsplice(struct file *file, const struct iovec __user *iov, 1212 unsigned long nr_segs, unsigned int flags) 1213 { 1214 struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; 1215 struct page *pages[PIPE_BUFFERS]; 1216 struct partial_page partial[PIPE_BUFFERS]; 1217 struct splice_pipe_desc spd = { 1218 .pages = pages, 1219 .partial = partial, 1220 .flags = flags, 1221 .ops = &user_page_pipe_buf_ops, 1222 }; 1223 1224 if (unlikely(!pipe)) 1225 return -EBADF; 1226 if (unlikely(nr_segs > UIO_MAXIOV)) 1227 return -EINVAL; 1228 else if (unlikely(!nr_segs)) 1229 return 0; 1230 1231 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); 1232 if (spd.nr_pages <= 0) 1233 return spd.nr_pages; 1234 1235 return splice_to_pipe(pipe, &spd); 1236 } 1237 1238 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1239 unsigned long nr_segs, unsigned int flags) 1240 { 1241 struct file *file; 1242 long error; 1243 int fput; 1244 1245 error = -EBADF; 1246 file = fget_light(fd, &fput); 1247 if (file) { 1248 if (file->f_mode & FMODE_WRITE) 1249 error = do_vmsplice(file, iov, nr_segs, flags); 1250 1251 fput_light(file, fput); 1252 } 1253 1254 return error; 1255 } 1256 1257 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1258 int fd_out, loff_t __user *off_out, 1259 size_t len, unsigned int flags) 1260 { 1261 long error; 1262 struct file *in, *out; 1263 int fput_in, fput_out; 1264 1265 if (unlikely(!len)) 1266 return 0; 1267 1268 error = -EBADF; 1269 in = fget_light(fd_in, &fput_in); 1270 if (in) { 1271 if (in->f_mode & FMODE_READ) { 1272 out = fget_light(fd_out, &fput_out); 1273 if (out) { 1274 if (out->f_mode & FMODE_WRITE) 1275 error = do_splice(in, off_in, 1276 out, off_out, 1277 len, flags); 1278 fput_light(out, fput_out); 1279 } 1280 } 1281 1282 fput_light(in, fput_in); 1283 } 1284 1285 return error; 1286 } 1287 1288 /* 1289 * Link contents of ipipe to opipe. 1290 */ 1291 static int link_pipe(struct pipe_inode_info *ipipe, 1292 struct pipe_inode_info *opipe, 1293 size_t len, unsigned int flags) 1294 { 1295 struct pipe_buffer *ibuf, *obuf; 1296 int ret, do_wakeup, i, ipipe_first; 1297 1298 ret = do_wakeup = ipipe_first = 0; 1299 1300 /* 1301 * Potential ABBA deadlock, work around it by ordering lock 1302 * grabbing by inode address. Otherwise two different processes 1303 * could deadlock (one doing tee from A -> B, the other from B -> A). 1304 */ 1305 if (ipipe->inode < opipe->inode) { 1306 ipipe_first = 1; 1307 mutex_lock(&ipipe->inode->i_mutex); 1308 mutex_lock(&opipe->inode->i_mutex); 1309 } else { 1310 mutex_lock(&opipe->inode->i_mutex); 1311 mutex_lock(&ipipe->inode->i_mutex); 1312 } 1313 1314 for (i = 0;; i++) { 1315 if (!opipe->readers) { 1316 send_sig(SIGPIPE, current, 0); 1317 if (!ret) 1318 ret = -EPIPE; 1319 break; 1320 } 1321 if (ipipe->nrbufs - i) { 1322 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1323 1324 /* 1325 * If we have room, fill this buffer 1326 */ 1327 if (opipe->nrbufs < PIPE_BUFFERS) { 1328 int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1329 1330 /* 1331 * Get a reference to this pipe buffer, 1332 * so we can copy the contents over. 1333 */ 1334 ibuf->ops->get(ipipe, ibuf); 1335 1336 obuf = opipe->bufs + nbuf; 1337 *obuf = *ibuf; 1338 1339 if (obuf->len > len) 1340 obuf->len = len; 1341 1342 opipe->nrbufs++; 1343 do_wakeup = 1; 1344 ret += obuf->len; 1345 len -= obuf->len; 1346 1347 if (!len) 1348 break; 1349 if (opipe->nrbufs < PIPE_BUFFERS) 1350 continue; 1351 } 1352 1353 /* 1354 * We have input available, but no output room. 1355 * If we already copied data, return that. If we 1356 * need to drop the opipe lock, it must be ordered 1357 * last to avoid deadlocks. 1358 */ 1359 if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { 1360 if (!ret) 1361 ret = -EAGAIN; 1362 break; 1363 } 1364 if (signal_pending(current)) { 1365 if (!ret) 1366 ret = -ERESTARTSYS; 1367 break; 1368 } 1369 if (do_wakeup) { 1370 smp_mb(); 1371 if (waitqueue_active(&opipe->wait)) 1372 wake_up_interruptible(&opipe->wait); 1373 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1374 do_wakeup = 0; 1375 } 1376 1377 opipe->waiting_writers++; 1378 pipe_wait(opipe); 1379 opipe->waiting_writers--; 1380 continue; 1381 } 1382 1383 /* 1384 * No input buffers, do the usual checks for available 1385 * writers and blocking and wait if necessary 1386 */ 1387 if (!ipipe->writers) 1388 break; 1389 if (!ipipe->waiting_writers) { 1390 if (ret) 1391 break; 1392 } 1393 /* 1394 * pipe_wait() drops the ipipe mutex. To avoid deadlocks 1395 * with another process, we can only safely do that if 1396 * the ipipe lock is ordered last. 1397 */ 1398 if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { 1399 if (!ret) 1400 ret = -EAGAIN; 1401 break; 1402 } 1403 if (signal_pending(current)) { 1404 if (!ret) 1405 ret = -ERESTARTSYS; 1406 break; 1407 } 1408 1409 if (waitqueue_active(&ipipe->wait)) 1410 wake_up_interruptible_sync(&ipipe->wait); 1411 kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); 1412 1413 pipe_wait(ipipe); 1414 } 1415 1416 mutex_unlock(&ipipe->inode->i_mutex); 1417 mutex_unlock(&opipe->inode->i_mutex); 1418 1419 if (do_wakeup) { 1420 smp_mb(); 1421 if (waitqueue_active(&opipe->wait)) 1422 wake_up_interruptible(&opipe->wait); 1423 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1424 } 1425 1426 return ret; 1427 } 1428 1429 /* 1430 * This is a tee(1) implementation that works on pipes. It doesn't copy 1431 * any data, it simply references the 'in' pages on the 'out' pipe. 1432 * The 'flags' used are the SPLICE_F_* variants, currently the only 1433 * applicable one is SPLICE_F_NONBLOCK. 1434 */ 1435 static long do_tee(struct file *in, struct file *out, size_t len, 1436 unsigned int flags) 1437 { 1438 struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; 1439 struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; 1440 1441 /* 1442 * Link ipipe to the two output pipes, consuming as we go along. 1443 */ 1444 if (ipipe && opipe) 1445 return link_pipe(ipipe, opipe, len, flags); 1446 1447 return -EINVAL; 1448 } 1449 1450 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1451 { 1452 struct file *in; 1453 int error, fput_in; 1454 1455 if (unlikely(!len)) 1456 return 0; 1457 1458 error = -EBADF; 1459 in = fget_light(fdin, &fput_in); 1460 if (in) { 1461 if (in->f_mode & FMODE_READ) { 1462 int fput_out; 1463 struct file *out = fget_light(fdout, &fput_out); 1464 1465 if (out) { 1466 if (out->f_mode & FMODE_WRITE) 1467 error = do_tee(in, out, len, flags); 1468 fput_light(out, fput_out); 1469 } 1470 } 1471 fput_light(in, fput_in); 1472 } 1473 1474 return error; 1475 } 1476