1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files and fixing the initial implementation 13 * bugs. 14 * 15 * Copyright (C) 2005 Jens Axboe <axboe@suse.de> 16 * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org> 17 * 18 */ 19 #include <linux/fs.h> 20 #include <linux/file.h> 21 #include <linux/pagemap.h> 22 #include <linux/pipe_fs_i.h> 23 #include <linux/mm_inline.h> 24 #include <linux/swap.h> 25 #include <linux/writeback.h> 26 #include <linux/buffer_head.h> 27 #include <linux/module.h> 28 #include <linux/syscalls.h> 29 30 /* 31 * Passed to the actors 32 */ 33 struct splice_desc { 34 unsigned int len, total_len; /* current and remaining length */ 35 unsigned int flags; /* splice flags */ 36 struct file *file; /* file to read/write */ 37 loff_t pos; /* file position */ 38 }; 39 40 /* 41 * Attempt to steal a page from a pipe buffer. This should perhaps go into 42 * a vm helper function, it's already simplified quite a bit by the 43 * addition of remove_mapping(). If success is returned, the caller may 44 * attempt to reuse this page for another destination. 45 */ 46 static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, 47 struct pipe_buffer *buf) 48 { 49 struct page *page = buf->page; 50 struct address_space *mapping = page_mapping(page); 51 52 WARN_ON(!PageLocked(page)); 53 WARN_ON(!PageUptodate(page)); 54 55 /* 56 * At least for ext2 with nobh option, we need to wait on writeback 57 * completing on this page, since we'll remove it from the pagecache. 58 * Otherwise truncate wont wait on the page, allowing the disk 59 * blocks to be reused by someone else before we actually wrote our 60 * data to them. fs corruption ensues. 61 */ 62 wait_on_page_writeback(page); 63 64 if (PagePrivate(page)) 65 try_to_release_page(page, mapping_gfp_mask(mapping)); 66 67 if (!remove_mapping(mapping, page)) 68 return 1; 69 70 buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; 71 return 0; 72 } 73 74 static void page_cache_pipe_buf_release(struct pipe_inode_info *info, 75 struct pipe_buffer *buf) 76 { 77 page_cache_release(buf->page); 78 buf->page = NULL; 79 buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); 80 } 81 82 static void *page_cache_pipe_buf_map(struct file *file, 83 struct pipe_inode_info *info, 84 struct pipe_buffer *buf) 85 { 86 struct page *page = buf->page; 87 88 lock_page(page); 89 90 if (!PageUptodate(page)) { 91 unlock_page(page); 92 return ERR_PTR(-EIO); 93 } 94 95 if (!page->mapping) { 96 unlock_page(page); 97 return ERR_PTR(-ENODATA); 98 } 99 100 return kmap(buf->page); 101 } 102 103 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, 104 struct pipe_buffer *buf) 105 { 106 unlock_page(buf->page); 107 kunmap(buf->page); 108 } 109 110 static struct pipe_buf_operations page_cache_pipe_buf_ops = { 111 .can_merge = 0, 112 .map = page_cache_pipe_buf_map, 113 .unmap = page_cache_pipe_buf_unmap, 114 .release = page_cache_pipe_buf_release, 115 .steal = page_cache_pipe_buf_steal, 116 }; 117 118 /* 119 * Pipe output worker. This sets up our pipe format with the page cache 120 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 121 */ 122 static ssize_t move_to_pipe(struct inode *inode, struct page **pages, 123 int nr_pages, unsigned long offset, 124 unsigned long len, unsigned int flags) 125 { 126 struct pipe_inode_info *info; 127 int ret, do_wakeup, i; 128 129 ret = 0; 130 do_wakeup = 0; 131 i = 0; 132 133 mutex_lock(PIPE_MUTEX(*inode)); 134 135 info = inode->i_pipe; 136 for (;;) { 137 int bufs; 138 139 if (!PIPE_READERS(*inode)) { 140 send_sig(SIGPIPE, current, 0); 141 if (!ret) 142 ret = -EPIPE; 143 break; 144 } 145 146 bufs = info->nrbufs; 147 if (bufs < PIPE_BUFFERS) { 148 int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1); 149 struct pipe_buffer *buf = info->bufs + newbuf; 150 struct page *page = pages[i++]; 151 unsigned long this_len; 152 153 this_len = PAGE_CACHE_SIZE - offset; 154 if (this_len > len) 155 this_len = len; 156 157 buf->page = page; 158 buf->offset = offset; 159 buf->len = this_len; 160 buf->ops = &page_cache_pipe_buf_ops; 161 info->nrbufs = ++bufs; 162 do_wakeup = 1; 163 164 ret += this_len; 165 len -= this_len; 166 offset = 0; 167 if (!--nr_pages) 168 break; 169 if (!len) 170 break; 171 if (bufs < PIPE_BUFFERS) 172 continue; 173 174 break; 175 } 176 177 if (flags & SPLICE_F_NONBLOCK) { 178 if (!ret) 179 ret = -EAGAIN; 180 break; 181 } 182 183 if (signal_pending(current)) { 184 if (!ret) 185 ret = -ERESTARTSYS; 186 break; 187 } 188 189 if (do_wakeup) { 190 wake_up_interruptible_sync(PIPE_WAIT(*inode)); 191 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, 192 POLL_IN); 193 do_wakeup = 0; 194 } 195 196 PIPE_WAITING_WRITERS(*inode)++; 197 pipe_wait(inode); 198 PIPE_WAITING_WRITERS(*inode)--; 199 } 200 201 mutex_unlock(PIPE_MUTEX(*inode)); 202 203 if (do_wakeup) { 204 wake_up_interruptible(PIPE_WAIT(*inode)); 205 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); 206 } 207 208 while (i < nr_pages) 209 page_cache_release(pages[i++]); 210 211 return ret; 212 } 213 214 static int __generic_file_splice_read(struct file *in, struct inode *pipe, 215 size_t len, unsigned int flags) 216 { 217 struct address_space *mapping = in->f_mapping; 218 unsigned int offset, nr_pages; 219 struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS]; 220 struct page *page; 221 pgoff_t index, pidx; 222 int i, j; 223 224 index = in->f_pos >> PAGE_CACHE_SHIFT; 225 offset = in->f_pos & ~PAGE_CACHE_MASK; 226 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 227 228 if (nr_pages > PIPE_BUFFERS) 229 nr_pages = PIPE_BUFFERS; 230 231 /* 232 * initiate read-ahead on this page range 233 */ 234 do_page_cache_readahead(mapping, in, index, nr_pages); 235 236 /* 237 * Get as many pages from the page cache as possible.. 238 * Start IO on the page cache entries we create (we 239 * can assume that any pre-existing ones we find have 240 * already had IO started on them). 241 */ 242 i = find_get_pages(mapping, index, nr_pages, pages); 243 244 /* 245 * common case - we found all pages and they are contiguous, 246 * kick them off 247 */ 248 if (i && (pages[i - 1]->index == index + i - 1)) 249 goto splice_them; 250 251 /* 252 * fill shadow[] with pages at the right locations, so we only 253 * have to fill holes 254 */ 255 memset(shadow, 0, nr_pages * sizeof(struct page *)); 256 for (j = 0; j < i; j++) 257 shadow[pages[j]->index - index] = pages[j]; 258 259 /* 260 * now fill in the holes 261 */ 262 for (i = 0, pidx = index; i < nr_pages; pidx++, i++) { 263 int error; 264 265 if (shadow[i]) 266 continue; 267 268 /* 269 * no page there, look one up / create it 270 */ 271 page = find_or_create_page(mapping, pidx, 272 mapping_gfp_mask(mapping)); 273 if (!page) 274 break; 275 276 if (PageUptodate(page)) 277 unlock_page(page); 278 else { 279 error = mapping->a_ops->readpage(in, page); 280 281 if (unlikely(error)) { 282 page_cache_release(page); 283 break; 284 } 285 } 286 shadow[i] = page; 287 } 288 289 if (!i) { 290 for (i = 0; i < nr_pages; i++) { 291 if (shadow[i]) 292 page_cache_release(shadow[i]); 293 } 294 return 0; 295 } 296 297 memcpy(pages, shadow, i * sizeof(struct page *)); 298 299 /* 300 * Now we splice them into the pipe.. 301 */ 302 splice_them: 303 return move_to_pipe(pipe, pages, i, offset, len, flags); 304 } 305 306 /** 307 * generic_file_splice_read - splice data from file to a pipe 308 * @in: file to splice from 309 * @pipe: pipe to splice to 310 * @len: number of bytes to splice 311 * @flags: splice modifier flags 312 * 313 * Will read pages from given file and fill them into a pipe. 314 * 315 */ 316 ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, 317 size_t len, unsigned int flags) 318 { 319 ssize_t spliced; 320 int ret; 321 322 ret = 0; 323 spliced = 0; 324 while (len) { 325 ret = __generic_file_splice_read(in, pipe, len, flags); 326 327 if (ret <= 0) 328 break; 329 330 in->f_pos += ret; 331 len -= ret; 332 spliced += ret; 333 334 if (!(flags & SPLICE_F_NONBLOCK)) 335 continue; 336 ret = -EAGAIN; 337 break; 338 } 339 340 if (spliced) 341 return spliced; 342 343 return ret; 344 } 345 346 EXPORT_SYMBOL(generic_file_splice_read); 347 348 /* 349 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 350 * using sendpage(). 351 */ 352 static int pipe_to_sendpage(struct pipe_inode_info *info, 353 struct pipe_buffer *buf, struct splice_desc *sd) 354 { 355 struct file *file = sd->file; 356 loff_t pos = sd->pos; 357 unsigned int offset; 358 ssize_t ret; 359 void *ptr; 360 int more; 361 362 /* 363 * sub-optimal, but we are limited by the pipe ->map. we don't 364 * need a kmap'ed buffer here, we just want to make sure we 365 * have the page pinned if the pipe page originates from the 366 * page cache 367 */ 368 ptr = buf->ops->map(file, info, buf); 369 if (IS_ERR(ptr)) 370 return PTR_ERR(ptr); 371 372 offset = pos & ~PAGE_CACHE_MASK; 373 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 374 375 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); 376 377 buf->ops->unmap(info, buf); 378 if (ret == sd->len) 379 return 0; 380 381 return -EIO; 382 } 383 384 /* 385 * This is a little more tricky than the file -> pipe splicing. There are 386 * basically three cases: 387 * 388 * - Destination page already exists in the address space and there 389 * are users of it. For that case we have no other option that 390 * copying the data. Tough luck. 391 * - Destination page already exists in the address space, but there 392 * are no users of it. Make sure it's uptodate, then drop it. Fall 393 * through to last case. 394 * - Destination page does not exist, we can add the pipe page to 395 * the page cache and avoid the copy. 396 * 397 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 398 * sd->flags), we attempt to migrate pages from the pipe to the output 399 * file address space page cache. This is possible if no one else has 400 * the pipe page referenced outside of the pipe and page cache. If 401 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 402 * a new page in the output file page cache and fill/dirty that. 403 */ 404 static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, 405 struct splice_desc *sd) 406 { 407 struct file *file = sd->file; 408 struct address_space *mapping = file->f_mapping; 409 gfp_t gfp_mask = mapping_gfp_mask(mapping); 410 unsigned int offset; 411 struct page *page; 412 pgoff_t index; 413 char *src; 414 int ret; 415 416 /* 417 * after this, page will be locked and unmapped 418 */ 419 src = buf->ops->map(file, info, buf); 420 if (IS_ERR(src)) 421 return PTR_ERR(src); 422 423 index = sd->pos >> PAGE_CACHE_SHIFT; 424 offset = sd->pos & ~PAGE_CACHE_MASK; 425 426 /* 427 * reuse buf page, if SPLICE_F_MOVE is set 428 */ 429 if (sd->flags & SPLICE_F_MOVE) { 430 /* 431 * If steal succeeds, buf->page is now pruned from the vm 432 * side (LRU and page cache) and we can reuse it. 433 */ 434 if (buf->ops->steal(info, buf)) 435 goto find_page; 436 437 page = buf->page; 438 if (add_to_page_cache(page, mapping, index, gfp_mask)) 439 goto find_page; 440 441 if (!(buf->flags & PIPE_BUF_FLAG_LRU)) 442 lru_cache_add(page); 443 } else { 444 find_page: 445 ret = -ENOMEM; 446 page = find_or_create_page(mapping, index, gfp_mask); 447 if (!page) 448 goto out; 449 450 /* 451 * If the page is uptodate, it is also locked. If it isn't 452 * uptodate, we can mark it uptodate if we are filling the 453 * full page. Otherwise we need to read it in first... 454 */ 455 if (!PageUptodate(page)) { 456 if (sd->len < PAGE_CACHE_SIZE) { 457 ret = mapping->a_ops->readpage(file, page); 458 if (unlikely(ret)) 459 goto out; 460 461 lock_page(page); 462 463 if (!PageUptodate(page)) { 464 /* 465 * page got invalidated, repeat 466 */ 467 if (!page->mapping) { 468 unlock_page(page); 469 page_cache_release(page); 470 goto find_page; 471 } 472 ret = -EIO; 473 goto out; 474 } 475 } else { 476 WARN_ON(!PageLocked(page)); 477 SetPageUptodate(page); 478 } 479 } 480 } 481 482 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); 483 if (ret == AOP_TRUNCATED_PAGE) { 484 page_cache_release(page); 485 goto find_page; 486 } else if (ret) 487 goto out; 488 489 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 490 char *dst = kmap_atomic(page, KM_USER0); 491 492 memcpy(dst + offset, src + buf->offset, sd->len); 493 flush_dcache_page(page); 494 kunmap_atomic(dst, KM_USER0); 495 } 496 497 ret = mapping->a_ops->commit_write(file, page, 0, sd->len); 498 if (ret == AOP_TRUNCATED_PAGE) { 499 page_cache_release(page); 500 goto find_page; 501 } else if (ret) 502 goto out; 503 504 balance_dirty_pages_ratelimited(mapping); 505 out: 506 if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { 507 page_cache_release(page); 508 unlock_page(page); 509 } 510 buf->ops->unmap(info, buf); 511 return ret; 512 } 513 514 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, 515 struct splice_desc *); 516 517 /* 518 * Pipe input worker. Most of this logic works like a regular pipe, the 519 * key here is the 'actor' worker passed in that actually moves the data 520 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 521 */ 522 static ssize_t move_from_pipe(struct inode *inode, struct file *out, 523 size_t len, unsigned int flags, 524 splice_actor *actor) 525 { 526 struct pipe_inode_info *info; 527 int ret, do_wakeup, err; 528 struct splice_desc sd; 529 530 ret = 0; 531 do_wakeup = 0; 532 533 sd.total_len = len; 534 sd.flags = flags; 535 sd.file = out; 536 sd.pos = out->f_pos; 537 538 mutex_lock(PIPE_MUTEX(*inode)); 539 540 info = inode->i_pipe; 541 for (;;) { 542 int bufs = info->nrbufs; 543 544 if (bufs) { 545 int curbuf = info->curbuf; 546 struct pipe_buffer *buf = info->bufs + curbuf; 547 struct pipe_buf_operations *ops = buf->ops; 548 549 sd.len = buf->len; 550 if (sd.len > sd.total_len) 551 sd.len = sd.total_len; 552 553 err = actor(info, buf, &sd); 554 if (err) { 555 if (!ret && err != -ENODATA) 556 ret = err; 557 558 break; 559 } 560 561 ret += sd.len; 562 buf->offset += sd.len; 563 buf->len -= sd.len; 564 if (!buf->len) { 565 buf->ops = NULL; 566 ops->release(info, buf); 567 curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); 568 info->curbuf = curbuf; 569 info->nrbufs = --bufs; 570 do_wakeup = 1; 571 } 572 573 sd.pos += sd.len; 574 sd.total_len -= sd.len; 575 if (!sd.total_len) 576 break; 577 } 578 579 if (bufs) 580 continue; 581 if (!PIPE_WRITERS(*inode)) 582 break; 583 if (!PIPE_WAITING_WRITERS(*inode)) { 584 if (ret) 585 break; 586 } 587 588 if (flags & SPLICE_F_NONBLOCK) { 589 if (!ret) 590 ret = -EAGAIN; 591 break; 592 } 593 594 if (signal_pending(current)) { 595 if (!ret) 596 ret = -ERESTARTSYS; 597 break; 598 } 599 600 if (do_wakeup) { 601 wake_up_interruptible_sync(PIPE_WAIT(*inode)); 602 kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); 603 do_wakeup = 0; 604 } 605 606 pipe_wait(inode); 607 } 608 609 mutex_unlock(PIPE_MUTEX(*inode)); 610 611 if (do_wakeup) { 612 wake_up_interruptible(PIPE_WAIT(*inode)); 613 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); 614 } 615 616 mutex_lock(&out->f_mapping->host->i_mutex); 617 out->f_pos = sd.pos; 618 mutex_unlock(&out->f_mapping->host->i_mutex); 619 return ret; 620 621 } 622 623 /** 624 * generic_file_splice_write - splice data from a pipe to a file 625 * @inode: pipe inode 626 * @out: file to write to 627 * @len: number of bytes to splice 628 * @flags: splice modifier flags 629 * 630 * Will either move or copy pages (determined by @flags options) from 631 * the given pipe inode to the given file. 632 * 633 */ 634 ssize_t generic_file_splice_write(struct inode *inode, struct file *out, 635 size_t len, unsigned int flags) 636 { 637 struct address_space *mapping = out->f_mapping; 638 ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file); 639 640 /* 641 * if file or inode is SYNC and we actually wrote some data, sync it 642 */ 643 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) 644 && ret > 0) { 645 struct inode *inode = mapping->host; 646 int err; 647 648 mutex_lock(&inode->i_mutex); 649 err = generic_osync_inode(mapping->host, mapping, 650 OSYNC_METADATA|OSYNC_DATA); 651 mutex_unlock(&inode->i_mutex); 652 653 if (err) 654 ret = err; 655 } 656 657 return ret; 658 } 659 660 EXPORT_SYMBOL(generic_file_splice_write); 661 662 /** 663 * generic_splice_sendpage - splice data from a pipe to a socket 664 * @inode: pipe inode 665 * @out: socket to write to 666 * @len: number of bytes to splice 667 * @flags: splice modifier flags 668 * 669 * Will send @len bytes from the pipe to a network socket. No data copying 670 * is involved. 671 * 672 */ 673 ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, 674 size_t len, unsigned int flags) 675 { 676 return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); 677 } 678 679 EXPORT_SYMBOL(generic_splice_sendpage); 680 681 /* 682 * Attempt to initiate a splice from pipe to file. 683 */ 684 static long do_splice_from(struct inode *pipe, struct file *out, size_t len, 685 unsigned int flags) 686 { 687 loff_t pos; 688 int ret; 689 690 if (!out->f_op || !out->f_op->splice_write) 691 return -EINVAL; 692 693 if (!(out->f_mode & FMODE_WRITE)) 694 return -EBADF; 695 696 pos = out->f_pos; 697 ret = rw_verify_area(WRITE, out, &pos, len); 698 if (unlikely(ret < 0)) 699 return ret; 700 701 return out->f_op->splice_write(pipe, out, len, flags); 702 } 703 704 /* 705 * Attempt to initiate a splice from a file to a pipe. 706 */ 707 static long do_splice_to(struct file *in, struct inode *pipe, size_t len, 708 unsigned int flags) 709 { 710 loff_t pos, isize, left; 711 int ret; 712 713 if (!in->f_op || !in->f_op->splice_read) 714 return -EINVAL; 715 716 if (!(in->f_mode & FMODE_READ)) 717 return -EBADF; 718 719 pos = in->f_pos; 720 ret = rw_verify_area(READ, in, &pos, len); 721 if (unlikely(ret < 0)) 722 return ret; 723 724 isize = i_size_read(in->f_mapping->host); 725 if (unlikely(in->f_pos >= isize)) 726 return 0; 727 728 left = isize - in->f_pos; 729 if (left < len) 730 len = left; 731 732 return in->f_op->splice_read(in, pipe, len, flags); 733 } 734 735 /* 736 * Determine where to splice to/from. 737 */ 738 static long do_splice(struct file *in, struct file *out, size_t len, 739 unsigned int flags) 740 { 741 struct inode *pipe; 742 743 pipe = in->f_dentry->d_inode; 744 if (pipe->i_pipe) 745 return do_splice_from(pipe, out, len, flags); 746 747 pipe = out->f_dentry->d_inode; 748 if (pipe->i_pipe) 749 return do_splice_to(in, pipe, len, flags); 750 751 return -EINVAL; 752 } 753 754 asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) 755 { 756 long error; 757 struct file *in, *out; 758 int fput_in, fput_out; 759 760 if (unlikely(!len)) 761 return 0; 762 763 error = -EBADF; 764 in = fget_light(fdin, &fput_in); 765 if (in) { 766 if (in->f_mode & FMODE_READ) { 767 out = fget_light(fdout, &fput_out); 768 if (out) { 769 if (out->f_mode & FMODE_WRITE) 770 error = do_splice(in, out, len, flags); 771 fput_light(out, fput_out); 772 } 773 } 774 775 fput_light(in, fput_in); 776 } 777 778 return error; 779 } 780