1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/splice.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm_inline.h> 26 #include <linux/swap.h> 27 #include <linux/writeback.h> 28 #include <linux/export.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 #include <linux/security.h> 32 #include <linux/gfp.h> 33 #include <linux/socket.h> 34 #include "internal.h" 35 36 /* 37 * Attempt to steal a page from a pipe buffer. This should perhaps go into 38 * a vm helper function, it's already simplified quite a bit by the 39 * addition of remove_mapping(). If success is returned, the caller may 40 * attempt to reuse this page for another destination. 41 */ 42 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 43 struct pipe_buffer *buf) 44 { 45 struct page *page = buf->page; 46 struct address_space *mapping; 47 48 lock_page(page); 49 50 mapping = page_mapping(page); 51 if (mapping) { 52 WARN_ON(!PageUptodate(page)); 53 54 /* 55 * At least for ext2 with nobh option, we need to wait on 56 * writeback completing on this page, since we'll remove it 57 * from the pagecache. Otherwise truncate wont wait on the 58 * page, allowing the disk blocks to be reused by someone else 59 * before we actually wrote our data to them. fs corruption 60 * ensues. 61 */ 62 wait_on_page_writeback(page); 63 64 if (page_has_private(page) && 65 !try_to_release_page(page, GFP_KERNEL)) 66 goto out_unlock; 67 68 /* 69 * If we succeeded in removing the mapping, set LRU flag 70 * and return good. 71 */ 72 if (remove_mapping(mapping, page)) { 73 buf->flags |= PIPE_BUF_FLAG_LRU; 74 return 0; 75 } 76 } 77 78 /* 79 * Raced with truncate or failed to remove page from current 80 * address space, unlock and return failure. 81 */ 82 out_unlock: 83 unlock_page(page); 84 return 1; 85 } 86 87 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 88 struct pipe_buffer *buf) 89 { 90 page_cache_release(buf->page); 91 buf->flags &= ~PIPE_BUF_FLAG_LRU; 92 } 93 94 /* 95 * Check whether the contents of buf is OK to access. Since the content 96 * is a page cache page, IO may be in flight. 97 */ 98 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 99 struct pipe_buffer *buf) 100 { 101 struct page *page = buf->page; 102 int err; 103 104 if (!PageUptodate(page)) { 105 lock_page(page); 106 107 /* 108 * Page got truncated/unhashed. This will cause a 0-byte 109 * splice, if this is the first page. 110 */ 111 if (!page->mapping) { 112 err = -ENODATA; 113 goto error; 114 } 115 116 /* 117 * Uh oh, read-error from disk. 118 */ 119 if (!PageUptodate(page)) { 120 err = -EIO; 121 goto error; 122 } 123 124 /* 125 * Page is ok afterall, we are done. 126 */ 127 unlock_page(page); 128 } 129 130 return 0; 131 error: 132 unlock_page(page); 133 return err; 134 } 135 136 const struct pipe_buf_operations page_cache_pipe_buf_ops = { 137 .can_merge = 0, 138 .map = generic_pipe_buf_map, 139 .unmap = generic_pipe_buf_unmap, 140 .confirm = page_cache_pipe_buf_confirm, 141 .release = page_cache_pipe_buf_release, 142 .steal = page_cache_pipe_buf_steal, 143 .get = generic_pipe_buf_get, 144 }; 145 146 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 147 struct pipe_buffer *buf) 148 { 149 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 150 return 1; 151 152 buf->flags |= PIPE_BUF_FLAG_LRU; 153 return generic_pipe_buf_steal(pipe, buf); 154 } 155 156 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 157 .can_merge = 0, 158 .map = generic_pipe_buf_map, 159 .unmap = generic_pipe_buf_unmap, 160 .confirm = generic_pipe_buf_confirm, 161 .release = page_cache_pipe_buf_release, 162 .steal = user_page_pipe_buf_steal, 163 .get = generic_pipe_buf_get, 164 }; 165 166 static void wakeup_pipe_readers(struct pipe_inode_info *pipe) 167 { 168 smp_mb(); 169 if (waitqueue_active(&pipe->wait)) 170 wake_up_interruptible(&pipe->wait); 171 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 172 } 173 174 /** 175 * splice_to_pipe - fill passed data into a pipe 176 * @pipe: pipe to fill 177 * @spd: data to fill 178 * 179 * Description: 180 * @spd contains a map of pages and len/offset tuples, along with 181 * the struct pipe_buf_operations associated with these pages. This 182 * function will link that data to the pipe. 183 * 184 */ 185 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 186 struct splice_pipe_desc *spd) 187 { 188 unsigned int spd_pages = spd->nr_pages; 189 int ret, do_wakeup, page_nr; 190 191 ret = 0; 192 do_wakeup = 0; 193 page_nr = 0; 194 195 pipe_lock(pipe); 196 197 for (;;) { 198 if (!pipe->readers) { 199 send_sig(SIGPIPE, current, 0); 200 if (!ret) 201 ret = -EPIPE; 202 break; 203 } 204 205 if (pipe->nrbufs < pipe->buffers) { 206 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); 207 struct pipe_buffer *buf = pipe->bufs + newbuf; 208 209 buf->page = spd->pages[page_nr]; 210 buf->offset = spd->partial[page_nr].offset; 211 buf->len = spd->partial[page_nr].len; 212 buf->private = spd->partial[page_nr].private; 213 buf->ops = spd->ops; 214 if (spd->flags & SPLICE_F_GIFT) 215 buf->flags |= PIPE_BUF_FLAG_GIFT; 216 217 pipe->nrbufs++; 218 page_nr++; 219 ret += buf->len; 220 221 if (pipe->inode) 222 do_wakeup = 1; 223 224 if (!--spd->nr_pages) 225 break; 226 if (pipe->nrbufs < pipe->buffers) 227 continue; 228 229 break; 230 } 231 232 if (spd->flags & SPLICE_F_NONBLOCK) { 233 if (!ret) 234 ret = -EAGAIN; 235 break; 236 } 237 238 if (signal_pending(current)) { 239 if (!ret) 240 ret = -ERESTARTSYS; 241 break; 242 } 243 244 if (do_wakeup) { 245 smp_mb(); 246 if (waitqueue_active(&pipe->wait)) 247 wake_up_interruptible_sync(&pipe->wait); 248 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 249 do_wakeup = 0; 250 } 251 252 pipe->waiting_writers++; 253 pipe_wait(pipe); 254 pipe->waiting_writers--; 255 } 256 257 pipe_unlock(pipe); 258 259 if (do_wakeup) 260 wakeup_pipe_readers(pipe); 261 262 while (page_nr < spd_pages) 263 spd->spd_release(spd, page_nr++); 264 265 return ret; 266 } 267 268 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) 269 { 270 page_cache_release(spd->pages[i]); 271 } 272 273 /* 274 * Check if we need to grow the arrays holding pages and partial page 275 * descriptions. 276 */ 277 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) 278 { 279 unsigned int buffers = ACCESS_ONCE(pipe->buffers); 280 281 spd->nr_pages_max = buffers; 282 if (buffers <= PIPE_DEF_BUFFERS) 283 return 0; 284 285 spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); 286 spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); 287 288 if (spd->pages && spd->partial) 289 return 0; 290 291 kfree(spd->pages); 292 kfree(spd->partial); 293 return -ENOMEM; 294 } 295 296 void splice_shrink_spd(struct splice_pipe_desc *spd) 297 { 298 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) 299 return; 300 301 kfree(spd->pages); 302 kfree(spd->partial); 303 } 304 305 static int 306 __generic_file_splice_read(struct file *in, loff_t *ppos, 307 struct pipe_inode_info *pipe, size_t len, 308 unsigned int flags) 309 { 310 struct address_space *mapping = in->f_mapping; 311 unsigned int loff, nr_pages, req_pages; 312 struct page *pages[PIPE_DEF_BUFFERS]; 313 struct partial_page partial[PIPE_DEF_BUFFERS]; 314 struct page *page; 315 pgoff_t index, end_index; 316 loff_t isize; 317 int error, page_nr; 318 struct splice_pipe_desc spd = { 319 .pages = pages, 320 .partial = partial, 321 .nr_pages_max = PIPE_DEF_BUFFERS, 322 .flags = flags, 323 .ops = &page_cache_pipe_buf_ops, 324 .spd_release = spd_release_page, 325 }; 326 327 if (splice_grow_spd(pipe, &spd)) 328 return -ENOMEM; 329 330 index = *ppos >> PAGE_CACHE_SHIFT; 331 loff = *ppos & ~PAGE_CACHE_MASK; 332 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 333 nr_pages = min(req_pages, spd.nr_pages_max); 334 335 /* 336 * Lookup the (hopefully) full range of pages we need. 337 */ 338 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); 339 index += spd.nr_pages; 340 341 /* 342 * If find_get_pages_contig() returned fewer pages than we needed, 343 * readahead/allocate the rest and fill in the holes. 344 */ 345 if (spd.nr_pages < nr_pages) 346 page_cache_sync_readahead(mapping, &in->f_ra, in, 347 index, req_pages - spd.nr_pages); 348 349 error = 0; 350 while (spd.nr_pages < nr_pages) { 351 /* 352 * Page could be there, find_get_pages_contig() breaks on 353 * the first hole. 354 */ 355 page = find_get_page(mapping, index); 356 if (!page) { 357 /* 358 * page didn't exist, allocate one. 359 */ 360 page = page_cache_alloc_cold(mapping); 361 if (!page) 362 break; 363 364 error = add_to_page_cache_lru(page, mapping, index, 365 GFP_KERNEL); 366 if (unlikely(error)) { 367 page_cache_release(page); 368 if (error == -EEXIST) 369 continue; 370 break; 371 } 372 /* 373 * add_to_page_cache() locks the page, unlock it 374 * to avoid convoluting the logic below even more. 375 */ 376 unlock_page(page); 377 } 378 379 spd.pages[spd.nr_pages++] = page; 380 index++; 381 } 382 383 /* 384 * Now loop over the map and see if we need to start IO on any 385 * pages, fill in the partial map, etc. 386 */ 387 index = *ppos >> PAGE_CACHE_SHIFT; 388 nr_pages = spd.nr_pages; 389 spd.nr_pages = 0; 390 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 391 unsigned int this_len; 392 393 if (!len) 394 break; 395 396 /* 397 * this_len is the max we'll use from this page 398 */ 399 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 400 page = spd.pages[page_nr]; 401 402 if (PageReadahead(page)) 403 page_cache_async_readahead(mapping, &in->f_ra, in, 404 page, index, req_pages - page_nr); 405 406 /* 407 * If the page isn't uptodate, we may need to start io on it 408 */ 409 if (!PageUptodate(page)) { 410 lock_page(page); 411 412 /* 413 * Page was truncated, or invalidated by the 414 * filesystem. Redo the find/create, but this time the 415 * page is kept locked, so there's no chance of another 416 * race with truncate/invalidate. 417 */ 418 if (!page->mapping) { 419 unlock_page(page); 420 page = find_or_create_page(mapping, index, 421 mapping_gfp_mask(mapping)); 422 423 if (!page) { 424 error = -ENOMEM; 425 break; 426 } 427 page_cache_release(spd.pages[page_nr]); 428 spd.pages[page_nr] = page; 429 } 430 /* 431 * page was already under io and is now done, great 432 */ 433 if (PageUptodate(page)) { 434 unlock_page(page); 435 goto fill_it; 436 } 437 438 /* 439 * need to read in the page 440 */ 441 error = mapping->a_ops->readpage(in, page); 442 if (unlikely(error)) { 443 /* 444 * We really should re-lookup the page here, 445 * but it complicates things a lot. Instead 446 * lets just do what we already stored, and 447 * we'll get it the next time we are called. 448 */ 449 if (error == AOP_TRUNCATED_PAGE) 450 error = 0; 451 452 break; 453 } 454 } 455 fill_it: 456 /* 457 * i_size must be checked after PageUptodate. 458 */ 459 isize = i_size_read(mapping->host); 460 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 461 if (unlikely(!isize || index > end_index)) 462 break; 463 464 /* 465 * if this is the last page, see if we need to shrink 466 * the length and stop 467 */ 468 if (end_index == index) { 469 unsigned int plen; 470 471 /* 472 * max good bytes in this page 473 */ 474 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 475 if (plen <= loff) 476 break; 477 478 /* 479 * force quit after adding this page 480 */ 481 this_len = min(this_len, plen - loff); 482 len = this_len; 483 } 484 485 spd.partial[page_nr].offset = loff; 486 spd.partial[page_nr].len = this_len; 487 len -= this_len; 488 loff = 0; 489 spd.nr_pages++; 490 index++; 491 } 492 493 /* 494 * Release any pages at the end, if we quit early. 'page_nr' is how far 495 * we got, 'nr_pages' is how many pages are in the map. 496 */ 497 while (page_nr < nr_pages) 498 page_cache_release(spd.pages[page_nr++]); 499 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 500 501 if (spd.nr_pages) 502 error = splice_to_pipe(pipe, &spd); 503 504 splice_shrink_spd(&spd); 505 return error; 506 } 507 508 /** 509 * generic_file_splice_read - splice data from file to a pipe 510 * @in: file to splice from 511 * @ppos: position in @in 512 * @pipe: pipe to splice to 513 * @len: number of bytes to splice 514 * @flags: splice modifier flags 515 * 516 * Description: 517 * Will read pages from given file and fill them into a pipe. Can be 518 * used as long as the address_space operations for the source implements 519 * a readpage() hook. 520 * 521 */ 522 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 523 struct pipe_inode_info *pipe, size_t len, 524 unsigned int flags) 525 { 526 loff_t isize, left; 527 int ret; 528 529 isize = i_size_read(in->f_mapping->host); 530 if (unlikely(*ppos >= isize)) 531 return 0; 532 533 left = isize - *ppos; 534 if (unlikely(left < len)) 535 len = left; 536 537 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 538 if (ret > 0) { 539 *ppos += ret; 540 file_accessed(in); 541 } 542 543 return ret; 544 } 545 EXPORT_SYMBOL(generic_file_splice_read); 546 547 static const struct pipe_buf_operations default_pipe_buf_ops = { 548 .can_merge = 0, 549 .map = generic_pipe_buf_map, 550 .unmap = generic_pipe_buf_unmap, 551 .confirm = generic_pipe_buf_confirm, 552 .release = generic_pipe_buf_release, 553 .steal = generic_pipe_buf_steal, 554 .get = generic_pipe_buf_get, 555 }; 556 557 static ssize_t kernel_readv(struct file *file, const struct iovec *vec, 558 unsigned long vlen, loff_t offset) 559 { 560 mm_segment_t old_fs; 561 loff_t pos = offset; 562 ssize_t res; 563 564 old_fs = get_fs(); 565 set_fs(get_ds()); 566 /* The cast to a user pointer is valid due to the set_fs() */ 567 res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); 568 set_fs(old_fs); 569 570 return res; 571 } 572 573 ssize_t kernel_write(struct file *file, const char *buf, size_t count, 574 loff_t pos) 575 { 576 mm_segment_t old_fs; 577 ssize_t res; 578 579 old_fs = get_fs(); 580 set_fs(get_ds()); 581 /* The cast to a user pointer is valid due to the set_fs() */ 582 res = vfs_write(file, (__force const char __user *)buf, count, &pos); 583 set_fs(old_fs); 584 585 return res; 586 } 587 EXPORT_SYMBOL(kernel_write); 588 589 ssize_t default_file_splice_read(struct file *in, loff_t *ppos, 590 struct pipe_inode_info *pipe, size_t len, 591 unsigned int flags) 592 { 593 unsigned int nr_pages; 594 unsigned int nr_freed; 595 size_t offset; 596 struct page *pages[PIPE_DEF_BUFFERS]; 597 struct partial_page partial[PIPE_DEF_BUFFERS]; 598 struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; 599 ssize_t res; 600 size_t this_len; 601 int error; 602 int i; 603 struct splice_pipe_desc spd = { 604 .pages = pages, 605 .partial = partial, 606 .nr_pages_max = PIPE_DEF_BUFFERS, 607 .flags = flags, 608 .ops = &default_pipe_buf_ops, 609 .spd_release = spd_release_page, 610 }; 611 612 if (splice_grow_spd(pipe, &spd)) 613 return -ENOMEM; 614 615 res = -ENOMEM; 616 vec = __vec; 617 if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { 618 vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); 619 if (!vec) 620 goto shrink_ret; 621 } 622 623 offset = *ppos & ~PAGE_CACHE_MASK; 624 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 625 626 for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { 627 struct page *page; 628 629 page = alloc_page(GFP_USER); 630 error = -ENOMEM; 631 if (!page) 632 goto err; 633 634 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); 635 vec[i].iov_base = (void __user *) page_address(page); 636 vec[i].iov_len = this_len; 637 spd.pages[i] = page; 638 spd.nr_pages++; 639 len -= this_len; 640 offset = 0; 641 } 642 643 res = kernel_readv(in, vec, spd.nr_pages, *ppos); 644 if (res < 0) { 645 error = res; 646 goto err; 647 } 648 649 error = 0; 650 if (!res) 651 goto err; 652 653 nr_freed = 0; 654 for (i = 0; i < spd.nr_pages; i++) { 655 this_len = min_t(size_t, vec[i].iov_len, res); 656 spd.partial[i].offset = 0; 657 spd.partial[i].len = this_len; 658 if (!this_len) { 659 __free_page(spd.pages[i]); 660 spd.pages[i] = NULL; 661 nr_freed++; 662 } 663 res -= this_len; 664 } 665 spd.nr_pages -= nr_freed; 666 667 res = splice_to_pipe(pipe, &spd); 668 if (res > 0) 669 *ppos += res; 670 671 shrink_ret: 672 if (vec != __vec) 673 kfree(vec); 674 splice_shrink_spd(&spd); 675 return res; 676 677 err: 678 for (i = 0; i < spd.nr_pages; i++) 679 __free_page(spd.pages[i]); 680 681 res = error; 682 goto shrink_ret; 683 } 684 EXPORT_SYMBOL(default_file_splice_read); 685 686 /* 687 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 688 * using sendpage(). Return the number of bytes sent. 689 */ 690 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 691 struct pipe_buffer *buf, struct splice_desc *sd) 692 { 693 struct file *file = sd->u.file; 694 loff_t pos = sd->pos; 695 int more; 696 697 if (!likely(file->f_op && file->f_op->sendpage)) 698 return -EINVAL; 699 700 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; 701 702 if (sd->len < sd->total_len && pipe->nrbufs > 1) 703 more |= MSG_SENDPAGE_NOTLAST; 704 705 return file->f_op->sendpage(file, buf->page, buf->offset, 706 sd->len, &pos, more); 707 } 708 709 /* 710 * This is a little more tricky than the file -> pipe splicing. There are 711 * basically three cases: 712 * 713 * - Destination page already exists in the address space and there 714 * are users of it. For that case we have no other option that 715 * copying the data. Tough luck. 716 * - Destination page already exists in the address space, but there 717 * are no users of it. Make sure it's uptodate, then drop it. Fall 718 * through to last case. 719 * - Destination page does not exist, we can add the pipe page to 720 * the page cache and avoid the copy. 721 * 722 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 723 * sd->flags), we attempt to migrate pages from the pipe to the output 724 * file address space page cache. This is possible if no one else has 725 * the pipe page referenced outside of the pipe and page cache. If 726 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 727 * a new page in the output file page cache and fill/dirty that. 728 */ 729 int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 730 struct splice_desc *sd) 731 { 732 struct file *file = sd->u.file; 733 struct address_space *mapping = file->f_mapping; 734 unsigned int offset, this_len; 735 struct page *page; 736 void *fsdata; 737 int ret; 738 739 offset = sd->pos & ~PAGE_CACHE_MASK; 740 741 this_len = sd->len; 742 if (this_len + offset > PAGE_CACHE_SIZE) 743 this_len = PAGE_CACHE_SIZE - offset; 744 745 ret = pagecache_write_begin(file, mapping, sd->pos, this_len, 746 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 747 if (unlikely(ret)) 748 goto out; 749 750 if (buf->page != page) { 751 char *src = buf->ops->map(pipe, buf, 1); 752 char *dst = kmap_atomic(page); 753 754 memcpy(dst + offset, src + buf->offset, this_len); 755 flush_dcache_page(page); 756 kunmap_atomic(dst); 757 buf->ops->unmap(pipe, buf, src); 758 } 759 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, 760 page, fsdata); 761 out: 762 return ret; 763 } 764 EXPORT_SYMBOL(pipe_to_file); 765 766 static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 767 { 768 smp_mb(); 769 if (waitqueue_active(&pipe->wait)) 770 wake_up_interruptible(&pipe->wait); 771 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 772 } 773 774 /** 775 * splice_from_pipe_feed - feed available data from a pipe to a file 776 * @pipe: pipe to splice from 777 * @sd: information to @actor 778 * @actor: handler that splices the data 779 * 780 * Description: 781 * This function loops over the pipe and calls @actor to do the 782 * actual moving of a single struct pipe_buffer to the desired 783 * destination. It returns when there's no more buffers left in 784 * the pipe or if the requested number of bytes (@sd->total_len) 785 * have been copied. It returns a positive number (one) if the 786 * pipe needs to be filled with more data, zero if the required 787 * number of bytes have been copied and -errno on error. 788 * 789 * This, together with splice_from_pipe_{begin,end,next}, may be 790 * used to implement the functionality of __splice_from_pipe() when 791 * locking is required around copying the pipe buffers to the 792 * destination. 793 */ 794 int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 795 splice_actor *actor) 796 { 797 int ret; 798 799 while (pipe->nrbufs) { 800 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 801 const struct pipe_buf_operations *ops = buf->ops; 802 803 sd->len = buf->len; 804 if (sd->len > sd->total_len) 805 sd->len = sd->total_len; 806 807 ret = buf->ops->confirm(pipe, buf); 808 if (unlikely(ret)) { 809 if (ret == -ENODATA) 810 ret = 0; 811 return ret; 812 } 813 814 ret = actor(pipe, buf, sd); 815 if (ret <= 0) 816 return ret; 817 818 buf->offset += ret; 819 buf->len -= ret; 820 821 sd->num_spliced += ret; 822 sd->len -= ret; 823 sd->pos += ret; 824 sd->total_len -= ret; 825 826 if (!buf->len) { 827 buf->ops = NULL; 828 ops->release(pipe, buf); 829 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); 830 pipe->nrbufs--; 831 if (pipe->inode) 832 sd->need_wakeup = true; 833 } 834 835 if (!sd->total_len) 836 return 0; 837 } 838 839 return 1; 840 } 841 EXPORT_SYMBOL(splice_from_pipe_feed); 842 843 /** 844 * splice_from_pipe_next - wait for some data to splice from 845 * @pipe: pipe to splice from 846 * @sd: information about the splice operation 847 * 848 * Description: 849 * This function will wait for some data and return a positive 850 * value (one) if pipe buffers are available. It will return zero 851 * or -errno if no more data needs to be spliced. 852 */ 853 int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 854 { 855 while (!pipe->nrbufs) { 856 if (!pipe->writers) 857 return 0; 858 859 if (!pipe->waiting_writers && sd->num_spliced) 860 return 0; 861 862 if (sd->flags & SPLICE_F_NONBLOCK) 863 return -EAGAIN; 864 865 if (signal_pending(current)) 866 return -ERESTARTSYS; 867 868 if (sd->need_wakeup) { 869 wakeup_pipe_writers(pipe); 870 sd->need_wakeup = false; 871 } 872 873 pipe_wait(pipe); 874 } 875 876 return 1; 877 } 878 EXPORT_SYMBOL(splice_from_pipe_next); 879 880 /** 881 * splice_from_pipe_begin - start splicing from pipe 882 * @sd: information about the splice operation 883 * 884 * Description: 885 * This function should be called before a loop containing 886 * splice_from_pipe_next() and splice_from_pipe_feed() to 887 * initialize the necessary fields of @sd. 888 */ 889 void splice_from_pipe_begin(struct splice_desc *sd) 890 { 891 sd->num_spliced = 0; 892 sd->need_wakeup = false; 893 } 894 EXPORT_SYMBOL(splice_from_pipe_begin); 895 896 /** 897 * splice_from_pipe_end - finish splicing from pipe 898 * @pipe: pipe to splice from 899 * @sd: information about the splice operation 900 * 901 * Description: 902 * This function will wake up pipe writers if necessary. It should 903 * be called after a loop containing splice_from_pipe_next() and 904 * splice_from_pipe_feed(). 905 */ 906 void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 907 { 908 if (sd->need_wakeup) 909 wakeup_pipe_writers(pipe); 910 } 911 EXPORT_SYMBOL(splice_from_pipe_end); 912 913 /** 914 * __splice_from_pipe - splice data from a pipe to given actor 915 * @pipe: pipe to splice from 916 * @sd: information to @actor 917 * @actor: handler that splices the data 918 * 919 * Description: 920 * This function does little more than loop over the pipe and call 921 * @actor to do the actual moving of a single struct pipe_buffer to 922 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 923 * pipe_to_user. 924 * 925 */ 926 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 927 splice_actor *actor) 928 { 929 int ret; 930 931 splice_from_pipe_begin(sd); 932 do { 933 ret = splice_from_pipe_next(pipe, sd); 934 if (ret > 0) 935 ret = splice_from_pipe_feed(pipe, sd, actor); 936 } while (ret > 0); 937 splice_from_pipe_end(pipe, sd); 938 939 return sd->num_spliced ? sd->num_spliced : ret; 940 } 941 EXPORT_SYMBOL(__splice_from_pipe); 942 943 /** 944 * splice_from_pipe - splice data from a pipe to a file 945 * @pipe: pipe to splice from 946 * @out: file to splice to 947 * @ppos: position in @out 948 * @len: how many bytes to splice 949 * @flags: splice modifier flags 950 * @actor: handler that splices the data 951 * 952 * Description: 953 * See __splice_from_pipe. This function locks the pipe inode, 954 * otherwise it's identical to __splice_from_pipe(). 955 * 956 */ 957 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 958 loff_t *ppos, size_t len, unsigned int flags, 959 splice_actor *actor) 960 { 961 ssize_t ret; 962 struct splice_desc sd = { 963 .total_len = len, 964 .flags = flags, 965 .pos = *ppos, 966 .u.file = out, 967 }; 968 969 pipe_lock(pipe); 970 ret = __splice_from_pipe(pipe, &sd, actor); 971 pipe_unlock(pipe); 972 973 return ret; 974 } 975 976 /** 977 * generic_file_splice_write - splice data from a pipe to a file 978 * @pipe: pipe info 979 * @out: file to write to 980 * @ppos: position in @out 981 * @len: number of bytes to splice 982 * @flags: splice modifier flags 983 * 984 * Description: 985 * Will either move or copy pages (determined by @flags options) from 986 * the given pipe inode to the given file. 987 * 988 */ 989 ssize_t 990 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 991 loff_t *ppos, size_t len, unsigned int flags) 992 { 993 struct address_space *mapping = out->f_mapping; 994 struct inode *inode = mapping->host; 995 struct splice_desc sd = { 996 .total_len = len, 997 .flags = flags, 998 .pos = *ppos, 999 .u.file = out, 1000 }; 1001 ssize_t ret; 1002 1003 sb_start_write(inode->i_sb); 1004 1005 pipe_lock(pipe); 1006 1007 splice_from_pipe_begin(&sd); 1008 do { 1009 ret = splice_from_pipe_next(pipe, &sd); 1010 if (ret <= 0) 1011 break; 1012 1013 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 1014 ret = file_remove_suid(out); 1015 if (!ret) { 1016 ret = file_update_time(out); 1017 if (!ret) 1018 ret = splice_from_pipe_feed(pipe, &sd, 1019 pipe_to_file); 1020 } 1021 mutex_unlock(&inode->i_mutex); 1022 } while (ret > 0); 1023 splice_from_pipe_end(pipe, &sd); 1024 1025 pipe_unlock(pipe); 1026 1027 if (sd.num_spliced) 1028 ret = sd.num_spliced; 1029 1030 if (ret > 0) { 1031 int err; 1032 1033 err = generic_write_sync(out, *ppos, ret); 1034 if (err) 1035 ret = err; 1036 else 1037 *ppos += ret; 1038 balance_dirty_pages_ratelimited(mapping); 1039 } 1040 sb_end_write(inode->i_sb); 1041 1042 return ret; 1043 } 1044 1045 EXPORT_SYMBOL(generic_file_splice_write); 1046 1047 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1048 struct splice_desc *sd) 1049 { 1050 int ret; 1051 void *data; 1052 loff_t tmp = sd->pos; 1053 1054 data = buf->ops->map(pipe, buf, 0); 1055 ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); 1056 buf->ops->unmap(pipe, buf, data); 1057 1058 return ret; 1059 } 1060 1061 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, 1062 struct file *out, loff_t *ppos, 1063 size_t len, unsigned int flags) 1064 { 1065 ssize_t ret; 1066 1067 ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); 1068 if (ret > 0) 1069 *ppos += ret; 1070 1071 return ret; 1072 } 1073 1074 /** 1075 * generic_splice_sendpage - splice data from a pipe to a socket 1076 * @pipe: pipe to splice from 1077 * @out: socket to write to 1078 * @ppos: position in @out 1079 * @len: number of bytes to splice 1080 * @flags: splice modifier flags 1081 * 1082 * Description: 1083 * Will send @len bytes from the pipe to a network socket. No data copying 1084 * is involved. 1085 * 1086 */ 1087 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 1088 loff_t *ppos, size_t len, unsigned int flags) 1089 { 1090 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 1091 } 1092 1093 EXPORT_SYMBOL(generic_splice_sendpage); 1094 1095 /* 1096 * Attempt to initiate a splice from pipe to file. 1097 */ 1098 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 1099 loff_t *ppos, size_t len, unsigned int flags) 1100 { 1101 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, 1102 loff_t *, size_t, unsigned int); 1103 int ret; 1104 1105 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1106 return -EBADF; 1107 1108 if (unlikely(out->f_flags & O_APPEND)) 1109 return -EINVAL; 1110 1111 ret = rw_verify_area(WRITE, out, ppos, len); 1112 if (unlikely(ret < 0)) 1113 return ret; 1114 1115 if (out->f_op && out->f_op->splice_write) 1116 splice_write = out->f_op->splice_write; 1117 else 1118 splice_write = default_file_splice_write; 1119 1120 return splice_write(pipe, out, ppos, len, flags); 1121 } 1122 1123 /* 1124 * Attempt to initiate a splice from a file to a pipe. 1125 */ 1126 static long do_splice_to(struct file *in, loff_t *ppos, 1127 struct pipe_inode_info *pipe, size_t len, 1128 unsigned int flags) 1129 { 1130 ssize_t (*splice_read)(struct file *, loff_t *, 1131 struct pipe_inode_info *, size_t, unsigned int); 1132 int ret; 1133 1134 if (unlikely(!(in->f_mode & FMODE_READ))) 1135 return -EBADF; 1136 1137 ret = rw_verify_area(READ, in, ppos, len); 1138 if (unlikely(ret < 0)) 1139 return ret; 1140 1141 if (in->f_op && in->f_op->splice_read) 1142 splice_read = in->f_op->splice_read; 1143 else 1144 splice_read = default_file_splice_read; 1145 1146 return splice_read(in, ppos, pipe, len, flags); 1147 } 1148 1149 /** 1150 * splice_direct_to_actor - splices data directly between two non-pipes 1151 * @in: file to splice from 1152 * @sd: actor information on where to splice to 1153 * @actor: handles the data splicing 1154 * 1155 * Description: 1156 * This is a special case helper to splice directly between two 1157 * points, without requiring an explicit pipe. Internally an allocated 1158 * pipe is cached in the process, and reused during the lifetime of 1159 * that process. 1160 * 1161 */ 1162 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 1163 splice_direct_actor *actor) 1164 { 1165 struct pipe_inode_info *pipe; 1166 long ret, bytes; 1167 umode_t i_mode; 1168 size_t len; 1169 int i, flags; 1170 1171 /* 1172 * We require the input being a regular file, as we don't want to 1173 * randomly drop data for eg socket -> socket splicing. Use the 1174 * piped splicing for that! 1175 */ 1176 i_mode = file_inode(in)->i_mode; 1177 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 1178 return -EINVAL; 1179 1180 /* 1181 * neither in nor out is a pipe, setup an internal pipe attached to 1182 * 'out' and transfer the wanted data from 'in' to 'out' through that 1183 */ 1184 pipe = current->splice_pipe; 1185 if (unlikely(!pipe)) { 1186 pipe = alloc_pipe_info(NULL); 1187 if (!pipe) 1188 return -ENOMEM; 1189 1190 /* 1191 * We don't have an immediate reader, but we'll read the stuff 1192 * out of the pipe right after the splice_to_pipe(). So set 1193 * PIPE_READERS appropriately. 1194 */ 1195 pipe->readers = 1; 1196 1197 current->splice_pipe = pipe; 1198 } 1199 1200 /* 1201 * Do the splice. 1202 */ 1203 ret = 0; 1204 bytes = 0; 1205 len = sd->total_len; 1206 flags = sd->flags; 1207 1208 /* 1209 * Don't block on output, we have to drain the direct pipe. 1210 */ 1211 sd->flags &= ~SPLICE_F_NONBLOCK; 1212 1213 while (len) { 1214 size_t read_len; 1215 loff_t pos = sd->pos, prev_pos = pos; 1216 1217 ret = do_splice_to(in, &pos, pipe, len, flags); 1218 if (unlikely(ret <= 0)) 1219 goto out_release; 1220 1221 read_len = ret; 1222 sd->total_len = read_len; 1223 1224 /* 1225 * NOTE: nonblocking mode only applies to the input. We 1226 * must not do the output in nonblocking mode as then we 1227 * could get stuck data in the internal pipe: 1228 */ 1229 ret = actor(pipe, sd); 1230 if (unlikely(ret <= 0)) { 1231 sd->pos = prev_pos; 1232 goto out_release; 1233 } 1234 1235 bytes += ret; 1236 len -= ret; 1237 sd->pos = pos; 1238 1239 if (ret < read_len) { 1240 sd->pos = prev_pos + ret; 1241 goto out_release; 1242 } 1243 } 1244 1245 done: 1246 pipe->nrbufs = pipe->curbuf = 0; 1247 file_accessed(in); 1248 return bytes; 1249 1250 out_release: 1251 /* 1252 * If we did an incomplete transfer we must release 1253 * the pipe buffers in question: 1254 */ 1255 for (i = 0; i < pipe->buffers; i++) { 1256 struct pipe_buffer *buf = pipe->bufs + i; 1257 1258 if (buf->ops) { 1259 buf->ops->release(pipe, buf); 1260 buf->ops = NULL; 1261 } 1262 } 1263 1264 if (!bytes) 1265 bytes = ret; 1266 1267 goto done; 1268 } 1269 EXPORT_SYMBOL(splice_direct_to_actor); 1270 1271 static int direct_splice_actor(struct pipe_inode_info *pipe, 1272 struct splice_desc *sd) 1273 { 1274 struct file *file = sd->u.file; 1275 1276 return do_splice_from(pipe, file, &file->f_pos, sd->total_len, 1277 sd->flags); 1278 } 1279 1280 /** 1281 * do_splice_direct - splices data directly between two files 1282 * @in: file to splice from 1283 * @ppos: input file offset 1284 * @out: file to splice to 1285 * @len: number of bytes to splice 1286 * @flags: splice modifier flags 1287 * 1288 * Description: 1289 * For use by do_sendfile(). splice can easily emulate sendfile, but 1290 * doing it in the application would incur an extra system call 1291 * (splice in + splice out, as compared to just sendfile()). So this helper 1292 * can splice directly through a process-private pipe. 1293 * 1294 */ 1295 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1296 size_t len, unsigned int flags) 1297 { 1298 struct splice_desc sd = { 1299 .len = len, 1300 .total_len = len, 1301 .flags = flags, 1302 .pos = *ppos, 1303 .u.file = out, 1304 }; 1305 long ret; 1306 1307 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1308 if (ret > 0) 1309 *ppos = sd.pos; 1310 1311 return ret; 1312 } 1313 1314 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1315 struct pipe_inode_info *opipe, 1316 size_t len, unsigned int flags); 1317 1318 /* 1319 * Determine where to splice to/from. 1320 */ 1321 static long do_splice(struct file *in, loff_t __user *off_in, 1322 struct file *out, loff_t __user *off_out, 1323 size_t len, unsigned int flags) 1324 { 1325 struct pipe_inode_info *ipipe; 1326 struct pipe_inode_info *opipe; 1327 loff_t offset, *off; 1328 long ret; 1329 1330 ipipe = get_pipe_info(in); 1331 opipe = get_pipe_info(out); 1332 1333 if (ipipe && opipe) { 1334 if (off_in || off_out) 1335 return -ESPIPE; 1336 1337 if (!(in->f_mode & FMODE_READ)) 1338 return -EBADF; 1339 1340 if (!(out->f_mode & FMODE_WRITE)) 1341 return -EBADF; 1342 1343 /* Splicing to self would be fun, but... */ 1344 if (ipipe == opipe) 1345 return -EINVAL; 1346 1347 return splice_pipe_to_pipe(ipipe, opipe, len, flags); 1348 } 1349 1350 if (ipipe) { 1351 if (off_in) 1352 return -ESPIPE; 1353 if (off_out) { 1354 if (!(out->f_mode & FMODE_PWRITE)) 1355 return -EINVAL; 1356 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1357 return -EFAULT; 1358 off = &offset; 1359 } else 1360 off = &out->f_pos; 1361 1362 ret = do_splice_from(ipipe, out, off, len, flags); 1363 1364 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1365 ret = -EFAULT; 1366 1367 return ret; 1368 } 1369 1370 if (opipe) { 1371 if (off_out) 1372 return -ESPIPE; 1373 if (off_in) { 1374 if (!(in->f_mode & FMODE_PREAD)) 1375 return -EINVAL; 1376 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1377 return -EFAULT; 1378 off = &offset; 1379 } else 1380 off = &in->f_pos; 1381 1382 ret = do_splice_to(in, off, opipe, len, flags); 1383 1384 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1385 ret = -EFAULT; 1386 1387 return ret; 1388 } 1389 1390 return -EINVAL; 1391 } 1392 1393 /* 1394 * Map an iov into an array of pages and offset/length tupples. With the 1395 * partial_page structure, we can map several non-contiguous ranges into 1396 * our ones pages[] map instead of splitting that operation into pieces. 1397 * Could easily be exported as a generic helper for other users, in which 1398 * case one would probably want to add a 'max_nr_pages' parameter as well. 1399 */ 1400 static int get_iovec_page_array(const struct iovec __user *iov, 1401 unsigned int nr_vecs, struct page **pages, 1402 struct partial_page *partial, bool aligned, 1403 unsigned int pipe_buffers) 1404 { 1405 int buffers = 0, error = 0; 1406 1407 while (nr_vecs) { 1408 unsigned long off, npages; 1409 struct iovec entry; 1410 void __user *base; 1411 size_t len; 1412 int i; 1413 1414 error = -EFAULT; 1415 if (copy_from_user(&entry, iov, sizeof(entry))) 1416 break; 1417 1418 base = entry.iov_base; 1419 len = entry.iov_len; 1420 1421 /* 1422 * Sanity check this iovec. 0 read succeeds. 1423 */ 1424 error = 0; 1425 if (unlikely(!len)) 1426 break; 1427 error = -EFAULT; 1428 if (!access_ok(VERIFY_READ, base, len)) 1429 break; 1430 1431 /* 1432 * Get this base offset and number of pages, then map 1433 * in the user pages. 1434 */ 1435 off = (unsigned long) base & ~PAGE_MASK; 1436 1437 /* 1438 * If asked for alignment, the offset must be zero and the 1439 * length a multiple of the PAGE_SIZE. 1440 */ 1441 error = -EINVAL; 1442 if (aligned && (off || len & ~PAGE_MASK)) 1443 break; 1444 1445 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1446 if (npages > pipe_buffers - buffers) 1447 npages = pipe_buffers - buffers; 1448 1449 error = get_user_pages_fast((unsigned long)base, npages, 1450 0, &pages[buffers]); 1451 1452 if (unlikely(error <= 0)) 1453 break; 1454 1455 /* 1456 * Fill this contiguous range into the partial page map. 1457 */ 1458 for (i = 0; i < error; i++) { 1459 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1460 1461 partial[buffers].offset = off; 1462 partial[buffers].len = plen; 1463 1464 off = 0; 1465 len -= plen; 1466 buffers++; 1467 } 1468 1469 /* 1470 * We didn't complete this iov, stop here since it probably 1471 * means we have to move some of this into a pipe to 1472 * be able to continue. 1473 */ 1474 if (len) 1475 break; 1476 1477 /* 1478 * Don't continue if we mapped fewer pages than we asked for, 1479 * or if we mapped the max number of pages that we have 1480 * room for. 1481 */ 1482 if (error < npages || buffers == pipe_buffers) 1483 break; 1484 1485 nr_vecs--; 1486 iov++; 1487 } 1488 1489 if (buffers) 1490 return buffers; 1491 1492 return error; 1493 } 1494 1495 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1496 struct splice_desc *sd) 1497 { 1498 char *src; 1499 int ret; 1500 1501 /* 1502 * See if we can use the atomic maps, by prefaulting in the 1503 * pages and doing an atomic copy 1504 */ 1505 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { 1506 src = buf->ops->map(pipe, buf, 1); 1507 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, 1508 sd->len); 1509 buf->ops->unmap(pipe, buf, src); 1510 if (!ret) { 1511 ret = sd->len; 1512 goto out; 1513 } 1514 } 1515 1516 /* 1517 * No dice, use slow non-atomic map and copy 1518 */ 1519 src = buf->ops->map(pipe, buf, 0); 1520 1521 ret = sd->len; 1522 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) 1523 ret = -EFAULT; 1524 1525 buf->ops->unmap(pipe, buf, src); 1526 out: 1527 if (ret > 0) 1528 sd->u.userptr += ret; 1529 return ret; 1530 } 1531 1532 /* 1533 * For lack of a better implementation, implement vmsplice() to userspace 1534 * as a simple copy of the pipes pages to the user iov. 1535 */ 1536 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1537 unsigned long nr_segs, unsigned int flags) 1538 { 1539 struct pipe_inode_info *pipe; 1540 struct splice_desc sd; 1541 ssize_t size; 1542 int error; 1543 long ret; 1544 1545 pipe = get_pipe_info(file); 1546 if (!pipe) 1547 return -EBADF; 1548 1549 pipe_lock(pipe); 1550 1551 error = ret = 0; 1552 while (nr_segs) { 1553 void __user *base; 1554 size_t len; 1555 1556 /* 1557 * Get user address base and length for this iovec. 1558 */ 1559 error = get_user(base, &iov->iov_base); 1560 if (unlikely(error)) 1561 break; 1562 error = get_user(len, &iov->iov_len); 1563 if (unlikely(error)) 1564 break; 1565 1566 /* 1567 * Sanity check this iovec. 0 read succeeds. 1568 */ 1569 if (unlikely(!len)) 1570 break; 1571 if (unlikely(!base)) { 1572 error = -EFAULT; 1573 break; 1574 } 1575 1576 if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { 1577 error = -EFAULT; 1578 break; 1579 } 1580 1581 sd.len = 0; 1582 sd.total_len = len; 1583 sd.flags = flags; 1584 sd.u.userptr = base; 1585 sd.pos = 0; 1586 1587 size = __splice_from_pipe(pipe, &sd, pipe_to_user); 1588 if (size < 0) { 1589 if (!ret) 1590 ret = size; 1591 1592 break; 1593 } 1594 1595 ret += size; 1596 1597 if (size < len) 1598 break; 1599 1600 nr_segs--; 1601 iov++; 1602 } 1603 1604 pipe_unlock(pipe); 1605 1606 if (!ret) 1607 ret = error; 1608 1609 return ret; 1610 } 1611 1612 /* 1613 * vmsplice splices a user address range into a pipe. It can be thought of 1614 * as splice-from-memory, where the regular splice is splice-from-file (or 1615 * to file). In both cases the output is a pipe, naturally. 1616 */ 1617 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1618 unsigned long nr_segs, unsigned int flags) 1619 { 1620 struct pipe_inode_info *pipe; 1621 struct page *pages[PIPE_DEF_BUFFERS]; 1622 struct partial_page partial[PIPE_DEF_BUFFERS]; 1623 struct splice_pipe_desc spd = { 1624 .pages = pages, 1625 .partial = partial, 1626 .nr_pages_max = PIPE_DEF_BUFFERS, 1627 .flags = flags, 1628 .ops = &user_page_pipe_buf_ops, 1629 .spd_release = spd_release_page, 1630 }; 1631 long ret; 1632 1633 pipe = get_pipe_info(file); 1634 if (!pipe) 1635 return -EBADF; 1636 1637 if (splice_grow_spd(pipe, &spd)) 1638 return -ENOMEM; 1639 1640 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, 1641 spd.partial, false, 1642 spd.nr_pages_max); 1643 if (spd.nr_pages <= 0) 1644 ret = spd.nr_pages; 1645 else 1646 ret = splice_to_pipe(pipe, &spd); 1647 1648 splice_shrink_spd(&spd); 1649 return ret; 1650 } 1651 1652 /* 1653 * Note that vmsplice only really supports true splicing _from_ user memory 1654 * to a pipe, not the other way around. Splicing from user memory is a simple 1655 * operation that can be supported without any funky alignment restrictions 1656 * or nasty vm tricks. We simply map in the user memory and fill them into 1657 * a pipe. The reverse isn't quite as easy, though. There are two possible 1658 * solutions for that: 1659 * 1660 * - memcpy() the data internally, at which point we might as well just 1661 * do a regular read() on the buffer anyway. 1662 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1663 * has restriction limitations on both ends of the pipe). 1664 * 1665 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1666 * 1667 */ 1668 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, 1669 unsigned long, nr_segs, unsigned int, flags) 1670 { 1671 struct fd f; 1672 long error; 1673 1674 if (unlikely(nr_segs > UIO_MAXIOV)) 1675 return -EINVAL; 1676 else if (unlikely(!nr_segs)) 1677 return 0; 1678 1679 error = -EBADF; 1680 f = fdget(fd); 1681 if (f.file) { 1682 if (f.file->f_mode & FMODE_WRITE) 1683 error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); 1684 else if (f.file->f_mode & FMODE_READ) 1685 error = vmsplice_to_user(f.file, iov, nr_segs, flags); 1686 1687 fdput(f); 1688 } 1689 1690 return error; 1691 } 1692 1693 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, 1694 int, fd_out, loff_t __user *, off_out, 1695 size_t, len, unsigned int, flags) 1696 { 1697 struct fd in, out; 1698 long error; 1699 1700 if (unlikely(!len)) 1701 return 0; 1702 1703 error = -EBADF; 1704 in = fdget(fd_in); 1705 if (in.file) { 1706 if (in.file->f_mode & FMODE_READ) { 1707 out = fdget(fd_out); 1708 if (out.file) { 1709 if (out.file->f_mode & FMODE_WRITE) 1710 error = do_splice(in.file, off_in, 1711 out.file, off_out, 1712 len, flags); 1713 fdput(out); 1714 } 1715 } 1716 fdput(in); 1717 } 1718 return error; 1719 } 1720 1721 /* 1722 * Make sure there's data to read. Wait for input if we can, otherwise 1723 * return an appropriate error. 1724 */ 1725 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1726 { 1727 int ret; 1728 1729 /* 1730 * Check ->nrbufs without the inode lock first. This function 1731 * is speculative anyways, so missing one is ok. 1732 */ 1733 if (pipe->nrbufs) 1734 return 0; 1735 1736 ret = 0; 1737 pipe_lock(pipe); 1738 1739 while (!pipe->nrbufs) { 1740 if (signal_pending(current)) { 1741 ret = -ERESTARTSYS; 1742 break; 1743 } 1744 if (!pipe->writers) 1745 break; 1746 if (!pipe->waiting_writers) { 1747 if (flags & SPLICE_F_NONBLOCK) { 1748 ret = -EAGAIN; 1749 break; 1750 } 1751 } 1752 pipe_wait(pipe); 1753 } 1754 1755 pipe_unlock(pipe); 1756 return ret; 1757 } 1758 1759 /* 1760 * Make sure there's writeable room. Wait for room if we can, otherwise 1761 * return an appropriate error. 1762 */ 1763 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1764 { 1765 int ret; 1766 1767 /* 1768 * Check ->nrbufs without the inode lock first. This function 1769 * is speculative anyways, so missing one is ok. 1770 */ 1771 if (pipe->nrbufs < pipe->buffers) 1772 return 0; 1773 1774 ret = 0; 1775 pipe_lock(pipe); 1776 1777 while (pipe->nrbufs >= pipe->buffers) { 1778 if (!pipe->readers) { 1779 send_sig(SIGPIPE, current, 0); 1780 ret = -EPIPE; 1781 break; 1782 } 1783 if (flags & SPLICE_F_NONBLOCK) { 1784 ret = -EAGAIN; 1785 break; 1786 } 1787 if (signal_pending(current)) { 1788 ret = -ERESTARTSYS; 1789 break; 1790 } 1791 pipe->waiting_writers++; 1792 pipe_wait(pipe); 1793 pipe->waiting_writers--; 1794 } 1795 1796 pipe_unlock(pipe); 1797 return ret; 1798 } 1799 1800 /* 1801 * Splice contents of ipipe to opipe. 1802 */ 1803 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1804 struct pipe_inode_info *opipe, 1805 size_t len, unsigned int flags) 1806 { 1807 struct pipe_buffer *ibuf, *obuf; 1808 int ret = 0, nbuf; 1809 bool input_wakeup = false; 1810 1811 1812 retry: 1813 ret = ipipe_prep(ipipe, flags); 1814 if (ret) 1815 return ret; 1816 1817 ret = opipe_prep(opipe, flags); 1818 if (ret) 1819 return ret; 1820 1821 /* 1822 * Potential ABBA deadlock, work around it by ordering lock 1823 * grabbing by pipe info address. Otherwise two different processes 1824 * could deadlock (one doing tee from A -> B, the other from B -> A). 1825 */ 1826 pipe_double_lock(ipipe, opipe); 1827 1828 do { 1829 if (!opipe->readers) { 1830 send_sig(SIGPIPE, current, 0); 1831 if (!ret) 1832 ret = -EPIPE; 1833 break; 1834 } 1835 1836 if (!ipipe->nrbufs && !ipipe->writers) 1837 break; 1838 1839 /* 1840 * Cannot make any progress, because either the input 1841 * pipe is empty or the output pipe is full. 1842 */ 1843 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { 1844 /* Already processed some buffers, break */ 1845 if (ret) 1846 break; 1847 1848 if (flags & SPLICE_F_NONBLOCK) { 1849 ret = -EAGAIN; 1850 break; 1851 } 1852 1853 /* 1854 * We raced with another reader/writer and haven't 1855 * managed to process any buffers. A zero return 1856 * value means EOF, so retry instead. 1857 */ 1858 pipe_unlock(ipipe); 1859 pipe_unlock(opipe); 1860 goto retry; 1861 } 1862 1863 ibuf = ipipe->bufs + ipipe->curbuf; 1864 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); 1865 obuf = opipe->bufs + nbuf; 1866 1867 if (len >= ibuf->len) { 1868 /* 1869 * Simply move the whole buffer from ipipe to opipe 1870 */ 1871 *obuf = *ibuf; 1872 ibuf->ops = NULL; 1873 opipe->nrbufs++; 1874 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); 1875 ipipe->nrbufs--; 1876 input_wakeup = true; 1877 } else { 1878 /* 1879 * Get a reference to this pipe buffer, 1880 * so we can copy the contents over. 1881 */ 1882 ibuf->ops->get(ipipe, ibuf); 1883 *obuf = *ibuf; 1884 1885 /* 1886 * Don't inherit the gift flag, we need to 1887 * prevent multiple steals of this page. 1888 */ 1889 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1890 1891 obuf->len = len; 1892 opipe->nrbufs++; 1893 ibuf->offset += obuf->len; 1894 ibuf->len -= obuf->len; 1895 } 1896 ret += obuf->len; 1897 len -= obuf->len; 1898 } while (len); 1899 1900 pipe_unlock(ipipe); 1901 pipe_unlock(opipe); 1902 1903 /* 1904 * If we put data in the output pipe, wakeup any potential readers. 1905 */ 1906 if (ret > 0) 1907 wakeup_pipe_readers(opipe); 1908 1909 if (input_wakeup) 1910 wakeup_pipe_writers(ipipe); 1911 1912 return ret; 1913 } 1914 1915 /* 1916 * Link contents of ipipe to opipe. 1917 */ 1918 static int link_pipe(struct pipe_inode_info *ipipe, 1919 struct pipe_inode_info *opipe, 1920 size_t len, unsigned int flags) 1921 { 1922 struct pipe_buffer *ibuf, *obuf; 1923 int ret = 0, i = 0, nbuf; 1924 1925 /* 1926 * Potential ABBA deadlock, work around it by ordering lock 1927 * grabbing by pipe info address. Otherwise two different processes 1928 * could deadlock (one doing tee from A -> B, the other from B -> A). 1929 */ 1930 pipe_double_lock(ipipe, opipe); 1931 1932 do { 1933 if (!opipe->readers) { 1934 send_sig(SIGPIPE, current, 0); 1935 if (!ret) 1936 ret = -EPIPE; 1937 break; 1938 } 1939 1940 /* 1941 * If we have iterated all input buffers or ran out of 1942 * output room, break. 1943 */ 1944 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) 1945 break; 1946 1947 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); 1948 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); 1949 1950 /* 1951 * Get a reference to this pipe buffer, 1952 * so we can copy the contents over. 1953 */ 1954 ibuf->ops->get(ipipe, ibuf); 1955 1956 obuf = opipe->bufs + nbuf; 1957 *obuf = *ibuf; 1958 1959 /* 1960 * Don't inherit the gift flag, we need to 1961 * prevent multiple steals of this page. 1962 */ 1963 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1964 1965 if (obuf->len > len) 1966 obuf->len = len; 1967 1968 opipe->nrbufs++; 1969 ret += obuf->len; 1970 len -= obuf->len; 1971 i++; 1972 } while (len); 1973 1974 /* 1975 * return EAGAIN if we have the potential of some data in the 1976 * future, otherwise just return 0 1977 */ 1978 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1979 ret = -EAGAIN; 1980 1981 pipe_unlock(ipipe); 1982 pipe_unlock(opipe); 1983 1984 /* 1985 * If we put data in the output pipe, wakeup any potential readers. 1986 */ 1987 if (ret > 0) 1988 wakeup_pipe_readers(opipe); 1989 1990 return ret; 1991 } 1992 1993 /* 1994 * This is a tee(1) implementation that works on pipes. It doesn't copy 1995 * any data, it simply references the 'in' pages on the 'out' pipe. 1996 * The 'flags' used are the SPLICE_F_* variants, currently the only 1997 * applicable one is SPLICE_F_NONBLOCK. 1998 */ 1999 static long do_tee(struct file *in, struct file *out, size_t len, 2000 unsigned int flags) 2001 { 2002 struct pipe_inode_info *ipipe = get_pipe_info(in); 2003 struct pipe_inode_info *opipe = get_pipe_info(out); 2004 int ret = -EINVAL; 2005 2006 /* 2007 * Duplicate the contents of ipipe to opipe without actually 2008 * copying the data. 2009 */ 2010 if (ipipe && opipe && ipipe != opipe) { 2011 /* 2012 * Keep going, unless we encounter an error. The ipipe/opipe 2013 * ordering doesn't really matter. 2014 */ 2015 ret = ipipe_prep(ipipe, flags); 2016 if (!ret) { 2017 ret = opipe_prep(opipe, flags); 2018 if (!ret) 2019 ret = link_pipe(ipipe, opipe, len, flags); 2020 } 2021 } 2022 2023 return ret; 2024 } 2025 2026 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 2027 { 2028 struct fd in; 2029 int error; 2030 2031 if (unlikely(!len)) 2032 return 0; 2033 2034 error = -EBADF; 2035 in = fdget(fdin); 2036 if (in.file) { 2037 if (in.file->f_mode & FMODE_READ) { 2038 struct fd out = fdget(fdout); 2039 if (out.file) { 2040 if (out.file->f_mode & FMODE_WRITE) 2041 error = do_tee(in.file, out.file, 2042 len, flags); 2043 fdput(out); 2044 } 2045 } 2046 fdput(in); 2047 } 2048 2049 return error; 2050 } 2051