1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/splice.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 #include <linux/security.h> 32 33 /* 34 * Attempt to steal a page from a pipe buffer. This should perhaps go into 35 * a vm helper function, it's already simplified quite a bit by the 36 * addition of remove_mapping(). If success is returned, the caller may 37 * attempt to reuse this page for another destination. 38 */ 39 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 40 struct pipe_buffer *buf) 41 { 42 struct page *page = buf->page; 43 struct address_space *mapping; 44 45 lock_page(page); 46 47 mapping = page_mapping(page); 48 if (mapping) { 49 WARN_ON(!PageUptodate(page)); 50 51 /* 52 * At least for ext2 with nobh option, we need to wait on 53 * writeback completing on this page, since we'll remove it 54 * from the pagecache. Otherwise truncate wont wait on the 55 * page, allowing the disk blocks to be reused by someone else 56 * before we actually wrote our data to them. fs corruption 57 * ensues. 58 */ 59 wait_on_page_writeback(page); 60 61 if (PagePrivate(page)) 62 try_to_release_page(page, GFP_KERNEL); 63 64 /* 65 * If we succeeded in removing the mapping, set LRU flag 66 * and return good. 67 */ 68 if (remove_mapping(mapping, page)) { 69 buf->flags |= PIPE_BUF_FLAG_LRU; 70 return 0; 71 } 72 } 73 74 /* 75 * Raced with truncate or failed to remove page from current 76 * address space, unlock and return failure. 77 */ 78 unlock_page(page); 79 return 1; 80 } 81 82 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 83 struct pipe_buffer *buf) 84 { 85 page_cache_release(buf->page); 86 buf->flags &= ~PIPE_BUF_FLAG_LRU; 87 } 88 89 /* 90 * Check whether the contents of buf is OK to access. Since the content 91 * is a page cache page, IO may be in flight. 92 */ 93 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 94 struct pipe_buffer *buf) 95 { 96 struct page *page = buf->page; 97 int err; 98 99 if (!PageUptodate(page)) { 100 lock_page(page); 101 102 /* 103 * Page got truncated/unhashed. This will cause a 0-byte 104 * splice, if this is the first page. 105 */ 106 if (!page->mapping) { 107 err = -ENODATA; 108 goto error; 109 } 110 111 /* 112 * Uh oh, read-error from disk. 113 */ 114 if (!PageUptodate(page)) { 115 err = -EIO; 116 goto error; 117 } 118 119 /* 120 * Page is ok afterall, we are done. 121 */ 122 unlock_page(page); 123 } 124 125 return 0; 126 error: 127 unlock_page(page); 128 return err; 129 } 130 131 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 132 .can_merge = 0, 133 .map = generic_pipe_buf_map, 134 .unmap = generic_pipe_buf_unmap, 135 .confirm = page_cache_pipe_buf_confirm, 136 .release = page_cache_pipe_buf_release, 137 .steal = page_cache_pipe_buf_steal, 138 .get = generic_pipe_buf_get, 139 }; 140 141 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 142 struct pipe_buffer *buf) 143 { 144 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 145 return 1; 146 147 buf->flags |= PIPE_BUF_FLAG_LRU; 148 return generic_pipe_buf_steal(pipe, buf); 149 } 150 151 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 152 .can_merge = 0, 153 .map = generic_pipe_buf_map, 154 .unmap = generic_pipe_buf_unmap, 155 .confirm = generic_pipe_buf_confirm, 156 .release = page_cache_pipe_buf_release, 157 .steal = user_page_pipe_buf_steal, 158 .get = generic_pipe_buf_get, 159 }; 160 161 /** 162 * splice_to_pipe - fill passed data into a pipe 163 * @pipe: pipe to fill 164 * @spd: data to fill 165 * 166 * Description: 167 * @spd contains a map of pages and len/offset tuples, along with 168 * the struct pipe_buf_operations associated with these pages. This 169 * function will link that data to the pipe. 170 * 171 */ 172 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 173 struct splice_pipe_desc *spd) 174 { 175 unsigned int spd_pages = spd->nr_pages; 176 int ret, do_wakeup, page_nr; 177 178 ret = 0; 179 do_wakeup = 0; 180 page_nr = 0; 181 182 if (pipe->inode) 183 mutex_lock(&pipe->inode->i_mutex); 184 185 for (;;) { 186 if (!pipe->readers) { 187 send_sig(SIGPIPE, current, 0); 188 if (!ret) 189 ret = -EPIPE; 190 break; 191 } 192 193 if (pipe->nrbufs < PIPE_BUFFERS) { 194 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 195 struct pipe_buffer *buf = pipe->bufs + newbuf; 196 197 buf->page = spd->pages[page_nr]; 198 buf->offset = spd->partial[page_nr].offset; 199 buf->len = spd->partial[page_nr].len; 200 buf->private = spd->partial[page_nr].private; 201 buf->ops = spd->ops; 202 if (spd->flags & SPLICE_F_GIFT) 203 buf->flags |= PIPE_BUF_FLAG_GIFT; 204 205 pipe->nrbufs++; 206 page_nr++; 207 ret += buf->len; 208 209 if (pipe->inode) 210 do_wakeup = 1; 211 212 if (!--spd->nr_pages) 213 break; 214 if (pipe->nrbufs < PIPE_BUFFERS) 215 continue; 216 217 break; 218 } 219 220 if (spd->flags & SPLICE_F_NONBLOCK) { 221 if (!ret) 222 ret = -EAGAIN; 223 break; 224 } 225 226 if (signal_pending(current)) { 227 if (!ret) 228 ret = -ERESTARTSYS; 229 break; 230 } 231 232 if (do_wakeup) { 233 smp_mb(); 234 if (waitqueue_active(&pipe->wait)) 235 wake_up_interruptible_sync(&pipe->wait); 236 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 237 do_wakeup = 0; 238 } 239 240 pipe->waiting_writers++; 241 pipe_wait(pipe); 242 pipe->waiting_writers--; 243 } 244 245 if (pipe->inode) { 246 mutex_unlock(&pipe->inode->i_mutex); 247 248 if (do_wakeup) { 249 smp_mb(); 250 if (waitqueue_active(&pipe->wait)) 251 wake_up_interruptible(&pipe->wait); 252 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 253 } 254 } 255 256 while (page_nr < spd_pages) 257 page_cache_release(spd->pages[page_nr++]); 258 259 return ret; 260 } 261 262 static int 263 __generic_file_splice_read(struct file *in, loff_t *ppos, 264 struct pipe_inode_info *pipe, size_t len, 265 unsigned int flags) 266 { 267 struct address_space *mapping = in->f_mapping; 268 unsigned int loff, nr_pages, req_pages; 269 struct page *pages[PIPE_BUFFERS]; 270 struct partial_page partial[PIPE_BUFFERS]; 271 struct page *page; 272 pgoff_t index, end_index; 273 loff_t isize; 274 int error, page_nr; 275 struct splice_pipe_desc spd = { 276 .pages = pages, 277 .partial = partial, 278 .flags = flags, 279 .ops = &page_cache_pipe_buf_ops, 280 }; 281 282 index = *ppos >> PAGE_CACHE_SHIFT; 283 loff = *ppos & ~PAGE_CACHE_MASK; 284 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 285 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 286 287 /* 288 * Lookup the (hopefully) full range of pages we need. 289 */ 290 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 291 index += spd.nr_pages; 292 293 /* 294 * If find_get_pages_contig() returned fewer pages than we needed, 295 * readahead/allocate the rest and fill in the holes. 296 */ 297 if (spd.nr_pages < nr_pages) 298 page_cache_sync_readahead(mapping, &in->f_ra, in, 299 index, req_pages - spd.nr_pages); 300 301 error = 0; 302 while (spd.nr_pages < nr_pages) { 303 /* 304 * Page could be there, find_get_pages_contig() breaks on 305 * the first hole. 306 */ 307 page = find_get_page(mapping, index); 308 if (!page) { 309 /* 310 * page didn't exist, allocate one. 311 */ 312 page = page_cache_alloc_cold(mapping); 313 if (!page) 314 break; 315 316 error = add_to_page_cache_lru(page, mapping, index, 317 GFP_KERNEL); 318 if (unlikely(error)) { 319 page_cache_release(page); 320 if (error == -EEXIST) 321 continue; 322 break; 323 } 324 /* 325 * add_to_page_cache() locks the page, unlock it 326 * to avoid convoluting the logic below even more. 327 */ 328 unlock_page(page); 329 } 330 331 pages[spd.nr_pages++] = page; 332 index++; 333 } 334 335 /* 336 * Now loop over the map and see if we need to start IO on any 337 * pages, fill in the partial map, etc. 338 */ 339 index = *ppos >> PAGE_CACHE_SHIFT; 340 nr_pages = spd.nr_pages; 341 spd.nr_pages = 0; 342 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 343 unsigned int this_len; 344 345 if (!len) 346 break; 347 348 /* 349 * this_len is the max we'll use from this page 350 */ 351 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 352 page = pages[page_nr]; 353 354 if (PageReadahead(page)) 355 page_cache_async_readahead(mapping, &in->f_ra, in, 356 page, index, req_pages - page_nr); 357 358 /* 359 * If the page isn't uptodate, we may need to start io on it 360 */ 361 if (!PageUptodate(page)) { 362 /* 363 * If in nonblock mode then dont block on waiting 364 * for an in-flight io page 365 */ 366 if (flags & SPLICE_F_NONBLOCK) { 367 if (TestSetPageLocked(page)) 368 break; 369 } else 370 lock_page(page); 371 372 /* 373 * page was truncated, stop here. if this isn't the 374 * first page, we'll just complete what we already 375 * added 376 */ 377 if (!page->mapping) { 378 unlock_page(page); 379 break; 380 } 381 /* 382 * page was already under io and is now done, great 383 */ 384 if (PageUptodate(page)) { 385 unlock_page(page); 386 goto fill_it; 387 } 388 389 /* 390 * need to read in the page 391 */ 392 error = mapping->a_ops->readpage(in, page); 393 if (unlikely(error)) { 394 /* 395 * We really should re-lookup the page here, 396 * but it complicates things a lot. Instead 397 * lets just do what we already stored, and 398 * we'll get it the next time we are called. 399 */ 400 if (error == AOP_TRUNCATED_PAGE) 401 error = 0; 402 403 break; 404 } 405 } 406 fill_it: 407 /* 408 * i_size must be checked after PageUptodate. 409 */ 410 isize = i_size_read(mapping->host); 411 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 412 if (unlikely(!isize || index > end_index)) 413 break; 414 415 /* 416 * if this is the last page, see if we need to shrink 417 * the length and stop 418 */ 419 if (end_index == index) { 420 unsigned int plen; 421 422 /* 423 * max good bytes in this page 424 */ 425 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 426 if (plen <= loff) 427 break; 428 429 /* 430 * force quit after adding this page 431 */ 432 this_len = min(this_len, plen - loff); 433 len = this_len; 434 } 435 436 partial[page_nr].offset = loff; 437 partial[page_nr].len = this_len; 438 len -= this_len; 439 loff = 0; 440 spd.nr_pages++; 441 index++; 442 } 443 444 /* 445 * Release any pages at the end, if we quit early. 'page_nr' is how far 446 * we got, 'nr_pages' is how many pages are in the map. 447 */ 448 while (page_nr < nr_pages) 449 page_cache_release(pages[page_nr++]); 450 in->f_ra.prev_index = index; 451 452 if (spd.nr_pages) 453 return splice_to_pipe(pipe, &spd); 454 455 return error; 456 } 457 458 /** 459 * generic_file_splice_read - splice data from file to a pipe 460 * @in: file to splice from 461 * @ppos: position in @in 462 * @pipe: pipe to splice to 463 * @len: number of bytes to splice 464 * @flags: splice modifier flags 465 * 466 * Description: 467 * Will read pages from given file and fill them into a pipe. Can be 468 * used as long as the address_space operations for the source implements 469 * a readpage() hook. 470 * 471 */ 472 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 473 struct pipe_inode_info *pipe, size_t len, 474 unsigned int flags) 475 { 476 ssize_t spliced; 477 int ret; 478 loff_t isize, left; 479 480 isize = i_size_read(in->f_mapping->host); 481 if (unlikely(*ppos >= isize)) 482 return 0; 483 484 left = isize - *ppos; 485 if (unlikely(left < len)) 486 len = left; 487 488 ret = 0; 489 spliced = 0; 490 while (len && !spliced) { 491 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 492 493 if (ret < 0) 494 break; 495 else if (!ret) { 496 if (spliced) 497 break; 498 if (flags & SPLICE_F_NONBLOCK) { 499 ret = -EAGAIN; 500 break; 501 } 502 } 503 504 *ppos += ret; 505 len -= ret; 506 spliced += ret; 507 } 508 509 if (spliced) 510 return spliced; 511 512 return ret; 513 } 514 515 EXPORT_SYMBOL(generic_file_splice_read); 516 517 /* 518 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 519 * using sendpage(). Return the number of bytes sent. 520 */ 521 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 522 struct pipe_buffer *buf, struct splice_desc *sd) 523 { 524 struct file *file = sd->u.file; 525 loff_t pos = sd->pos; 526 int ret, more; 527 528 ret = buf->ops->confirm(pipe, buf); 529 if (!ret) { 530 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 531 532 ret = file->f_op->sendpage(file, buf->page, buf->offset, 533 sd->len, &pos, more); 534 } 535 536 return ret; 537 } 538 539 /* 540 * This is a little more tricky than the file -> pipe splicing. There are 541 * basically three cases: 542 * 543 * - Destination page already exists in the address space and there 544 * are users of it. For that case we have no other option that 545 * copying the data. Tough luck. 546 * - Destination page already exists in the address space, but there 547 * are no users of it. Make sure it's uptodate, then drop it. Fall 548 * through to last case. 549 * - Destination page does not exist, we can add the pipe page to 550 * the page cache and avoid the copy. 551 * 552 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 553 * sd->flags), we attempt to migrate pages from the pipe to the output 554 * file address space page cache. This is possible if no one else has 555 * the pipe page referenced outside of the pipe and page cache. If 556 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 557 * a new page in the output file page cache and fill/dirty that. 558 */ 559 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 560 struct splice_desc *sd) 561 { 562 struct file *file = sd->u.file; 563 struct address_space *mapping = file->f_mapping; 564 unsigned int offset, this_len; 565 struct page *page; 566 pgoff_t index; 567 int ret; 568 569 /* 570 * make sure the data in this buffer is uptodate 571 */ 572 ret = buf->ops->confirm(pipe, buf); 573 if (unlikely(ret)) 574 return ret; 575 576 index = sd->pos >> PAGE_CACHE_SHIFT; 577 offset = sd->pos & ~PAGE_CACHE_MASK; 578 579 this_len = sd->len; 580 if (this_len + offset > PAGE_CACHE_SIZE) 581 this_len = PAGE_CACHE_SIZE - offset; 582 583 find_page: 584 page = find_lock_page(mapping, index); 585 if (!page) { 586 ret = -ENOMEM; 587 page = page_cache_alloc_cold(mapping); 588 if (unlikely(!page)) 589 goto out_ret; 590 591 /* 592 * This will also lock the page 593 */ 594 ret = add_to_page_cache_lru(page, mapping, index, 595 GFP_KERNEL); 596 if (unlikely(ret)) 597 goto out_release; 598 } 599 600 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 601 if (unlikely(ret)) { 602 loff_t isize = i_size_read(mapping->host); 603 604 if (ret != AOP_TRUNCATED_PAGE) 605 unlock_page(page); 606 page_cache_release(page); 607 if (ret == AOP_TRUNCATED_PAGE) 608 goto find_page; 609 610 /* 611 * prepare_write() may have instantiated a few blocks 612 * outside i_size. Trim these off again. 613 */ 614 if (sd->pos + this_len > isize) 615 vmtruncate(mapping->host, isize); 616 617 goto out_ret; 618 } 619 620 if (buf->page != page) { 621 /* 622 * Careful, ->map() uses KM_USER0! 623 */ 624 char *src = buf->ops->map(pipe, buf, 1); 625 char *dst = kmap_atomic(page, KM_USER1); 626 627 memcpy(dst + offset, src + buf->offset, this_len); 628 flush_dcache_page(page); 629 kunmap_atomic(dst, KM_USER1); 630 buf->ops->unmap(pipe, buf, src); 631 } 632 633 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 634 if (ret) { 635 if (ret == AOP_TRUNCATED_PAGE) { 636 page_cache_release(page); 637 goto find_page; 638 } 639 if (ret < 0) 640 goto out; 641 /* 642 * Partial write has happened, so 'ret' already initialized by 643 * number of bytes written, Where is nothing we have to do here. 644 */ 645 } else 646 ret = this_len; 647 /* 648 * Return the number of bytes written and mark page as 649 * accessed, we are now done! 650 */ 651 mark_page_accessed(page); 652 out: 653 unlock_page(page); 654 out_release: 655 page_cache_release(page); 656 out_ret: 657 return ret; 658 } 659 660 /** 661 * __splice_from_pipe - splice data from a pipe to given actor 662 * @pipe: pipe to splice from 663 * @sd: information to @actor 664 * @actor: handler that splices the data 665 * 666 * Description: 667 * This function does little more than loop over the pipe and call 668 * @actor to do the actual moving of a single struct pipe_buffer to 669 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 670 * pipe_to_user. 671 * 672 */ 673 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 674 splice_actor *actor) 675 { 676 int ret, do_wakeup, err; 677 678 ret = 0; 679 do_wakeup = 0; 680 681 for (;;) { 682 if (pipe->nrbufs) { 683 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 684 const struct pipe_buf_operations *ops = buf->ops; 685 686 sd->len = buf->len; 687 if (sd->len > sd->total_len) 688 sd->len = sd->total_len; 689 690 err = actor(pipe, buf, sd); 691 if (err <= 0) { 692 if (!ret && err != -ENODATA) 693 ret = err; 694 695 break; 696 } 697 698 ret += err; 699 buf->offset += err; 700 buf->len -= err; 701 702 sd->len -= err; 703 sd->pos += err; 704 sd->total_len -= err; 705 if (sd->len) 706 continue; 707 708 if (!buf->len) { 709 buf->ops = NULL; 710 ops->release(pipe, buf); 711 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 712 pipe->nrbufs--; 713 if (pipe->inode) 714 do_wakeup = 1; 715 } 716 717 if (!sd->total_len) 718 break; 719 } 720 721 if (pipe->nrbufs) 722 continue; 723 if (!pipe->writers) 724 break; 725 if (!pipe->waiting_writers) { 726 if (ret) 727 break; 728 } 729 730 if (sd->flags & SPLICE_F_NONBLOCK) { 731 if (!ret) 732 ret = -EAGAIN; 733 break; 734 } 735 736 if (signal_pending(current)) { 737 if (!ret) 738 ret = -ERESTARTSYS; 739 break; 740 } 741 742 if (do_wakeup) { 743 smp_mb(); 744 if (waitqueue_active(&pipe->wait)) 745 wake_up_interruptible_sync(&pipe->wait); 746 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 747 do_wakeup = 0; 748 } 749 750 pipe_wait(pipe); 751 } 752 753 if (do_wakeup) { 754 smp_mb(); 755 if (waitqueue_active(&pipe->wait)) 756 wake_up_interruptible(&pipe->wait); 757 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 758 } 759 760 return ret; 761 } 762 EXPORT_SYMBOL(__splice_from_pipe); 763 764 /** 765 * splice_from_pipe - splice data from a pipe to a file 766 * @pipe: pipe to splice from 767 * @out: file to splice to 768 * @ppos: position in @out 769 * @len: how many bytes to splice 770 * @flags: splice modifier flags 771 * @actor: handler that splices the data 772 * 773 * Description: 774 * See __splice_from_pipe. This function locks the input and output inodes, 775 * otherwise it's identical to __splice_from_pipe(). 776 * 777 */ 778 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 779 loff_t *ppos, size_t len, unsigned int flags, 780 splice_actor *actor) 781 { 782 ssize_t ret; 783 struct inode *inode = out->f_mapping->host; 784 struct splice_desc sd = { 785 .total_len = len, 786 .flags = flags, 787 .pos = *ppos, 788 .u.file = out, 789 }; 790 791 /* 792 * The actor worker might be calling ->prepare_write and 793 * ->commit_write. Most of the time, these expect i_mutex to 794 * be held. Since this may result in an ABBA deadlock with 795 * pipe->inode, we have to order lock acquiry here. 796 */ 797 inode_double_lock(inode, pipe->inode); 798 ret = __splice_from_pipe(pipe, &sd, actor); 799 inode_double_unlock(inode, pipe->inode); 800 801 return ret; 802 } 803 804 /** 805 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 806 * @pipe: pipe info 807 * @out: file to write to 808 * @ppos: position in @out 809 * @len: number of bytes to splice 810 * @flags: splice modifier flags 811 * 812 * Description: 813 * Will either move or copy pages (determined by @flags options) from 814 * the given pipe inode to the given file. The caller is responsible 815 * for acquiring i_mutex on both inodes. 816 * 817 */ 818 ssize_t 819 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 820 loff_t *ppos, size_t len, unsigned int flags) 821 { 822 struct address_space *mapping = out->f_mapping; 823 struct inode *inode = mapping->host; 824 struct splice_desc sd = { 825 .total_len = len, 826 .flags = flags, 827 .pos = *ppos, 828 .u.file = out, 829 }; 830 ssize_t ret; 831 int err; 832 833 err = remove_suid(out->f_path.dentry); 834 if (unlikely(err)) 835 return err; 836 837 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 838 if (ret > 0) { 839 unsigned long nr_pages; 840 841 *ppos += ret; 842 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 843 844 /* 845 * If file or inode is SYNC and we actually wrote some data, 846 * sync it. 847 */ 848 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 849 err = generic_osync_inode(inode, mapping, 850 OSYNC_METADATA|OSYNC_DATA); 851 852 if (err) 853 ret = err; 854 } 855 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 856 } 857 858 return ret; 859 } 860 861 EXPORT_SYMBOL(generic_file_splice_write_nolock); 862 863 /** 864 * generic_file_splice_write - splice data from a pipe to a file 865 * @pipe: pipe info 866 * @out: file to write to 867 * @ppos: position in @out 868 * @len: number of bytes to splice 869 * @flags: splice modifier flags 870 * 871 * Description: 872 * Will either move or copy pages (determined by @flags options) from 873 * the given pipe inode to the given file. 874 * 875 */ 876 ssize_t 877 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 878 loff_t *ppos, size_t len, unsigned int flags) 879 { 880 struct address_space *mapping = out->f_mapping; 881 struct inode *inode = mapping->host; 882 ssize_t ret; 883 int err; 884 885 err = should_remove_suid(out->f_path.dentry); 886 if (unlikely(err)) { 887 mutex_lock(&inode->i_mutex); 888 err = __remove_suid(out->f_path.dentry, err); 889 mutex_unlock(&inode->i_mutex); 890 if (err) 891 return err; 892 } 893 894 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 895 if (ret > 0) { 896 unsigned long nr_pages; 897 898 *ppos += ret; 899 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 900 901 /* 902 * If file or inode is SYNC and we actually wrote some data, 903 * sync it. 904 */ 905 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 906 mutex_lock(&inode->i_mutex); 907 err = generic_osync_inode(inode, mapping, 908 OSYNC_METADATA|OSYNC_DATA); 909 mutex_unlock(&inode->i_mutex); 910 911 if (err) 912 ret = err; 913 } 914 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 915 } 916 917 return ret; 918 } 919 920 EXPORT_SYMBOL(generic_file_splice_write); 921 922 /** 923 * generic_splice_sendpage - splice data from a pipe to a socket 924 * @pipe: pipe to splice from 925 * @out: socket to write to 926 * @ppos: position in @out 927 * @len: number of bytes to splice 928 * @flags: splice modifier flags 929 * 930 * Description: 931 * Will send @len bytes from the pipe to a network socket. No data copying 932 * is involved. 933 * 934 */ 935 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 936 loff_t *ppos, size_t len, unsigned int flags) 937 { 938 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 939 } 940 941 EXPORT_SYMBOL(generic_splice_sendpage); 942 943 /* 944 * Attempt to initiate a splice from pipe to file. 945 */ 946 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 947 loff_t *ppos, size_t len, unsigned int flags) 948 { 949 int ret; 950 951 if (unlikely(!out->f_op || !out->f_op->splice_write)) 952 return -EINVAL; 953 954 if (unlikely(!(out->f_mode & FMODE_WRITE))) 955 return -EBADF; 956 957 ret = rw_verify_area(WRITE, out, ppos, len); 958 if (unlikely(ret < 0)) 959 return ret; 960 961 ret = security_file_permission(out, MAY_WRITE); 962 if (unlikely(ret < 0)) 963 return ret; 964 965 return out->f_op->splice_write(pipe, out, ppos, len, flags); 966 } 967 968 /* 969 * Attempt to initiate a splice from a file to a pipe. 970 */ 971 static long do_splice_to(struct file *in, loff_t *ppos, 972 struct pipe_inode_info *pipe, size_t len, 973 unsigned int flags) 974 { 975 int ret; 976 977 if (unlikely(!in->f_op || !in->f_op->splice_read)) 978 return -EINVAL; 979 980 if (unlikely(!(in->f_mode & FMODE_READ))) 981 return -EBADF; 982 983 ret = rw_verify_area(READ, in, ppos, len); 984 if (unlikely(ret < 0)) 985 return ret; 986 987 ret = security_file_permission(in, MAY_READ); 988 if (unlikely(ret < 0)) 989 return ret; 990 991 return in->f_op->splice_read(in, ppos, pipe, len, flags); 992 } 993 994 /** 995 * splice_direct_to_actor - splices data directly between two non-pipes 996 * @in: file to splice from 997 * @sd: actor information on where to splice to 998 * @actor: handles the data splicing 999 * 1000 * Description: 1001 * This is a special case helper to splice directly between two 1002 * points, without requiring an explicit pipe. Internally an allocated 1003 * pipe is cached in the process, and reused during the lifetime of 1004 * that process. 1005 * 1006 */ 1007 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 1008 splice_direct_actor *actor) 1009 { 1010 struct pipe_inode_info *pipe; 1011 long ret, bytes; 1012 umode_t i_mode; 1013 size_t len; 1014 int i, flags; 1015 1016 /* 1017 * We require the input being a regular file, as we don't want to 1018 * randomly drop data for eg socket -> socket splicing. Use the 1019 * piped splicing for that! 1020 */ 1021 i_mode = in->f_path.dentry->d_inode->i_mode; 1022 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 1023 return -EINVAL; 1024 1025 /* 1026 * neither in nor out is a pipe, setup an internal pipe attached to 1027 * 'out' and transfer the wanted data from 'in' to 'out' through that 1028 */ 1029 pipe = current->splice_pipe; 1030 if (unlikely(!pipe)) { 1031 pipe = alloc_pipe_info(NULL); 1032 if (!pipe) 1033 return -ENOMEM; 1034 1035 /* 1036 * We don't have an immediate reader, but we'll read the stuff 1037 * out of the pipe right after the splice_to_pipe(). So set 1038 * PIPE_READERS appropriately. 1039 */ 1040 pipe->readers = 1; 1041 1042 current->splice_pipe = pipe; 1043 } 1044 1045 /* 1046 * Do the splice. 1047 */ 1048 ret = 0; 1049 bytes = 0; 1050 len = sd->total_len; 1051 flags = sd->flags; 1052 1053 /* 1054 * Don't block on output, we have to drain the direct pipe. 1055 */ 1056 sd->flags &= ~SPLICE_F_NONBLOCK; 1057 1058 while (len) { 1059 size_t read_len; 1060 loff_t pos = sd->pos; 1061 1062 ret = do_splice_to(in, &pos, pipe, len, flags); 1063 if (unlikely(ret <= 0)) 1064 goto out_release; 1065 1066 read_len = ret; 1067 sd->total_len = read_len; 1068 1069 /* 1070 * NOTE: nonblocking mode only applies to the input. We 1071 * must not do the output in nonblocking mode as then we 1072 * could get stuck data in the internal pipe: 1073 */ 1074 ret = actor(pipe, sd); 1075 if (unlikely(ret <= 0)) 1076 goto out_release; 1077 1078 bytes += ret; 1079 len -= ret; 1080 sd->pos = pos; 1081 1082 if (ret < read_len) 1083 goto out_release; 1084 } 1085 1086 pipe->nrbufs = pipe->curbuf = 0; 1087 return bytes; 1088 1089 out_release: 1090 /* 1091 * If we did an incomplete transfer we must release 1092 * the pipe buffers in question: 1093 */ 1094 for (i = 0; i < PIPE_BUFFERS; i++) { 1095 struct pipe_buffer *buf = pipe->bufs + i; 1096 1097 if (buf->ops) { 1098 buf->ops->release(pipe, buf); 1099 buf->ops = NULL; 1100 } 1101 } 1102 pipe->nrbufs = pipe->curbuf = 0; 1103 1104 /* 1105 * If we transferred some data, return the number of bytes: 1106 */ 1107 if (bytes > 0) 1108 return bytes; 1109 1110 return ret; 1111 1112 } 1113 EXPORT_SYMBOL(splice_direct_to_actor); 1114 1115 static int direct_splice_actor(struct pipe_inode_info *pipe, 1116 struct splice_desc *sd) 1117 { 1118 struct file *file = sd->u.file; 1119 1120 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1121 } 1122 1123 /** 1124 * do_splice_direct - splices data directly between two files 1125 * @in: file to splice from 1126 * @ppos: input file offset 1127 * @out: file to splice to 1128 * @len: number of bytes to splice 1129 * @flags: splice modifier flags 1130 * 1131 * Description: 1132 * For use by do_sendfile(). splice can easily emulate sendfile, but 1133 * doing it in the application would incur an extra system call 1134 * (splice in + splice out, as compared to just sendfile()). So this helper 1135 * can splice directly through a process-private pipe. 1136 * 1137 */ 1138 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1139 size_t len, unsigned int flags) 1140 { 1141 struct splice_desc sd = { 1142 .len = len, 1143 .total_len = len, 1144 .flags = flags, 1145 .pos = *ppos, 1146 .u.file = out, 1147 }; 1148 long ret; 1149 1150 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1151 if (ret > 0) 1152 *ppos += ret; 1153 1154 return ret; 1155 } 1156 1157 /* 1158 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1159 * location, so checking ->i_pipe is not enough to verify that this is a 1160 * pipe. 1161 */ 1162 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1163 { 1164 if (S_ISFIFO(inode->i_mode)) 1165 return inode->i_pipe; 1166 1167 return NULL; 1168 } 1169 1170 /* 1171 * Determine where to splice to/from. 1172 */ 1173 static long do_splice(struct file *in, loff_t __user *off_in, 1174 struct file *out, loff_t __user *off_out, 1175 size_t len, unsigned int flags) 1176 { 1177 struct pipe_inode_info *pipe; 1178 loff_t offset, *off; 1179 long ret; 1180 1181 pipe = pipe_info(in->f_path.dentry->d_inode); 1182 if (pipe) { 1183 if (off_in) 1184 return -ESPIPE; 1185 if (off_out) { 1186 if (out->f_op->llseek == no_llseek) 1187 return -EINVAL; 1188 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1189 return -EFAULT; 1190 off = &offset; 1191 } else 1192 off = &out->f_pos; 1193 1194 ret = do_splice_from(pipe, out, off, len, flags); 1195 1196 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1197 ret = -EFAULT; 1198 1199 return ret; 1200 } 1201 1202 pipe = pipe_info(out->f_path.dentry->d_inode); 1203 if (pipe) { 1204 if (off_out) 1205 return -ESPIPE; 1206 if (off_in) { 1207 if (in->f_op->llseek == no_llseek) 1208 return -EINVAL; 1209 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1210 return -EFAULT; 1211 off = &offset; 1212 } else 1213 off = &in->f_pos; 1214 1215 ret = do_splice_to(in, off, pipe, len, flags); 1216 1217 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1218 ret = -EFAULT; 1219 1220 return ret; 1221 } 1222 1223 return -EINVAL; 1224 } 1225 1226 /* 1227 * Do a copy-from-user while holding the mmap_semaphore for reading, in a 1228 * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem 1229 * for writing) and page faulting on the user memory pointed to by src. 1230 * This assumes that we will very rarely hit the partial != 0 path, or this 1231 * will not be a win. 1232 */ 1233 static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n) 1234 { 1235 int partial; 1236 1237 pagefault_disable(); 1238 partial = __copy_from_user_inatomic(dst, src, n); 1239 pagefault_enable(); 1240 1241 /* 1242 * Didn't copy everything, drop the mmap_sem and do a faulting copy 1243 */ 1244 if (unlikely(partial)) { 1245 up_read(¤t->mm->mmap_sem); 1246 partial = copy_from_user(dst, src, n); 1247 down_read(¤t->mm->mmap_sem); 1248 } 1249 1250 return partial; 1251 } 1252 1253 /* 1254 * Map an iov into an array of pages and offset/length tupples. With the 1255 * partial_page structure, we can map several non-contiguous ranges into 1256 * our ones pages[] map instead of splitting that operation into pieces. 1257 * Could easily be exported as a generic helper for other users, in which 1258 * case one would probably want to add a 'max_nr_pages' parameter as well. 1259 */ 1260 static int get_iovec_page_array(const struct iovec __user *iov, 1261 unsigned int nr_vecs, struct page **pages, 1262 struct partial_page *partial, int aligned) 1263 { 1264 int buffers = 0, error = 0; 1265 1266 down_read(¤t->mm->mmap_sem); 1267 1268 while (nr_vecs) { 1269 unsigned long off, npages; 1270 struct iovec entry; 1271 void __user *base; 1272 size_t len; 1273 int i; 1274 1275 error = -EFAULT; 1276 if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry))) 1277 break; 1278 1279 base = entry.iov_base; 1280 len = entry.iov_len; 1281 1282 /* 1283 * Sanity check this iovec. 0 read succeeds. 1284 */ 1285 error = 0; 1286 if (unlikely(!len)) 1287 break; 1288 error = -EFAULT; 1289 if (unlikely(!base)) 1290 break; 1291 1292 /* 1293 * Get this base offset and number of pages, then map 1294 * in the user pages. 1295 */ 1296 off = (unsigned long) base & ~PAGE_MASK; 1297 1298 /* 1299 * If asked for alignment, the offset must be zero and the 1300 * length a multiple of the PAGE_SIZE. 1301 */ 1302 error = -EINVAL; 1303 if (aligned && (off || len & ~PAGE_MASK)) 1304 break; 1305 1306 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1307 if (npages > PIPE_BUFFERS - buffers) 1308 npages = PIPE_BUFFERS - buffers; 1309 1310 error = get_user_pages(current, current->mm, 1311 (unsigned long) base, npages, 0, 0, 1312 &pages[buffers], NULL); 1313 1314 if (unlikely(error <= 0)) 1315 break; 1316 1317 /* 1318 * Fill this contiguous range into the partial page map. 1319 */ 1320 for (i = 0; i < error; i++) { 1321 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1322 1323 partial[buffers].offset = off; 1324 partial[buffers].len = plen; 1325 1326 off = 0; 1327 len -= plen; 1328 buffers++; 1329 } 1330 1331 /* 1332 * We didn't complete this iov, stop here since it probably 1333 * means we have to move some of this into a pipe to 1334 * be able to continue. 1335 */ 1336 if (len) 1337 break; 1338 1339 /* 1340 * Don't continue if we mapped fewer pages than we asked for, 1341 * or if we mapped the max number of pages that we have 1342 * room for. 1343 */ 1344 if (error < npages || buffers == PIPE_BUFFERS) 1345 break; 1346 1347 nr_vecs--; 1348 iov++; 1349 } 1350 1351 up_read(¤t->mm->mmap_sem); 1352 1353 if (buffers) 1354 return buffers; 1355 1356 return error; 1357 } 1358 1359 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1360 struct splice_desc *sd) 1361 { 1362 char *src; 1363 int ret; 1364 1365 ret = buf->ops->confirm(pipe, buf); 1366 if (unlikely(ret)) 1367 return ret; 1368 1369 /* 1370 * See if we can use the atomic maps, by prefaulting in the 1371 * pages and doing an atomic copy 1372 */ 1373 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { 1374 src = buf->ops->map(pipe, buf, 1); 1375 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, 1376 sd->len); 1377 buf->ops->unmap(pipe, buf, src); 1378 if (!ret) { 1379 ret = sd->len; 1380 goto out; 1381 } 1382 } 1383 1384 /* 1385 * No dice, use slow non-atomic map and copy 1386 */ 1387 src = buf->ops->map(pipe, buf, 0); 1388 1389 ret = sd->len; 1390 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) 1391 ret = -EFAULT; 1392 1393 out: 1394 if (ret > 0) 1395 sd->u.userptr += ret; 1396 buf->ops->unmap(pipe, buf, src); 1397 return ret; 1398 } 1399 1400 /* 1401 * For lack of a better implementation, implement vmsplice() to userspace 1402 * as a simple copy of the pipes pages to the user iov. 1403 */ 1404 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1405 unsigned long nr_segs, unsigned int flags) 1406 { 1407 struct pipe_inode_info *pipe; 1408 struct splice_desc sd; 1409 ssize_t size; 1410 int error; 1411 long ret; 1412 1413 pipe = pipe_info(file->f_path.dentry->d_inode); 1414 if (!pipe) 1415 return -EBADF; 1416 1417 if (pipe->inode) 1418 mutex_lock(&pipe->inode->i_mutex); 1419 1420 error = ret = 0; 1421 while (nr_segs) { 1422 void __user *base; 1423 size_t len; 1424 1425 /* 1426 * Get user address base and length for this iovec. 1427 */ 1428 error = get_user(base, &iov->iov_base); 1429 if (unlikely(error)) 1430 break; 1431 error = get_user(len, &iov->iov_len); 1432 if (unlikely(error)) 1433 break; 1434 1435 /* 1436 * Sanity check this iovec. 0 read succeeds. 1437 */ 1438 if (unlikely(!len)) 1439 break; 1440 if (unlikely(!base)) { 1441 error = -EFAULT; 1442 break; 1443 } 1444 1445 sd.len = 0; 1446 sd.total_len = len; 1447 sd.flags = flags; 1448 sd.u.userptr = base; 1449 sd.pos = 0; 1450 1451 size = __splice_from_pipe(pipe, &sd, pipe_to_user); 1452 if (size < 0) { 1453 if (!ret) 1454 ret = size; 1455 1456 break; 1457 } 1458 1459 ret += size; 1460 1461 if (size < len) 1462 break; 1463 1464 nr_segs--; 1465 iov++; 1466 } 1467 1468 if (pipe->inode) 1469 mutex_unlock(&pipe->inode->i_mutex); 1470 1471 if (!ret) 1472 ret = error; 1473 1474 return ret; 1475 } 1476 1477 /* 1478 * vmsplice splices a user address range into a pipe. It can be thought of 1479 * as splice-from-memory, where the regular splice is splice-from-file (or 1480 * to file). In both cases the output is a pipe, naturally. 1481 */ 1482 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1483 unsigned long nr_segs, unsigned int flags) 1484 { 1485 struct pipe_inode_info *pipe; 1486 struct page *pages[PIPE_BUFFERS]; 1487 struct partial_page partial[PIPE_BUFFERS]; 1488 struct splice_pipe_desc spd = { 1489 .pages = pages, 1490 .partial = partial, 1491 .flags = flags, 1492 .ops = &user_page_pipe_buf_ops, 1493 }; 1494 1495 pipe = pipe_info(file->f_path.dentry->d_inode); 1496 if (!pipe) 1497 return -EBADF; 1498 1499 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1500 flags & SPLICE_F_GIFT); 1501 if (spd.nr_pages <= 0) 1502 return spd.nr_pages; 1503 1504 return splice_to_pipe(pipe, &spd); 1505 } 1506 1507 /* 1508 * Note that vmsplice only really supports true splicing _from_ user memory 1509 * to a pipe, not the other way around. Splicing from user memory is a simple 1510 * operation that can be supported without any funky alignment restrictions 1511 * or nasty vm tricks. We simply map in the user memory and fill them into 1512 * a pipe. The reverse isn't quite as easy, though. There are two possible 1513 * solutions for that: 1514 * 1515 * - memcpy() the data internally, at which point we might as well just 1516 * do a regular read() on the buffer anyway. 1517 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1518 * has restriction limitations on both ends of the pipe). 1519 * 1520 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1521 * 1522 */ 1523 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1524 unsigned long nr_segs, unsigned int flags) 1525 { 1526 struct file *file; 1527 long error; 1528 int fput; 1529 1530 if (unlikely(nr_segs > UIO_MAXIOV)) 1531 return -EINVAL; 1532 else if (unlikely(!nr_segs)) 1533 return 0; 1534 1535 error = -EBADF; 1536 file = fget_light(fd, &fput); 1537 if (file) { 1538 if (file->f_mode & FMODE_WRITE) 1539 error = vmsplice_to_pipe(file, iov, nr_segs, flags); 1540 else if (file->f_mode & FMODE_READ) 1541 error = vmsplice_to_user(file, iov, nr_segs, flags); 1542 1543 fput_light(file, fput); 1544 } 1545 1546 return error; 1547 } 1548 1549 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1550 int fd_out, loff_t __user *off_out, 1551 size_t len, unsigned int flags) 1552 { 1553 long error; 1554 struct file *in, *out; 1555 int fput_in, fput_out; 1556 1557 if (unlikely(!len)) 1558 return 0; 1559 1560 error = -EBADF; 1561 in = fget_light(fd_in, &fput_in); 1562 if (in) { 1563 if (in->f_mode & FMODE_READ) { 1564 out = fget_light(fd_out, &fput_out); 1565 if (out) { 1566 if (out->f_mode & FMODE_WRITE) 1567 error = do_splice(in, off_in, 1568 out, off_out, 1569 len, flags); 1570 fput_light(out, fput_out); 1571 } 1572 } 1573 1574 fput_light(in, fput_in); 1575 } 1576 1577 return error; 1578 } 1579 1580 /* 1581 * Make sure there's data to read. Wait for input if we can, otherwise 1582 * return an appropriate error. 1583 */ 1584 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1585 { 1586 int ret; 1587 1588 /* 1589 * Check ->nrbufs without the inode lock first. This function 1590 * is speculative anyways, so missing one is ok. 1591 */ 1592 if (pipe->nrbufs) 1593 return 0; 1594 1595 ret = 0; 1596 mutex_lock(&pipe->inode->i_mutex); 1597 1598 while (!pipe->nrbufs) { 1599 if (signal_pending(current)) { 1600 ret = -ERESTARTSYS; 1601 break; 1602 } 1603 if (!pipe->writers) 1604 break; 1605 if (!pipe->waiting_writers) { 1606 if (flags & SPLICE_F_NONBLOCK) { 1607 ret = -EAGAIN; 1608 break; 1609 } 1610 } 1611 pipe_wait(pipe); 1612 } 1613 1614 mutex_unlock(&pipe->inode->i_mutex); 1615 return ret; 1616 } 1617 1618 /* 1619 * Make sure there's writeable room. Wait for room if we can, otherwise 1620 * return an appropriate error. 1621 */ 1622 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1623 { 1624 int ret; 1625 1626 /* 1627 * Check ->nrbufs without the inode lock first. This function 1628 * is speculative anyways, so missing one is ok. 1629 */ 1630 if (pipe->nrbufs < PIPE_BUFFERS) 1631 return 0; 1632 1633 ret = 0; 1634 mutex_lock(&pipe->inode->i_mutex); 1635 1636 while (pipe->nrbufs >= PIPE_BUFFERS) { 1637 if (!pipe->readers) { 1638 send_sig(SIGPIPE, current, 0); 1639 ret = -EPIPE; 1640 break; 1641 } 1642 if (flags & SPLICE_F_NONBLOCK) { 1643 ret = -EAGAIN; 1644 break; 1645 } 1646 if (signal_pending(current)) { 1647 ret = -ERESTARTSYS; 1648 break; 1649 } 1650 pipe->waiting_writers++; 1651 pipe_wait(pipe); 1652 pipe->waiting_writers--; 1653 } 1654 1655 mutex_unlock(&pipe->inode->i_mutex); 1656 return ret; 1657 } 1658 1659 /* 1660 * Link contents of ipipe to opipe. 1661 */ 1662 static int link_pipe(struct pipe_inode_info *ipipe, 1663 struct pipe_inode_info *opipe, 1664 size_t len, unsigned int flags) 1665 { 1666 struct pipe_buffer *ibuf, *obuf; 1667 int ret = 0, i = 0, nbuf; 1668 1669 /* 1670 * Potential ABBA deadlock, work around it by ordering lock 1671 * grabbing by inode address. Otherwise two different processes 1672 * could deadlock (one doing tee from A -> B, the other from B -> A). 1673 */ 1674 inode_double_lock(ipipe->inode, opipe->inode); 1675 1676 do { 1677 if (!opipe->readers) { 1678 send_sig(SIGPIPE, current, 0); 1679 if (!ret) 1680 ret = -EPIPE; 1681 break; 1682 } 1683 1684 /* 1685 * If we have iterated all input buffers or ran out of 1686 * output room, break. 1687 */ 1688 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1689 break; 1690 1691 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1692 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1693 1694 /* 1695 * Get a reference to this pipe buffer, 1696 * so we can copy the contents over. 1697 */ 1698 ibuf->ops->get(ipipe, ibuf); 1699 1700 obuf = opipe->bufs + nbuf; 1701 *obuf = *ibuf; 1702 1703 /* 1704 * Don't inherit the gift flag, we need to 1705 * prevent multiple steals of this page. 1706 */ 1707 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1708 1709 if (obuf->len > len) 1710 obuf->len = len; 1711 1712 opipe->nrbufs++; 1713 ret += obuf->len; 1714 len -= obuf->len; 1715 i++; 1716 } while (len); 1717 1718 inode_double_unlock(ipipe->inode, opipe->inode); 1719 1720 /* 1721 * If we put data in the output pipe, wakeup any potential readers. 1722 */ 1723 if (ret > 0) { 1724 smp_mb(); 1725 if (waitqueue_active(&opipe->wait)) 1726 wake_up_interruptible(&opipe->wait); 1727 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1728 } 1729 1730 return ret; 1731 } 1732 1733 /* 1734 * This is a tee(1) implementation that works on pipes. It doesn't copy 1735 * any data, it simply references the 'in' pages on the 'out' pipe. 1736 * The 'flags' used are the SPLICE_F_* variants, currently the only 1737 * applicable one is SPLICE_F_NONBLOCK. 1738 */ 1739 static long do_tee(struct file *in, struct file *out, size_t len, 1740 unsigned int flags) 1741 { 1742 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1743 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1744 int ret = -EINVAL; 1745 1746 /* 1747 * Duplicate the contents of ipipe to opipe without actually 1748 * copying the data. 1749 */ 1750 if (ipipe && opipe && ipipe != opipe) { 1751 /* 1752 * Keep going, unless we encounter an error. The ipipe/opipe 1753 * ordering doesn't really matter. 1754 */ 1755 ret = link_ipipe_prep(ipipe, flags); 1756 if (!ret) { 1757 ret = link_opipe_prep(opipe, flags); 1758 if (!ret) { 1759 ret = link_pipe(ipipe, opipe, len, flags); 1760 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1761 ret = -EAGAIN; 1762 } 1763 } 1764 } 1765 1766 return ret; 1767 } 1768 1769 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1770 { 1771 struct file *in; 1772 int error, fput_in; 1773 1774 if (unlikely(!len)) 1775 return 0; 1776 1777 error = -EBADF; 1778 in = fget_light(fdin, &fput_in); 1779 if (in) { 1780 if (in->f_mode & FMODE_READ) { 1781 int fput_out; 1782 struct file *out = fget_light(fdout, &fput_out); 1783 1784 if (out) { 1785 if (out->f_mode & FMODE_WRITE) 1786 error = do_tee(in, out, len, flags); 1787 fput_light(out, fput_out); 1788 } 1789 } 1790 fput_light(in, fput_in); 1791 } 1792 1793 return error; 1794 } 1795