1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * "splice": joining two ropes together by interweaving their strands. 4 * 5 * This is the "extended pipe" functionality, where a pipe is used as 6 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 7 * buffer that you can use to transfer data from one end to the other. 8 * 9 * The traditional unix read/write is extended with a "splice()" operation 10 * that transfers data buffers to or from a pipe buffer. 11 * 12 * Named by Larry McVoy, original implementation from Linus, extended by 13 * Jens to support splicing to files, network, direct splicing, etc and 14 * fixing lots of bugs. 15 * 16 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 17 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 18 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 19 * 20 */ 21 #include <linux/bvec.h> 22 #include <linux/fs.h> 23 #include <linux/file.h> 24 #include <linux/pagemap.h> 25 #include <linux/splice.h> 26 #include <linux/memcontrol.h> 27 #include <linux/mm_inline.h> 28 #include <linux/swap.h> 29 #include <linux/writeback.h> 30 #include <linux/export.h> 31 #include <linux/syscalls.h> 32 #include <linux/uio.h> 33 #include <linux/fsnotify.h> 34 #include <linux/security.h> 35 #include <linux/gfp.h> 36 #include <linux/net.h> 37 #include <linux/socket.h> 38 #include <linux/sched/signal.h> 39 40 #include "internal.h" 41 42 /* 43 * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to 44 * indicate they support non-blocking reads or writes, we must clear it 45 * here if set to avoid blocking other users of this pipe if splice is 46 * being done on it. 47 */ 48 static noinline void noinline pipe_clear_nowait(struct file *file) 49 { 50 fmode_t fmode = READ_ONCE(file->f_mode); 51 52 do { 53 if (!(fmode & FMODE_NOWAIT)) 54 break; 55 } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT)); 56 } 57 58 /* 59 * Attempt to steal a page from a pipe buffer. This should perhaps go into 60 * a vm helper function, it's already simplified quite a bit by the 61 * addition of remove_mapping(). If success is returned, the caller may 62 * attempt to reuse this page for another destination. 63 */ 64 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, 65 struct pipe_buffer *buf) 66 { 67 struct folio *folio = page_folio(buf->page); 68 struct address_space *mapping; 69 70 folio_lock(folio); 71 72 mapping = folio_mapping(folio); 73 if (mapping) { 74 WARN_ON(!folio_test_uptodate(folio)); 75 76 /* 77 * At least for ext2 with nobh option, we need to wait on 78 * writeback completing on this folio, since we'll remove it 79 * from the pagecache. Otherwise truncate wont wait on the 80 * folio, allowing the disk blocks to be reused by someone else 81 * before we actually wrote our data to them. fs corruption 82 * ensues. 83 */ 84 folio_wait_writeback(folio); 85 86 if (!filemap_release_folio(folio, GFP_KERNEL)) 87 goto out_unlock; 88 89 /* 90 * If we succeeded in removing the mapping, set LRU flag 91 * and return good. 92 */ 93 if (remove_mapping(mapping, folio)) { 94 buf->flags |= PIPE_BUF_FLAG_LRU; 95 return true; 96 } 97 } 98 99 /* 100 * Raced with truncate or failed to remove folio from current 101 * address space, unlock and return failure. 102 */ 103 out_unlock: 104 folio_unlock(folio); 105 return false; 106 } 107 108 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 109 struct pipe_buffer *buf) 110 { 111 put_page(buf->page); 112 buf->flags &= ~PIPE_BUF_FLAG_LRU; 113 } 114 115 /* 116 * Check whether the contents of buf is OK to access. Since the content 117 * is a page cache page, IO may be in flight. 118 */ 119 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 120 struct pipe_buffer *buf) 121 { 122 struct folio *folio = page_folio(buf->page); 123 int err; 124 125 if (!folio_test_uptodate(folio)) { 126 folio_lock(folio); 127 128 /* 129 * Folio got truncated/unhashed. This will cause a 0-byte 130 * splice, if this is the first page. 131 */ 132 if (!folio->mapping) { 133 err = -ENODATA; 134 goto error; 135 } 136 137 /* 138 * Uh oh, read-error from disk. 139 */ 140 if (!folio_test_uptodate(folio)) { 141 err = -EIO; 142 goto error; 143 } 144 145 /* Folio is ok after all, we are done */ 146 folio_unlock(folio); 147 } 148 149 return 0; 150 error: 151 folio_unlock(folio); 152 return err; 153 } 154 155 const struct pipe_buf_operations page_cache_pipe_buf_ops = { 156 .confirm = page_cache_pipe_buf_confirm, 157 .release = page_cache_pipe_buf_release, 158 .try_steal = page_cache_pipe_buf_try_steal, 159 .get = generic_pipe_buf_get, 160 }; 161 162 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, 163 struct pipe_buffer *buf) 164 { 165 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 166 return false; 167 168 buf->flags |= PIPE_BUF_FLAG_LRU; 169 return generic_pipe_buf_try_steal(pipe, buf); 170 } 171 172 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 173 .release = page_cache_pipe_buf_release, 174 .try_steal = user_page_pipe_buf_try_steal, 175 .get = generic_pipe_buf_get, 176 }; 177 178 static void wakeup_pipe_readers(struct pipe_inode_info *pipe) 179 { 180 smp_mb(); 181 if (waitqueue_active(&pipe->rd_wait)) 182 wake_up_interruptible(&pipe->rd_wait); 183 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 184 } 185 186 /** 187 * splice_to_pipe - fill passed data into a pipe 188 * @pipe: pipe to fill 189 * @spd: data to fill 190 * 191 * Description: 192 * @spd contains a map of pages and len/offset tuples, along with 193 * the struct pipe_buf_operations associated with these pages. This 194 * function will link that data to the pipe. 195 * 196 */ 197 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 198 struct splice_pipe_desc *spd) 199 { 200 unsigned int spd_pages = spd->nr_pages; 201 unsigned int tail = pipe->tail; 202 unsigned int head = pipe->head; 203 unsigned int mask = pipe->ring_size - 1; 204 int ret = 0, page_nr = 0; 205 206 if (!spd_pages) 207 return 0; 208 209 if (unlikely(!pipe->readers)) { 210 send_sig(SIGPIPE, current, 0); 211 ret = -EPIPE; 212 goto out; 213 } 214 215 while (!pipe_full(head, tail, pipe->max_usage)) { 216 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 217 218 buf->page = spd->pages[page_nr]; 219 buf->offset = spd->partial[page_nr].offset; 220 buf->len = spd->partial[page_nr].len; 221 buf->private = spd->partial[page_nr].private; 222 buf->ops = spd->ops; 223 buf->flags = 0; 224 225 head++; 226 pipe->head = head; 227 page_nr++; 228 ret += buf->len; 229 230 if (!--spd->nr_pages) 231 break; 232 } 233 234 if (!ret) 235 ret = -EAGAIN; 236 237 out: 238 while (page_nr < spd_pages) 239 spd->spd_release(spd, page_nr++); 240 241 return ret; 242 } 243 EXPORT_SYMBOL_GPL(splice_to_pipe); 244 245 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 246 { 247 unsigned int head = pipe->head; 248 unsigned int tail = pipe->tail; 249 unsigned int mask = pipe->ring_size - 1; 250 int ret; 251 252 if (unlikely(!pipe->readers)) { 253 send_sig(SIGPIPE, current, 0); 254 ret = -EPIPE; 255 } else if (pipe_full(head, tail, pipe->max_usage)) { 256 ret = -EAGAIN; 257 } else { 258 pipe->bufs[head & mask] = *buf; 259 pipe->head = head + 1; 260 return buf->len; 261 } 262 pipe_buf_release(pipe, buf); 263 return ret; 264 } 265 EXPORT_SYMBOL(add_to_pipe); 266 267 /* 268 * Check if we need to grow the arrays holding pages and partial page 269 * descriptions. 270 */ 271 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) 272 { 273 unsigned int max_usage = READ_ONCE(pipe->max_usage); 274 275 spd->nr_pages_max = max_usage; 276 if (max_usage <= PIPE_DEF_BUFFERS) 277 return 0; 278 279 spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); 280 spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), 281 GFP_KERNEL); 282 283 if (spd->pages && spd->partial) 284 return 0; 285 286 kfree(spd->pages); 287 kfree(spd->partial); 288 return -ENOMEM; 289 } 290 291 void splice_shrink_spd(struct splice_pipe_desc *spd) 292 { 293 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) 294 return; 295 296 kfree(spd->pages); 297 kfree(spd->partial); 298 } 299 300 /** 301 * copy_splice_read - Copy data from a file and splice the copy into a pipe 302 * @in: The file to read from 303 * @ppos: Pointer to the file position to read from 304 * @pipe: The pipe to splice into 305 * @len: The amount to splice 306 * @flags: The SPLICE_F_* flags 307 * 308 * This function allocates a bunch of pages sufficient to hold the requested 309 * amount of data (but limited by the remaining pipe capacity), passes it to 310 * the file's ->read_iter() to read into and then splices the used pages into 311 * the pipe. 312 * 313 * Return: On success, the number of bytes read will be returned and *@ppos 314 * will be updated if appropriate; 0 will be returned if there is no more data 315 * to be read; -EAGAIN will be returned if the pipe had no space, and some 316 * other negative error code will be returned on error. A short read may occur 317 * if the pipe has insufficient space, we reach the end of the data or we hit a 318 * hole. 319 */ 320 ssize_t copy_splice_read(struct file *in, loff_t *ppos, 321 struct pipe_inode_info *pipe, 322 size_t len, unsigned int flags) 323 { 324 struct iov_iter to; 325 struct bio_vec *bv; 326 struct kiocb kiocb; 327 struct page **pages; 328 ssize_t ret; 329 size_t used, npages, chunk, remain, keep = 0; 330 int i; 331 332 /* Work out how much data we can actually add into the pipe */ 333 used = pipe_occupancy(pipe->head, pipe->tail); 334 npages = max_t(ssize_t, pipe->max_usage - used, 0); 335 len = min_t(size_t, len, npages * PAGE_SIZE); 336 npages = DIV_ROUND_UP(len, PAGE_SIZE); 337 338 bv = kzalloc(array_size(npages, sizeof(bv[0])) + 339 array_size(npages, sizeof(struct page *)), GFP_KERNEL); 340 if (!bv) 341 return -ENOMEM; 342 343 pages = (struct page **)(bv + npages); 344 npages = alloc_pages_bulk_array(GFP_USER, npages, pages); 345 if (!npages) { 346 kfree(bv); 347 return -ENOMEM; 348 } 349 350 remain = len = min_t(size_t, len, npages * PAGE_SIZE); 351 352 for (i = 0; i < npages; i++) { 353 chunk = min_t(size_t, PAGE_SIZE, remain); 354 bv[i].bv_page = pages[i]; 355 bv[i].bv_offset = 0; 356 bv[i].bv_len = chunk; 357 remain -= chunk; 358 } 359 360 /* Do the I/O */ 361 iov_iter_bvec(&to, ITER_DEST, bv, npages, len); 362 init_sync_kiocb(&kiocb, in); 363 kiocb.ki_pos = *ppos; 364 ret = call_read_iter(in, &kiocb, &to); 365 366 if (ret > 0) { 367 keep = DIV_ROUND_UP(ret, PAGE_SIZE); 368 *ppos = kiocb.ki_pos; 369 } 370 371 /* 372 * Callers of ->splice_read() expect -EAGAIN on "can't put anything in 373 * there", rather than -EFAULT. 374 */ 375 if (ret == -EFAULT) 376 ret = -EAGAIN; 377 378 /* Free any pages that didn't get touched at all. */ 379 if (keep < npages) 380 release_pages(pages + keep, npages - keep); 381 382 /* Push the remaining pages into the pipe. */ 383 remain = ret; 384 for (i = 0; i < keep; i++) { 385 struct pipe_buffer *buf = pipe_head_buf(pipe); 386 387 chunk = min_t(size_t, remain, PAGE_SIZE); 388 *buf = (struct pipe_buffer) { 389 .ops = &default_pipe_buf_ops, 390 .page = bv[i].bv_page, 391 .offset = 0, 392 .len = chunk, 393 }; 394 pipe->head++; 395 remain -= chunk; 396 } 397 398 kfree(bv); 399 return ret; 400 } 401 EXPORT_SYMBOL(copy_splice_read); 402 403 const struct pipe_buf_operations default_pipe_buf_ops = { 404 .release = generic_pipe_buf_release, 405 .try_steal = generic_pipe_buf_try_steal, 406 .get = generic_pipe_buf_get, 407 }; 408 409 /* Pipe buffer operations for a socket and similar. */ 410 const struct pipe_buf_operations nosteal_pipe_buf_ops = { 411 .release = generic_pipe_buf_release, 412 .get = generic_pipe_buf_get, 413 }; 414 EXPORT_SYMBOL(nosteal_pipe_buf_ops); 415 416 static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 417 { 418 smp_mb(); 419 if (waitqueue_active(&pipe->wr_wait)) 420 wake_up_interruptible(&pipe->wr_wait); 421 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 422 } 423 424 /** 425 * splice_from_pipe_feed - feed available data from a pipe to a file 426 * @pipe: pipe to splice from 427 * @sd: information to @actor 428 * @actor: handler that splices the data 429 * 430 * Description: 431 * This function loops over the pipe and calls @actor to do the 432 * actual moving of a single struct pipe_buffer to the desired 433 * destination. It returns when there's no more buffers left in 434 * the pipe or if the requested number of bytes (@sd->total_len) 435 * have been copied. It returns a positive number (one) if the 436 * pipe needs to be filled with more data, zero if the required 437 * number of bytes have been copied and -errno on error. 438 * 439 * This, together with splice_from_pipe_{begin,end,next}, may be 440 * used to implement the functionality of __splice_from_pipe() when 441 * locking is required around copying the pipe buffers to the 442 * destination. 443 */ 444 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 445 splice_actor *actor) 446 { 447 unsigned int head = pipe->head; 448 unsigned int tail = pipe->tail; 449 unsigned int mask = pipe->ring_size - 1; 450 int ret; 451 452 while (!pipe_empty(head, tail)) { 453 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 454 455 sd->len = buf->len; 456 if (sd->len > sd->total_len) 457 sd->len = sd->total_len; 458 459 ret = pipe_buf_confirm(pipe, buf); 460 if (unlikely(ret)) { 461 if (ret == -ENODATA) 462 ret = 0; 463 return ret; 464 } 465 466 ret = actor(pipe, buf, sd); 467 if (ret <= 0) 468 return ret; 469 470 buf->offset += ret; 471 buf->len -= ret; 472 473 sd->num_spliced += ret; 474 sd->len -= ret; 475 sd->pos += ret; 476 sd->total_len -= ret; 477 478 if (!buf->len) { 479 pipe_buf_release(pipe, buf); 480 tail++; 481 pipe->tail = tail; 482 if (pipe->files) 483 sd->need_wakeup = true; 484 } 485 486 if (!sd->total_len) 487 return 0; 488 } 489 490 return 1; 491 } 492 493 /* We know we have a pipe buffer, but maybe it's empty? */ 494 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) 495 { 496 unsigned int tail = pipe->tail; 497 unsigned int mask = pipe->ring_size - 1; 498 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 499 500 if (unlikely(!buf->len)) { 501 pipe_buf_release(pipe, buf); 502 pipe->tail = tail+1; 503 return true; 504 } 505 506 return false; 507 } 508 509 /** 510 * splice_from_pipe_next - wait for some data to splice from 511 * @pipe: pipe to splice from 512 * @sd: information about the splice operation 513 * 514 * Description: 515 * This function will wait for some data and return a positive 516 * value (one) if pipe buffers are available. It will return zero 517 * or -errno if no more data needs to be spliced. 518 */ 519 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 520 { 521 /* 522 * Check for signal early to make process killable when there are 523 * always buffers available 524 */ 525 if (signal_pending(current)) 526 return -ERESTARTSYS; 527 528 repeat: 529 while (pipe_empty(pipe->head, pipe->tail)) { 530 if (!pipe->writers) 531 return 0; 532 533 if (sd->num_spliced) 534 return 0; 535 536 if (sd->flags & SPLICE_F_NONBLOCK) 537 return -EAGAIN; 538 539 if (signal_pending(current)) 540 return -ERESTARTSYS; 541 542 if (sd->need_wakeup) { 543 wakeup_pipe_writers(pipe); 544 sd->need_wakeup = false; 545 } 546 547 pipe_wait_readable(pipe); 548 } 549 550 if (eat_empty_buffer(pipe)) 551 goto repeat; 552 553 return 1; 554 } 555 556 /** 557 * splice_from_pipe_begin - start splicing from pipe 558 * @sd: information about the splice operation 559 * 560 * Description: 561 * This function should be called before a loop containing 562 * splice_from_pipe_next() and splice_from_pipe_feed() to 563 * initialize the necessary fields of @sd. 564 */ 565 static void splice_from_pipe_begin(struct splice_desc *sd) 566 { 567 sd->num_spliced = 0; 568 sd->need_wakeup = false; 569 } 570 571 /** 572 * splice_from_pipe_end - finish splicing from pipe 573 * @pipe: pipe to splice from 574 * @sd: information about the splice operation 575 * 576 * Description: 577 * This function will wake up pipe writers if necessary. It should 578 * be called after a loop containing splice_from_pipe_next() and 579 * splice_from_pipe_feed(). 580 */ 581 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 582 { 583 if (sd->need_wakeup) 584 wakeup_pipe_writers(pipe); 585 } 586 587 /** 588 * __splice_from_pipe - splice data from a pipe to given actor 589 * @pipe: pipe to splice from 590 * @sd: information to @actor 591 * @actor: handler that splices the data 592 * 593 * Description: 594 * This function does little more than loop over the pipe and call 595 * @actor to do the actual moving of a single struct pipe_buffer to 596 * the desired destination. See pipe_to_file, pipe_to_sendmsg, or 597 * pipe_to_user. 598 * 599 */ 600 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 601 splice_actor *actor) 602 { 603 int ret; 604 605 splice_from_pipe_begin(sd); 606 do { 607 cond_resched(); 608 ret = splice_from_pipe_next(pipe, sd); 609 if (ret > 0) 610 ret = splice_from_pipe_feed(pipe, sd, actor); 611 } while (ret > 0); 612 splice_from_pipe_end(pipe, sd); 613 614 return sd->num_spliced ? sd->num_spliced : ret; 615 } 616 EXPORT_SYMBOL(__splice_from_pipe); 617 618 /** 619 * splice_from_pipe - splice data from a pipe to a file 620 * @pipe: pipe to splice from 621 * @out: file to splice to 622 * @ppos: position in @out 623 * @len: how many bytes to splice 624 * @flags: splice modifier flags 625 * @actor: handler that splices the data 626 * 627 * Description: 628 * See __splice_from_pipe. This function locks the pipe inode, 629 * otherwise it's identical to __splice_from_pipe(). 630 * 631 */ 632 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 633 loff_t *ppos, size_t len, unsigned int flags, 634 splice_actor *actor) 635 { 636 ssize_t ret; 637 struct splice_desc sd = { 638 .total_len = len, 639 .flags = flags, 640 .pos = *ppos, 641 .u.file = out, 642 }; 643 644 pipe_lock(pipe); 645 ret = __splice_from_pipe(pipe, &sd, actor); 646 pipe_unlock(pipe); 647 648 return ret; 649 } 650 651 /** 652 * iter_file_splice_write - splice data from a pipe to a file 653 * @pipe: pipe info 654 * @out: file to write to 655 * @ppos: position in @out 656 * @len: number of bytes to splice 657 * @flags: splice modifier flags 658 * 659 * Description: 660 * Will either move or copy pages (determined by @flags options) from 661 * the given pipe inode to the given file. 662 * This one is ->write_iter-based. 663 * 664 */ 665 ssize_t 666 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 667 loff_t *ppos, size_t len, unsigned int flags) 668 { 669 struct splice_desc sd = { 670 .total_len = len, 671 .flags = flags, 672 .pos = *ppos, 673 .u.file = out, 674 }; 675 int nbufs = pipe->max_usage; 676 struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), 677 GFP_KERNEL); 678 ssize_t ret; 679 680 if (unlikely(!array)) 681 return -ENOMEM; 682 683 pipe_lock(pipe); 684 685 splice_from_pipe_begin(&sd); 686 while (sd.total_len) { 687 struct iov_iter from; 688 unsigned int head, tail, mask; 689 size_t left; 690 int n; 691 692 ret = splice_from_pipe_next(pipe, &sd); 693 if (ret <= 0) 694 break; 695 696 if (unlikely(nbufs < pipe->max_usage)) { 697 kfree(array); 698 nbufs = pipe->max_usage; 699 array = kcalloc(nbufs, sizeof(struct bio_vec), 700 GFP_KERNEL); 701 if (!array) { 702 ret = -ENOMEM; 703 break; 704 } 705 } 706 707 head = pipe->head; 708 tail = pipe->tail; 709 mask = pipe->ring_size - 1; 710 711 /* build the vector */ 712 left = sd.total_len; 713 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { 714 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 715 size_t this_len = buf->len; 716 717 /* zero-length bvecs are not supported, skip them */ 718 if (!this_len) 719 continue; 720 this_len = min(this_len, left); 721 722 ret = pipe_buf_confirm(pipe, buf); 723 if (unlikely(ret)) { 724 if (ret == -ENODATA) 725 ret = 0; 726 goto done; 727 } 728 729 bvec_set_page(&array[n], buf->page, this_len, 730 buf->offset); 731 left -= this_len; 732 n++; 733 } 734 735 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); 736 ret = vfs_iter_write(out, &from, &sd.pos, 0); 737 if (ret <= 0) 738 break; 739 740 sd.num_spliced += ret; 741 sd.total_len -= ret; 742 *ppos = sd.pos; 743 744 /* dismiss the fully eaten buffers, adjust the partial one */ 745 tail = pipe->tail; 746 while (ret) { 747 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 748 if (ret >= buf->len) { 749 ret -= buf->len; 750 buf->len = 0; 751 pipe_buf_release(pipe, buf); 752 tail++; 753 pipe->tail = tail; 754 if (pipe->files) 755 sd.need_wakeup = true; 756 } else { 757 buf->offset += ret; 758 buf->len -= ret; 759 ret = 0; 760 } 761 } 762 } 763 done: 764 kfree(array); 765 splice_from_pipe_end(pipe, &sd); 766 767 pipe_unlock(pipe); 768 769 if (sd.num_spliced) 770 ret = sd.num_spliced; 771 772 return ret; 773 } 774 775 EXPORT_SYMBOL(iter_file_splice_write); 776 777 #ifdef CONFIG_NET 778 /** 779 * splice_to_socket - splice data from a pipe to a socket 780 * @pipe: pipe to splice from 781 * @out: socket to write to 782 * @ppos: position in @out 783 * @len: number of bytes to splice 784 * @flags: splice modifier flags 785 * 786 * Description: 787 * Will send @len bytes from the pipe to a network socket. No data copying 788 * is involved. 789 * 790 */ 791 ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, 792 loff_t *ppos, size_t len, unsigned int flags) 793 { 794 struct socket *sock = sock_from_file(out); 795 struct bio_vec bvec[16]; 796 struct msghdr msg = {}; 797 ssize_t ret = 0; 798 size_t spliced = 0; 799 bool need_wakeup = false; 800 801 pipe_lock(pipe); 802 803 while (len > 0) { 804 unsigned int head, tail, mask, bc = 0; 805 size_t remain = len; 806 807 /* 808 * Check for signal early to make process killable when there 809 * are always buffers available 810 */ 811 ret = -ERESTARTSYS; 812 if (signal_pending(current)) 813 break; 814 815 while (pipe_empty(pipe->head, pipe->tail)) { 816 ret = 0; 817 if (!pipe->writers) 818 goto out; 819 820 if (spliced) 821 goto out; 822 823 ret = -EAGAIN; 824 if (flags & SPLICE_F_NONBLOCK) 825 goto out; 826 827 ret = -ERESTARTSYS; 828 if (signal_pending(current)) 829 goto out; 830 831 if (need_wakeup) { 832 wakeup_pipe_writers(pipe); 833 need_wakeup = false; 834 } 835 836 pipe_wait_readable(pipe); 837 } 838 839 head = pipe->head; 840 tail = pipe->tail; 841 mask = pipe->ring_size - 1; 842 843 while (!pipe_empty(head, tail)) { 844 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 845 size_t seg; 846 847 if (!buf->len) { 848 tail++; 849 continue; 850 } 851 852 seg = min_t(size_t, remain, buf->len); 853 854 ret = pipe_buf_confirm(pipe, buf); 855 if (unlikely(ret)) { 856 if (ret == -ENODATA) 857 ret = 0; 858 break; 859 } 860 861 bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset); 862 remain -= seg; 863 if (remain == 0 || bc >= ARRAY_SIZE(bvec)) 864 break; 865 tail++; 866 } 867 868 if (!bc) 869 break; 870 871 msg.msg_flags = MSG_SPLICE_PAGES; 872 if (flags & SPLICE_F_MORE) 873 msg.msg_flags |= MSG_MORE; 874 if (remain && pipe_occupancy(pipe->head, tail) > 0) 875 msg.msg_flags |= MSG_MORE; 876 if (out->f_flags & O_NONBLOCK) 877 msg.msg_flags |= MSG_DONTWAIT; 878 879 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, 880 len - remain); 881 ret = sock_sendmsg(sock, &msg); 882 if (ret <= 0) 883 break; 884 885 spliced += ret; 886 len -= ret; 887 tail = pipe->tail; 888 while (ret > 0) { 889 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 890 size_t seg = min_t(size_t, ret, buf->len); 891 892 buf->offset += seg; 893 buf->len -= seg; 894 ret -= seg; 895 896 if (!buf->len) { 897 pipe_buf_release(pipe, buf); 898 tail++; 899 } 900 } 901 902 if (tail != pipe->tail) { 903 pipe->tail = tail; 904 if (pipe->files) 905 need_wakeup = true; 906 } 907 } 908 909 out: 910 pipe_unlock(pipe); 911 if (need_wakeup) 912 wakeup_pipe_writers(pipe); 913 return spliced ?: ret; 914 } 915 #endif 916 917 static int warn_unsupported(struct file *file, const char *op) 918 { 919 pr_debug_ratelimited( 920 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 921 op, file, current->pid, current->comm); 922 return -EINVAL; 923 } 924 925 /* 926 * Attempt to initiate a splice from pipe to file. 927 */ 928 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 929 loff_t *ppos, size_t len, unsigned int flags) 930 { 931 if (unlikely(!out->f_op->splice_write)) 932 return warn_unsupported(out, "write"); 933 return out->f_op->splice_write(pipe, out, ppos, len, flags); 934 } 935 936 /* 937 * Indicate to the caller that there was a premature EOF when reading from the 938 * source and the caller didn't indicate they would be sending more data after 939 * this. 940 */ 941 static void do_splice_eof(struct splice_desc *sd) 942 { 943 if (sd->splice_eof) 944 sd->splice_eof(sd); 945 } 946 947 /** 948 * vfs_splice_read - Read data from a file and splice it into a pipe 949 * @in: File to splice from 950 * @ppos: Input file offset 951 * @pipe: Pipe to splice to 952 * @len: Number of bytes to splice 953 * @flags: Splice modifier flags (SPLICE_F_*) 954 * 955 * Splice the requested amount of data from the input file to the pipe. This 956 * is synchronous as the caller must hold the pipe lock across the entire 957 * operation. 958 * 959 * If successful, it returns the amount of data spliced, 0 if it hit the EOF or 960 * a hole and a negative error code otherwise. 961 */ 962 long vfs_splice_read(struct file *in, loff_t *ppos, 963 struct pipe_inode_info *pipe, size_t len, 964 unsigned int flags) 965 { 966 unsigned int p_space; 967 int ret; 968 969 if (unlikely(!(in->f_mode & FMODE_READ))) 970 return -EBADF; 971 if (!len) 972 return 0; 973 974 /* Don't try to read more the pipe has space for. */ 975 p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); 976 len = min_t(size_t, len, p_space << PAGE_SHIFT); 977 978 ret = rw_verify_area(READ, in, ppos, len); 979 if (unlikely(ret < 0)) 980 return ret; 981 982 if (unlikely(len > MAX_RW_COUNT)) 983 len = MAX_RW_COUNT; 984 985 if (unlikely(!in->f_op->splice_read)) 986 return warn_unsupported(in, "read"); 987 /* 988 * O_DIRECT and DAX don't deal with the pagecache, so we allocate a 989 * buffer, copy into it and splice that into the pipe. 990 */ 991 if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) 992 return copy_splice_read(in, ppos, pipe, len, flags); 993 return in->f_op->splice_read(in, ppos, pipe, len, flags); 994 } 995 EXPORT_SYMBOL_GPL(vfs_splice_read); 996 997 /** 998 * splice_direct_to_actor - splices data directly between two non-pipes 999 * @in: file to splice from 1000 * @sd: actor information on where to splice to 1001 * @actor: handles the data splicing 1002 * 1003 * Description: 1004 * This is a special case helper to splice directly between two 1005 * points, without requiring an explicit pipe. Internally an allocated 1006 * pipe is cached in the process, and reused during the lifetime of 1007 * that process. 1008 * 1009 */ 1010 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 1011 splice_direct_actor *actor) 1012 { 1013 struct pipe_inode_info *pipe; 1014 long ret, bytes; 1015 size_t len; 1016 int i, flags, more; 1017 1018 /* 1019 * We require the input to be seekable, as we don't want to randomly 1020 * drop data for eg socket -> socket splicing. Use the piped splicing 1021 * for that! 1022 */ 1023 if (unlikely(!(in->f_mode & FMODE_LSEEK))) 1024 return -EINVAL; 1025 1026 /* 1027 * neither in nor out is a pipe, setup an internal pipe attached to 1028 * 'out' and transfer the wanted data from 'in' to 'out' through that 1029 */ 1030 pipe = current->splice_pipe; 1031 if (unlikely(!pipe)) { 1032 pipe = alloc_pipe_info(); 1033 if (!pipe) 1034 return -ENOMEM; 1035 1036 /* 1037 * We don't have an immediate reader, but we'll read the stuff 1038 * out of the pipe right after the splice_to_pipe(). So set 1039 * PIPE_READERS appropriately. 1040 */ 1041 pipe->readers = 1; 1042 1043 current->splice_pipe = pipe; 1044 } 1045 1046 /* 1047 * Do the splice. 1048 */ 1049 bytes = 0; 1050 len = sd->total_len; 1051 1052 /* Don't block on output, we have to drain the direct pipe. */ 1053 flags = sd->flags; 1054 sd->flags &= ~SPLICE_F_NONBLOCK; 1055 1056 /* 1057 * We signal MORE until we've read sufficient data to fulfill the 1058 * request and we keep signalling it if the caller set it. 1059 */ 1060 more = sd->flags & SPLICE_F_MORE; 1061 sd->flags |= SPLICE_F_MORE; 1062 1063 WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); 1064 1065 while (len) { 1066 size_t read_len; 1067 loff_t pos = sd->pos, prev_pos = pos; 1068 1069 ret = vfs_splice_read(in, &pos, pipe, len, flags); 1070 if (unlikely(ret <= 0)) 1071 goto read_failure; 1072 1073 read_len = ret; 1074 sd->total_len = read_len; 1075 1076 /* 1077 * If we now have sufficient data to fulfill the request then 1078 * we clear SPLICE_F_MORE if it was not set initially. 1079 */ 1080 if (read_len >= len && !more) 1081 sd->flags &= ~SPLICE_F_MORE; 1082 1083 /* 1084 * NOTE: nonblocking mode only applies to the input. We 1085 * must not do the output in nonblocking mode as then we 1086 * could get stuck data in the internal pipe: 1087 */ 1088 ret = actor(pipe, sd); 1089 if (unlikely(ret <= 0)) { 1090 sd->pos = prev_pos; 1091 goto out_release; 1092 } 1093 1094 bytes += ret; 1095 len -= ret; 1096 sd->pos = pos; 1097 1098 if (ret < read_len) { 1099 sd->pos = prev_pos + ret; 1100 goto out_release; 1101 } 1102 } 1103 1104 done: 1105 pipe->tail = pipe->head = 0; 1106 file_accessed(in); 1107 return bytes; 1108 1109 read_failure: 1110 /* 1111 * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that 1112 * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a 1113 * "->splice_in()" that returned EOF (ie zero) *and* we have sent at 1114 * least 1 byte *then* we will also do the ->splice_eof() call. 1115 */ 1116 if (ret == 0 && !more && len > 0 && bytes) 1117 do_splice_eof(sd); 1118 out_release: 1119 /* 1120 * If we did an incomplete transfer we must release 1121 * the pipe buffers in question: 1122 */ 1123 for (i = 0; i < pipe->ring_size; i++) { 1124 struct pipe_buffer *buf = &pipe->bufs[i]; 1125 1126 if (buf->ops) 1127 pipe_buf_release(pipe, buf); 1128 } 1129 1130 if (!bytes) 1131 bytes = ret; 1132 1133 goto done; 1134 } 1135 EXPORT_SYMBOL(splice_direct_to_actor); 1136 1137 static int direct_splice_actor(struct pipe_inode_info *pipe, 1138 struct splice_desc *sd) 1139 { 1140 struct file *file = sd->u.file; 1141 1142 return do_splice_from(pipe, file, sd->opos, sd->total_len, 1143 sd->flags); 1144 } 1145 1146 static void direct_file_splice_eof(struct splice_desc *sd) 1147 { 1148 struct file *file = sd->u.file; 1149 1150 if (file->f_op->splice_eof) 1151 file->f_op->splice_eof(file); 1152 } 1153 1154 /** 1155 * do_splice_direct - splices data directly between two files 1156 * @in: file to splice from 1157 * @ppos: input file offset 1158 * @out: file to splice to 1159 * @opos: output file offset 1160 * @len: number of bytes to splice 1161 * @flags: splice modifier flags 1162 * 1163 * Description: 1164 * For use by do_sendfile(). splice can easily emulate sendfile, but 1165 * doing it in the application would incur an extra system call 1166 * (splice in + splice out, as compared to just sendfile()). So this helper 1167 * can splice directly through a process-private pipe. 1168 * 1169 */ 1170 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1171 loff_t *opos, size_t len, unsigned int flags) 1172 { 1173 struct splice_desc sd = { 1174 .len = len, 1175 .total_len = len, 1176 .flags = flags, 1177 .pos = *ppos, 1178 .u.file = out, 1179 .splice_eof = direct_file_splice_eof, 1180 .opos = opos, 1181 }; 1182 long ret; 1183 1184 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1185 return -EBADF; 1186 1187 if (unlikely(out->f_flags & O_APPEND)) 1188 return -EINVAL; 1189 1190 ret = rw_verify_area(WRITE, out, opos, len); 1191 if (unlikely(ret < 0)) 1192 return ret; 1193 1194 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1195 if (ret > 0) 1196 *ppos = sd.pos; 1197 1198 return ret; 1199 } 1200 EXPORT_SYMBOL(do_splice_direct); 1201 1202 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) 1203 { 1204 for (;;) { 1205 if (unlikely(!pipe->readers)) { 1206 send_sig(SIGPIPE, current, 0); 1207 return -EPIPE; 1208 } 1209 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 1210 return 0; 1211 if (flags & SPLICE_F_NONBLOCK) 1212 return -EAGAIN; 1213 if (signal_pending(current)) 1214 return -ERESTARTSYS; 1215 pipe_wait_writable(pipe); 1216 } 1217 } 1218 1219 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1220 struct pipe_inode_info *opipe, 1221 size_t len, unsigned int flags); 1222 1223 long splice_file_to_pipe(struct file *in, 1224 struct pipe_inode_info *opipe, 1225 loff_t *offset, 1226 size_t len, unsigned int flags) 1227 { 1228 long ret; 1229 1230 pipe_lock(opipe); 1231 ret = wait_for_space(opipe, flags); 1232 if (!ret) 1233 ret = vfs_splice_read(in, offset, opipe, len, flags); 1234 pipe_unlock(opipe); 1235 if (ret > 0) 1236 wakeup_pipe_readers(opipe); 1237 return ret; 1238 } 1239 1240 /* 1241 * Determine where to splice to/from. 1242 */ 1243 long do_splice(struct file *in, loff_t *off_in, struct file *out, 1244 loff_t *off_out, size_t len, unsigned int flags) 1245 { 1246 struct pipe_inode_info *ipipe; 1247 struct pipe_inode_info *opipe; 1248 loff_t offset; 1249 long ret; 1250 1251 if (unlikely(!(in->f_mode & FMODE_READ) || 1252 !(out->f_mode & FMODE_WRITE))) 1253 return -EBADF; 1254 1255 ipipe = get_pipe_info(in, true); 1256 opipe = get_pipe_info(out, true); 1257 1258 if (ipipe && opipe) { 1259 if (off_in || off_out) 1260 return -ESPIPE; 1261 1262 /* Splicing to self would be fun, but... */ 1263 if (ipipe == opipe) 1264 return -EINVAL; 1265 1266 if ((in->f_flags | out->f_flags) & O_NONBLOCK) 1267 flags |= SPLICE_F_NONBLOCK; 1268 1269 ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); 1270 } else if (ipipe) { 1271 if (off_in) 1272 return -ESPIPE; 1273 if (off_out) { 1274 if (!(out->f_mode & FMODE_PWRITE)) 1275 return -EINVAL; 1276 offset = *off_out; 1277 } else { 1278 offset = out->f_pos; 1279 } 1280 1281 if (unlikely(out->f_flags & O_APPEND)) 1282 return -EINVAL; 1283 1284 ret = rw_verify_area(WRITE, out, &offset, len); 1285 if (unlikely(ret < 0)) 1286 return ret; 1287 1288 if (in->f_flags & O_NONBLOCK) 1289 flags |= SPLICE_F_NONBLOCK; 1290 1291 file_start_write(out); 1292 ret = do_splice_from(ipipe, out, &offset, len, flags); 1293 file_end_write(out); 1294 1295 if (!off_out) 1296 out->f_pos = offset; 1297 else 1298 *off_out = offset; 1299 } else if (opipe) { 1300 if (off_out) 1301 return -ESPIPE; 1302 if (off_in) { 1303 if (!(in->f_mode & FMODE_PREAD)) 1304 return -EINVAL; 1305 offset = *off_in; 1306 } else { 1307 offset = in->f_pos; 1308 } 1309 1310 if (out->f_flags & O_NONBLOCK) 1311 flags |= SPLICE_F_NONBLOCK; 1312 1313 ret = splice_file_to_pipe(in, opipe, &offset, len, flags); 1314 1315 if (!off_in) 1316 in->f_pos = offset; 1317 else 1318 *off_in = offset; 1319 } else { 1320 ret = -EINVAL; 1321 } 1322 1323 if (ret > 0) { 1324 /* 1325 * Generate modify out before access in: 1326 * do_splice_from() may've already sent modify out, 1327 * and this ensures the events get merged. 1328 */ 1329 fsnotify_modify(out); 1330 fsnotify_access(in); 1331 } 1332 1333 return ret; 1334 } 1335 1336 static long __do_splice(struct file *in, loff_t __user *off_in, 1337 struct file *out, loff_t __user *off_out, 1338 size_t len, unsigned int flags) 1339 { 1340 struct pipe_inode_info *ipipe; 1341 struct pipe_inode_info *opipe; 1342 loff_t offset, *__off_in = NULL, *__off_out = NULL; 1343 long ret; 1344 1345 ipipe = get_pipe_info(in, true); 1346 opipe = get_pipe_info(out, true); 1347 1348 if (ipipe) { 1349 if (off_in) 1350 return -ESPIPE; 1351 pipe_clear_nowait(in); 1352 } 1353 if (opipe) { 1354 if (off_out) 1355 return -ESPIPE; 1356 pipe_clear_nowait(out); 1357 } 1358 1359 if (off_out) { 1360 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1361 return -EFAULT; 1362 __off_out = &offset; 1363 } 1364 if (off_in) { 1365 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1366 return -EFAULT; 1367 __off_in = &offset; 1368 } 1369 1370 ret = do_splice(in, __off_in, out, __off_out, len, flags); 1371 if (ret < 0) 1372 return ret; 1373 1374 if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) 1375 return -EFAULT; 1376 if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) 1377 return -EFAULT; 1378 1379 return ret; 1380 } 1381 1382 static int iter_to_pipe(struct iov_iter *from, 1383 struct pipe_inode_info *pipe, 1384 unsigned flags) 1385 { 1386 struct pipe_buffer buf = { 1387 .ops = &user_page_pipe_buf_ops, 1388 .flags = flags 1389 }; 1390 size_t total = 0; 1391 int ret = 0; 1392 1393 while (iov_iter_count(from)) { 1394 struct page *pages[16]; 1395 ssize_t left; 1396 size_t start; 1397 int i, n; 1398 1399 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); 1400 if (left <= 0) { 1401 ret = left; 1402 break; 1403 } 1404 1405 n = DIV_ROUND_UP(left + start, PAGE_SIZE); 1406 for (i = 0; i < n; i++) { 1407 int size = min_t(int, left, PAGE_SIZE - start); 1408 1409 buf.page = pages[i]; 1410 buf.offset = start; 1411 buf.len = size; 1412 ret = add_to_pipe(pipe, &buf); 1413 if (unlikely(ret < 0)) { 1414 iov_iter_revert(from, left); 1415 // this one got dropped by add_to_pipe() 1416 while (++i < n) 1417 put_page(pages[i]); 1418 goto out; 1419 } 1420 total += ret; 1421 left -= size; 1422 start = 0; 1423 } 1424 } 1425 out: 1426 return total ? total : ret; 1427 } 1428 1429 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1430 struct splice_desc *sd) 1431 { 1432 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); 1433 return n == sd->len ? n : -EFAULT; 1434 } 1435 1436 /* 1437 * For lack of a better implementation, implement vmsplice() to userspace 1438 * as a simple copy of the pipes pages to the user iov. 1439 */ 1440 static long vmsplice_to_user(struct file *file, struct iov_iter *iter, 1441 unsigned int flags) 1442 { 1443 struct pipe_inode_info *pipe = get_pipe_info(file, true); 1444 struct splice_desc sd = { 1445 .total_len = iov_iter_count(iter), 1446 .flags = flags, 1447 .u.data = iter 1448 }; 1449 long ret = 0; 1450 1451 if (!pipe) 1452 return -EBADF; 1453 1454 pipe_clear_nowait(file); 1455 1456 if (sd.total_len) { 1457 pipe_lock(pipe); 1458 ret = __splice_from_pipe(pipe, &sd, pipe_to_user); 1459 pipe_unlock(pipe); 1460 } 1461 1462 if (ret > 0) 1463 fsnotify_access(file); 1464 1465 return ret; 1466 } 1467 1468 /* 1469 * vmsplice splices a user address range into a pipe. It can be thought of 1470 * as splice-from-memory, where the regular splice is splice-from-file (or 1471 * to file). In both cases the output is a pipe, naturally. 1472 */ 1473 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, 1474 unsigned int flags) 1475 { 1476 struct pipe_inode_info *pipe; 1477 long ret = 0; 1478 unsigned buf_flag = 0; 1479 1480 if (flags & SPLICE_F_GIFT) 1481 buf_flag = PIPE_BUF_FLAG_GIFT; 1482 1483 pipe = get_pipe_info(file, true); 1484 if (!pipe) 1485 return -EBADF; 1486 1487 pipe_clear_nowait(file); 1488 1489 pipe_lock(pipe); 1490 ret = wait_for_space(pipe, flags); 1491 if (!ret) 1492 ret = iter_to_pipe(iter, pipe, buf_flag); 1493 pipe_unlock(pipe); 1494 if (ret > 0) { 1495 wakeup_pipe_readers(pipe); 1496 fsnotify_modify(file); 1497 } 1498 return ret; 1499 } 1500 1501 static int vmsplice_type(struct fd f, int *type) 1502 { 1503 if (!f.file) 1504 return -EBADF; 1505 if (f.file->f_mode & FMODE_WRITE) { 1506 *type = ITER_SOURCE; 1507 } else if (f.file->f_mode & FMODE_READ) { 1508 *type = ITER_DEST; 1509 } else { 1510 fdput(f); 1511 return -EBADF; 1512 } 1513 return 0; 1514 } 1515 1516 /* 1517 * Note that vmsplice only really supports true splicing _from_ user memory 1518 * to a pipe, not the other way around. Splicing from user memory is a simple 1519 * operation that can be supported without any funky alignment restrictions 1520 * or nasty vm tricks. We simply map in the user memory and fill them into 1521 * a pipe. The reverse isn't quite as easy, though. There are two possible 1522 * solutions for that: 1523 * 1524 * - memcpy() the data internally, at which point we might as well just 1525 * do a regular read() on the buffer anyway. 1526 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1527 * has restriction limitations on both ends of the pipe). 1528 * 1529 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1530 * 1531 */ 1532 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, 1533 unsigned long, nr_segs, unsigned int, flags) 1534 { 1535 struct iovec iovstack[UIO_FASTIOV]; 1536 struct iovec *iov = iovstack; 1537 struct iov_iter iter; 1538 ssize_t error; 1539 struct fd f; 1540 int type; 1541 1542 if (unlikely(flags & ~SPLICE_F_ALL)) 1543 return -EINVAL; 1544 1545 f = fdget(fd); 1546 error = vmsplice_type(f, &type); 1547 if (error) 1548 return error; 1549 1550 error = import_iovec(type, uiov, nr_segs, 1551 ARRAY_SIZE(iovstack), &iov, &iter); 1552 if (error < 0) 1553 goto out_fdput; 1554 1555 if (!iov_iter_count(&iter)) 1556 error = 0; 1557 else if (type == ITER_SOURCE) 1558 error = vmsplice_to_pipe(f.file, &iter, flags); 1559 else 1560 error = vmsplice_to_user(f.file, &iter, flags); 1561 1562 kfree(iov); 1563 out_fdput: 1564 fdput(f); 1565 return error; 1566 } 1567 1568 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, 1569 int, fd_out, loff_t __user *, off_out, 1570 size_t, len, unsigned int, flags) 1571 { 1572 struct fd in, out; 1573 long error; 1574 1575 if (unlikely(!len)) 1576 return 0; 1577 1578 if (unlikely(flags & ~SPLICE_F_ALL)) 1579 return -EINVAL; 1580 1581 error = -EBADF; 1582 in = fdget(fd_in); 1583 if (in.file) { 1584 out = fdget(fd_out); 1585 if (out.file) { 1586 error = __do_splice(in.file, off_in, out.file, off_out, 1587 len, flags); 1588 fdput(out); 1589 } 1590 fdput(in); 1591 } 1592 return error; 1593 } 1594 1595 /* 1596 * Make sure there's data to read. Wait for input if we can, otherwise 1597 * return an appropriate error. 1598 */ 1599 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1600 { 1601 int ret; 1602 1603 /* 1604 * Check the pipe occupancy without the inode lock first. This function 1605 * is speculative anyways, so missing one is ok. 1606 */ 1607 if (!pipe_empty(pipe->head, pipe->tail)) 1608 return 0; 1609 1610 ret = 0; 1611 pipe_lock(pipe); 1612 1613 while (pipe_empty(pipe->head, pipe->tail)) { 1614 if (signal_pending(current)) { 1615 ret = -ERESTARTSYS; 1616 break; 1617 } 1618 if (!pipe->writers) 1619 break; 1620 if (flags & SPLICE_F_NONBLOCK) { 1621 ret = -EAGAIN; 1622 break; 1623 } 1624 pipe_wait_readable(pipe); 1625 } 1626 1627 pipe_unlock(pipe); 1628 return ret; 1629 } 1630 1631 /* 1632 * Make sure there's writeable room. Wait for room if we can, otherwise 1633 * return an appropriate error. 1634 */ 1635 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1636 { 1637 int ret; 1638 1639 /* 1640 * Check pipe occupancy without the inode lock first. This function 1641 * is speculative anyways, so missing one is ok. 1642 */ 1643 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 1644 return 0; 1645 1646 ret = 0; 1647 pipe_lock(pipe); 1648 1649 while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 1650 if (!pipe->readers) { 1651 send_sig(SIGPIPE, current, 0); 1652 ret = -EPIPE; 1653 break; 1654 } 1655 if (flags & SPLICE_F_NONBLOCK) { 1656 ret = -EAGAIN; 1657 break; 1658 } 1659 if (signal_pending(current)) { 1660 ret = -ERESTARTSYS; 1661 break; 1662 } 1663 pipe_wait_writable(pipe); 1664 } 1665 1666 pipe_unlock(pipe); 1667 return ret; 1668 } 1669 1670 /* 1671 * Splice contents of ipipe to opipe. 1672 */ 1673 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1674 struct pipe_inode_info *opipe, 1675 size_t len, unsigned int flags) 1676 { 1677 struct pipe_buffer *ibuf, *obuf; 1678 unsigned int i_head, o_head; 1679 unsigned int i_tail, o_tail; 1680 unsigned int i_mask, o_mask; 1681 int ret = 0; 1682 bool input_wakeup = false; 1683 1684 1685 retry: 1686 ret = ipipe_prep(ipipe, flags); 1687 if (ret) 1688 return ret; 1689 1690 ret = opipe_prep(opipe, flags); 1691 if (ret) 1692 return ret; 1693 1694 /* 1695 * Potential ABBA deadlock, work around it by ordering lock 1696 * grabbing by pipe info address. Otherwise two different processes 1697 * could deadlock (one doing tee from A -> B, the other from B -> A). 1698 */ 1699 pipe_double_lock(ipipe, opipe); 1700 1701 i_tail = ipipe->tail; 1702 i_mask = ipipe->ring_size - 1; 1703 o_head = opipe->head; 1704 o_mask = opipe->ring_size - 1; 1705 1706 do { 1707 size_t o_len; 1708 1709 if (!opipe->readers) { 1710 send_sig(SIGPIPE, current, 0); 1711 if (!ret) 1712 ret = -EPIPE; 1713 break; 1714 } 1715 1716 i_head = ipipe->head; 1717 o_tail = opipe->tail; 1718 1719 if (pipe_empty(i_head, i_tail) && !ipipe->writers) 1720 break; 1721 1722 /* 1723 * Cannot make any progress, because either the input 1724 * pipe is empty or the output pipe is full. 1725 */ 1726 if (pipe_empty(i_head, i_tail) || 1727 pipe_full(o_head, o_tail, opipe->max_usage)) { 1728 /* Already processed some buffers, break */ 1729 if (ret) 1730 break; 1731 1732 if (flags & SPLICE_F_NONBLOCK) { 1733 ret = -EAGAIN; 1734 break; 1735 } 1736 1737 /* 1738 * We raced with another reader/writer and haven't 1739 * managed to process any buffers. A zero return 1740 * value means EOF, so retry instead. 1741 */ 1742 pipe_unlock(ipipe); 1743 pipe_unlock(opipe); 1744 goto retry; 1745 } 1746 1747 ibuf = &ipipe->bufs[i_tail & i_mask]; 1748 obuf = &opipe->bufs[o_head & o_mask]; 1749 1750 if (len >= ibuf->len) { 1751 /* 1752 * Simply move the whole buffer from ipipe to opipe 1753 */ 1754 *obuf = *ibuf; 1755 ibuf->ops = NULL; 1756 i_tail++; 1757 ipipe->tail = i_tail; 1758 input_wakeup = true; 1759 o_len = obuf->len; 1760 o_head++; 1761 opipe->head = o_head; 1762 } else { 1763 /* 1764 * Get a reference to this pipe buffer, 1765 * so we can copy the contents over. 1766 */ 1767 if (!pipe_buf_get(ipipe, ibuf)) { 1768 if (ret == 0) 1769 ret = -EFAULT; 1770 break; 1771 } 1772 *obuf = *ibuf; 1773 1774 /* 1775 * Don't inherit the gift and merge flags, we need to 1776 * prevent multiple steals of this page. 1777 */ 1778 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1779 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; 1780 1781 obuf->len = len; 1782 ibuf->offset += len; 1783 ibuf->len -= len; 1784 o_len = len; 1785 o_head++; 1786 opipe->head = o_head; 1787 } 1788 ret += o_len; 1789 len -= o_len; 1790 } while (len); 1791 1792 pipe_unlock(ipipe); 1793 pipe_unlock(opipe); 1794 1795 /* 1796 * If we put data in the output pipe, wakeup any potential readers. 1797 */ 1798 if (ret > 0) 1799 wakeup_pipe_readers(opipe); 1800 1801 if (input_wakeup) 1802 wakeup_pipe_writers(ipipe); 1803 1804 return ret; 1805 } 1806 1807 /* 1808 * Link contents of ipipe to opipe. 1809 */ 1810 static int link_pipe(struct pipe_inode_info *ipipe, 1811 struct pipe_inode_info *opipe, 1812 size_t len, unsigned int flags) 1813 { 1814 struct pipe_buffer *ibuf, *obuf; 1815 unsigned int i_head, o_head; 1816 unsigned int i_tail, o_tail; 1817 unsigned int i_mask, o_mask; 1818 int ret = 0; 1819 1820 /* 1821 * Potential ABBA deadlock, work around it by ordering lock 1822 * grabbing by pipe info address. Otherwise two different processes 1823 * could deadlock (one doing tee from A -> B, the other from B -> A). 1824 */ 1825 pipe_double_lock(ipipe, opipe); 1826 1827 i_tail = ipipe->tail; 1828 i_mask = ipipe->ring_size - 1; 1829 o_head = opipe->head; 1830 o_mask = opipe->ring_size - 1; 1831 1832 do { 1833 if (!opipe->readers) { 1834 send_sig(SIGPIPE, current, 0); 1835 if (!ret) 1836 ret = -EPIPE; 1837 break; 1838 } 1839 1840 i_head = ipipe->head; 1841 o_tail = opipe->tail; 1842 1843 /* 1844 * If we have iterated all input buffers or run out of 1845 * output room, break. 1846 */ 1847 if (pipe_empty(i_head, i_tail) || 1848 pipe_full(o_head, o_tail, opipe->max_usage)) 1849 break; 1850 1851 ibuf = &ipipe->bufs[i_tail & i_mask]; 1852 obuf = &opipe->bufs[o_head & o_mask]; 1853 1854 /* 1855 * Get a reference to this pipe buffer, 1856 * so we can copy the contents over. 1857 */ 1858 if (!pipe_buf_get(ipipe, ibuf)) { 1859 if (ret == 0) 1860 ret = -EFAULT; 1861 break; 1862 } 1863 1864 *obuf = *ibuf; 1865 1866 /* 1867 * Don't inherit the gift and merge flag, we need to prevent 1868 * multiple steals of this page. 1869 */ 1870 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1871 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; 1872 1873 if (obuf->len > len) 1874 obuf->len = len; 1875 ret += obuf->len; 1876 len -= obuf->len; 1877 1878 o_head++; 1879 opipe->head = o_head; 1880 i_tail++; 1881 } while (len); 1882 1883 pipe_unlock(ipipe); 1884 pipe_unlock(opipe); 1885 1886 /* 1887 * If we put data in the output pipe, wakeup any potential readers. 1888 */ 1889 if (ret > 0) 1890 wakeup_pipe_readers(opipe); 1891 1892 return ret; 1893 } 1894 1895 /* 1896 * This is a tee(1) implementation that works on pipes. It doesn't copy 1897 * any data, it simply references the 'in' pages on the 'out' pipe. 1898 * The 'flags' used are the SPLICE_F_* variants, currently the only 1899 * applicable one is SPLICE_F_NONBLOCK. 1900 */ 1901 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) 1902 { 1903 struct pipe_inode_info *ipipe = get_pipe_info(in, true); 1904 struct pipe_inode_info *opipe = get_pipe_info(out, true); 1905 int ret = -EINVAL; 1906 1907 if (unlikely(!(in->f_mode & FMODE_READ) || 1908 !(out->f_mode & FMODE_WRITE))) 1909 return -EBADF; 1910 1911 /* 1912 * Duplicate the contents of ipipe to opipe without actually 1913 * copying the data. 1914 */ 1915 if (ipipe && opipe && ipipe != opipe) { 1916 if ((in->f_flags | out->f_flags) & O_NONBLOCK) 1917 flags |= SPLICE_F_NONBLOCK; 1918 1919 /* 1920 * Keep going, unless we encounter an error. The ipipe/opipe 1921 * ordering doesn't really matter. 1922 */ 1923 ret = ipipe_prep(ipipe, flags); 1924 if (!ret) { 1925 ret = opipe_prep(opipe, flags); 1926 if (!ret) 1927 ret = link_pipe(ipipe, opipe, len, flags); 1928 } 1929 } 1930 1931 if (ret > 0) { 1932 fsnotify_access(in); 1933 fsnotify_modify(out); 1934 } 1935 1936 return ret; 1937 } 1938 1939 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 1940 { 1941 struct fd in, out; 1942 int error; 1943 1944 if (unlikely(flags & ~SPLICE_F_ALL)) 1945 return -EINVAL; 1946 1947 if (unlikely(!len)) 1948 return 0; 1949 1950 error = -EBADF; 1951 in = fdget(fdin); 1952 if (in.file) { 1953 out = fdget(fdout); 1954 if (out.file) { 1955 error = do_tee(in.file, out.file, len, flags); 1956 fdput(out); 1957 } 1958 fdput(in); 1959 } 1960 1961 return error; 1962 } 1963