1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * "splice": joining two ropes together by interweaving their strands.
4 *
5 * This is the "extended pipe" functionality, where a pipe is used as
6 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7 * buffer that you can use to transfer data from one end to the other.
8 *
9 * The traditional unix read/write is extended with a "splice()" operation
10 * that transfers data buffers to or from a pipe buffer.
11 *
12 * Named by Larry McVoy, original implementation from Linus, extended by
13 * Jens to support splicing to files, network, direct splicing, etc and
14 * fixing lots of bugs.
15 *
16 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19 *
20 */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/fsnotify.h>
34 #include <linux/security.h>
35 #include <linux/gfp.h>
36 #include <linux/net.h>
37 #include <linux/socket.h>
38 #include <linux/sched/signal.h>
39
40 #include "internal.h"
41
42 /*
43 * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
44 * indicate they support non-blocking reads or writes, we must clear it
45 * here if set to avoid blocking other users of this pipe if splice is
46 * being done on it.
47 */
pipe_clear_nowait(struct file * file)48 static noinline void pipe_clear_nowait(struct file *file)
49 {
50 fmode_t fmode = READ_ONCE(file->f_mode);
51
52 do {
53 if (!(fmode & FMODE_NOWAIT))
54 break;
55 } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
56 }
57
58 /*
59 * Attempt to steal a page from a pipe buffer. This should perhaps go into
60 * a vm helper function, it's already simplified quite a bit by the
61 * addition of remove_mapping(). If success is returned, the caller may
62 * attempt to reuse this page for another destination.
63 */
page_cache_pipe_buf_try_steal(struct pipe_inode_info * pipe,struct pipe_buffer * buf)64 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
65 struct pipe_buffer *buf)
66 {
67 struct folio *folio = page_folio(buf->page);
68 struct address_space *mapping;
69
70 folio_lock(folio);
71
72 mapping = folio_mapping(folio);
73 if (mapping) {
74 WARN_ON(!folio_test_uptodate(folio));
75
76 /*
77 * At least for ext2 with nobh option, we need to wait on
78 * writeback completing on this folio, since we'll remove it
79 * from the pagecache. Otherwise truncate wont wait on the
80 * folio, allowing the disk blocks to be reused by someone else
81 * before we actually wrote our data to them. fs corruption
82 * ensues.
83 */
84 folio_wait_writeback(folio);
85
86 if (!filemap_release_folio(folio, GFP_KERNEL))
87 goto out_unlock;
88
89 /*
90 * If we succeeded in removing the mapping, set LRU flag
91 * and return good.
92 */
93 if (remove_mapping(mapping, folio)) {
94 buf->flags |= PIPE_BUF_FLAG_LRU;
95 return true;
96 }
97 }
98
99 /*
100 * Raced with truncate or failed to remove folio from current
101 * address space, unlock and return failure.
102 */
103 out_unlock:
104 folio_unlock(folio);
105 return false;
106 }
107
page_cache_pipe_buf_release(struct pipe_inode_info * pipe,struct pipe_buffer * buf)108 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
109 struct pipe_buffer *buf)
110 {
111 put_page(buf->page);
112 buf->flags &= ~PIPE_BUF_FLAG_LRU;
113 }
114
115 /*
116 * Check whether the contents of buf is OK to access. Since the content
117 * is a page cache page, IO may be in flight.
118 */
page_cache_pipe_buf_confirm(struct pipe_inode_info * pipe,struct pipe_buffer * buf)119 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
120 struct pipe_buffer *buf)
121 {
122 struct folio *folio = page_folio(buf->page);
123 int err;
124
125 if (!folio_test_uptodate(folio)) {
126 folio_lock(folio);
127
128 /*
129 * Folio got truncated/unhashed. This will cause a 0-byte
130 * splice, if this is the first page.
131 */
132 if (!folio->mapping) {
133 err = -ENODATA;
134 goto error;
135 }
136
137 /*
138 * Uh oh, read-error from disk.
139 */
140 if (!folio_test_uptodate(folio)) {
141 err = -EIO;
142 goto error;
143 }
144
145 /* Folio is ok after all, we are done */
146 folio_unlock(folio);
147 }
148
149 return 0;
150 error:
151 folio_unlock(folio);
152 return err;
153 }
154
155 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
156 .confirm = page_cache_pipe_buf_confirm,
157 .release = page_cache_pipe_buf_release,
158 .try_steal = page_cache_pipe_buf_try_steal,
159 .get = generic_pipe_buf_get,
160 };
161
user_page_pipe_buf_try_steal(struct pipe_inode_info * pipe,struct pipe_buffer * buf)162 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
163 struct pipe_buffer *buf)
164 {
165 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
166 return false;
167
168 buf->flags |= PIPE_BUF_FLAG_LRU;
169 return generic_pipe_buf_try_steal(pipe, buf);
170 }
171
172 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
173 .release = page_cache_pipe_buf_release,
174 .try_steal = user_page_pipe_buf_try_steal,
175 .get = generic_pipe_buf_get,
176 };
177
wakeup_pipe_readers(struct pipe_inode_info * pipe)178 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
179 {
180 smp_mb();
181 if (waitqueue_active(&pipe->rd_wait))
182 wake_up_interruptible(&pipe->rd_wait);
183 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
184 }
185
186 /**
187 * splice_to_pipe - fill passed data into a pipe
188 * @pipe: pipe to fill
189 * @spd: data to fill
190 *
191 * Description:
192 * @spd contains a map of pages and len/offset tuples, along with
193 * the struct pipe_buf_operations associated with these pages. This
194 * function will link that data to the pipe.
195 *
196 */
splice_to_pipe(struct pipe_inode_info * pipe,struct splice_pipe_desc * spd)197 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
198 struct splice_pipe_desc *spd)
199 {
200 unsigned int spd_pages = spd->nr_pages;
201 unsigned int tail = pipe->tail;
202 unsigned int head = pipe->head;
203 ssize_t ret = 0;
204 int page_nr = 0;
205
206 if (!spd_pages)
207 return 0;
208
209 if (unlikely(!pipe->readers)) {
210 send_sig(SIGPIPE, current, 0);
211 ret = -EPIPE;
212 goto out;
213 }
214
215 while (!pipe_full(head, tail, pipe->max_usage)) {
216 struct pipe_buffer *buf = pipe_buf(pipe, head);
217
218 buf->page = spd->pages[page_nr];
219 buf->offset = spd->partial[page_nr].offset;
220 buf->len = spd->partial[page_nr].len;
221 buf->private = spd->partial[page_nr].private;
222 buf->ops = spd->ops;
223 buf->flags = 0;
224
225 head++;
226 pipe->head = head;
227 page_nr++;
228 ret += buf->len;
229
230 if (!--spd->nr_pages)
231 break;
232 }
233
234 if (!ret)
235 ret = -EAGAIN;
236
237 out:
238 while (page_nr < spd_pages)
239 spd->spd_release(spd, page_nr++);
240
241 return ret;
242 }
243 EXPORT_SYMBOL_GPL(splice_to_pipe);
244
add_to_pipe(struct pipe_inode_info * pipe,struct pipe_buffer * buf)245 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
246 {
247 unsigned int head = pipe->head;
248 unsigned int tail = pipe->tail;
249 int ret;
250
251 if (unlikely(!pipe->readers)) {
252 send_sig(SIGPIPE, current, 0);
253 ret = -EPIPE;
254 } else if (pipe_full(head, tail, pipe->max_usage)) {
255 ret = -EAGAIN;
256 } else {
257 *pipe_buf(pipe, head) = *buf;
258 pipe->head = head + 1;
259 return buf->len;
260 }
261 pipe_buf_release(pipe, buf);
262 return ret;
263 }
264 EXPORT_SYMBOL(add_to_pipe);
265
266 /*
267 * Check if we need to grow the arrays holding pages and partial page
268 * descriptions.
269 */
splice_grow_spd(const struct pipe_inode_info * pipe,struct splice_pipe_desc * spd)270 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
271 {
272 unsigned int max_usage = READ_ONCE(pipe->max_usage);
273
274 spd->nr_pages_max = max_usage;
275 if (max_usage <= PIPE_DEF_BUFFERS)
276 return 0;
277
278 spd->pages = kmalloc_objs(struct page *, max_usage);
279 spd->partial = kmalloc_objs(struct partial_page, max_usage);
280
281 if (spd->pages && spd->partial)
282 return 0;
283
284 kfree(spd->pages);
285 kfree(spd->partial);
286 return -ENOMEM;
287 }
288
splice_shrink_spd(struct splice_pipe_desc * spd)289 void splice_shrink_spd(struct splice_pipe_desc *spd)
290 {
291 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
292 return;
293
294 kfree(spd->pages);
295 kfree(spd->partial);
296 }
297
298 /**
299 * copy_splice_read - Copy data from a file and splice the copy into a pipe
300 * @in: The file to read from
301 * @ppos: Pointer to the file position to read from
302 * @pipe: The pipe to splice into
303 * @len: The amount to splice
304 * @flags: The SPLICE_F_* flags
305 *
306 * This function allocates a bunch of pages sufficient to hold the requested
307 * amount of data (but limited by the remaining pipe capacity), passes it to
308 * the file's ->read_iter() to read into and then splices the used pages into
309 * the pipe.
310 *
311 * Return: On success, the number of bytes read will be returned and *@ppos
312 * will be updated if appropriate; 0 will be returned if there is no more data
313 * to be read; -EAGAIN will be returned if the pipe had no space, and some
314 * other negative error code will be returned on error. A short read may occur
315 * if the pipe has insufficient space, we reach the end of the data or we hit a
316 * hole.
317 */
copy_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)318 ssize_t copy_splice_read(struct file *in, loff_t *ppos,
319 struct pipe_inode_info *pipe,
320 size_t len, unsigned int flags)
321 {
322 struct iov_iter to;
323 struct bio_vec *bv;
324 struct kiocb kiocb;
325 struct page **pages;
326 ssize_t ret;
327 size_t used, npages, chunk, remain, keep = 0;
328 int i;
329
330 /* Work out how much data we can actually add into the pipe */
331 used = pipe_buf_usage(pipe);
332 npages = max_t(ssize_t, pipe->max_usage - used, 0);
333 len = min_t(size_t, len, npages * PAGE_SIZE);
334 npages = DIV_ROUND_UP(len, PAGE_SIZE);
335
336 bv = kzalloc(array_size(npages, sizeof(bv[0])) +
337 array_size(npages, sizeof(struct page *)), GFP_KERNEL);
338 if (!bv)
339 return -ENOMEM;
340
341 pages = (struct page **)(bv + npages);
342 npages = alloc_pages_bulk(GFP_USER, npages, pages);
343 if (!npages) {
344 kfree(bv);
345 return -ENOMEM;
346 }
347
348 remain = len = min_t(size_t, len, npages * PAGE_SIZE);
349
350 for (i = 0; i < npages; i++) {
351 chunk = min_t(size_t, PAGE_SIZE, remain);
352 bv[i].bv_page = pages[i];
353 bv[i].bv_offset = 0;
354 bv[i].bv_len = chunk;
355 remain -= chunk;
356 }
357
358 /* Do the I/O */
359 iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
360 init_sync_kiocb(&kiocb, in);
361 kiocb.ki_pos = *ppos;
362 ret = in->f_op->read_iter(&kiocb, &to);
363
364 if (ret > 0) {
365 keep = DIV_ROUND_UP(ret, PAGE_SIZE);
366 *ppos = kiocb.ki_pos;
367 }
368
369 /*
370 * Callers of ->splice_read() expect -EAGAIN on "can't put anything in
371 * there", rather than -EFAULT.
372 */
373 if (ret == -EFAULT)
374 ret = -EAGAIN;
375
376 /* Free any pages that didn't get touched at all. */
377 if (keep < npages)
378 release_pages(pages + keep, npages - keep);
379
380 /* Push the remaining pages into the pipe. */
381 remain = ret;
382 for (i = 0; i < keep; i++) {
383 struct pipe_buffer *buf = pipe_head_buf(pipe);
384
385 chunk = min_t(size_t, remain, PAGE_SIZE);
386 *buf = (struct pipe_buffer) {
387 .ops = &default_pipe_buf_ops,
388 .page = bv[i].bv_page,
389 .offset = 0,
390 .len = chunk,
391 };
392 pipe->head++;
393 remain -= chunk;
394 }
395
396 kfree(bv);
397 return ret;
398 }
399 EXPORT_SYMBOL(copy_splice_read);
400
401 const struct pipe_buf_operations default_pipe_buf_ops = {
402 .release = generic_pipe_buf_release,
403 .try_steal = generic_pipe_buf_try_steal,
404 .get = generic_pipe_buf_get,
405 };
406
407 /* Pipe buffer operations for a socket and similar. */
408 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
409 .release = generic_pipe_buf_release,
410 .get = generic_pipe_buf_get,
411 };
412 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
413
wakeup_pipe_writers(struct pipe_inode_info * pipe)414 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
415 {
416 smp_mb();
417 if (waitqueue_active(&pipe->wr_wait))
418 wake_up_interruptible(&pipe->wr_wait);
419 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
420 }
421
422 /**
423 * splice_from_pipe_feed - feed available data from a pipe to a file
424 * @pipe: pipe to splice from
425 * @sd: information to @actor
426 * @actor: handler that splices the data
427 *
428 * Description:
429 * This function loops over the pipe and calls @actor to do the
430 * actual moving of a single struct pipe_buffer to the desired
431 * destination. It returns when there's no more buffers left in
432 * the pipe or if the requested number of bytes (@sd->total_len)
433 * have been copied. It returns a positive number (one) if the
434 * pipe needs to be filled with more data, zero if the required
435 * number of bytes have been copied and -errno on error.
436 *
437 * This, together with splice_from_pipe_{begin,end,next}, may be
438 * used to implement the functionality of __splice_from_pipe() when
439 * locking is required around copying the pipe buffers to the
440 * destination.
441 */
splice_from_pipe_feed(struct pipe_inode_info * pipe,struct splice_desc * sd,splice_actor * actor)442 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
443 splice_actor *actor)
444 {
445 unsigned int head = pipe->head;
446 unsigned int tail = pipe->tail;
447 int ret;
448
449 while (!pipe_empty(head, tail)) {
450 struct pipe_buffer *buf = pipe_buf(pipe, tail);
451
452 sd->len = buf->len;
453 if (sd->len > sd->total_len)
454 sd->len = sd->total_len;
455
456 ret = pipe_buf_confirm(pipe, buf);
457 if (unlikely(ret)) {
458 if (ret == -ENODATA)
459 ret = 0;
460 return ret;
461 }
462
463 ret = actor(pipe, buf, sd);
464 if (ret <= 0)
465 return ret;
466
467 buf->offset += ret;
468 buf->len -= ret;
469
470 sd->num_spliced += ret;
471 sd->len -= ret;
472 sd->pos += ret;
473 sd->total_len -= ret;
474
475 if (!buf->len) {
476 pipe_buf_release(pipe, buf);
477 tail++;
478 pipe->tail = tail;
479 if (pipe->files)
480 sd->need_wakeup = true;
481 }
482
483 if (!sd->total_len)
484 return 0;
485 }
486
487 return 1;
488 }
489
490 /* We know we have a pipe buffer, but maybe it's empty? */
eat_empty_buffer(struct pipe_inode_info * pipe)491 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
492 {
493 unsigned int tail = pipe->tail;
494 struct pipe_buffer *buf = pipe_buf(pipe, tail);
495
496 if (unlikely(!buf->len)) {
497 pipe_buf_release(pipe, buf);
498 pipe->tail = tail+1;
499 return true;
500 }
501
502 return false;
503 }
504
505 /**
506 * splice_from_pipe_next - wait for some data to splice from
507 * @pipe: pipe to splice from
508 * @sd: information about the splice operation
509 *
510 * Description:
511 * This function will wait for some data and return a positive
512 * value (one) if pipe buffers are available. It will return zero
513 * or -errno if no more data needs to be spliced.
514 */
splice_from_pipe_next(struct pipe_inode_info * pipe,struct splice_desc * sd)515 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
516 {
517 /*
518 * Check for signal early to make process killable when there are
519 * always buffers available
520 */
521 if (signal_pending(current))
522 return -ERESTARTSYS;
523
524 repeat:
525 while (pipe_is_empty(pipe)) {
526 if (!pipe->writers)
527 return 0;
528
529 if (sd->num_spliced)
530 return 0;
531
532 if (sd->flags & SPLICE_F_NONBLOCK)
533 return -EAGAIN;
534
535 if (signal_pending(current))
536 return -ERESTARTSYS;
537
538 if (sd->need_wakeup) {
539 wakeup_pipe_writers(pipe);
540 sd->need_wakeup = false;
541 }
542
543 pipe_wait_readable(pipe);
544 }
545
546 if (eat_empty_buffer(pipe))
547 goto repeat;
548
549 return 1;
550 }
551
552 /**
553 * splice_from_pipe_begin - start splicing from pipe
554 * @sd: information about the splice operation
555 *
556 * Description:
557 * This function should be called before a loop containing
558 * splice_from_pipe_next() and splice_from_pipe_feed() to
559 * initialize the necessary fields of @sd.
560 */
splice_from_pipe_begin(struct splice_desc * sd)561 static void splice_from_pipe_begin(struct splice_desc *sd)
562 {
563 sd->num_spliced = 0;
564 sd->need_wakeup = false;
565 }
566
567 /**
568 * splice_from_pipe_end - finish splicing from pipe
569 * @pipe: pipe to splice from
570 * @sd: information about the splice operation
571 *
572 * Description:
573 * This function will wake up pipe writers if necessary. It should
574 * be called after a loop containing splice_from_pipe_next() and
575 * splice_from_pipe_feed().
576 */
splice_from_pipe_end(struct pipe_inode_info * pipe,struct splice_desc * sd)577 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
578 {
579 if (sd->need_wakeup)
580 wakeup_pipe_writers(pipe);
581 }
582
583 /**
584 * __splice_from_pipe - splice data from a pipe to given actor
585 * @pipe: pipe to splice from
586 * @sd: information to @actor
587 * @actor: handler that splices the data
588 *
589 * Description:
590 * This function does little more than loop over the pipe and call
591 * @actor to do the actual moving of a single struct pipe_buffer to
592 * the desired destination. See pipe_to_file, pipe_to_sendmsg, or
593 * pipe_to_user.
594 *
595 */
__splice_from_pipe(struct pipe_inode_info * pipe,struct splice_desc * sd,splice_actor * actor)596 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
597 splice_actor *actor)
598 {
599 int ret;
600
601 splice_from_pipe_begin(sd);
602 do {
603 cond_resched();
604 ret = splice_from_pipe_next(pipe, sd);
605 if (ret > 0)
606 ret = splice_from_pipe_feed(pipe, sd, actor);
607 } while (ret > 0);
608 splice_from_pipe_end(pipe, sd);
609
610 return sd->num_spliced ? sd->num_spliced : ret;
611 }
612 EXPORT_SYMBOL(__splice_from_pipe);
613
614 /**
615 * splice_from_pipe - splice data from a pipe to a file
616 * @pipe: pipe to splice from
617 * @out: file to splice to
618 * @ppos: position in @out
619 * @len: how many bytes to splice
620 * @flags: splice modifier flags
621 * @actor: handler that splices the data
622 *
623 * Description:
624 * See __splice_from_pipe. This function locks the pipe inode,
625 * otherwise it's identical to __splice_from_pipe().
626 *
627 */
splice_from_pipe(struct pipe_inode_info * pipe,struct file * out,loff_t * ppos,size_t len,unsigned int flags,splice_actor * actor)628 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
629 loff_t *ppos, size_t len, unsigned int flags,
630 splice_actor *actor)
631 {
632 ssize_t ret;
633 struct splice_desc sd = {
634 .total_len = len,
635 .flags = flags,
636 .pos = *ppos,
637 .u.file = out,
638 };
639
640 pipe_lock(pipe);
641 ret = __splice_from_pipe(pipe, &sd, actor);
642 pipe_unlock(pipe);
643
644 return ret;
645 }
646
647 /**
648 * iter_file_splice_write - splice data from a pipe to a file
649 * @pipe: pipe info
650 * @out: file to write to
651 * @ppos: position in @out
652 * @len: number of bytes to splice
653 * @flags: splice modifier flags
654 *
655 * Description:
656 * Will either move or copy pages (determined by @flags options) from
657 * the given pipe inode to the given file.
658 * This one is ->write_iter-based.
659 *
660 */
661 ssize_t
iter_file_splice_write(struct pipe_inode_info * pipe,struct file * out,loff_t * ppos,size_t len,unsigned int flags)662 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
663 loff_t *ppos, size_t len, unsigned int flags)
664 {
665 struct splice_desc sd = {
666 .total_len = len,
667 .flags = flags,
668 .pos = *ppos,
669 .u.file = out,
670 };
671 int nbufs = pipe->max_usage;
672 struct bio_vec *array;
673 ssize_t ret;
674
675 if (!out->f_op->write_iter)
676 return -EINVAL;
677
678 array = kzalloc_objs(struct bio_vec, nbufs);
679 if (unlikely(!array))
680 return -ENOMEM;
681
682 pipe_lock(pipe);
683
684 splice_from_pipe_begin(&sd);
685 while (sd.total_len) {
686 struct kiocb kiocb;
687 struct iov_iter from;
688 unsigned int head, tail;
689 size_t left;
690 int n;
691
692 ret = splice_from_pipe_next(pipe, &sd);
693 if (ret <= 0)
694 break;
695
696 if (unlikely(nbufs < pipe->max_usage)) {
697 kfree(array);
698 nbufs = pipe->max_usage;
699 array = kzalloc_objs(struct bio_vec, nbufs);
700 if (!array) {
701 ret = -ENOMEM;
702 break;
703 }
704 }
705
706 head = pipe->head;
707 tail = pipe->tail;
708
709 /* build the vector */
710 left = sd.total_len;
711 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
712 struct pipe_buffer *buf = pipe_buf(pipe, tail);
713 size_t this_len = buf->len;
714
715 /* zero-length bvecs are not supported, skip them */
716 if (!this_len)
717 continue;
718 this_len = min(this_len, left);
719
720 ret = pipe_buf_confirm(pipe, buf);
721 if (unlikely(ret)) {
722 if (ret == -ENODATA)
723 ret = 0;
724 goto done;
725 }
726
727 bvec_set_page(&array[n], buf->page, this_len,
728 buf->offset);
729 left -= this_len;
730 n++;
731 }
732
733 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
734 init_sync_kiocb(&kiocb, out);
735 kiocb.ki_pos = sd.pos;
736 ret = out->f_op->write_iter(&kiocb, &from);
737 sd.pos = kiocb.ki_pos;
738 if (ret <= 0)
739 break;
740 WARN_ONCE(ret > sd.total_len - left,
741 "Splice Exceeded! ret=%zd tot=%zu left=%zu\n",
742 ret, sd.total_len, left);
743
744 sd.num_spliced += ret;
745 sd.total_len -= ret;
746 *ppos = sd.pos;
747
748 /* dismiss the fully eaten buffers, adjust the partial one */
749 tail = pipe->tail;
750 while (ret) {
751 struct pipe_buffer *buf = pipe_buf(pipe, tail);
752 if (ret >= buf->len) {
753 ret -= buf->len;
754 buf->len = 0;
755 pipe_buf_release(pipe, buf);
756 tail++;
757 pipe->tail = tail;
758 if (pipe->files)
759 sd.need_wakeup = true;
760 } else {
761 buf->offset += ret;
762 buf->len -= ret;
763 ret = 0;
764 }
765 }
766 }
767 done:
768 kfree(array);
769 splice_from_pipe_end(pipe, &sd);
770
771 pipe_unlock(pipe);
772
773 if (sd.num_spliced)
774 ret = sd.num_spliced;
775
776 return ret;
777 }
778
779 EXPORT_SYMBOL(iter_file_splice_write);
780
781 #ifdef CONFIG_NET
782 /**
783 * splice_to_socket - splice data from a pipe to a socket
784 * @pipe: pipe to splice from
785 * @out: socket to write to
786 * @ppos: position in @out
787 * @len: number of bytes to splice
788 * @flags: splice modifier flags
789 *
790 * Description:
791 * Will send @len bytes from the pipe to a network socket. No data copying
792 * is involved.
793 *
794 */
splice_to_socket(struct pipe_inode_info * pipe,struct file * out,loff_t * ppos,size_t len,unsigned int flags)795 ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
796 loff_t *ppos, size_t len, unsigned int flags)
797 {
798 struct socket *sock = sock_from_file(out);
799 struct bio_vec bvec[16];
800 struct msghdr msg = {};
801 ssize_t ret = 0;
802 size_t spliced = 0;
803 bool need_wakeup = false;
804
805 pipe_lock(pipe);
806
807 while (len > 0) {
808 unsigned int head, tail, bc = 0;
809 size_t remain = len;
810
811 /*
812 * Check for signal early to make process killable when there
813 * are always buffers available
814 */
815 ret = -ERESTARTSYS;
816 if (signal_pending(current))
817 break;
818
819 while (pipe_is_empty(pipe)) {
820 ret = 0;
821 if (!pipe->writers)
822 goto out;
823
824 if (spliced)
825 goto out;
826
827 ret = -EAGAIN;
828 if (flags & SPLICE_F_NONBLOCK)
829 goto out;
830
831 ret = -ERESTARTSYS;
832 if (signal_pending(current))
833 goto out;
834
835 if (need_wakeup) {
836 wakeup_pipe_writers(pipe);
837 need_wakeup = false;
838 }
839
840 pipe_wait_readable(pipe);
841 }
842
843 head = pipe->head;
844 tail = pipe->tail;
845
846 while (!pipe_empty(head, tail)) {
847 struct pipe_buffer *buf = pipe_buf(pipe, tail);
848 size_t seg;
849
850 if (!buf->len) {
851 tail++;
852 continue;
853 }
854
855 seg = min_t(size_t, remain, buf->len);
856
857 ret = pipe_buf_confirm(pipe, buf);
858 if (unlikely(ret)) {
859 if (ret == -ENODATA)
860 ret = 0;
861 break;
862 }
863
864 bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
865 remain -= seg;
866 if (remain == 0 || bc >= ARRAY_SIZE(bvec))
867 break;
868 tail++;
869 }
870
871 if (!bc)
872 break;
873
874 msg.msg_flags = MSG_SPLICE_PAGES;
875 if (flags & SPLICE_F_MORE)
876 msg.msg_flags |= MSG_MORE;
877 if (remain && pipe_occupancy(pipe->head, tail) > 0)
878 msg.msg_flags |= MSG_MORE;
879 if (out->f_flags & O_NONBLOCK)
880 msg.msg_flags |= MSG_DONTWAIT;
881
882 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
883 len - remain);
884 ret = sock_sendmsg(sock, &msg);
885 if (ret <= 0)
886 break;
887
888 spliced += ret;
889 len -= ret;
890 tail = pipe->tail;
891 while (ret > 0) {
892 struct pipe_buffer *buf = pipe_buf(pipe, tail);
893 size_t seg = min_t(size_t, ret, buf->len);
894
895 buf->offset += seg;
896 buf->len -= seg;
897 ret -= seg;
898
899 if (!buf->len) {
900 pipe_buf_release(pipe, buf);
901 tail++;
902 }
903 }
904
905 if (tail != pipe->tail) {
906 pipe->tail = tail;
907 if (pipe->files)
908 need_wakeup = true;
909 }
910 }
911
912 out:
913 pipe_unlock(pipe);
914 if (need_wakeup)
915 wakeup_pipe_writers(pipe);
916 return spliced ?: ret;
917 }
918 #endif
919
warn_unsupported(struct file * file,const char * op)920 static int warn_unsupported(struct file *file, const char *op)
921 {
922 pr_debug_ratelimited(
923 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
924 op, file, current->pid, current->comm);
925 return -EINVAL;
926 }
927
928 /*
929 * Attempt to initiate a splice from pipe to file.
930 */
do_splice_from(struct pipe_inode_info * pipe,struct file * out,loff_t * ppos,size_t len,unsigned int flags)931 static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out,
932 loff_t *ppos, size_t len, unsigned int flags)
933 {
934 if (unlikely(!out->f_op->splice_write))
935 return warn_unsupported(out, "write");
936 return out->f_op->splice_write(pipe, out, ppos, len, flags);
937 }
938
939 /*
940 * Indicate to the caller that there was a premature EOF when reading from the
941 * source and the caller didn't indicate they would be sending more data after
942 * this.
943 */
do_splice_eof(struct splice_desc * sd)944 static void do_splice_eof(struct splice_desc *sd)
945 {
946 if (sd->splice_eof)
947 sd->splice_eof(sd);
948 }
949
950 /*
951 * Callers already called rw_verify_area() on the entire range.
952 * No need to call it for sub ranges.
953 */
do_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)954 static ssize_t do_splice_read(struct file *in, loff_t *ppos,
955 struct pipe_inode_info *pipe, size_t len,
956 unsigned int flags)
957 {
958 unsigned int p_space;
959
960 if (unlikely(!(in->f_mode & FMODE_READ)))
961 return -EBADF;
962 if (!len)
963 return 0;
964
965 /* Don't try to read more the pipe has space for. */
966 p_space = pipe->max_usage - pipe_buf_usage(pipe);
967 len = min_t(size_t, len, p_space << PAGE_SHIFT);
968
969 if (unlikely(len > MAX_RW_COUNT))
970 len = MAX_RW_COUNT;
971
972 if (unlikely(!in->f_op->splice_read))
973 return warn_unsupported(in, "read");
974 /*
975 * O_DIRECT and DAX don't deal with the pagecache, so we allocate a
976 * buffer, copy into it and splice that into the pipe.
977 */
978 if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
979 return copy_splice_read(in, ppos, pipe, len, flags);
980 return in->f_op->splice_read(in, ppos, pipe, len, flags);
981 }
982
983 /**
984 * vfs_splice_read - Read data from a file and splice it into a pipe
985 * @in: File to splice from
986 * @ppos: Input file offset
987 * @pipe: Pipe to splice to
988 * @len: Number of bytes to splice
989 * @flags: Splice modifier flags (SPLICE_F_*)
990 *
991 * Splice the requested amount of data from the input file to the pipe. This
992 * is synchronous as the caller must hold the pipe lock across the entire
993 * operation.
994 *
995 * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
996 * a hole and a negative error code otherwise.
997 */
vfs_splice_read(struct file * in,loff_t * ppos,struct pipe_inode_info * pipe,size_t len,unsigned int flags)998 ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
999 struct pipe_inode_info *pipe, size_t len,
1000 unsigned int flags)
1001 {
1002 ssize_t ret;
1003
1004 ret = rw_verify_area(READ, in, ppos, len);
1005 if (unlikely(ret < 0))
1006 return ret;
1007
1008 return do_splice_read(in, ppos, pipe, len, flags);
1009 }
1010 EXPORT_SYMBOL_GPL(vfs_splice_read);
1011
1012 /**
1013 * splice_direct_to_actor - splices data directly between two non-pipes
1014 * @in: file to splice from
1015 * @sd: actor information on where to splice to
1016 * @actor: handles the data splicing
1017 *
1018 * Description:
1019 * This is a special case helper to splice directly between two
1020 * points, without requiring an explicit pipe. Internally an allocated
1021 * pipe is cached in the process, and reused during the lifetime of
1022 * that process.
1023 *
1024 */
splice_direct_to_actor(struct file * in,struct splice_desc * sd,splice_direct_actor * actor)1025 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1026 splice_direct_actor *actor)
1027 {
1028 struct pipe_inode_info *pipe;
1029 ssize_t ret, bytes;
1030 size_t len;
1031 int i, flags, more;
1032
1033 /*
1034 * We require the input to be seekable, as we don't want to randomly
1035 * drop data for eg socket -> socket splicing. Use the piped splicing
1036 * for that!
1037 */
1038 if (unlikely(!(in->f_mode & FMODE_LSEEK)))
1039 return -EINVAL;
1040
1041 /*
1042 * neither in nor out is a pipe, setup an internal pipe attached to
1043 * 'out' and transfer the wanted data from 'in' to 'out' through that
1044 */
1045 pipe = current->splice_pipe;
1046 if (unlikely(!pipe)) {
1047 pipe = alloc_pipe_info();
1048 if (!pipe)
1049 return -ENOMEM;
1050
1051 /*
1052 * We don't have an immediate reader, but we'll read the stuff
1053 * out of the pipe right after the splice_to_pipe(). So set
1054 * PIPE_READERS appropriately.
1055 */
1056 pipe->readers = 1;
1057
1058 current->splice_pipe = pipe;
1059 }
1060
1061 /*
1062 * Do the splice.
1063 */
1064 bytes = 0;
1065 len = sd->total_len;
1066
1067 /* Don't block on output, we have to drain the direct pipe. */
1068 flags = sd->flags;
1069 sd->flags &= ~SPLICE_F_NONBLOCK;
1070
1071 /*
1072 * We signal MORE until we've read sufficient data to fulfill the
1073 * request and we keep signalling it if the caller set it.
1074 */
1075 more = sd->flags & SPLICE_F_MORE;
1076 sd->flags |= SPLICE_F_MORE;
1077
1078 WARN_ON_ONCE(!pipe_is_empty(pipe));
1079
1080 while (len) {
1081 size_t read_len;
1082 loff_t pos = sd->pos, prev_pos = pos;
1083
1084 ret = do_splice_read(in, &pos, pipe, len, flags);
1085 if (unlikely(ret <= 0))
1086 goto read_failure;
1087
1088 read_len = ret;
1089 sd->total_len = read_len;
1090
1091 /*
1092 * If we now have sufficient data to fulfill the request then
1093 * we clear SPLICE_F_MORE if it was not set initially.
1094 */
1095 if (read_len >= len && !more)
1096 sd->flags &= ~SPLICE_F_MORE;
1097
1098 /*
1099 * NOTE: nonblocking mode only applies to the input. We
1100 * must not do the output in nonblocking mode as then we
1101 * could get stuck data in the internal pipe:
1102 */
1103 ret = actor(pipe, sd);
1104 if (unlikely(ret <= 0)) {
1105 sd->pos = prev_pos;
1106 goto out_release;
1107 }
1108
1109 bytes += ret;
1110 len -= ret;
1111 sd->pos = pos;
1112
1113 if (ret < read_len) {
1114 sd->pos = prev_pos + ret;
1115 goto out_release;
1116 }
1117 }
1118
1119 done:
1120 pipe->tail = pipe->head = 0;
1121 file_accessed(in);
1122 return bytes;
1123
1124 read_failure:
1125 /*
1126 * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
1127 * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
1128 * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
1129 * least 1 byte *then* we will also do the ->splice_eof() call.
1130 */
1131 if (ret == 0 && !more && len > 0 && bytes)
1132 do_splice_eof(sd);
1133 out_release:
1134 /*
1135 * If we did an incomplete transfer we must release
1136 * the pipe buffers in question:
1137 */
1138 for (i = 0; i < pipe->ring_size; i++) {
1139 struct pipe_buffer *buf = &pipe->bufs[i];
1140
1141 if (buf->ops)
1142 pipe_buf_release(pipe, buf);
1143 }
1144
1145 if (!bytes)
1146 bytes = ret;
1147
1148 goto done;
1149 }
1150 EXPORT_SYMBOL(splice_direct_to_actor);
1151
direct_splice_actor(struct pipe_inode_info * pipe,struct splice_desc * sd)1152 static int direct_splice_actor(struct pipe_inode_info *pipe,
1153 struct splice_desc *sd)
1154 {
1155 struct file *file = sd->u.file;
1156 long ret;
1157
1158 file_start_write(file);
1159 ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
1160 file_end_write(file);
1161 return ret;
1162 }
1163
splice_file_range_actor(struct pipe_inode_info * pipe,struct splice_desc * sd)1164 static int splice_file_range_actor(struct pipe_inode_info *pipe,
1165 struct splice_desc *sd)
1166 {
1167 struct file *file = sd->u.file;
1168
1169 return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
1170 }
1171
direct_file_splice_eof(struct splice_desc * sd)1172 static void direct_file_splice_eof(struct splice_desc *sd)
1173 {
1174 struct file *file = sd->u.file;
1175
1176 if (file->f_op->splice_eof)
1177 file->f_op->splice_eof(file);
1178 }
1179
do_splice_direct_actor(struct file * in,loff_t * ppos,struct file * out,loff_t * opos,size_t len,unsigned int flags,splice_direct_actor * actor)1180 static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos,
1181 struct file *out, loff_t *opos,
1182 size_t len, unsigned int flags,
1183 splice_direct_actor *actor)
1184 {
1185 struct splice_desc sd = {
1186 .len = len,
1187 .total_len = len,
1188 .flags = flags,
1189 .pos = *ppos,
1190 .u.file = out,
1191 .splice_eof = direct_file_splice_eof,
1192 .opos = opos,
1193 };
1194 ssize_t ret;
1195
1196 if (unlikely(!(out->f_mode & FMODE_WRITE)))
1197 return -EBADF;
1198
1199 if (unlikely(out->f_flags & O_APPEND))
1200 return -EINVAL;
1201
1202 ret = splice_direct_to_actor(in, &sd, actor);
1203 if (ret > 0)
1204 *ppos = sd.pos;
1205
1206 return ret;
1207 }
1208 /**
1209 * do_splice_direct - splices data directly between two files
1210 * @in: file to splice from
1211 * @ppos: input file offset
1212 * @out: file to splice to
1213 * @opos: output file offset
1214 * @len: number of bytes to splice
1215 * @flags: splice modifier flags
1216 *
1217 * Description:
1218 * For use by do_sendfile(). splice can easily emulate sendfile, but
1219 * doing it in the application would incur an extra system call
1220 * (splice in + splice out, as compared to just sendfile()). So this helper
1221 * can splice directly through a process-private pipe.
1222 *
1223 * Callers already called rw_verify_area() on the entire range.
1224 */
do_splice_direct(struct file * in,loff_t * ppos,struct file * out,loff_t * opos,size_t len,unsigned int flags)1225 ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1226 loff_t *opos, size_t len, unsigned int flags)
1227 {
1228 return do_splice_direct_actor(in, ppos, out, opos, len, flags,
1229 direct_splice_actor);
1230 }
1231 EXPORT_SYMBOL(do_splice_direct);
1232
1233 /**
1234 * splice_file_range - splices data between two files for copy_file_range()
1235 * @in: file to splice from
1236 * @ppos: input file offset
1237 * @out: file to splice to
1238 * @opos: output file offset
1239 * @len: number of bytes to splice
1240 *
1241 * Description:
1242 * For use by ->copy_file_range() methods.
1243 * Like do_splice_direct(), but vfs_copy_file_range() already holds
1244 * start_file_write() on @out file.
1245 *
1246 * Callers already called rw_verify_area() on the entire range.
1247 */
splice_file_range(struct file * in,loff_t * ppos,struct file * out,loff_t * opos,size_t len)1248 ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
1249 loff_t *opos, size_t len)
1250 {
1251 lockdep_assert(file_write_started(out));
1252
1253 return do_splice_direct_actor(in, ppos, out, opos,
1254 min_t(size_t, len, MAX_RW_COUNT),
1255 0, splice_file_range_actor);
1256 }
1257 EXPORT_SYMBOL(splice_file_range);
1258
wait_for_space(struct pipe_inode_info * pipe,unsigned flags)1259 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1260 {
1261 for (;;) {
1262 if (unlikely(!pipe->readers)) {
1263 send_sig(SIGPIPE, current, 0);
1264 return -EPIPE;
1265 }
1266 if (!pipe_is_full(pipe))
1267 return 0;
1268 if (flags & SPLICE_F_NONBLOCK)
1269 return -EAGAIN;
1270 if (signal_pending(current))
1271 return -ERESTARTSYS;
1272 pipe_wait_writable(pipe);
1273 }
1274 }
1275
1276 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1277 struct pipe_inode_info *opipe,
1278 size_t len, unsigned int flags);
1279
splice_file_to_pipe(struct file * in,struct pipe_inode_info * opipe,loff_t * offset,size_t len,unsigned int flags)1280 ssize_t splice_file_to_pipe(struct file *in,
1281 struct pipe_inode_info *opipe,
1282 loff_t *offset,
1283 size_t len, unsigned int flags)
1284 {
1285 ssize_t ret;
1286
1287 pipe_lock(opipe);
1288 ret = wait_for_space(opipe, flags);
1289 if (!ret)
1290 ret = do_splice_read(in, offset, opipe, len, flags);
1291 pipe_unlock(opipe);
1292 if (ret > 0)
1293 wakeup_pipe_readers(opipe);
1294 return ret;
1295 }
1296
1297 /*
1298 * Determine where to splice to/from.
1299 */
do_splice(struct file * in,loff_t * off_in,struct file * out,loff_t * off_out,size_t len,unsigned int flags)1300 ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
1301 loff_t *off_out, size_t len, unsigned int flags)
1302 {
1303 struct pipe_inode_info *ipipe;
1304 struct pipe_inode_info *opipe;
1305 loff_t offset;
1306 ssize_t ret;
1307
1308 if (unlikely(!(in->f_mode & FMODE_READ) ||
1309 !(out->f_mode & FMODE_WRITE)))
1310 return -EBADF;
1311
1312 ipipe = get_pipe_info(in, true);
1313 opipe = get_pipe_info(out, true);
1314
1315 if (ipipe && opipe) {
1316 if (off_in || off_out)
1317 return -ESPIPE;
1318
1319 /* Splicing to self would be fun, but... */
1320 if (ipipe == opipe)
1321 return -EINVAL;
1322
1323 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1324 flags |= SPLICE_F_NONBLOCK;
1325
1326 ret = splice_pipe_to_pipe(ipipe, opipe, len, flags);
1327 } else if (ipipe) {
1328 if (off_in)
1329 return -ESPIPE;
1330 if (off_out) {
1331 if (!(out->f_mode & FMODE_PWRITE))
1332 return -EINVAL;
1333 offset = *off_out;
1334 } else {
1335 offset = out->f_pos;
1336 }
1337
1338 if (unlikely(out->f_flags & O_APPEND))
1339 return -EINVAL;
1340
1341 ret = rw_verify_area(WRITE, out, &offset, len);
1342 if (unlikely(ret < 0))
1343 return ret;
1344
1345 if (in->f_flags & O_NONBLOCK)
1346 flags |= SPLICE_F_NONBLOCK;
1347
1348 file_start_write(out);
1349 ret = do_splice_from(ipipe, out, &offset, len, flags);
1350 file_end_write(out);
1351
1352 if (!off_out)
1353 out->f_pos = offset;
1354 else
1355 *off_out = offset;
1356 } else if (opipe) {
1357 if (off_out)
1358 return -ESPIPE;
1359 if (off_in) {
1360 if (!(in->f_mode & FMODE_PREAD))
1361 return -EINVAL;
1362 offset = *off_in;
1363 } else {
1364 offset = in->f_pos;
1365 }
1366
1367 ret = rw_verify_area(READ, in, &offset, len);
1368 if (unlikely(ret < 0))
1369 return ret;
1370
1371 if (out->f_flags & O_NONBLOCK)
1372 flags |= SPLICE_F_NONBLOCK;
1373
1374 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1375
1376 if (!off_in)
1377 in->f_pos = offset;
1378 else
1379 *off_in = offset;
1380 } else {
1381 ret = -EINVAL;
1382 }
1383
1384 if (ret > 0) {
1385 /*
1386 * Generate modify out before access in:
1387 * do_splice_from() may've already sent modify out,
1388 * and this ensures the events get merged.
1389 */
1390 fsnotify_modify(out);
1391 fsnotify_access(in);
1392 }
1393
1394 return ret;
1395 }
1396
__do_splice(struct file * in,loff_t __user * off_in,struct file * out,loff_t __user * off_out,size_t len,unsigned int flags)1397 static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
1398 struct file *out, loff_t __user *off_out,
1399 size_t len, unsigned int flags)
1400 {
1401 struct pipe_inode_info *ipipe;
1402 struct pipe_inode_info *opipe;
1403 loff_t offset, *__off_in = NULL, *__off_out = NULL;
1404 ssize_t ret;
1405
1406 ipipe = get_pipe_info(in, true);
1407 opipe = get_pipe_info(out, true);
1408
1409 if (ipipe) {
1410 if (off_in)
1411 return -ESPIPE;
1412 pipe_clear_nowait(in);
1413 }
1414 if (opipe) {
1415 if (off_out)
1416 return -ESPIPE;
1417 pipe_clear_nowait(out);
1418 }
1419
1420 if (off_out) {
1421 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1422 return -EFAULT;
1423 __off_out = &offset;
1424 }
1425 if (off_in) {
1426 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1427 return -EFAULT;
1428 __off_in = &offset;
1429 }
1430
1431 ret = do_splice(in, __off_in, out, __off_out, len, flags);
1432 if (ret < 0)
1433 return ret;
1434
1435 if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1436 return -EFAULT;
1437 if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1438 return -EFAULT;
1439
1440 return ret;
1441 }
1442
iter_to_pipe(struct iov_iter * from,struct pipe_inode_info * pipe,unsigned int flags)1443 static ssize_t iter_to_pipe(struct iov_iter *from,
1444 struct pipe_inode_info *pipe,
1445 unsigned int flags)
1446 {
1447 struct pipe_buffer buf = {
1448 .ops = &user_page_pipe_buf_ops,
1449 .flags = flags
1450 };
1451 size_t total = 0;
1452 ssize_t ret = 0;
1453
1454 while (iov_iter_count(from)) {
1455 struct page *pages[16];
1456 ssize_t left;
1457 size_t start;
1458 int i, n;
1459
1460 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1461 if (left <= 0) {
1462 ret = left;
1463 break;
1464 }
1465
1466 n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1467 for (i = 0; i < n; i++) {
1468 int size = umin(left, PAGE_SIZE - start);
1469
1470 buf.page = pages[i];
1471 buf.offset = start;
1472 buf.len = size;
1473 ret = add_to_pipe(pipe, &buf);
1474 if (unlikely(ret < 0)) {
1475 iov_iter_revert(from, left);
1476 // this one got dropped by add_to_pipe()
1477 while (++i < n)
1478 put_page(pages[i]);
1479 goto out;
1480 }
1481 total += ret;
1482 left -= size;
1483 start = 0;
1484 }
1485 }
1486 out:
1487 return total ? total : ret;
1488 }
1489
pipe_to_user(struct pipe_inode_info * pipe,struct pipe_buffer * buf,struct splice_desc * sd)1490 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1491 struct splice_desc *sd)
1492 {
1493 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1494 return n == sd->len ? n : -EFAULT;
1495 }
1496
1497 /*
1498 * For lack of a better implementation, implement vmsplice() to userspace
1499 * as a simple copy of the pipe's pages to the user iov.
1500 */
vmsplice_to_user(struct file * file,struct iov_iter * iter,unsigned int flags)1501 static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
1502 unsigned int flags)
1503 {
1504 struct pipe_inode_info *pipe = get_pipe_info(file, true);
1505 struct splice_desc sd = {
1506 .total_len = iov_iter_count(iter),
1507 .flags = flags,
1508 .u.data = iter
1509 };
1510 ssize_t ret = 0;
1511
1512 if (!pipe)
1513 return -EBADF;
1514
1515 pipe_clear_nowait(file);
1516
1517 if (sd.total_len) {
1518 pipe_lock(pipe);
1519 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1520 pipe_unlock(pipe);
1521 }
1522
1523 if (ret > 0)
1524 fsnotify_access(file);
1525
1526 return ret;
1527 }
1528
1529 /*
1530 * vmsplice splices a user address range into a pipe. It can be thought of
1531 * as splice-from-memory, where the regular splice is splice-from-file (or
1532 * to file). In both cases the output is a pipe, naturally.
1533 */
vmsplice_to_pipe(struct file * file,struct iov_iter * iter,unsigned int flags)1534 static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1535 unsigned int flags)
1536 {
1537 struct pipe_inode_info *pipe;
1538 ssize_t ret = 0;
1539 unsigned buf_flag = 0;
1540
1541 if (flags & SPLICE_F_GIFT)
1542 buf_flag = PIPE_BUF_FLAG_GIFT;
1543
1544 pipe = get_pipe_info(file, true);
1545 if (!pipe)
1546 return -EBADF;
1547
1548 pipe_clear_nowait(file);
1549
1550 pipe_lock(pipe);
1551 ret = wait_for_space(pipe, flags);
1552 if (!ret)
1553 ret = iter_to_pipe(iter, pipe, buf_flag);
1554 pipe_unlock(pipe);
1555 if (ret > 0) {
1556 wakeup_pipe_readers(pipe);
1557 fsnotify_modify(file);
1558 }
1559 return ret;
1560 }
1561
1562 /*
1563 * Note that vmsplice only really supports true splicing _from_ user memory
1564 * to a pipe, not the other way around. Splicing from user memory is a simple
1565 * operation that can be supported without any funky alignment restrictions
1566 * or nasty vm tricks. We simply map in the user memory and fill them into
1567 * a pipe. The reverse isn't quite as easy, though. There are two possible
1568 * solutions for that:
1569 *
1570 * - memcpy() the data internally, at which point we might as well just
1571 * do a regular read() on the buffer anyway.
1572 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1573 * has restriction limitations on both ends of the pipe).
1574 *
1575 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1576 *
1577 */
SYSCALL_DEFINE4(vmsplice,int,fd,const struct iovec __user *,uiov,unsigned long,nr_segs,unsigned int,flags)1578 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1579 unsigned long, nr_segs, unsigned int, flags)
1580 {
1581 struct iovec iovstack[UIO_FASTIOV];
1582 struct iovec *iov = iovstack;
1583 struct iov_iter iter;
1584 ssize_t error;
1585 int type;
1586
1587 if (unlikely(flags & ~SPLICE_F_ALL))
1588 return -EINVAL;
1589
1590 CLASS(fd, f)(fd);
1591 if (fd_empty(f))
1592 return -EBADF;
1593 if (fd_file(f)->f_mode & FMODE_WRITE)
1594 type = ITER_SOURCE;
1595 else if (fd_file(f)->f_mode & FMODE_READ)
1596 type = ITER_DEST;
1597 else
1598 return -EBADF;
1599
1600 error = import_iovec(type, uiov, nr_segs,
1601 ARRAY_SIZE(iovstack), &iov, &iter);
1602 if (error < 0)
1603 return error;
1604
1605 if (!iov_iter_count(&iter))
1606 error = 0;
1607 else if (type == ITER_SOURCE)
1608 error = vmsplice_to_pipe(fd_file(f), &iter, flags);
1609 else
1610 error = vmsplice_to_user(fd_file(f), &iter, flags);
1611
1612 kfree(iov);
1613 return error;
1614 }
1615
SYSCALL_DEFINE6(splice,int,fd_in,loff_t __user *,off_in,int,fd_out,loff_t __user *,off_out,size_t,len,unsigned int,flags)1616 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1617 int, fd_out, loff_t __user *, off_out,
1618 size_t, len, unsigned int, flags)
1619 {
1620 if (unlikely(!len))
1621 return 0;
1622
1623 if (unlikely(flags & ~SPLICE_F_ALL))
1624 return -EINVAL;
1625
1626 CLASS(fd, in)(fd_in);
1627 if (fd_empty(in))
1628 return -EBADF;
1629
1630 CLASS(fd, out)(fd_out);
1631 if (fd_empty(out))
1632 return -EBADF;
1633
1634 return __do_splice(fd_file(in), off_in, fd_file(out), off_out,
1635 len, flags);
1636 }
1637
1638 /*
1639 * Make sure there's data to read. Wait for input if we can, otherwise
1640 * return an appropriate error.
1641 */
ipipe_prep(struct pipe_inode_info * pipe,unsigned int flags)1642 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1643 {
1644 int ret;
1645
1646 /*
1647 * Check the pipe occupancy without the inode lock first. This function
1648 * is speculative anyways, so missing one is ok.
1649 */
1650 if (!pipe_is_empty(pipe))
1651 return 0;
1652
1653 ret = 0;
1654 pipe_lock(pipe);
1655
1656 while (pipe_is_empty(pipe)) {
1657 if (signal_pending(current)) {
1658 ret = -ERESTARTSYS;
1659 break;
1660 }
1661 if (!pipe->writers)
1662 break;
1663 if (flags & SPLICE_F_NONBLOCK) {
1664 ret = -EAGAIN;
1665 break;
1666 }
1667 pipe_wait_readable(pipe);
1668 }
1669
1670 pipe_unlock(pipe);
1671 return ret;
1672 }
1673
1674 /*
1675 * Make sure there's writeable room. Wait for room if we can, otherwise
1676 * return an appropriate error.
1677 */
opipe_prep(struct pipe_inode_info * pipe,unsigned int flags)1678 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1679 {
1680 int ret;
1681
1682 /*
1683 * Check pipe occupancy without the inode lock first. This function
1684 * is speculative anyways, so missing one is ok.
1685 */
1686 if (!pipe_is_full(pipe))
1687 return 0;
1688
1689 ret = 0;
1690 pipe_lock(pipe);
1691
1692 while (pipe_is_full(pipe)) {
1693 if (!pipe->readers) {
1694 send_sig(SIGPIPE, current, 0);
1695 ret = -EPIPE;
1696 break;
1697 }
1698 if (flags & SPLICE_F_NONBLOCK) {
1699 ret = -EAGAIN;
1700 break;
1701 }
1702 if (signal_pending(current)) {
1703 ret = -ERESTARTSYS;
1704 break;
1705 }
1706 pipe_wait_writable(pipe);
1707 }
1708
1709 pipe_unlock(pipe);
1710 return ret;
1711 }
1712
1713 /*
1714 * Splice contents of ipipe to opipe.
1715 */
splice_pipe_to_pipe(struct pipe_inode_info * ipipe,struct pipe_inode_info * opipe,size_t len,unsigned int flags)1716 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1717 struct pipe_inode_info *opipe,
1718 size_t len, unsigned int flags)
1719 {
1720 struct pipe_buffer *ibuf, *obuf;
1721 unsigned int i_head, o_head;
1722 unsigned int i_tail, o_tail;
1723 int ret = 0;
1724 bool input_wakeup = false;
1725
1726
1727 retry:
1728 ret = ipipe_prep(ipipe, flags);
1729 if (ret)
1730 return ret;
1731
1732 ret = opipe_prep(opipe, flags);
1733 if (ret)
1734 return ret;
1735
1736 /*
1737 * Potential ABBA deadlock, work around it by ordering lock
1738 * grabbing by pipe info address. Otherwise two different processes
1739 * could deadlock (one doing tee from A -> B, the other from B -> A).
1740 */
1741 pipe_double_lock(ipipe, opipe);
1742
1743 i_tail = ipipe->tail;
1744 o_head = opipe->head;
1745
1746 do {
1747 size_t o_len;
1748
1749 if (!opipe->readers) {
1750 send_sig(SIGPIPE, current, 0);
1751 if (!ret)
1752 ret = -EPIPE;
1753 break;
1754 }
1755
1756 i_head = ipipe->head;
1757 o_tail = opipe->tail;
1758
1759 if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1760 break;
1761
1762 /*
1763 * Cannot make any progress, because either the input
1764 * pipe is empty or the output pipe is full.
1765 */
1766 if (pipe_empty(i_head, i_tail) ||
1767 pipe_full(o_head, o_tail, opipe->max_usage)) {
1768 /* Already processed some buffers, break */
1769 if (ret)
1770 break;
1771
1772 if (flags & SPLICE_F_NONBLOCK) {
1773 ret = -EAGAIN;
1774 break;
1775 }
1776
1777 /*
1778 * We raced with another reader/writer and haven't
1779 * managed to process any buffers. A zero return
1780 * value means EOF, so retry instead.
1781 */
1782 pipe_unlock(ipipe);
1783 pipe_unlock(opipe);
1784 goto retry;
1785 }
1786
1787 ibuf = pipe_buf(ipipe, i_tail);
1788 obuf = pipe_buf(opipe, o_head);
1789
1790 if (len >= ibuf->len) {
1791 /*
1792 * Simply move the whole buffer from ipipe to opipe
1793 */
1794 *obuf = *ibuf;
1795 ibuf->ops = NULL;
1796 i_tail++;
1797 ipipe->tail = i_tail;
1798 input_wakeup = true;
1799 o_len = obuf->len;
1800 o_head++;
1801 opipe->head = o_head;
1802 } else {
1803 /*
1804 * Get a reference to this pipe buffer,
1805 * so we can copy the contents over.
1806 */
1807 if (!pipe_buf_get(ipipe, ibuf)) {
1808 if (ret == 0)
1809 ret = -EFAULT;
1810 break;
1811 }
1812 *obuf = *ibuf;
1813
1814 /*
1815 * Don't inherit the gift and merge flags, we need to
1816 * prevent multiple steals of this page.
1817 */
1818 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1819 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1820
1821 obuf->len = len;
1822 ibuf->offset += len;
1823 ibuf->len -= len;
1824 o_len = len;
1825 o_head++;
1826 opipe->head = o_head;
1827 }
1828 ret += o_len;
1829 len -= o_len;
1830 } while (len);
1831
1832 pipe_unlock(ipipe);
1833 pipe_unlock(opipe);
1834
1835 /*
1836 * If we put data in the output pipe, wakeup any potential readers.
1837 */
1838 if (ret > 0)
1839 wakeup_pipe_readers(opipe);
1840
1841 if (input_wakeup)
1842 wakeup_pipe_writers(ipipe);
1843
1844 return ret;
1845 }
1846
1847 /*
1848 * Link contents of ipipe to opipe.
1849 */
link_pipe(struct pipe_inode_info * ipipe,struct pipe_inode_info * opipe,size_t len,unsigned int flags)1850 static ssize_t link_pipe(struct pipe_inode_info *ipipe,
1851 struct pipe_inode_info *opipe,
1852 size_t len, unsigned int flags)
1853 {
1854 struct pipe_buffer *ibuf, *obuf;
1855 unsigned int i_head, o_head;
1856 unsigned int i_tail, o_tail;
1857 ssize_t ret = 0;
1858
1859 /*
1860 * Potential ABBA deadlock, work around it by ordering lock
1861 * grabbing by pipe info address. Otherwise two different processes
1862 * could deadlock (one doing tee from A -> B, the other from B -> A).
1863 */
1864 pipe_double_lock(ipipe, opipe);
1865
1866 i_tail = ipipe->tail;
1867 o_head = opipe->head;
1868
1869 do {
1870 if (!opipe->readers) {
1871 send_sig(SIGPIPE, current, 0);
1872 if (!ret)
1873 ret = -EPIPE;
1874 break;
1875 }
1876
1877 i_head = ipipe->head;
1878 o_tail = opipe->tail;
1879
1880 /*
1881 * If we have iterated all input buffers or run out of
1882 * output room, break.
1883 */
1884 if (pipe_empty(i_head, i_tail) ||
1885 pipe_full(o_head, o_tail, opipe->max_usage))
1886 break;
1887
1888 ibuf = pipe_buf(ipipe, i_tail);
1889 obuf = pipe_buf(opipe, o_head);
1890
1891 /*
1892 * Get a reference to this pipe buffer,
1893 * so we can copy the contents over.
1894 */
1895 if (!pipe_buf_get(ipipe, ibuf)) {
1896 if (ret == 0)
1897 ret = -EFAULT;
1898 break;
1899 }
1900
1901 *obuf = *ibuf;
1902
1903 /*
1904 * Don't inherit the gift and merge flag, we need to prevent
1905 * multiple steals of this page.
1906 */
1907 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1908 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1909
1910 if (obuf->len > len)
1911 obuf->len = len;
1912 ret += obuf->len;
1913 len -= obuf->len;
1914
1915 o_head++;
1916 opipe->head = o_head;
1917 i_tail++;
1918 } while (len);
1919
1920 pipe_unlock(ipipe);
1921 pipe_unlock(opipe);
1922
1923 /*
1924 * If we put data in the output pipe, wakeup any potential readers.
1925 */
1926 if (ret > 0)
1927 wakeup_pipe_readers(opipe);
1928
1929 return ret;
1930 }
1931
1932 /*
1933 * This is a tee(1) implementation that works on pipes. It doesn't copy
1934 * any data, it simply references the 'in' pages on the 'out' pipe.
1935 * The 'flags' used are the SPLICE_F_* variants, currently the only
1936 * applicable one is SPLICE_F_NONBLOCK.
1937 */
do_tee(struct file * in,struct file * out,size_t len,unsigned int flags)1938 ssize_t do_tee(struct file *in, struct file *out, size_t len,
1939 unsigned int flags)
1940 {
1941 struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1942 struct pipe_inode_info *opipe = get_pipe_info(out, true);
1943 ssize_t ret = -EINVAL;
1944
1945 if (unlikely(!(in->f_mode & FMODE_READ) ||
1946 !(out->f_mode & FMODE_WRITE)))
1947 return -EBADF;
1948
1949 /*
1950 * Duplicate the contents of ipipe to opipe without actually
1951 * copying the data.
1952 */
1953 if (ipipe && opipe && ipipe != opipe) {
1954 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1955 flags |= SPLICE_F_NONBLOCK;
1956
1957 /*
1958 * Keep going, unless we encounter an error. The ipipe/opipe
1959 * ordering doesn't really matter.
1960 */
1961 ret = ipipe_prep(ipipe, flags);
1962 if (!ret) {
1963 ret = opipe_prep(opipe, flags);
1964 if (!ret)
1965 ret = link_pipe(ipipe, opipe, len, flags);
1966 }
1967 }
1968
1969 if (ret > 0) {
1970 fsnotify_access(in);
1971 fsnotify_modify(out);
1972 }
1973
1974 return ret;
1975 }
1976
SYSCALL_DEFINE4(tee,int,fdin,int,fdout,size_t,len,unsigned int,flags)1977 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1978 {
1979 if (unlikely(flags & ~SPLICE_F_ALL))
1980 return -EINVAL;
1981
1982 if (unlikely(!len))
1983 return 0;
1984
1985 CLASS(fd, in)(fdin);
1986 if (fd_empty(in))
1987 return -EBADF;
1988
1989 CLASS(fd, out)(fdout);
1990 if (fd_empty(out))
1991 return -EBADF;
1992
1993 return do_tee(fd_file(in), fd_file(out), len, flags);
1994 }
1995