xref: /linux/fs/splice.c (revision f8cfe02a53e6ca991a0325ec6925510ac5c8690f)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * "splice": joining two ropes together by interweaving their strands.
4   *
5   * This is the "extended pipe" functionality, where a pipe is used as
6   * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7   * buffer that you can use to transfer data from one end to the other.
8   *
9   * The traditional unix read/write is extended with a "splice()" operation
10   * that transfers data buffers to or from a pipe buffer.
11   *
12   * Named by Larry McVoy, original implementation from Linus, extended by
13   * Jens to support splicing to files, network, direct splicing, etc and
14   * fixing lots of bugs.
15   *
16   * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17   * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18   * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19   *
20   */
21  #include <linux/bvec.h>
22  #include <linux/fs.h>
23  #include <linux/file.h>
24  #include <linux/pagemap.h>
25  #include <linux/splice.h>
26  #include <linux/memcontrol.h>
27  #include <linux/mm_inline.h>
28  #include <linux/swap.h>
29  #include <linux/writeback.h>
30  #include <linux/export.h>
31  #include <linux/syscalls.h>
32  #include <linux/uio.h>
33  #include <linux/fsnotify.h>
34  #include <linux/security.h>
35  #include <linux/gfp.h>
36  #include <linux/net.h>
37  #include <linux/socket.h>
38  #include <linux/sched/signal.h>
39  
40  #include "internal.h"
41  
42  /*
43   * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
44   * indicate they support non-blocking reads or writes, we must clear it
45   * here if set to avoid blocking other users of this pipe if splice is
46   * being done on it.
47   */
48  static noinline void noinline pipe_clear_nowait(struct file *file)
49  {
50  	fmode_t fmode = READ_ONCE(file->f_mode);
51  
52  	do {
53  		if (!(fmode & FMODE_NOWAIT))
54  			break;
55  	} while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
56  }
57  
58  /*
59   * Attempt to steal a page from a pipe buffer. This should perhaps go into
60   * a vm helper function, it's already simplified quite a bit by the
61   * addition of remove_mapping(). If success is returned, the caller may
62   * attempt to reuse this page for another destination.
63   */
64  static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
65  		struct pipe_buffer *buf)
66  {
67  	struct folio *folio = page_folio(buf->page);
68  	struct address_space *mapping;
69  
70  	folio_lock(folio);
71  
72  	mapping = folio_mapping(folio);
73  	if (mapping) {
74  		WARN_ON(!folio_test_uptodate(folio));
75  
76  		/*
77  		 * At least for ext2 with nobh option, we need to wait on
78  		 * writeback completing on this folio, since we'll remove it
79  		 * from the pagecache.  Otherwise truncate wont wait on the
80  		 * folio, allowing the disk blocks to be reused by someone else
81  		 * before we actually wrote our data to them. fs corruption
82  		 * ensues.
83  		 */
84  		folio_wait_writeback(folio);
85  
86  		if (!filemap_release_folio(folio, GFP_KERNEL))
87  			goto out_unlock;
88  
89  		/*
90  		 * If we succeeded in removing the mapping, set LRU flag
91  		 * and return good.
92  		 */
93  		if (remove_mapping(mapping, folio)) {
94  			buf->flags |= PIPE_BUF_FLAG_LRU;
95  			return true;
96  		}
97  	}
98  
99  	/*
100  	 * Raced with truncate or failed to remove folio from current
101  	 * address space, unlock and return failure.
102  	 */
103  out_unlock:
104  	folio_unlock(folio);
105  	return false;
106  }
107  
108  static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
109  					struct pipe_buffer *buf)
110  {
111  	put_page(buf->page);
112  	buf->flags &= ~PIPE_BUF_FLAG_LRU;
113  }
114  
115  /*
116   * Check whether the contents of buf is OK to access. Since the content
117   * is a page cache page, IO may be in flight.
118   */
119  static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
120  				       struct pipe_buffer *buf)
121  {
122  	struct folio *folio = page_folio(buf->page);
123  	int err;
124  
125  	if (!folio_test_uptodate(folio)) {
126  		folio_lock(folio);
127  
128  		/*
129  		 * Folio got truncated/unhashed. This will cause a 0-byte
130  		 * splice, if this is the first page.
131  		 */
132  		if (!folio->mapping) {
133  			err = -ENODATA;
134  			goto error;
135  		}
136  
137  		/*
138  		 * Uh oh, read-error from disk.
139  		 */
140  		if (!folio_test_uptodate(folio)) {
141  			err = -EIO;
142  			goto error;
143  		}
144  
145  		/* Folio is ok after all, we are done */
146  		folio_unlock(folio);
147  	}
148  
149  	return 0;
150  error:
151  	folio_unlock(folio);
152  	return err;
153  }
154  
155  const struct pipe_buf_operations page_cache_pipe_buf_ops = {
156  	.confirm	= page_cache_pipe_buf_confirm,
157  	.release	= page_cache_pipe_buf_release,
158  	.try_steal	= page_cache_pipe_buf_try_steal,
159  	.get		= generic_pipe_buf_get,
160  };
161  
162  static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
163  		struct pipe_buffer *buf)
164  {
165  	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
166  		return false;
167  
168  	buf->flags |= PIPE_BUF_FLAG_LRU;
169  	return generic_pipe_buf_try_steal(pipe, buf);
170  }
171  
172  static const struct pipe_buf_operations user_page_pipe_buf_ops = {
173  	.release	= page_cache_pipe_buf_release,
174  	.try_steal	= user_page_pipe_buf_try_steal,
175  	.get		= generic_pipe_buf_get,
176  };
177  
178  static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
179  {
180  	smp_mb();
181  	if (waitqueue_active(&pipe->rd_wait))
182  		wake_up_interruptible(&pipe->rd_wait);
183  	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
184  }
185  
186  /**
187   * splice_to_pipe - fill passed data into a pipe
188   * @pipe:	pipe to fill
189   * @spd:	data to fill
190   *
191   * Description:
192   *    @spd contains a map of pages and len/offset tuples, along with
193   *    the struct pipe_buf_operations associated with these pages. This
194   *    function will link that data to the pipe.
195   *
196   */
197  ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
198  		       struct splice_pipe_desc *spd)
199  {
200  	unsigned int spd_pages = spd->nr_pages;
201  	unsigned int tail = pipe->tail;
202  	unsigned int head = pipe->head;
203  	unsigned int mask = pipe->ring_size - 1;
204  	ssize_t ret = 0;
205  	int page_nr = 0;
206  
207  	if (!spd_pages)
208  		return 0;
209  
210  	if (unlikely(!pipe->readers)) {
211  		send_sig(SIGPIPE, current, 0);
212  		ret = -EPIPE;
213  		goto out;
214  	}
215  
216  	while (!pipe_full(head, tail, pipe->max_usage)) {
217  		struct pipe_buffer *buf = &pipe->bufs[head & mask];
218  
219  		buf->page = spd->pages[page_nr];
220  		buf->offset = spd->partial[page_nr].offset;
221  		buf->len = spd->partial[page_nr].len;
222  		buf->private = spd->partial[page_nr].private;
223  		buf->ops = spd->ops;
224  		buf->flags = 0;
225  
226  		head++;
227  		pipe->head = head;
228  		page_nr++;
229  		ret += buf->len;
230  
231  		if (!--spd->nr_pages)
232  			break;
233  	}
234  
235  	if (!ret)
236  		ret = -EAGAIN;
237  
238  out:
239  	while (page_nr < spd_pages)
240  		spd->spd_release(spd, page_nr++);
241  
242  	return ret;
243  }
244  EXPORT_SYMBOL_GPL(splice_to_pipe);
245  
246  ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
247  {
248  	unsigned int head = pipe->head;
249  	unsigned int tail = pipe->tail;
250  	unsigned int mask = pipe->ring_size - 1;
251  	int ret;
252  
253  	if (unlikely(!pipe->readers)) {
254  		send_sig(SIGPIPE, current, 0);
255  		ret = -EPIPE;
256  	} else if (pipe_full(head, tail, pipe->max_usage)) {
257  		ret = -EAGAIN;
258  	} else {
259  		pipe->bufs[head & mask] = *buf;
260  		pipe->head = head + 1;
261  		return buf->len;
262  	}
263  	pipe_buf_release(pipe, buf);
264  	return ret;
265  }
266  EXPORT_SYMBOL(add_to_pipe);
267  
268  /*
269   * Check if we need to grow the arrays holding pages and partial page
270   * descriptions.
271   */
272  int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
273  {
274  	unsigned int max_usage = READ_ONCE(pipe->max_usage);
275  
276  	spd->nr_pages_max = max_usage;
277  	if (max_usage <= PIPE_DEF_BUFFERS)
278  		return 0;
279  
280  	spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
281  	spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
282  				     GFP_KERNEL);
283  
284  	if (spd->pages && spd->partial)
285  		return 0;
286  
287  	kfree(spd->pages);
288  	kfree(spd->partial);
289  	return -ENOMEM;
290  }
291  
292  void splice_shrink_spd(struct splice_pipe_desc *spd)
293  {
294  	if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
295  		return;
296  
297  	kfree(spd->pages);
298  	kfree(spd->partial);
299  }
300  
301  /**
302   * copy_splice_read -  Copy data from a file and splice the copy into a pipe
303   * @in: The file to read from
304   * @ppos: Pointer to the file position to read from
305   * @pipe: The pipe to splice into
306   * @len: The amount to splice
307   * @flags: The SPLICE_F_* flags
308   *
309   * This function allocates a bunch of pages sufficient to hold the requested
310   * amount of data (but limited by the remaining pipe capacity), passes it to
311   * the file's ->read_iter() to read into and then splices the used pages into
312   * the pipe.
313   *
314   * Return: On success, the number of bytes read will be returned and *@ppos
315   * will be updated if appropriate; 0 will be returned if there is no more data
316   * to be read; -EAGAIN will be returned if the pipe had no space, and some
317   * other negative error code will be returned on error.  A short read may occur
318   * if the pipe has insufficient space, we reach the end of the data or we hit a
319   * hole.
320   */
321  ssize_t copy_splice_read(struct file *in, loff_t *ppos,
322  			 struct pipe_inode_info *pipe,
323  			 size_t len, unsigned int flags)
324  {
325  	struct iov_iter to;
326  	struct bio_vec *bv;
327  	struct kiocb kiocb;
328  	struct page **pages;
329  	ssize_t ret;
330  	size_t used, npages, chunk, remain, keep = 0;
331  	int i;
332  
333  	/* Work out how much data we can actually add into the pipe */
334  	used = pipe_occupancy(pipe->head, pipe->tail);
335  	npages = max_t(ssize_t, pipe->max_usage - used, 0);
336  	len = min_t(size_t, len, npages * PAGE_SIZE);
337  	npages = DIV_ROUND_UP(len, PAGE_SIZE);
338  
339  	bv = kzalloc(array_size(npages, sizeof(bv[0])) +
340  		     array_size(npages, sizeof(struct page *)), GFP_KERNEL);
341  	if (!bv)
342  		return -ENOMEM;
343  
344  	pages = (struct page **)(bv + npages);
345  	npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
346  	if (!npages) {
347  		kfree(bv);
348  		return -ENOMEM;
349  	}
350  
351  	remain = len = min_t(size_t, len, npages * PAGE_SIZE);
352  
353  	for (i = 0; i < npages; i++) {
354  		chunk = min_t(size_t, PAGE_SIZE, remain);
355  		bv[i].bv_page = pages[i];
356  		bv[i].bv_offset = 0;
357  		bv[i].bv_len = chunk;
358  		remain -= chunk;
359  	}
360  
361  	/* Do the I/O */
362  	iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
363  	init_sync_kiocb(&kiocb, in);
364  	kiocb.ki_pos = *ppos;
365  	ret = call_read_iter(in, &kiocb, &to);
366  
367  	if (ret > 0) {
368  		keep = DIV_ROUND_UP(ret, PAGE_SIZE);
369  		*ppos = kiocb.ki_pos;
370  	}
371  
372  	/*
373  	 * Callers of ->splice_read() expect -EAGAIN on "can't put anything in
374  	 * there", rather than -EFAULT.
375  	 */
376  	if (ret == -EFAULT)
377  		ret = -EAGAIN;
378  
379  	/* Free any pages that didn't get touched at all. */
380  	if (keep < npages)
381  		release_pages(pages + keep, npages - keep);
382  
383  	/* Push the remaining pages into the pipe. */
384  	remain = ret;
385  	for (i = 0; i < keep; i++) {
386  		struct pipe_buffer *buf = pipe_head_buf(pipe);
387  
388  		chunk = min_t(size_t, remain, PAGE_SIZE);
389  		*buf = (struct pipe_buffer) {
390  			.ops	= &default_pipe_buf_ops,
391  			.page	= bv[i].bv_page,
392  			.offset	= 0,
393  			.len	= chunk,
394  		};
395  		pipe->head++;
396  		remain -= chunk;
397  	}
398  
399  	kfree(bv);
400  	return ret;
401  }
402  EXPORT_SYMBOL(copy_splice_read);
403  
404  const struct pipe_buf_operations default_pipe_buf_ops = {
405  	.release	= generic_pipe_buf_release,
406  	.try_steal	= generic_pipe_buf_try_steal,
407  	.get		= generic_pipe_buf_get,
408  };
409  
410  /* Pipe buffer operations for a socket and similar. */
411  const struct pipe_buf_operations nosteal_pipe_buf_ops = {
412  	.release	= generic_pipe_buf_release,
413  	.get		= generic_pipe_buf_get,
414  };
415  EXPORT_SYMBOL(nosteal_pipe_buf_ops);
416  
417  static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
418  {
419  	smp_mb();
420  	if (waitqueue_active(&pipe->wr_wait))
421  		wake_up_interruptible(&pipe->wr_wait);
422  	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
423  }
424  
425  /**
426   * splice_from_pipe_feed - feed available data from a pipe to a file
427   * @pipe:	pipe to splice from
428   * @sd:		information to @actor
429   * @actor:	handler that splices the data
430   *
431   * Description:
432   *    This function loops over the pipe and calls @actor to do the
433   *    actual moving of a single struct pipe_buffer to the desired
434   *    destination.  It returns when there's no more buffers left in
435   *    the pipe or if the requested number of bytes (@sd->total_len)
436   *    have been copied.  It returns a positive number (one) if the
437   *    pipe needs to be filled with more data, zero if the required
438   *    number of bytes have been copied and -errno on error.
439   *
440   *    This, together with splice_from_pipe_{begin,end,next}, may be
441   *    used to implement the functionality of __splice_from_pipe() when
442   *    locking is required around copying the pipe buffers to the
443   *    destination.
444   */
445  static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
446  			  splice_actor *actor)
447  {
448  	unsigned int head = pipe->head;
449  	unsigned int tail = pipe->tail;
450  	unsigned int mask = pipe->ring_size - 1;
451  	int ret;
452  
453  	while (!pipe_empty(head, tail)) {
454  		struct pipe_buffer *buf = &pipe->bufs[tail & mask];
455  
456  		sd->len = buf->len;
457  		if (sd->len > sd->total_len)
458  			sd->len = sd->total_len;
459  
460  		ret = pipe_buf_confirm(pipe, buf);
461  		if (unlikely(ret)) {
462  			if (ret == -ENODATA)
463  				ret = 0;
464  			return ret;
465  		}
466  
467  		ret = actor(pipe, buf, sd);
468  		if (ret <= 0)
469  			return ret;
470  
471  		buf->offset += ret;
472  		buf->len -= ret;
473  
474  		sd->num_spliced += ret;
475  		sd->len -= ret;
476  		sd->pos += ret;
477  		sd->total_len -= ret;
478  
479  		if (!buf->len) {
480  			pipe_buf_release(pipe, buf);
481  			tail++;
482  			pipe->tail = tail;
483  			if (pipe->files)
484  				sd->need_wakeup = true;
485  		}
486  
487  		if (!sd->total_len)
488  			return 0;
489  	}
490  
491  	return 1;
492  }
493  
494  /* We know we have a pipe buffer, but maybe it's empty? */
495  static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
496  {
497  	unsigned int tail = pipe->tail;
498  	unsigned int mask = pipe->ring_size - 1;
499  	struct pipe_buffer *buf = &pipe->bufs[tail & mask];
500  
501  	if (unlikely(!buf->len)) {
502  		pipe_buf_release(pipe, buf);
503  		pipe->tail = tail+1;
504  		return true;
505  	}
506  
507  	return false;
508  }
509  
510  /**
511   * splice_from_pipe_next - wait for some data to splice from
512   * @pipe:	pipe to splice from
513   * @sd:		information about the splice operation
514   *
515   * Description:
516   *    This function will wait for some data and return a positive
517   *    value (one) if pipe buffers are available.  It will return zero
518   *    or -errno if no more data needs to be spliced.
519   */
520  static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
521  {
522  	/*
523  	 * Check for signal early to make process killable when there are
524  	 * always buffers available
525  	 */
526  	if (signal_pending(current))
527  		return -ERESTARTSYS;
528  
529  repeat:
530  	while (pipe_empty(pipe->head, pipe->tail)) {
531  		if (!pipe->writers)
532  			return 0;
533  
534  		if (sd->num_spliced)
535  			return 0;
536  
537  		if (sd->flags & SPLICE_F_NONBLOCK)
538  			return -EAGAIN;
539  
540  		if (signal_pending(current))
541  			return -ERESTARTSYS;
542  
543  		if (sd->need_wakeup) {
544  			wakeup_pipe_writers(pipe);
545  			sd->need_wakeup = false;
546  		}
547  
548  		pipe_wait_readable(pipe);
549  	}
550  
551  	if (eat_empty_buffer(pipe))
552  		goto repeat;
553  
554  	return 1;
555  }
556  
557  /**
558   * splice_from_pipe_begin - start splicing from pipe
559   * @sd:		information about the splice operation
560   *
561   * Description:
562   *    This function should be called before a loop containing
563   *    splice_from_pipe_next() and splice_from_pipe_feed() to
564   *    initialize the necessary fields of @sd.
565   */
566  static void splice_from_pipe_begin(struct splice_desc *sd)
567  {
568  	sd->num_spliced = 0;
569  	sd->need_wakeup = false;
570  }
571  
572  /**
573   * splice_from_pipe_end - finish splicing from pipe
574   * @pipe:	pipe to splice from
575   * @sd:		information about the splice operation
576   *
577   * Description:
578   *    This function will wake up pipe writers if necessary.  It should
579   *    be called after a loop containing splice_from_pipe_next() and
580   *    splice_from_pipe_feed().
581   */
582  static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
583  {
584  	if (sd->need_wakeup)
585  		wakeup_pipe_writers(pipe);
586  }
587  
588  /**
589   * __splice_from_pipe - splice data from a pipe to given actor
590   * @pipe:	pipe to splice from
591   * @sd:		information to @actor
592   * @actor:	handler that splices the data
593   *
594   * Description:
595   *    This function does little more than loop over the pipe and call
596   *    @actor to do the actual moving of a single struct pipe_buffer to
597   *    the desired destination. See pipe_to_file, pipe_to_sendmsg, or
598   *    pipe_to_user.
599   *
600   */
601  ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
602  			   splice_actor *actor)
603  {
604  	int ret;
605  
606  	splice_from_pipe_begin(sd);
607  	do {
608  		cond_resched();
609  		ret = splice_from_pipe_next(pipe, sd);
610  		if (ret > 0)
611  			ret = splice_from_pipe_feed(pipe, sd, actor);
612  	} while (ret > 0);
613  	splice_from_pipe_end(pipe, sd);
614  
615  	return sd->num_spliced ? sd->num_spliced : ret;
616  }
617  EXPORT_SYMBOL(__splice_from_pipe);
618  
619  /**
620   * splice_from_pipe - splice data from a pipe to a file
621   * @pipe:	pipe to splice from
622   * @out:	file to splice to
623   * @ppos:	position in @out
624   * @len:	how many bytes to splice
625   * @flags:	splice modifier flags
626   * @actor:	handler that splices the data
627   *
628   * Description:
629   *    See __splice_from_pipe. This function locks the pipe inode,
630   *    otherwise it's identical to __splice_from_pipe().
631   *
632   */
633  ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
634  			 loff_t *ppos, size_t len, unsigned int flags,
635  			 splice_actor *actor)
636  {
637  	ssize_t ret;
638  	struct splice_desc sd = {
639  		.total_len = len,
640  		.flags = flags,
641  		.pos = *ppos,
642  		.u.file = out,
643  	};
644  
645  	pipe_lock(pipe);
646  	ret = __splice_from_pipe(pipe, &sd, actor);
647  	pipe_unlock(pipe);
648  
649  	return ret;
650  }
651  
652  /**
653   * iter_file_splice_write - splice data from a pipe to a file
654   * @pipe:	pipe info
655   * @out:	file to write to
656   * @ppos:	position in @out
657   * @len:	number of bytes to splice
658   * @flags:	splice modifier flags
659   *
660   * Description:
661   *    Will either move or copy pages (determined by @flags options) from
662   *    the given pipe inode to the given file.
663   *    This one is ->write_iter-based.
664   *
665   */
666  ssize_t
667  iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
668  			  loff_t *ppos, size_t len, unsigned int flags)
669  {
670  	struct splice_desc sd = {
671  		.total_len = len,
672  		.flags = flags,
673  		.pos = *ppos,
674  		.u.file = out,
675  	};
676  	int nbufs = pipe->max_usage;
677  	struct bio_vec *array;
678  	ssize_t ret;
679  
680  	if (!out->f_op->write_iter)
681  		return -EINVAL;
682  
683  	array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL);
684  	if (unlikely(!array))
685  		return -ENOMEM;
686  
687  	pipe_lock(pipe);
688  
689  	splice_from_pipe_begin(&sd);
690  	while (sd.total_len) {
691  		struct kiocb kiocb;
692  		struct iov_iter from;
693  		unsigned int head, tail, mask;
694  		size_t left;
695  		int n;
696  
697  		ret = splice_from_pipe_next(pipe, &sd);
698  		if (ret <= 0)
699  			break;
700  
701  		if (unlikely(nbufs < pipe->max_usage)) {
702  			kfree(array);
703  			nbufs = pipe->max_usage;
704  			array = kcalloc(nbufs, sizeof(struct bio_vec),
705  					GFP_KERNEL);
706  			if (!array) {
707  				ret = -ENOMEM;
708  				break;
709  			}
710  		}
711  
712  		head = pipe->head;
713  		tail = pipe->tail;
714  		mask = pipe->ring_size - 1;
715  
716  		/* build the vector */
717  		left = sd.total_len;
718  		for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
719  			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
720  			size_t this_len = buf->len;
721  
722  			/* zero-length bvecs are not supported, skip them */
723  			if (!this_len)
724  				continue;
725  			this_len = min(this_len, left);
726  
727  			ret = pipe_buf_confirm(pipe, buf);
728  			if (unlikely(ret)) {
729  				if (ret == -ENODATA)
730  					ret = 0;
731  				goto done;
732  			}
733  
734  			bvec_set_page(&array[n], buf->page, this_len,
735  				      buf->offset);
736  			left -= this_len;
737  			n++;
738  		}
739  
740  		iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
741  		init_sync_kiocb(&kiocb, out);
742  		kiocb.ki_pos = sd.pos;
743  		ret = call_write_iter(out, &kiocb, &from);
744  		sd.pos = kiocb.ki_pos;
745  		if (ret <= 0)
746  			break;
747  
748  		sd.num_spliced += ret;
749  		sd.total_len -= ret;
750  		*ppos = sd.pos;
751  
752  		/* dismiss the fully eaten buffers, adjust the partial one */
753  		tail = pipe->tail;
754  		while (ret) {
755  			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
756  			if (ret >= buf->len) {
757  				ret -= buf->len;
758  				buf->len = 0;
759  				pipe_buf_release(pipe, buf);
760  				tail++;
761  				pipe->tail = tail;
762  				if (pipe->files)
763  					sd.need_wakeup = true;
764  			} else {
765  				buf->offset += ret;
766  				buf->len -= ret;
767  				ret = 0;
768  			}
769  		}
770  	}
771  done:
772  	kfree(array);
773  	splice_from_pipe_end(pipe, &sd);
774  
775  	pipe_unlock(pipe);
776  
777  	if (sd.num_spliced)
778  		ret = sd.num_spliced;
779  
780  	return ret;
781  }
782  
783  EXPORT_SYMBOL(iter_file_splice_write);
784  
785  #ifdef CONFIG_NET
786  /**
787   * splice_to_socket - splice data from a pipe to a socket
788   * @pipe:	pipe to splice from
789   * @out:	socket to write to
790   * @ppos:	position in @out
791   * @len:	number of bytes to splice
792   * @flags:	splice modifier flags
793   *
794   * Description:
795   *    Will send @len bytes from the pipe to a network socket. No data copying
796   *    is involved.
797   *
798   */
799  ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
800  			 loff_t *ppos, size_t len, unsigned int flags)
801  {
802  	struct socket *sock = sock_from_file(out);
803  	struct bio_vec bvec[16];
804  	struct msghdr msg = {};
805  	ssize_t ret = 0;
806  	size_t spliced = 0;
807  	bool need_wakeup = false;
808  
809  	pipe_lock(pipe);
810  
811  	while (len > 0) {
812  		unsigned int head, tail, mask, bc = 0;
813  		size_t remain = len;
814  
815  		/*
816  		 * Check for signal early to make process killable when there
817  		 * are always buffers available
818  		 */
819  		ret = -ERESTARTSYS;
820  		if (signal_pending(current))
821  			break;
822  
823  		while (pipe_empty(pipe->head, pipe->tail)) {
824  			ret = 0;
825  			if (!pipe->writers)
826  				goto out;
827  
828  			if (spliced)
829  				goto out;
830  
831  			ret = -EAGAIN;
832  			if (flags & SPLICE_F_NONBLOCK)
833  				goto out;
834  
835  			ret = -ERESTARTSYS;
836  			if (signal_pending(current))
837  				goto out;
838  
839  			if (need_wakeup) {
840  				wakeup_pipe_writers(pipe);
841  				need_wakeup = false;
842  			}
843  
844  			pipe_wait_readable(pipe);
845  		}
846  
847  		head = pipe->head;
848  		tail = pipe->tail;
849  		mask = pipe->ring_size - 1;
850  
851  		while (!pipe_empty(head, tail)) {
852  			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
853  			size_t seg;
854  
855  			if (!buf->len) {
856  				tail++;
857  				continue;
858  			}
859  
860  			seg = min_t(size_t, remain, buf->len);
861  
862  			ret = pipe_buf_confirm(pipe, buf);
863  			if (unlikely(ret)) {
864  				if (ret == -ENODATA)
865  					ret = 0;
866  				break;
867  			}
868  
869  			bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
870  			remain -= seg;
871  			if (remain == 0 || bc >= ARRAY_SIZE(bvec))
872  				break;
873  			tail++;
874  		}
875  
876  		if (!bc)
877  			break;
878  
879  		msg.msg_flags = MSG_SPLICE_PAGES;
880  		if (flags & SPLICE_F_MORE)
881  			msg.msg_flags |= MSG_MORE;
882  		if (remain && pipe_occupancy(pipe->head, tail) > 0)
883  			msg.msg_flags |= MSG_MORE;
884  		if (out->f_flags & O_NONBLOCK)
885  			msg.msg_flags |= MSG_DONTWAIT;
886  
887  		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
888  			      len - remain);
889  		ret = sock_sendmsg(sock, &msg);
890  		if (ret <= 0)
891  			break;
892  
893  		spliced += ret;
894  		len -= ret;
895  		tail = pipe->tail;
896  		while (ret > 0) {
897  			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
898  			size_t seg = min_t(size_t, ret, buf->len);
899  
900  			buf->offset += seg;
901  			buf->len -= seg;
902  			ret -= seg;
903  
904  			if (!buf->len) {
905  				pipe_buf_release(pipe, buf);
906  				tail++;
907  			}
908  		}
909  
910  		if (tail != pipe->tail) {
911  			pipe->tail = tail;
912  			if (pipe->files)
913  				need_wakeup = true;
914  		}
915  	}
916  
917  out:
918  	pipe_unlock(pipe);
919  	if (need_wakeup)
920  		wakeup_pipe_writers(pipe);
921  	return spliced ?: ret;
922  }
923  #endif
924  
925  static int warn_unsupported(struct file *file, const char *op)
926  {
927  	pr_debug_ratelimited(
928  		"splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
929  		op, file, current->pid, current->comm);
930  	return -EINVAL;
931  }
932  
933  /*
934   * Attempt to initiate a splice from pipe to file.
935   */
936  static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out,
937  			      loff_t *ppos, size_t len, unsigned int flags)
938  {
939  	if (unlikely(!out->f_op->splice_write))
940  		return warn_unsupported(out, "write");
941  	return out->f_op->splice_write(pipe, out, ppos, len, flags);
942  }
943  
944  /*
945   * Indicate to the caller that there was a premature EOF when reading from the
946   * source and the caller didn't indicate they would be sending more data after
947   * this.
948   */
949  static void do_splice_eof(struct splice_desc *sd)
950  {
951  	if (sd->splice_eof)
952  		sd->splice_eof(sd);
953  }
954  
955  /*
956   * Callers already called rw_verify_area() on the entire range.
957   * No need to call it for sub ranges.
958   */
959  static ssize_t do_splice_read(struct file *in, loff_t *ppos,
960  			      struct pipe_inode_info *pipe, size_t len,
961  			      unsigned int flags)
962  {
963  	unsigned int p_space;
964  
965  	if (unlikely(!(in->f_mode & FMODE_READ)))
966  		return -EBADF;
967  	if (!len)
968  		return 0;
969  
970  	/* Don't try to read more the pipe has space for. */
971  	p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
972  	len = min_t(size_t, len, p_space << PAGE_SHIFT);
973  
974  	if (unlikely(len > MAX_RW_COUNT))
975  		len = MAX_RW_COUNT;
976  
977  	if (unlikely(!in->f_op->splice_read))
978  		return warn_unsupported(in, "read");
979  	/*
980  	 * O_DIRECT and DAX don't deal with the pagecache, so we allocate a
981  	 * buffer, copy into it and splice that into the pipe.
982  	 */
983  	if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
984  		return copy_splice_read(in, ppos, pipe, len, flags);
985  	return in->f_op->splice_read(in, ppos, pipe, len, flags);
986  }
987  
988  /**
989   * vfs_splice_read - Read data from a file and splice it into a pipe
990   * @in:		File to splice from
991   * @ppos:	Input file offset
992   * @pipe:	Pipe to splice to
993   * @len:	Number of bytes to splice
994   * @flags:	Splice modifier flags (SPLICE_F_*)
995   *
996   * Splice the requested amount of data from the input file to the pipe.  This
997   * is synchronous as the caller must hold the pipe lock across the entire
998   * operation.
999   *
1000   * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
1001   * a hole and a negative error code otherwise.
1002   */
1003  ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
1004  			struct pipe_inode_info *pipe, size_t len,
1005  			unsigned int flags)
1006  {
1007  	ssize_t ret;
1008  
1009  	ret = rw_verify_area(READ, in, ppos, len);
1010  	if (unlikely(ret < 0))
1011  		return ret;
1012  
1013  	return do_splice_read(in, ppos, pipe, len, flags);
1014  }
1015  EXPORT_SYMBOL_GPL(vfs_splice_read);
1016  
1017  /**
1018   * splice_direct_to_actor - splices data directly between two non-pipes
1019   * @in:		file to splice from
1020   * @sd:		actor information on where to splice to
1021   * @actor:	handles the data splicing
1022   *
1023   * Description:
1024   *    This is a special case helper to splice directly between two
1025   *    points, without requiring an explicit pipe. Internally an allocated
1026   *    pipe is cached in the process, and reused during the lifetime of
1027   *    that process.
1028   *
1029   */
1030  ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1031  			       splice_direct_actor *actor)
1032  {
1033  	struct pipe_inode_info *pipe;
1034  	ssize_t ret, bytes;
1035  	size_t len;
1036  	int i, flags, more;
1037  
1038  	/*
1039  	 * We require the input to be seekable, as we don't want to randomly
1040  	 * drop data for eg socket -> socket splicing. Use the piped splicing
1041  	 * for that!
1042  	 */
1043  	if (unlikely(!(in->f_mode & FMODE_LSEEK)))
1044  		return -EINVAL;
1045  
1046  	/*
1047  	 * neither in nor out is a pipe, setup an internal pipe attached to
1048  	 * 'out' and transfer the wanted data from 'in' to 'out' through that
1049  	 */
1050  	pipe = current->splice_pipe;
1051  	if (unlikely(!pipe)) {
1052  		pipe = alloc_pipe_info();
1053  		if (!pipe)
1054  			return -ENOMEM;
1055  
1056  		/*
1057  		 * We don't have an immediate reader, but we'll read the stuff
1058  		 * out of the pipe right after the splice_to_pipe(). So set
1059  		 * PIPE_READERS appropriately.
1060  		 */
1061  		pipe->readers = 1;
1062  
1063  		current->splice_pipe = pipe;
1064  	}
1065  
1066  	/*
1067  	 * Do the splice.
1068  	 */
1069  	bytes = 0;
1070  	len = sd->total_len;
1071  
1072  	/* Don't block on output, we have to drain the direct pipe. */
1073  	flags = sd->flags;
1074  	sd->flags &= ~SPLICE_F_NONBLOCK;
1075  
1076  	/*
1077  	 * We signal MORE until we've read sufficient data to fulfill the
1078  	 * request and we keep signalling it if the caller set it.
1079  	 */
1080  	more = sd->flags & SPLICE_F_MORE;
1081  	sd->flags |= SPLICE_F_MORE;
1082  
1083  	WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
1084  
1085  	while (len) {
1086  		size_t read_len;
1087  		loff_t pos = sd->pos, prev_pos = pos;
1088  
1089  		ret = do_splice_read(in, &pos, pipe, len, flags);
1090  		if (unlikely(ret <= 0))
1091  			goto read_failure;
1092  
1093  		read_len = ret;
1094  		sd->total_len = read_len;
1095  
1096  		/*
1097  		 * If we now have sufficient data to fulfill the request then
1098  		 * we clear SPLICE_F_MORE if it was not set initially.
1099  		 */
1100  		if (read_len >= len && !more)
1101  			sd->flags &= ~SPLICE_F_MORE;
1102  
1103  		/*
1104  		 * NOTE: nonblocking mode only applies to the input. We
1105  		 * must not do the output in nonblocking mode as then we
1106  		 * could get stuck data in the internal pipe:
1107  		 */
1108  		ret = actor(pipe, sd);
1109  		if (unlikely(ret <= 0)) {
1110  			sd->pos = prev_pos;
1111  			goto out_release;
1112  		}
1113  
1114  		bytes += ret;
1115  		len -= ret;
1116  		sd->pos = pos;
1117  
1118  		if (ret < read_len) {
1119  			sd->pos = prev_pos + ret;
1120  			goto out_release;
1121  		}
1122  	}
1123  
1124  done:
1125  	pipe->tail = pipe->head = 0;
1126  	file_accessed(in);
1127  	return bytes;
1128  
1129  read_failure:
1130  	/*
1131  	 * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
1132  	 * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
1133  	 * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
1134  	 * least 1 byte *then* we will also do the ->splice_eof() call.
1135  	 */
1136  	if (ret == 0 && !more && len > 0 && bytes)
1137  		do_splice_eof(sd);
1138  out_release:
1139  	/*
1140  	 * If we did an incomplete transfer we must release
1141  	 * the pipe buffers in question:
1142  	 */
1143  	for (i = 0; i < pipe->ring_size; i++) {
1144  		struct pipe_buffer *buf = &pipe->bufs[i];
1145  
1146  		if (buf->ops)
1147  			pipe_buf_release(pipe, buf);
1148  	}
1149  
1150  	if (!bytes)
1151  		bytes = ret;
1152  
1153  	goto done;
1154  }
1155  EXPORT_SYMBOL(splice_direct_to_actor);
1156  
1157  static int direct_splice_actor(struct pipe_inode_info *pipe,
1158  			       struct splice_desc *sd)
1159  {
1160  	struct file *file = sd->u.file;
1161  	long ret;
1162  
1163  	file_start_write(file);
1164  	ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
1165  	file_end_write(file);
1166  	return ret;
1167  }
1168  
1169  static int splice_file_range_actor(struct pipe_inode_info *pipe,
1170  					struct splice_desc *sd)
1171  {
1172  	struct file *file = sd->u.file;
1173  
1174  	return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
1175  }
1176  
1177  static void direct_file_splice_eof(struct splice_desc *sd)
1178  {
1179  	struct file *file = sd->u.file;
1180  
1181  	if (file->f_op->splice_eof)
1182  		file->f_op->splice_eof(file);
1183  }
1184  
1185  static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos,
1186  				      struct file *out, loff_t *opos,
1187  				      size_t len, unsigned int flags,
1188  				      splice_direct_actor *actor)
1189  {
1190  	struct splice_desc sd = {
1191  		.len		= len,
1192  		.total_len	= len,
1193  		.flags		= flags,
1194  		.pos		= *ppos,
1195  		.u.file		= out,
1196  		.splice_eof	= direct_file_splice_eof,
1197  		.opos		= opos,
1198  	};
1199  	ssize_t ret;
1200  
1201  	if (unlikely(!(out->f_mode & FMODE_WRITE)))
1202  		return -EBADF;
1203  
1204  	if (unlikely(out->f_flags & O_APPEND))
1205  		return -EINVAL;
1206  
1207  	ret = splice_direct_to_actor(in, &sd, actor);
1208  	if (ret > 0)
1209  		*ppos = sd.pos;
1210  
1211  	return ret;
1212  }
1213  /**
1214   * do_splice_direct - splices data directly between two files
1215   * @in:		file to splice from
1216   * @ppos:	input file offset
1217   * @out:	file to splice to
1218   * @opos:	output file offset
1219   * @len:	number of bytes to splice
1220   * @flags:	splice modifier flags
1221   *
1222   * Description:
1223   *    For use by do_sendfile(). splice can easily emulate sendfile, but
1224   *    doing it in the application would incur an extra system call
1225   *    (splice in + splice out, as compared to just sendfile()). So this helper
1226   *    can splice directly through a process-private pipe.
1227   *
1228   * Callers already called rw_verify_area() on the entire range.
1229   */
1230  ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1231  			 loff_t *opos, size_t len, unsigned int flags)
1232  {
1233  	return do_splice_direct_actor(in, ppos, out, opos, len, flags,
1234  				      direct_splice_actor);
1235  }
1236  EXPORT_SYMBOL(do_splice_direct);
1237  
1238  /**
1239   * splice_file_range - splices data between two files for copy_file_range()
1240   * @in:		file to splice from
1241   * @ppos:	input file offset
1242   * @out:	file to splice to
1243   * @opos:	output file offset
1244   * @len:	number of bytes to splice
1245   *
1246   * Description:
1247   *    For use by ->copy_file_range() methods.
1248   *    Like do_splice_direct(), but vfs_copy_file_range() already holds
1249   *    start_file_write() on @out file.
1250   *
1251   * Callers already called rw_verify_area() on the entire range.
1252   */
1253  ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
1254  			  loff_t *opos, size_t len)
1255  {
1256  	lockdep_assert(file_write_started(out));
1257  
1258  	return do_splice_direct_actor(in, ppos, out, opos,
1259  				      min_t(size_t, len, MAX_RW_COUNT),
1260  				      0, splice_file_range_actor);
1261  }
1262  EXPORT_SYMBOL(splice_file_range);
1263  
1264  static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1265  {
1266  	for (;;) {
1267  		if (unlikely(!pipe->readers)) {
1268  			send_sig(SIGPIPE, current, 0);
1269  			return -EPIPE;
1270  		}
1271  		if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1272  			return 0;
1273  		if (flags & SPLICE_F_NONBLOCK)
1274  			return -EAGAIN;
1275  		if (signal_pending(current))
1276  			return -ERESTARTSYS;
1277  		pipe_wait_writable(pipe);
1278  	}
1279  }
1280  
1281  static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1282  			       struct pipe_inode_info *opipe,
1283  			       size_t len, unsigned int flags);
1284  
1285  ssize_t splice_file_to_pipe(struct file *in,
1286  			    struct pipe_inode_info *opipe,
1287  			    loff_t *offset,
1288  			    size_t len, unsigned int flags)
1289  {
1290  	ssize_t ret;
1291  
1292  	pipe_lock(opipe);
1293  	ret = wait_for_space(opipe, flags);
1294  	if (!ret)
1295  		ret = do_splice_read(in, offset, opipe, len, flags);
1296  	pipe_unlock(opipe);
1297  	if (ret > 0)
1298  		wakeup_pipe_readers(opipe);
1299  	return ret;
1300  }
1301  
1302  /*
1303   * Determine where to splice to/from.
1304   */
1305  ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
1306  		  loff_t *off_out, size_t len, unsigned int flags)
1307  {
1308  	struct pipe_inode_info *ipipe;
1309  	struct pipe_inode_info *opipe;
1310  	loff_t offset;
1311  	ssize_t ret;
1312  
1313  	if (unlikely(!(in->f_mode & FMODE_READ) ||
1314  		     !(out->f_mode & FMODE_WRITE)))
1315  		return -EBADF;
1316  
1317  	ipipe = get_pipe_info(in, true);
1318  	opipe = get_pipe_info(out, true);
1319  
1320  	if (ipipe && opipe) {
1321  		if (off_in || off_out)
1322  			return -ESPIPE;
1323  
1324  		/* Splicing to self would be fun, but... */
1325  		if (ipipe == opipe)
1326  			return -EINVAL;
1327  
1328  		if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1329  			flags |= SPLICE_F_NONBLOCK;
1330  
1331  		ret = splice_pipe_to_pipe(ipipe, opipe, len, flags);
1332  	} else if (ipipe) {
1333  		if (off_in)
1334  			return -ESPIPE;
1335  		if (off_out) {
1336  			if (!(out->f_mode & FMODE_PWRITE))
1337  				return -EINVAL;
1338  			offset = *off_out;
1339  		} else {
1340  			offset = out->f_pos;
1341  		}
1342  
1343  		if (unlikely(out->f_flags & O_APPEND))
1344  			return -EINVAL;
1345  
1346  		ret = rw_verify_area(WRITE, out, &offset, len);
1347  		if (unlikely(ret < 0))
1348  			return ret;
1349  
1350  		if (in->f_flags & O_NONBLOCK)
1351  			flags |= SPLICE_F_NONBLOCK;
1352  
1353  		file_start_write(out);
1354  		ret = do_splice_from(ipipe, out, &offset, len, flags);
1355  		file_end_write(out);
1356  
1357  		if (!off_out)
1358  			out->f_pos = offset;
1359  		else
1360  			*off_out = offset;
1361  	} else if (opipe) {
1362  		if (off_out)
1363  			return -ESPIPE;
1364  		if (off_in) {
1365  			if (!(in->f_mode & FMODE_PREAD))
1366  				return -EINVAL;
1367  			offset = *off_in;
1368  		} else {
1369  			offset = in->f_pos;
1370  		}
1371  
1372  		ret = rw_verify_area(READ, in, &offset, len);
1373  		if (unlikely(ret < 0))
1374  			return ret;
1375  
1376  		if (out->f_flags & O_NONBLOCK)
1377  			flags |= SPLICE_F_NONBLOCK;
1378  
1379  		ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1380  
1381  		if (!off_in)
1382  			in->f_pos = offset;
1383  		else
1384  			*off_in = offset;
1385  	} else {
1386  		ret = -EINVAL;
1387  	}
1388  
1389  	if (ret > 0) {
1390  		/*
1391  		 * Generate modify out before access in:
1392  		 * do_splice_from() may've already sent modify out,
1393  		 * and this ensures the events get merged.
1394  		 */
1395  		fsnotify_modify(out);
1396  		fsnotify_access(in);
1397  	}
1398  
1399  	return ret;
1400  }
1401  
1402  static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
1403  			   struct file *out, loff_t __user *off_out,
1404  			   size_t len, unsigned int flags)
1405  {
1406  	struct pipe_inode_info *ipipe;
1407  	struct pipe_inode_info *opipe;
1408  	loff_t offset, *__off_in = NULL, *__off_out = NULL;
1409  	ssize_t ret;
1410  
1411  	ipipe = get_pipe_info(in, true);
1412  	opipe = get_pipe_info(out, true);
1413  
1414  	if (ipipe) {
1415  		if (off_in)
1416  			return -ESPIPE;
1417  		pipe_clear_nowait(in);
1418  	}
1419  	if (opipe) {
1420  		if (off_out)
1421  			return -ESPIPE;
1422  		pipe_clear_nowait(out);
1423  	}
1424  
1425  	if (off_out) {
1426  		if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1427  			return -EFAULT;
1428  		__off_out = &offset;
1429  	}
1430  	if (off_in) {
1431  		if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1432  			return -EFAULT;
1433  		__off_in = &offset;
1434  	}
1435  
1436  	ret = do_splice(in, __off_in, out, __off_out, len, flags);
1437  	if (ret < 0)
1438  		return ret;
1439  
1440  	if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1441  		return -EFAULT;
1442  	if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1443  		return -EFAULT;
1444  
1445  	return ret;
1446  }
1447  
1448  static ssize_t iter_to_pipe(struct iov_iter *from,
1449  			    struct pipe_inode_info *pipe,
1450  			    unsigned int flags)
1451  {
1452  	struct pipe_buffer buf = {
1453  		.ops = &user_page_pipe_buf_ops,
1454  		.flags = flags
1455  	};
1456  	size_t total = 0;
1457  	ssize_t ret = 0;
1458  
1459  	while (iov_iter_count(from)) {
1460  		struct page *pages[16];
1461  		ssize_t left;
1462  		size_t start;
1463  		int i, n;
1464  
1465  		left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1466  		if (left <= 0) {
1467  			ret = left;
1468  			break;
1469  		}
1470  
1471  		n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1472  		for (i = 0; i < n; i++) {
1473  			int size = min_t(int, left, PAGE_SIZE - start);
1474  
1475  			buf.page = pages[i];
1476  			buf.offset = start;
1477  			buf.len = size;
1478  			ret = add_to_pipe(pipe, &buf);
1479  			if (unlikely(ret < 0)) {
1480  				iov_iter_revert(from, left);
1481  				// this one got dropped by add_to_pipe()
1482  				while (++i < n)
1483  					put_page(pages[i]);
1484  				goto out;
1485  			}
1486  			total += ret;
1487  			left -= size;
1488  			start = 0;
1489  		}
1490  	}
1491  out:
1492  	return total ? total : ret;
1493  }
1494  
1495  static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1496  			struct splice_desc *sd)
1497  {
1498  	int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1499  	return n == sd->len ? n : -EFAULT;
1500  }
1501  
1502  /*
1503   * For lack of a better implementation, implement vmsplice() to userspace
1504   * as a simple copy of the pipes pages to the user iov.
1505   */
1506  static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
1507  				unsigned int flags)
1508  {
1509  	struct pipe_inode_info *pipe = get_pipe_info(file, true);
1510  	struct splice_desc sd = {
1511  		.total_len = iov_iter_count(iter),
1512  		.flags = flags,
1513  		.u.data = iter
1514  	};
1515  	ssize_t ret = 0;
1516  
1517  	if (!pipe)
1518  		return -EBADF;
1519  
1520  	pipe_clear_nowait(file);
1521  
1522  	if (sd.total_len) {
1523  		pipe_lock(pipe);
1524  		ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1525  		pipe_unlock(pipe);
1526  	}
1527  
1528  	if (ret > 0)
1529  		fsnotify_access(file);
1530  
1531  	return ret;
1532  }
1533  
1534  /*
1535   * vmsplice splices a user address range into a pipe. It can be thought of
1536   * as splice-from-memory, where the regular splice is splice-from-file (or
1537   * to file). In both cases the output is a pipe, naturally.
1538   */
1539  static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1540  				unsigned int flags)
1541  {
1542  	struct pipe_inode_info *pipe;
1543  	ssize_t ret = 0;
1544  	unsigned buf_flag = 0;
1545  
1546  	if (flags & SPLICE_F_GIFT)
1547  		buf_flag = PIPE_BUF_FLAG_GIFT;
1548  
1549  	pipe = get_pipe_info(file, true);
1550  	if (!pipe)
1551  		return -EBADF;
1552  
1553  	pipe_clear_nowait(file);
1554  
1555  	pipe_lock(pipe);
1556  	ret = wait_for_space(pipe, flags);
1557  	if (!ret)
1558  		ret = iter_to_pipe(iter, pipe, buf_flag);
1559  	pipe_unlock(pipe);
1560  	if (ret > 0) {
1561  		wakeup_pipe_readers(pipe);
1562  		fsnotify_modify(file);
1563  	}
1564  	return ret;
1565  }
1566  
1567  static int vmsplice_type(struct fd f, int *type)
1568  {
1569  	if (!f.file)
1570  		return -EBADF;
1571  	if (f.file->f_mode & FMODE_WRITE) {
1572  		*type = ITER_SOURCE;
1573  	} else if (f.file->f_mode & FMODE_READ) {
1574  		*type = ITER_DEST;
1575  	} else {
1576  		fdput(f);
1577  		return -EBADF;
1578  	}
1579  	return 0;
1580  }
1581  
1582  /*
1583   * Note that vmsplice only really supports true splicing _from_ user memory
1584   * to a pipe, not the other way around. Splicing from user memory is a simple
1585   * operation that can be supported without any funky alignment restrictions
1586   * or nasty vm tricks. We simply map in the user memory and fill them into
1587   * a pipe. The reverse isn't quite as easy, though. There are two possible
1588   * solutions for that:
1589   *
1590   *	- memcpy() the data internally, at which point we might as well just
1591   *	  do a regular read() on the buffer anyway.
1592   *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
1593   *	  has restriction limitations on both ends of the pipe).
1594   *
1595   * Currently we punt and implement it as a normal copy, see pipe_to_user().
1596   *
1597   */
1598  SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1599  		unsigned long, nr_segs, unsigned int, flags)
1600  {
1601  	struct iovec iovstack[UIO_FASTIOV];
1602  	struct iovec *iov = iovstack;
1603  	struct iov_iter iter;
1604  	ssize_t error;
1605  	struct fd f;
1606  	int type;
1607  
1608  	if (unlikely(flags & ~SPLICE_F_ALL))
1609  		return -EINVAL;
1610  
1611  	f = fdget(fd);
1612  	error = vmsplice_type(f, &type);
1613  	if (error)
1614  		return error;
1615  
1616  	error = import_iovec(type, uiov, nr_segs,
1617  			     ARRAY_SIZE(iovstack), &iov, &iter);
1618  	if (error < 0)
1619  		goto out_fdput;
1620  
1621  	if (!iov_iter_count(&iter))
1622  		error = 0;
1623  	else if (type == ITER_SOURCE)
1624  		error = vmsplice_to_pipe(f.file, &iter, flags);
1625  	else
1626  		error = vmsplice_to_user(f.file, &iter, flags);
1627  
1628  	kfree(iov);
1629  out_fdput:
1630  	fdput(f);
1631  	return error;
1632  }
1633  
1634  SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1635  		int, fd_out, loff_t __user *, off_out,
1636  		size_t, len, unsigned int, flags)
1637  {
1638  	struct fd in, out;
1639  	ssize_t error;
1640  
1641  	if (unlikely(!len))
1642  		return 0;
1643  
1644  	if (unlikely(flags & ~SPLICE_F_ALL))
1645  		return -EINVAL;
1646  
1647  	error = -EBADF;
1648  	in = fdget(fd_in);
1649  	if (in.file) {
1650  		out = fdget(fd_out);
1651  		if (out.file) {
1652  			error = __do_splice(in.file, off_in, out.file, off_out,
1653  					    len, flags);
1654  			fdput(out);
1655  		}
1656  		fdput(in);
1657  	}
1658  	return error;
1659  }
1660  
1661  /*
1662   * Make sure there's data to read. Wait for input if we can, otherwise
1663   * return an appropriate error.
1664   */
1665  static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1666  {
1667  	int ret;
1668  
1669  	/*
1670  	 * Check the pipe occupancy without the inode lock first. This function
1671  	 * is speculative anyways, so missing one is ok.
1672  	 */
1673  	if (!pipe_empty(pipe->head, pipe->tail))
1674  		return 0;
1675  
1676  	ret = 0;
1677  	pipe_lock(pipe);
1678  
1679  	while (pipe_empty(pipe->head, pipe->tail)) {
1680  		if (signal_pending(current)) {
1681  			ret = -ERESTARTSYS;
1682  			break;
1683  		}
1684  		if (!pipe->writers)
1685  			break;
1686  		if (flags & SPLICE_F_NONBLOCK) {
1687  			ret = -EAGAIN;
1688  			break;
1689  		}
1690  		pipe_wait_readable(pipe);
1691  	}
1692  
1693  	pipe_unlock(pipe);
1694  	return ret;
1695  }
1696  
1697  /*
1698   * Make sure there's writeable room. Wait for room if we can, otherwise
1699   * return an appropriate error.
1700   */
1701  static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1702  {
1703  	int ret;
1704  
1705  	/*
1706  	 * Check pipe occupancy without the inode lock first. This function
1707  	 * is speculative anyways, so missing one is ok.
1708  	 */
1709  	if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1710  		return 0;
1711  
1712  	ret = 0;
1713  	pipe_lock(pipe);
1714  
1715  	while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1716  		if (!pipe->readers) {
1717  			send_sig(SIGPIPE, current, 0);
1718  			ret = -EPIPE;
1719  			break;
1720  		}
1721  		if (flags & SPLICE_F_NONBLOCK) {
1722  			ret = -EAGAIN;
1723  			break;
1724  		}
1725  		if (signal_pending(current)) {
1726  			ret = -ERESTARTSYS;
1727  			break;
1728  		}
1729  		pipe_wait_writable(pipe);
1730  	}
1731  
1732  	pipe_unlock(pipe);
1733  	return ret;
1734  }
1735  
1736  /*
1737   * Splice contents of ipipe to opipe.
1738   */
1739  static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1740  			       struct pipe_inode_info *opipe,
1741  			       size_t len, unsigned int flags)
1742  {
1743  	struct pipe_buffer *ibuf, *obuf;
1744  	unsigned int i_head, o_head;
1745  	unsigned int i_tail, o_tail;
1746  	unsigned int i_mask, o_mask;
1747  	int ret = 0;
1748  	bool input_wakeup = false;
1749  
1750  
1751  retry:
1752  	ret = ipipe_prep(ipipe, flags);
1753  	if (ret)
1754  		return ret;
1755  
1756  	ret = opipe_prep(opipe, flags);
1757  	if (ret)
1758  		return ret;
1759  
1760  	/*
1761  	 * Potential ABBA deadlock, work around it by ordering lock
1762  	 * grabbing by pipe info address. Otherwise two different processes
1763  	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1764  	 */
1765  	pipe_double_lock(ipipe, opipe);
1766  
1767  	i_tail = ipipe->tail;
1768  	i_mask = ipipe->ring_size - 1;
1769  	o_head = opipe->head;
1770  	o_mask = opipe->ring_size - 1;
1771  
1772  	do {
1773  		size_t o_len;
1774  
1775  		if (!opipe->readers) {
1776  			send_sig(SIGPIPE, current, 0);
1777  			if (!ret)
1778  				ret = -EPIPE;
1779  			break;
1780  		}
1781  
1782  		i_head = ipipe->head;
1783  		o_tail = opipe->tail;
1784  
1785  		if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1786  			break;
1787  
1788  		/*
1789  		 * Cannot make any progress, because either the input
1790  		 * pipe is empty or the output pipe is full.
1791  		 */
1792  		if (pipe_empty(i_head, i_tail) ||
1793  		    pipe_full(o_head, o_tail, opipe->max_usage)) {
1794  			/* Already processed some buffers, break */
1795  			if (ret)
1796  				break;
1797  
1798  			if (flags & SPLICE_F_NONBLOCK) {
1799  				ret = -EAGAIN;
1800  				break;
1801  			}
1802  
1803  			/*
1804  			 * We raced with another reader/writer and haven't
1805  			 * managed to process any buffers.  A zero return
1806  			 * value means EOF, so retry instead.
1807  			 */
1808  			pipe_unlock(ipipe);
1809  			pipe_unlock(opipe);
1810  			goto retry;
1811  		}
1812  
1813  		ibuf = &ipipe->bufs[i_tail & i_mask];
1814  		obuf = &opipe->bufs[o_head & o_mask];
1815  
1816  		if (len >= ibuf->len) {
1817  			/*
1818  			 * Simply move the whole buffer from ipipe to opipe
1819  			 */
1820  			*obuf = *ibuf;
1821  			ibuf->ops = NULL;
1822  			i_tail++;
1823  			ipipe->tail = i_tail;
1824  			input_wakeup = true;
1825  			o_len = obuf->len;
1826  			o_head++;
1827  			opipe->head = o_head;
1828  		} else {
1829  			/*
1830  			 * Get a reference to this pipe buffer,
1831  			 * so we can copy the contents over.
1832  			 */
1833  			if (!pipe_buf_get(ipipe, ibuf)) {
1834  				if (ret == 0)
1835  					ret = -EFAULT;
1836  				break;
1837  			}
1838  			*obuf = *ibuf;
1839  
1840  			/*
1841  			 * Don't inherit the gift and merge flags, we need to
1842  			 * prevent multiple steals of this page.
1843  			 */
1844  			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1845  			obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1846  
1847  			obuf->len = len;
1848  			ibuf->offset += len;
1849  			ibuf->len -= len;
1850  			o_len = len;
1851  			o_head++;
1852  			opipe->head = o_head;
1853  		}
1854  		ret += o_len;
1855  		len -= o_len;
1856  	} while (len);
1857  
1858  	pipe_unlock(ipipe);
1859  	pipe_unlock(opipe);
1860  
1861  	/*
1862  	 * If we put data in the output pipe, wakeup any potential readers.
1863  	 */
1864  	if (ret > 0)
1865  		wakeup_pipe_readers(opipe);
1866  
1867  	if (input_wakeup)
1868  		wakeup_pipe_writers(ipipe);
1869  
1870  	return ret;
1871  }
1872  
1873  /*
1874   * Link contents of ipipe to opipe.
1875   */
1876  static ssize_t link_pipe(struct pipe_inode_info *ipipe,
1877  			 struct pipe_inode_info *opipe,
1878  			 size_t len, unsigned int flags)
1879  {
1880  	struct pipe_buffer *ibuf, *obuf;
1881  	unsigned int i_head, o_head;
1882  	unsigned int i_tail, o_tail;
1883  	unsigned int i_mask, o_mask;
1884  	ssize_t ret = 0;
1885  
1886  	/*
1887  	 * Potential ABBA deadlock, work around it by ordering lock
1888  	 * grabbing by pipe info address. Otherwise two different processes
1889  	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1890  	 */
1891  	pipe_double_lock(ipipe, opipe);
1892  
1893  	i_tail = ipipe->tail;
1894  	i_mask = ipipe->ring_size - 1;
1895  	o_head = opipe->head;
1896  	o_mask = opipe->ring_size - 1;
1897  
1898  	do {
1899  		if (!opipe->readers) {
1900  			send_sig(SIGPIPE, current, 0);
1901  			if (!ret)
1902  				ret = -EPIPE;
1903  			break;
1904  		}
1905  
1906  		i_head = ipipe->head;
1907  		o_tail = opipe->tail;
1908  
1909  		/*
1910  		 * If we have iterated all input buffers or run out of
1911  		 * output room, break.
1912  		 */
1913  		if (pipe_empty(i_head, i_tail) ||
1914  		    pipe_full(o_head, o_tail, opipe->max_usage))
1915  			break;
1916  
1917  		ibuf = &ipipe->bufs[i_tail & i_mask];
1918  		obuf = &opipe->bufs[o_head & o_mask];
1919  
1920  		/*
1921  		 * Get a reference to this pipe buffer,
1922  		 * so we can copy the contents over.
1923  		 */
1924  		if (!pipe_buf_get(ipipe, ibuf)) {
1925  			if (ret == 0)
1926  				ret = -EFAULT;
1927  			break;
1928  		}
1929  
1930  		*obuf = *ibuf;
1931  
1932  		/*
1933  		 * Don't inherit the gift and merge flag, we need to prevent
1934  		 * multiple steals of this page.
1935  		 */
1936  		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1937  		obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1938  
1939  		if (obuf->len > len)
1940  			obuf->len = len;
1941  		ret += obuf->len;
1942  		len -= obuf->len;
1943  
1944  		o_head++;
1945  		opipe->head = o_head;
1946  		i_tail++;
1947  	} while (len);
1948  
1949  	pipe_unlock(ipipe);
1950  	pipe_unlock(opipe);
1951  
1952  	/*
1953  	 * If we put data in the output pipe, wakeup any potential readers.
1954  	 */
1955  	if (ret > 0)
1956  		wakeup_pipe_readers(opipe);
1957  
1958  	return ret;
1959  }
1960  
1961  /*
1962   * This is a tee(1) implementation that works on pipes. It doesn't copy
1963   * any data, it simply references the 'in' pages on the 'out' pipe.
1964   * The 'flags' used are the SPLICE_F_* variants, currently the only
1965   * applicable one is SPLICE_F_NONBLOCK.
1966   */
1967  ssize_t do_tee(struct file *in, struct file *out, size_t len,
1968  	       unsigned int flags)
1969  {
1970  	struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1971  	struct pipe_inode_info *opipe = get_pipe_info(out, true);
1972  	ssize_t ret = -EINVAL;
1973  
1974  	if (unlikely(!(in->f_mode & FMODE_READ) ||
1975  		     !(out->f_mode & FMODE_WRITE)))
1976  		return -EBADF;
1977  
1978  	/*
1979  	 * Duplicate the contents of ipipe to opipe without actually
1980  	 * copying the data.
1981  	 */
1982  	if (ipipe && opipe && ipipe != opipe) {
1983  		if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1984  			flags |= SPLICE_F_NONBLOCK;
1985  
1986  		/*
1987  		 * Keep going, unless we encounter an error. The ipipe/opipe
1988  		 * ordering doesn't really matter.
1989  		 */
1990  		ret = ipipe_prep(ipipe, flags);
1991  		if (!ret) {
1992  			ret = opipe_prep(opipe, flags);
1993  			if (!ret)
1994  				ret = link_pipe(ipipe, opipe, len, flags);
1995  		}
1996  	}
1997  
1998  	if (ret > 0) {
1999  		fsnotify_access(in);
2000  		fsnotify_modify(out);
2001  	}
2002  
2003  	return ret;
2004  }
2005  
2006  SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
2007  {
2008  	struct fd in, out;
2009  	ssize_t error;
2010  
2011  	if (unlikely(flags & ~SPLICE_F_ALL))
2012  		return -EINVAL;
2013  
2014  	if (unlikely(!len))
2015  		return 0;
2016  
2017  	error = -EBADF;
2018  	in = fdget(fdin);
2019  	if (in.file) {
2020  		out = fdget(fdout);
2021  		if (out.file) {
2022  			error = do_tee(in.file, out.file, len, flags);
2023  			fdput(out);
2024  		}
2025   		fdput(in);
2026   	}
2027  
2028  	return error;
2029  }
2030