xref: /linux/fs/splice.c (revision de2fe5e07d58424bc286fff3fd3c1b0bf933cd58)
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files and fixing the initial implementation
13  * bugs.
14  *
15  * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
16  * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
17  *
18  */
19 #include <linux/fs.h>
20 #include <linux/file.h>
21 #include <linux/pagemap.h>
22 #include <linux/pipe_fs_i.h>
23 #include <linux/mm_inline.h>
24 #include <linux/swap.h>
25 #include <linux/module.h>
26 
27 /*
28  * Passed to the actors
29  */
30 struct splice_desc {
31 	unsigned int len, total_len;	/* current and remaining length */
32 	unsigned int flags;		/* splice flags */
33 	struct file *file;		/* file to read/write */
34 	loff_t pos;			/* file position */
35 };
36 
37 static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
38 				     struct pipe_buffer *buf)
39 {
40 	struct page *page = buf->page;
41 
42 	WARN_ON(!PageLocked(page));
43 	WARN_ON(!PageUptodate(page));
44 
45 	if (!remove_mapping(page_mapping(page), page))
46 		return 1;
47 
48 	if (PageLRU(page)) {
49 		struct zone *zone = page_zone(page);
50 
51 		spin_lock_irq(&zone->lru_lock);
52 		BUG_ON(!PageLRU(page));
53 		__ClearPageLRU(page);
54 		del_page_from_lru(zone, page);
55 		spin_unlock_irq(&zone->lru_lock);
56 	}
57 
58 	buf->stolen = 1;
59 	return 0;
60 }
61 
62 static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
63 					struct pipe_buffer *buf)
64 {
65 	page_cache_release(buf->page);
66 	buf->page = NULL;
67 	buf->stolen = 0;
68 }
69 
70 static void *page_cache_pipe_buf_map(struct file *file,
71 				     struct pipe_inode_info *info,
72 				     struct pipe_buffer *buf)
73 {
74 	struct page *page = buf->page;
75 
76 	lock_page(page);
77 
78 	if (!PageUptodate(page)) {
79 		unlock_page(page);
80 		return ERR_PTR(-EIO);
81 	}
82 
83 	if (!page->mapping) {
84 		unlock_page(page);
85 		return ERR_PTR(-ENODATA);
86 	}
87 
88 	return kmap(buf->page);
89 }
90 
91 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
92 				      struct pipe_buffer *buf)
93 {
94 	if (!buf->stolen)
95 		unlock_page(buf->page);
96 	kunmap(buf->page);
97 }
98 
99 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
100 	.can_merge = 0,
101 	.map = page_cache_pipe_buf_map,
102 	.unmap = page_cache_pipe_buf_unmap,
103 	.release = page_cache_pipe_buf_release,
104 	.steal = page_cache_pipe_buf_steal,
105 };
106 
107 static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
108 			    int nr_pages, unsigned long offset,
109 			    unsigned long len)
110 {
111 	struct pipe_inode_info *info;
112 	int ret, do_wakeup, i;
113 
114 	ret = 0;
115 	do_wakeup = 0;
116 	i = 0;
117 
118 	mutex_lock(PIPE_MUTEX(*inode));
119 
120 	info = inode->i_pipe;
121 	for (;;) {
122 		int bufs;
123 
124 		if (!PIPE_READERS(*inode)) {
125 			send_sig(SIGPIPE, current, 0);
126 			if (!ret)
127 				ret = -EPIPE;
128 			break;
129 		}
130 
131 		bufs = info->nrbufs;
132 		if (bufs < PIPE_BUFFERS) {
133 			int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
134 			struct pipe_buffer *buf = info->bufs + newbuf;
135 			struct page *page = pages[i++];
136 			unsigned long this_len;
137 
138 			this_len = PAGE_CACHE_SIZE - offset;
139 			if (this_len > len)
140 				this_len = len;
141 
142 			buf->page = page;
143 			buf->offset = offset;
144 			buf->len = this_len;
145 			buf->ops = &page_cache_pipe_buf_ops;
146 			info->nrbufs = ++bufs;
147 			do_wakeup = 1;
148 
149 			ret += this_len;
150 			len -= this_len;
151 			offset = 0;
152 			if (!--nr_pages)
153 				break;
154 			if (!len)
155 				break;
156 			if (bufs < PIPE_BUFFERS)
157 				continue;
158 
159 			break;
160 		}
161 
162 		if (signal_pending(current)) {
163 			if (!ret)
164 				ret = -ERESTARTSYS;
165 			break;
166 		}
167 
168 		if (do_wakeup) {
169 			wake_up_interruptible_sync(PIPE_WAIT(*inode));
170 			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
171 				    POLL_IN);
172 			do_wakeup = 0;
173 		}
174 
175 		PIPE_WAITING_WRITERS(*inode)++;
176 		pipe_wait(inode);
177 		PIPE_WAITING_WRITERS(*inode)--;
178 	}
179 
180 	mutex_unlock(PIPE_MUTEX(*inode));
181 
182 	if (do_wakeup) {
183 		wake_up_interruptible(PIPE_WAIT(*inode));
184 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
185 	}
186 
187 	while (i < nr_pages)
188 		page_cache_release(pages[i++]);
189 
190 	return ret;
191 }
192 
193 static int __generic_file_splice_read(struct file *in, struct inode *pipe,
194 				      size_t len)
195 {
196 	struct address_space *mapping = in->f_mapping;
197 	unsigned int offset, nr_pages;
198 	struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
199 	struct page *page;
200 	pgoff_t index, pidx;
201 	int i, j;
202 
203 	index = in->f_pos >> PAGE_CACHE_SHIFT;
204 	offset = in->f_pos & ~PAGE_CACHE_MASK;
205 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
206 
207 	if (nr_pages > PIPE_BUFFERS)
208 		nr_pages = PIPE_BUFFERS;
209 
210 	/*
211 	 * initiate read-ahead on this page range
212 	 */
213 	do_page_cache_readahead(mapping, in, index, nr_pages);
214 
215 	/*
216 	 * Get as many pages from the page cache as possible..
217 	 * Start IO on the page cache entries we create (we
218 	 * can assume that any pre-existing ones we find have
219 	 * already had IO started on them).
220 	 */
221 	i = find_get_pages(mapping, index, nr_pages, pages);
222 
223 	/*
224 	 * common case - we found all pages and they are contiguous,
225 	 * kick them off
226 	 */
227 	if (i && (pages[i - 1]->index == index + i - 1))
228 		goto splice_them;
229 
230 	/*
231 	 * fill shadow[] with pages at the right locations, so we only
232 	 * have to fill holes
233 	 */
234 	memset(shadow, 0, i * sizeof(struct page *));
235 	for (j = 0, pidx = index; j < i; pidx++, j++)
236 		shadow[pages[j]->index - pidx] = pages[j];
237 
238 	/*
239 	 * now fill in the holes
240 	 */
241 	for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
242 		int error;
243 
244 		if (shadow[i])
245 			continue;
246 
247 		/*
248 		 * no page there, look one up / create it
249 		 */
250 		page = find_or_create_page(mapping, pidx,
251 						   mapping_gfp_mask(mapping));
252 		if (!page)
253 			break;
254 
255 		if (PageUptodate(page))
256 			unlock_page(page);
257 		else {
258 			error = mapping->a_ops->readpage(in, page);
259 
260 			if (unlikely(error)) {
261 				page_cache_release(page);
262 				break;
263 			}
264 		}
265 		shadow[i] = page;
266 	}
267 
268 	if (!i) {
269 		for (i = 0; i < nr_pages; i++) {
270 			 if (shadow[i])
271 				page_cache_release(shadow[i]);
272 		}
273 		return 0;
274 	}
275 
276 	memcpy(pages, shadow, i * sizeof(struct page *));
277 
278 	/*
279 	 * Now we splice them into the pipe..
280 	 */
281 splice_them:
282 	return move_to_pipe(pipe, pages, i, offset, len);
283 }
284 
285 ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
286 				 size_t len, unsigned int flags)
287 {
288 	ssize_t spliced;
289 	int ret;
290 
291 	ret = 0;
292 	spliced = 0;
293 	while (len) {
294 		ret = __generic_file_splice_read(in, pipe, len);
295 
296 		if (ret <= 0)
297 			break;
298 
299 		in->f_pos += ret;
300 		len -= ret;
301 		spliced += ret;
302 	}
303 
304 	if (spliced)
305 		return spliced;
306 
307 	return ret;
308 }
309 
310 /*
311  * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
312  */
313 static int pipe_to_sendpage(struct pipe_inode_info *info,
314 			    struct pipe_buffer *buf, struct splice_desc *sd)
315 {
316 	struct file *file = sd->file;
317 	loff_t pos = sd->pos;
318 	unsigned int offset;
319 	ssize_t ret;
320 	void *ptr;
321 
322 	/*
323 	 * sub-optimal, but we are limited by the pipe ->map. we don't
324 	 * need a kmap'ed buffer here, we just want to make sure we
325 	 * have the page pinned if the pipe page originates from the
326 	 * page cache
327 	 */
328 	ptr = buf->ops->map(file, info, buf);
329 	if (IS_ERR(ptr))
330 		return PTR_ERR(ptr);
331 
332 	offset = pos & ~PAGE_CACHE_MASK;
333 
334 	ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
335 					sd->len < sd->total_len);
336 
337 	buf->ops->unmap(info, buf);
338 	if (ret == sd->len)
339 		return 0;
340 
341 	return -EIO;
342 }
343 
344 /*
345  * This is a little more tricky than the file -> pipe splicing. There are
346  * basically three cases:
347  *
348  *	- Destination page already exists in the address space and there
349  *	  are users of it. For that case we have no other option that
350  *	  copying the data. Tough luck.
351  *	- Destination page already exists in the address space, but there
352  *	  are no users of it. Make sure it's uptodate, then drop it. Fall
353  *	  through to last case.
354  *	- Destination page does not exist, we can add the pipe page to
355  *	  the page cache and avoid the copy.
356  *
357  * For now we just do the slower thing and always copy pages over, it's
358  * easier than migrating pages from the pipe to the target file. For the
359  * case of doing file | file splicing, the migrate approach had some LRU
360  * nastiness...
361  */
362 static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
363 			struct splice_desc *sd)
364 {
365 	struct file *file = sd->file;
366 	struct address_space *mapping = file->f_mapping;
367 	unsigned int offset;
368 	struct page *page;
369 	pgoff_t index;
370 	char *src;
371 	int ret;
372 
373 	/*
374 	 * after this, page will be locked and unmapped
375 	 */
376 	src = buf->ops->map(file, info, buf);
377 	if (IS_ERR(src))
378 		return PTR_ERR(src);
379 
380 	index = sd->pos >> PAGE_CACHE_SHIFT;
381 	offset = sd->pos & ~PAGE_CACHE_MASK;
382 
383 	/*
384 	 * reuse buf page, if SPLICE_F_MOVE is set
385 	 */
386 	if (sd->flags & SPLICE_F_MOVE) {
387 		if (buf->ops->steal(info, buf))
388 			goto find_page;
389 
390 		page = buf->page;
391 		if (add_to_page_cache_lru(page, mapping, index,
392 						mapping_gfp_mask(mapping)))
393 			goto find_page;
394 	} else {
395 find_page:
396 		ret = -ENOMEM;
397 		page = find_or_create_page(mapping, index,
398 						mapping_gfp_mask(mapping));
399 		if (!page)
400 			goto out;
401 
402 		/*
403 		 * If the page is uptodate, it is also locked. If it isn't
404 		 * uptodate, we can mark it uptodate if we are filling the
405 		 * full page. Otherwise we need to read it in first...
406 		 */
407 		if (!PageUptodate(page)) {
408 			if (sd->len < PAGE_CACHE_SIZE) {
409 				ret = mapping->a_ops->readpage(file, page);
410 				if (unlikely(ret))
411 					goto out;
412 
413 				lock_page(page);
414 
415 				if (!PageUptodate(page)) {
416 					/*
417 					 * page got invalidated, repeat
418 					 */
419 					if (!page->mapping) {
420 						unlock_page(page);
421 						page_cache_release(page);
422 						goto find_page;
423 					}
424 					ret = -EIO;
425 					goto out;
426 				}
427 			} else {
428 				WARN_ON(!PageLocked(page));
429 				SetPageUptodate(page);
430 			}
431 		}
432 	}
433 
434 	ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
435 	if (ret)
436 		goto out;
437 
438 	if (!buf->stolen) {
439 		char *dst = kmap_atomic(page, KM_USER0);
440 
441 		memcpy(dst + offset, src + buf->offset, sd->len);
442 		flush_dcache_page(page);
443 		kunmap_atomic(dst, KM_USER0);
444 	}
445 
446 	ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
447 	if (ret < 0)
448 		goto out;
449 
450 	set_page_dirty(page);
451 	ret = write_one_page(page, 0);
452 out:
453 	if (ret < 0)
454 		unlock_page(page);
455 	if (!buf->stolen)
456 		page_cache_release(page);
457 	buf->ops->unmap(info, buf);
458 	return ret;
459 }
460 
461 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
462 			   struct splice_desc *);
463 
464 static ssize_t move_from_pipe(struct inode *inode, struct file *out,
465 			      size_t len, unsigned int flags,
466 			      splice_actor *actor)
467 {
468 	struct pipe_inode_info *info;
469 	int ret, do_wakeup, err;
470 	struct splice_desc sd;
471 
472 	ret = 0;
473 	do_wakeup = 0;
474 
475 	sd.total_len = len;
476 	sd.flags = flags;
477 	sd.file = out;
478 	sd.pos = out->f_pos;
479 
480 	mutex_lock(PIPE_MUTEX(*inode));
481 
482 	info = inode->i_pipe;
483 	for (;;) {
484 		int bufs = info->nrbufs;
485 
486 		if (bufs) {
487 			int curbuf = info->curbuf;
488 			struct pipe_buffer *buf = info->bufs + curbuf;
489 			struct pipe_buf_operations *ops = buf->ops;
490 
491 			sd.len = buf->len;
492 			if (sd.len > sd.total_len)
493 				sd.len = sd.total_len;
494 
495 			err = actor(info, buf, &sd);
496 			if (err) {
497 				if (!ret && err != -ENODATA)
498 					ret = err;
499 
500 				break;
501 			}
502 
503 			ret += sd.len;
504 			buf->offset += sd.len;
505 			buf->len -= sd.len;
506 			if (!buf->len) {
507 				buf->ops = NULL;
508 				ops->release(info, buf);
509 				curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
510 				info->curbuf = curbuf;
511 				info->nrbufs = --bufs;
512 				do_wakeup = 1;
513 			}
514 
515 			sd.pos += sd.len;
516 			sd.total_len -= sd.len;
517 			if (!sd.total_len)
518 				break;
519 		}
520 
521 		if (bufs)
522 			continue;
523 		if (!PIPE_WRITERS(*inode))
524 			break;
525 		if (!PIPE_WAITING_WRITERS(*inode)) {
526 			if (ret)
527 				break;
528 		}
529 
530 		if (signal_pending(current)) {
531 			if (!ret)
532 				ret = -ERESTARTSYS;
533 			break;
534 		}
535 
536 		if (do_wakeup) {
537 			wake_up_interruptible_sync(PIPE_WAIT(*inode));
538 			kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
539 			do_wakeup = 0;
540 		}
541 
542 		pipe_wait(inode);
543 	}
544 
545 	mutex_unlock(PIPE_MUTEX(*inode));
546 
547 	if (do_wakeup) {
548 		wake_up_interruptible(PIPE_WAIT(*inode));
549 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
550 	}
551 
552 	mutex_lock(&out->f_mapping->host->i_mutex);
553 	out->f_pos = sd.pos;
554 	mutex_unlock(&out->f_mapping->host->i_mutex);
555 	return ret;
556 
557 }
558 
559 ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
560 				  size_t len, unsigned int flags)
561 {
562 	return move_from_pipe(inode, out, len, flags, pipe_to_file);
563 }
564 
565 ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
566 				size_t len, unsigned int flags)
567 {
568 	return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
569 }
570 
571 EXPORT_SYMBOL(generic_file_splice_write);
572 EXPORT_SYMBOL(generic_file_splice_read);
573 
574 static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
575 			   unsigned int flags)
576 {
577 	loff_t pos;
578 	int ret;
579 
580 	if (!out->f_op || !out->f_op->splice_write)
581 		return -EINVAL;
582 
583 	if (!(out->f_mode & FMODE_WRITE))
584 		return -EBADF;
585 
586 	pos = out->f_pos;
587 	ret = rw_verify_area(WRITE, out, &pos, len);
588 	if (unlikely(ret < 0))
589 		return ret;
590 
591 	return out->f_op->splice_write(pipe, out, len, flags);
592 }
593 
594 static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
595 			 unsigned int flags)
596 {
597 	loff_t pos, isize, left;
598 	int ret;
599 
600 	if (!in->f_op || !in->f_op->splice_read)
601 		return -EINVAL;
602 
603 	if (!(in->f_mode & FMODE_READ))
604 		return -EBADF;
605 
606 	pos = in->f_pos;
607 	ret = rw_verify_area(READ, in, &pos, len);
608 	if (unlikely(ret < 0))
609 		return ret;
610 
611 	isize = i_size_read(in->f_mapping->host);
612 	if (unlikely(in->f_pos >= isize))
613 		return 0;
614 
615 	left = isize - in->f_pos;
616 	if (left < len)
617 		len = left;
618 
619 	return in->f_op->splice_read(in, pipe, len, flags);
620 }
621 
622 static long do_splice(struct file *in, struct file *out, size_t len,
623 		      unsigned int flags)
624 {
625 	struct inode *pipe;
626 
627 	pipe = in->f_dentry->d_inode;
628 	if (pipe->i_pipe)
629 		return do_splice_from(pipe, out, len, flags);
630 
631 	pipe = out->f_dentry->d_inode;
632 	if (pipe->i_pipe)
633 		return do_splice_to(in, pipe, len, flags);
634 
635 	return -EINVAL;
636 }
637 
638 asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
639 {
640 	long error;
641 	struct file *in, *out;
642 	int fput_in, fput_out;
643 
644 	if (unlikely(!len))
645 		return 0;
646 
647 	error = -EBADF;
648 	in = fget_light(fdin, &fput_in);
649 	if (in) {
650 		if (in->f_mode & FMODE_READ) {
651 			out = fget_light(fdout, &fput_out);
652 			if (out) {
653 				if (out->f_mode & FMODE_WRITE)
654 					error = do_splice(in, out, len, flags);
655 				fput_light(out, fput_out);
656 			}
657 		}
658 
659 		fput_light(in, fput_in);
660 	}
661 
662 	return error;
663 }
664