1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files and fixing the initial implementation 13 * bugs. 14 * 15 * Copyright (C) 2005 Jens Axboe <axboe@suse.de> 16 * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org> 17 * 18 */ 19 #include <linux/fs.h> 20 #include <linux/file.h> 21 #include <linux/pagemap.h> 22 #include <linux/pipe_fs_i.h> 23 #include <linux/mm_inline.h> 24 #include <linux/swap.h> 25 #include <linux/module.h> 26 27 /* 28 * Passed to the actors 29 */ 30 struct splice_desc { 31 unsigned int len, total_len; /* current and remaining length */ 32 unsigned int flags; /* splice flags */ 33 struct file *file; /* file to read/write */ 34 loff_t pos; /* file position */ 35 }; 36 37 static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, 38 struct pipe_buffer *buf) 39 { 40 struct page *page = buf->page; 41 42 WARN_ON(!PageLocked(page)); 43 WARN_ON(!PageUptodate(page)); 44 45 if (!remove_mapping(page_mapping(page), page)) 46 return 1; 47 48 if (PageLRU(page)) { 49 struct zone *zone = page_zone(page); 50 51 spin_lock_irq(&zone->lru_lock); 52 BUG_ON(!PageLRU(page)); 53 __ClearPageLRU(page); 54 del_page_from_lru(zone, page); 55 spin_unlock_irq(&zone->lru_lock); 56 } 57 58 buf->stolen = 1; 59 return 0; 60 } 61 62 static void page_cache_pipe_buf_release(struct pipe_inode_info *info, 63 struct pipe_buffer *buf) 64 { 65 page_cache_release(buf->page); 66 buf->page = NULL; 67 buf->stolen = 0; 68 } 69 70 static void *page_cache_pipe_buf_map(struct file *file, 71 struct pipe_inode_info *info, 72 struct pipe_buffer *buf) 73 { 74 struct page *page = buf->page; 75 76 lock_page(page); 77 78 if (!PageUptodate(page)) { 79 unlock_page(page); 80 return ERR_PTR(-EIO); 81 } 82 83 if (!page->mapping) { 84 unlock_page(page); 85 return ERR_PTR(-ENODATA); 86 } 87 88 return kmap(buf->page); 89 } 90 91 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, 92 struct pipe_buffer *buf) 93 { 94 if (!buf->stolen) 95 unlock_page(buf->page); 96 kunmap(buf->page); 97 } 98 99 static struct pipe_buf_operations page_cache_pipe_buf_ops = { 100 .can_merge = 0, 101 .map = page_cache_pipe_buf_map, 102 .unmap = page_cache_pipe_buf_unmap, 103 .release = page_cache_pipe_buf_release, 104 .steal = page_cache_pipe_buf_steal, 105 }; 106 107 static ssize_t move_to_pipe(struct inode *inode, struct page **pages, 108 int nr_pages, unsigned long offset, 109 unsigned long len) 110 { 111 struct pipe_inode_info *info; 112 int ret, do_wakeup, i; 113 114 ret = 0; 115 do_wakeup = 0; 116 i = 0; 117 118 mutex_lock(PIPE_MUTEX(*inode)); 119 120 info = inode->i_pipe; 121 for (;;) { 122 int bufs; 123 124 if (!PIPE_READERS(*inode)) { 125 send_sig(SIGPIPE, current, 0); 126 if (!ret) 127 ret = -EPIPE; 128 break; 129 } 130 131 bufs = info->nrbufs; 132 if (bufs < PIPE_BUFFERS) { 133 int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1); 134 struct pipe_buffer *buf = info->bufs + newbuf; 135 struct page *page = pages[i++]; 136 unsigned long this_len; 137 138 this_len = PAGE_CACHE_SIZE - offset; 139 if (this_len > len) 140 this_len = len; 141 142 buf->page = page; 143 buf->offset = offset; 144 buf->len = this_len; 145 buf->ops = &page_cache_pipe_buf_ops; 146 info->nrbufs = ++bufs; 147 do_wakeup = 1; 148 149 ret += this_len; 150 len -= this_len; 151 offset = 0; 152 if (!--nr_pages) 153 break; 154 if (!len) 155 break; 156 if (bufs < PIPE_BUFFERS) 157 continue; 158 159 break; 160 } 161 162 if (signal_pending(current)) { 163 if (!ret) 164 ret = -ERESTARTSYS; 165 break; 166 } 167 168 if (do_wakeup) { 169 wake_up_interruptible_sync(PIPE_WAIT(*inode)); 170 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, 171 POLL_IN); 172 do_wakeup = 0; 173 } 174 175 PIPE_WAITING_WRITERS(*inode)++; 176 pipe_wait(inode); 177 PIPE_WAITING_WRITERS(*inode)--; 178 } 179 180 mutex_unlock(PIPE_MUTEX(*inode)); 181 182 if (do_wakeup) { 183 wake_up_interruptible(PIPE_WAIT(*inode)); 184 kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); 185 } 186 187 while (i < nr_pages) 188 page_cache_release(pages[i++]); 189 190 return ret; 191 } 192 193 static int __generic_file_splice_read(struct file *in, struct inode *pipe, 194 size_t len) 195 { 196 struct address_space *mapping = in->f_mapping; 197 unsigned int offset, nr_pages; 198 struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS]; 199 struct page *page; 200 pgoff_t index, pidx; 201 int i, j; 202 203 index = in->f_pos >> PAGE_CACHE_SHIFT; 204 offset = in->f_pos & ~PAGE_CACHE_MASK; 205 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 206 207 if (nr_pages > PIPE_BUFFERS) 208 nr_pages = PIPE_BUFFERS; 209 210 /* 211 * initiate read-ahead on this page range 212 */ 213 do_page_cache_readahead(mapping, in, index, nr_pages); 214 215 /* 216 * Get as many pages from the page cache as possible.. 217 * Start IO on the page cache entries we create (we 218 * can assume that any pre-existing ones we find have 219 * already had IO started on them). 220 */ 221 i = find_get_pages(mapping, index, nr_pages, pages); 222 223 /* 224 * common case - we found all pages and they are contiguous, 225 * kick them off 226 */ 227 if (i && (pages[i - 1]->index == index + i - 1)) 228 goto splice_them; 229 230 /* 231 * fill shadow[] with pages at the right locations, so we only 232 * have to fill holes 233 */ 234 memset(shadow, 0, i * sizeof(struct page *)); 235 for (j = 0, pidx = index; j < i; pidx++, j++) 236 shadow[pages[j]->index - pidx] = pages[j]; 237 238 /* 239 * now fill in the holes 240 */ 241 for (i = 0, pidx = index; i < nr_pages; pidx++, i++) { 242 int error; 243 244 if (shadow[i]) 245 continue; 246 247 /* 248 * no page there, look one up / create it 249 */ 250 page = find_or_create_page(mapping, pidx, 251 mapping_gfp_mask(mapping)); 252 if (!page) 253 break; 254 255 if (PageUptodate(page)) 256 unlock_page(page); 257 else { 258 error = mapping->a_ops->readpage(in, page); 259 260 if (unlikely(error)) { 261 page_cache_release(page); 262 break; 263 } 264 } 265 shadow[i] = page; 266 } 267 268 if (!i) { 269 for (i = 0; i < nr_pages; i++) { 270 if (shadow[i]) 271 page_cache_release(shadow[i]); 272 } 273 return 0; 274 } 275 276 memcpy(pages, shadow, i * sizeof(struct page *)); 277 278 /* 279 * Now we splice them into the pipe.. 280 */ 281 splice_them: 282 return move_to_pipe(pipe, pages, i, offset, len); 283 } 284 285 ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, 286 size_t len, unsigned int flags) 287 { 288 ssize_t spliced; 289 int ret; 290 291 ret = 0; 292 spliced = 0; 293 while (len) { 294 ret = __generic_file_splice_read(in, pipe, len); 295 296 if (ret <= 0) 297 break; 298 299 in->f_pos += ret; 300 len -= ret; 301 spliced += ret; 302 } 303 304 if (spliced) 305 return spliced; 306 307 return ret; 308 } 309 310 /* 311 * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage(). 312 */ 313 static int pipe_to_sendpage(struct pipe_inode_info *info, 314 struct pipe_buffer *buf, struct splice_desc *sd) 315 { 316 struct file *file = sd->file; 317 loff_t pos = sd->pos; 318 unsigned int offset; 319 ssize_t ret; 320 void *ptr; 321 322 /* 323 * sub-optimal, but we are limited by the pipe ->map. we don't 324 * need a kmap'ed buffer here, we just want to make sure we 325 * have the page pinned if the pipe page originates from the 326 * page cache 327 */ 328 ptr = buf->ops->map(file, info, buf); 329 if (IS_ERR(ptr)) 330 return PTR_ERR(ptr); 331 332 offset = pos & ~PAGE_CACHE_MASK; 333 334 ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos, 335 sd->len < sd->total_len); 336 337 buf->ops->unmap(info, buf); 338 if (ret == sd->len) 339 return 0; 340 341 return -EIO; 342 } 343 344 /* 345 * This is a little more tricky than the file -> pipe splicing. There are 346 * basically three cases: 347 * 348 * - Destination page already exists in the address space and there 349 * are users of it. For that case we have no other option that 350 * copying the data. Tough luck. 351 * - Destination page already exists in the address space, but there 352 * are no users of it. Make sure it's uptodate, then drop it. Fall 353 * through to last case. 354 * - Destination page does not exist, we can add the pipe page to 355 * the page cache and avoid the copy. 356 * 357 * For now we just do the slower thing and always copy pages over, it's 358 * easier than migrating pages from the pipe to the target file. For the 359 * case of doing file | file splicing, the migrate approach had some LRU 360 * nastiness... 361 */ 362 static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, 363 struct splice_desc *sd) 364 { 365 struct file *file = sd->file; 366 struct address_space *mapping = file->f_mapping; 367 unsigned int offset; 368 struct page *page; 369 pgoff_t index; 370 char *src; 371 int ret; 372 373 /* 374 * after this, page will be locked and unmapped 375 */ 376 src = buf->ops->map(file, info, buf); 377 if (IS_ERR(src)) 378 return PTR_ERR(src); 379 380 index = sd->pos >> PAGE_CACHE_SHIFT; 381 offset = sd->pos & ~PAGE_CACHE_MASK; 382 383 /* 384 * reuse buf page, if SPLICE_F_MOVE is set 385 */ 386 if (sd->flags & SPLICE_F_MOVE) { 387 if (buf->ops->steal(info, buf)) 388 goto find_page; 389 390 page = buf->page; 391 if (add_to_page_cache_lru(page, mapping, index, 392 mapping_gfp_mask(mapping))) 393 goto find_page; 394 } else { 395 find_page: 396 ret = -ENOMEM; 397 page = find_or_create_page(mapping, index, 398 mapping_gfp_mask(mapping)); 399 if (!page) 400 goto out; 401 402 /* 403 * If the page is uptodate, it is also locked. If it isn't 404 * uptodate, we can mark it uptodate if we are filling the 405 * full page. Otherwise we need to read it in first... 406 */ 407 if (!PageUptodate(page)) { 408 if (sd->len < PAGE_CACHE_SIZE) { 409 ret = mapping->a_ops->readpage(file, page); 410 if (unlikely(ret)) 411 goto out; 412 413 lock_page(page); 414 415 if (!PageUptodate(page)) { 416 /* 417 * page got invalidated, repeat 418 */ 419 if (!page->mapping) { 420 unlock_page(page); 421 page_cache_release(page); 422 goto find_page; 423 } 424 ret = -EIO; 425 goto out; 426 } 427 } else { 428 WARN_ON(!PageLocked(page)); 429 SetPageUptodate(page); 430 } 431 } 432 } 433 434 ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); 435 if (ret) 436 goto out; 437 438 if (!buf->stolen) { 439 char *dst = kmap_atomic(page, KM_USER0); 440 441 memcpy(dst + offset, src + buf->offset, sd->len); 442 flush_dcache_page(page); 443 kunmap_atomic(dst, KM_USER0); 444 } 445 446 ret = mapping->a_ops->commit_write(file, page, 0, sd->len); 447 if (ret < 0) 448 goto out; 449 450 set_page_dirty(page); 451 ret = write_one_page(page, 0); 452 out: 453 if (ret < 0) 454 unlock_page(page); 455 if (!buf->stolen) 456 page_cache_release(page); 457 buf->ops->unmap(info, buf); 458 return ret; 459 } 460 461 typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, 462 struct splice_desc *); 463 464 static ssize_t move_from_pipe(struct inode *inode, struct file *out, 465 size_t len, unsigned int flags, 466 splice_actor *actor) 467 { 468 struct pipe_inode_info *info; 469 int ret, do_wakeup, err; 470 struct splice_desc sd; 471 472 ret = 0; 473 do_wakeup = 0; 474 475 sd.total_len = len; 476 sd.flags = flags; 477 sd.file = out; 478 sd.pos = out->f_pos; 479 480 mutex_lock(PIPE_MUTEX(*inode)); 481 482 info = inode->i_pipe; 483 for (;;) { 484 int bufs = info->nrbufs; 485 486 if (bufs) { 487 int curbuf = info->curbuf; 488 struct pipe_buffer *buf = info->bufs + curbuf; 489 struct pipe_buf_operations *ops = buf->ops; 490 491 sd.len = buf->len; 492 if (sd.len > sd.total_len) 493 sd.len = sd.total_len; 494 495 err = actor(info, buf, &sd); 496 if (err) { 497 if (!ret && err != -ENODATA) 498 ret = err; 499 500 break; 501 } 502 503 ret += sd.len; 504 buf->offset += sd.len; 505 buf->len -= sd.len; 506 if (!buf->len) { 507 buf->ops = NULL; 508 ops->release(info, buf); 509 curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); 510 info->curbuf = curbuf; 511 info->nrbufs = --bufs; 512 do_wakeup = 1; 513 } 514 515 sd.pos += sd.len; 516 sd.total_len -= sd.len; 517 if (!sd.total_len) 518 break; 519 } 520 521 if (bufs) 522 continue; 523 if (!PIPE_WRITERS(*inode)) 524 break; 525 if (!PIPE_WAITING_WRITERS(*inode)) { 526 if (ret) 527 break; 528 } 529 530 if (signal_pending(current)) { 531 if (!ret) 532 ret = -ERESTARTSYS; 533 break; 534 } 535 536 if (do_wakeup) { 537 wake_up_interruptible_sync(PIPE_WAIT(*inode)); 538 kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); 539 do_wakeup = 0; 540 } 541 542 pipe_wait(inode); 543 } 544 545 mutex_unlock(PIPE_MUTEX(*inode)); 546 547 if (do_wakeup) { 548 wake_up_interruptible(PIPE_WAIT(*inode)); 549 kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); 550 } 551 552 mutex_lock(&out->f_mapping->host->i_mutex); 553 out->f_pos = sd.pos; 554 mutex_unlock(&out->f_mapping->host->i_mutex); 555 return ret; 556 557 } 558 559 ssize_t generic_file_splice_write(struct inode *inode, struct file *out, 560 size_t len, unsigned int flags) 561 { 562 return move_from_pipe(inode, out, len, flags, pipe_to_file); 563 } 564 565 ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, 566 size_t len, unsigned int flags) 567 { 568 return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); 569 } 570 571 EXPORT_SYMBOL(generic_file_splice_write); 572 EXPORT_SYMBOL(generic_file_splice_read); 573 574 static long do_splice_from(struct inode *pipe, struct file *out, size_t len, 575 unsigned int flags) 576 { 577 loff_t pos; 578 int ret; 579 580 if (!out->f_op || !out->f_op->splice_write) 581 return -EINVAL; 582 583 if (!(out->f_mode & FMODE_WRITE)) 584 return -EBADF; 585 586 pos = out->f_pos; 587 ret = rw_verify_area(WRITE, out, &pos, len); 588 if (unlikely(ret < 0)) 589 return ret; 590 591 return out->f_op->splice_write(pipe, out, len, flags); 592 } 593 594 static long do_splice_to(struct file *in, struct inode *pipe, size_t len, 595 unsigned int flags) 596 { 597 loff_t pos, isize, left; 598 int ret; 599 600 if (!in->f_op || !in->f_op->splice_read) 601 return -EINVAL; 602 603 if (!(in->f_mode & FMODE_READ)) 604 return -EBADF; 605 606 pos = in->f_pos; 607 ret = rw_verify_area(READ, in, &pos, len); 608 if (unlikely(ret < 0)) 609 return ret; 610 611 isize = i_size_read(in->f_mapping->host); 612 if (unlikely(in->f_pos >= isize)) 613 return 0; 614 615 left = isize - in->f_pos; 616 if (left < len) 617 len = left; 618 619 return in->f_op->splice_read(in, pipe, len, flags); 620 } 621 622 static long do_splice(struct file *in, struct file *out, size_t len, 623 unsigned int flags) 624 { 625 struct inode *pipe; 626 627 pipe = in->f_dentry->d_inode; 628 if (pipe->i_pipe) 629 return do_splice_from(pipe, out, len, flags); 630 631 pipe = out->f_dentry->d_inode; 632 if (pipe->i_pipe) 633 return do_splice_to(in, pipe, len, flags); 634 635 return -EINVAL; 636 } 637 638 asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) 639 { 640 long error; 641 struct file *in, *out; 642 int fput_in, fput_out; 643 644 if (unlikely(!len)) 645 return 0; 646 647 error = -EBADF; 648 in = fget_light(fdin, &fput_in); 649 if (in) { 650 if (in->f_mode & FMODE_READ) { 651 out = fget_light(fdout, &fput_out); 652 if (out) { 653 if (out->f_mode & FMODE_WRITE) 654 error = do_splice(in, out, len, flags); 655 fput_light(out, fput_out); 656 } 657 } 658 659 fput_light(in, fput_in); 660 } 661 662 return error; 663 } 664