1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/read_write.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/slab.h> 9 #include <linux/stat.h> 10 #include <linux/sched/xacct.h> 11 #include <linux/fcntl.h> 12 #include <linux/file.h> 13 #include <linux/uio.h> 14 #include <linux/fsnotify.h> 15 #include <linux/security.h> 16 #include <linux/export.h> 17 #include <linux/syscalls.h> 18 #include <linux/pagemap.h> 19 #include <linux/splice.h> 20 #include <linux/compat.h> 21 #include <linux/mount.h> 22 #include <linux/fs.h> 23 #include "internal.h" 24 25 #include <linux/uaccess.h> 26 #include <asm/unistd.h> 27 28 const struct file_operations generic_ro_fops = { 29 .llseek = generic_file_llseek, 30 .read_iter = generic_file_read_iter, 31 .mmap = generic_file_readonly_mmap, 32 .splice_read = filemap_splice_read, 33 }; 34 35 EXPORT_SYMBOL(generic_ro_fops); 36 37 static inline bool unsigned_offsets(struct file *file) 38 { 39 return file->f_mode & FMODE_UNSIGNED_OFFSET; 40 } 41 42 /** 43 * vfs_setpos - update the file offset for lseek 44 * @file: file structure in question 45 * @offset: file offset to seek to 46 * @maxsize: maximum file size 47 * 48 * This is a low-level filesystem helper for updating the file offset to 49 * the value specified by @offset if the given offset is valid and it is 50 * not equal to the current file offset. 51 * 52 * Return the specified offset on success and -EINVAL on invalid offset. 53 */ 54 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) 55 { 56 if (offset < 0 && !unsigned_offsets(file)) 57 return -EINVAL; 58 if (offset > maxsize) 59 return -EINVAL; 60 61 if (offset != file->f_pos) { 62 file->f_pos = offset; 63 file->f_version = 0; 64 } 65 return offset; 66 } 67 EXPORT_SYMBOL(vfs_setpos); 68 69 /** 70 * generic_file_llseek_size - generic llseek implementation for regular files 71 * @file: file structure to seek on 72 * @offset: file offset to seek to 73 * @whence: type of seek 74 * @maxsize: max size of this file in file system 75 * @eof: offset used for SEEK_END position 76 * 77 * This is a variant of generic_file_llseek that allows passing in a custom 78 * maximum file size and a custom EOF position, for e.g. hashed directories 79 * 80 * Synchronization: 81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) 82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. 83 * read/writes behave like SEEK_SET against seeks. 84 */ 85 loff_t 86 generic_file_llseek_size(struct file *file, loff_t offset, int whence, 87 loff_t maxsize, loff_t eof) 88 { 89 switch (whence) { 90 case SEEK_END: 91 offset += eof; 92 break; 93 case SEEK_CUR: 94 /* 95 * Here we special-case the lseek(fd, 0, SEEK_CUR) 96 * position-querying operation. Avoid rewriting the "same" 97 * f_pos value back to the file because a concurrent read(), 98 * write() or lseek() might have altered it 99 */ 100 if (offset == 0) 101 return file->f_pos; 102 /* 103 * f_lock protects against read/modify/write race with other 104 * SEEK_CURs. Note that parallel writes and reads behave 105 * like SEEK_SET. 106 */ 107 spin_lock(&file->f_lock); 108 offset = vfs_setpos(file, file->f_pos + offset, maxsize); 109 spin_unlock(&file->f_lock); 110 return offset; 111 case SEEK_DATA: 112 /* 113 * In the generic case the entire file is data, so as long as 114 * offset isn't at the end of the file then the offset is data. 115 */ 116 if ((unsigned long long)offset >= eof) 117 return -ENXIO; 118 break; 119 case SEEK_HOLE: 120 /* 121 * There is a virtual hole at the end of the file, so as long as 122 * offset isn't i_size or larger, return i_size. 123 */ 124 if ((unsigned long long)offset >= eof) 125 return -ENXIO; 126 offset = eof; 127 break; 128 } 129 130 return vfs_setpos(file, offset, maxsize); 131 } 132 EXPORT_SYMBOL(generic_file_llseek_size); 133 134 /** 135 * generic_file_llseek - generic llseek implementation for regular files 136 * @file: file structure to seek on 137 * @offset: file offset to seek to 138 * @whence: type of seek 139 * 140 * This is a generic implemenation of ->llseek useable for all normal local 141 * filesystems. It just updates the file offset to the value specified by 142 * @offset and @whence. 143 */ 144 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) 145 { 146 struct inode *inode = file->f_mapping->host; 147 148 return generic_file_llseek_size(file, offset, whence, 149 inode->i_sb->s_maxbytes, 150 i_size_read(inode)); 151 } 152 EXPORT_SYMBOL(generic_file_llseek); 153 154 /** 155 * fixed_size_llseek - llseek implementation for fixed-sized devices 156 * @file: file structure to seek on 157 * @offset: file offset to seek to 158 * @whence: type of seek 159 * @size: size of the file 160 * 161 */ 162 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) 163 { 164 switch (whence) { 165 case SEEK_SET: case SEEK_CUR: case SEEK_END: 166 return generic_file_llseek_size(file, offset, whence, 167 size, size); 168 default: 169 return -EINVAL; 170 } 171 } 172 EXPORT_SYMBOL(fixed_size_llseek); 173 174 /** 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices 176 * @file: file structure to seek on 177 * @offset: file offset to seek to 178 * @whence: type of seek 179 * 180 */ 181 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) 182 { 183 switch (whence) { 184 case SEEK_SET: case SEEK_CUR: 185 return generic_file_llseek_size(file, offset, whence, 186 OFFSET_MAX, 0); 187 default: 188 return -EINVAL; 189 } 190 } 191 EXPORT_SYMBOL(no_seek_end_llseek); 192 193 /** 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices 195 * @file: file structure to seek on 196 * @offset: file offset to seek to 197 * @whence: type of seek 198 * @size: maximal offset allowed 199 * 200 */ 201 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) 202 { 203 switch (whence) { 204 case SEEK_SET: case SEEK_CUR: 205 return generic_file_llseek_size(file, offset, whence, 206 size, 0); 207 default: 208 return -EINVAL; 209 } 210 } 211 EXPORT_SYMBOL(no_seek_end_llseek_size); 212 213 /** 214 * noop_llseek - No Operation Performed llseek implementation 215 * @file: file structure to seek on 216 * @offset: file offset to seek to 217 * @whence: type of seek 218 * 219 * This is an implementation of ->llseek useable for the rare special case when 220 * userspace expects the seek to succeed but the (device) file is actually not 221 * able to perform the seek. In this case you use noop_llseek() instead of 222 * falling back to the default implementation of ->llseek. 223 */ 224 loff_t noop_llseek(struct file *file, loff_t offset, int whence) 225 { 226 return file->f_pos; 227 } 228 EXPORT_SYMBOL(noop_llseek); 229 230 loff_t default_llseek(struct file *file, loff_t offset, int whence) 231 { 232 struct inode *inode = file_inode(file); 233 loff_t retval; 234 235 inode_lock(inode); 236 switch (whence) { 237 case SEEK_END: 238 offset += i_size_read(inode); 239 break; 240 case SEEK_CUR: 241 if (offset == 0) { 242 retval = file->f_pos; 243 goto out; 244 } 245 offset += file->f_pos; 246 break; 247 case SEEK_DATA: 248 /* 249 * In the generic case the entire file is data, so as 250 * long as offset isn't at the end of the file then the 251 * offset is data. 252 */ 253 if (offset >= inode->i_size) { 254 retval = -ENXIO; 255 goto out; 256 } 257 break; 258 case SEEK_HOLE: 259 /* 260 * There is a virtual hole at the end of the file, so 261 * as long as offset isn't i_size or larger, return 262 * i_size. 263 */ 264 if (offset >= inode->i_size) { 265 retval = -ENXIO; 266 goto out; 267 } 268 offset = inode->i_size; 269 break; 270 } 271 retval = -EINVAL; 272 if (offset >= 0 || unsigned_offsets(file)) { 273 if (offset != file->f_pos) { 274 file->f_pos = offset; 275 file->f_version = 0; 276 } 277 retval = offset; 278 } 279 out: 280 inode_unlock(inode); 281 return retval; 282 } 283 EXPORT_SYMBOL(default_llseek); 284 285 loff_t vfs_llseek(struct file *file, loff_t offset, int whence) 286 { 287 if (!(file->f_mode & FMODE_LSEEK)) 288 return -ESPIPE; 289 return file->f_op->llseek(file, offset, whence); 290 } 291 EXPORT_SYMBOL(vfs_llseek); 292 293 static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) 294 { 295 off_t retval; 296 struct fd f = fdget_pos(fd); 297 if (!f.file) 298 return -EBADF; 299 300 retval = -EINVAL; 301 if (whence <= SEEK_MAX) { 302 loff_t res = vfs_llseek(f.file, offset, whence); 303 retval = res; 304 if (res != (loff_t)retval) 305 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 306 } 307 fdput_pos(f); 308 return retval; 309 } 310 311 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 312 { 313 return ksys_lseek(fd, offset, whence); 314 } 315 316 #ifdef CONFIG_COMPAT 317 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) 318 { 319 return ksys_lseek(fd, offset, whence); 320 } 321 #endif 322 323 #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ 324 defined(__ARCH_WANT_SYS_LLSEEK) 325 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 326 unsigned long, offset_low, loff_t __user *, result, 327 unsigned int, whence) 328 { 329 int retval; 330 struct fd f = fdget_pos(fd); 331 loff_t offset; 332 333 if (!f.file) 334 return -EBADF; 335 336 retval = -EINVAL; 337 if (whence > SEEK_MAX) 338 goto out_putf; 339 340 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, 341 whence); 342 343 retval = (int)offset; 344 if (offset >= 0) { 345 retval = -EFAULT; 346 if (!copy_to_user(result, &offset, sizeof(offset))) 347 retval = 0; 348 } 349 out_putf: 350 fdput_pos(f); 351 return retval; 352 } 353 #endif 354 355 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) 356 { 357 int mask = read_write == READ ? MAY_READ : MAY_WRITE; 358 int ret; 359 360 if (unlikely((ssize_t) count < 0)) 361 return -EINVAL; 362 363 if (ppos) { 364 loff_t pos = *ppos; 365 366 if (unlikely(pos < 0)) { 367 if (!unsigned_offsets(file)) 368 return -EINVAL; 369 if (count >= -pos) /* both values are in 0..LLONG_MAX */ 370 return -EOVERFLOW; 371 } else if (unlikely((loff_t) (pos + count) < 0)) { 372 if (!unsigned_offsets(file)) 373 return -EINVAL; 374 } 375 } 376 377 ret = security_file_permission(file, mask); 378 if (ret) 379 return ret; 380 381 return fsnotify_file_area_perm(file, mask, ppos, count); 382 } 383 EXPORT_SYMBOL(rw_verify_area); 384 385 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 386 { 387 struct kiocb kiocb; 388 struct iov_iter iter; 389 ssize_t ret; 390 391 init_sync_kiocb(&kiocb, filp); 392 kiocb.ki_pos = (ppos ? *ppos : 0); 393 iov_iter_ubuf(&iter, ITER_DEST, buf, len); 394 395 ret = filp->f_op->read_iter(&kiocb, &iter); 396 BUG_ON(ret == -EIOCBQUEUED); 397 if (ppos) 398 *ppos = kiocb.ki_pos; 399 return ret; 400 } 401 402 static int warn_unsupported(struct file *file, const char *op) 403 { 404 pr_warn_ratelimited( 405 "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 406 op, file, current->pid, current->comm); 407 return -EINVAL; 408 } 409 410 ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 411 { 412 struct kvec iov = { 413 .iov_base = buf, 414 .iov_len = min_t(size_t, count, MAX_RW_COUNT), 415 }; 416 struct kiocb kiocb; 417 struct iov_iter iter; 418 ssize_t ret; 419 420 if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) 421 return -EINVAL; 422 if (!(file->f_mode & FMODE_CAN_READ)) 423 return -EINVAL; 424 /* 425 * Also fail if ->read_iter and ->read are both wired up as that 426 * implies very convoluted semantics. 427 */ 428 if (unlikely(!file->f_op->read_iter || file->f_op->read)) 429 return warn_unsupported(file, "read"); 430 431 init_sync_kiocb(&kiocb, file); 432 kiocb.ki_pos = pos ? *pos : 0; 433 iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); 434 ret = file->f_op->read_iter(&kiocb, &iter); 435 if (ret > 0) { 436 if (pos) 437 *pos = kiocb.ki_pos; 438 fsnotify_access(file); 439 add_rchar(current, ret); 440 } 441 inc_syscr(current); 442 return ret; 443 } 444 445 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 446 { 447 ssize_t ret; 448 449 ret = rw_verify_area(READ, file, pos, count); 450 if (ret) 451 return ret; 452 return __kernel_read(file, buf, count, pos); 453 } 454 EXPORT_SYMBOL(kernel_read); 455 456 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 457 { 458 ssize_t ret; 459 460 if (!(file->f_mode & FMODE_READ)) 461 return -EBADF; 462 if (!(file->f_mode & FMODE_CAN_READ)) 463 return -EINVAL; 464 if (unlikely(!access_ok(buf, count))) 465 return -EFAULT; 466 467 ret = rw_verify_area(READ, file, pos, count); 468 if (ret) 469 return ret; 470 if (count > MAX_RW_COUNT) 471 count = MAX_RW_COUNT; 472 473 if (file->f_op->read) 474 ret = file->f_op->read(file, buf, count, pos); 475 else if (file->f_op->read_iter) 476 ret = new_sync_read(file, buf, count, pos); 477 else 478 ret = -EINVAL; 479 if (ret > 0) { 480 fsnotify_access(file); 481 add_rchar(current, ret); 482 } 483 inc_syscr(current); 484 return ret; 485 } 486 487 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) 488 { 489 struct kiocb kiocb; 490 struct iov_iter iter; 491 ssize_t ret; 492 493 init_sync_kiocb(&kiocb, filp); 494 kiocb.ki_pos = (ppos ? *ppos : 0); 495 iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); 496 497 ret = filp->f_op->write_iter(&kiocb, &iter); 498 BUG_ON(ret == -EIOCBQUEUED); 499 if (ret > 0 && ppos) 500 *ppos = kiocb.ki_pos; 501 return ret; 502 } 503 504 /* caller is responsible for file_start_write/file_end_write */ 505 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) 506 { 507 struct kiocb kiocb; 508 ssize_t ret; 509 510 if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) 511 return -EBADF; 512 if (!(file->f_mode & FMODE_CAN_WRITE)) 513 return -EINVAL; 514 /* 515 * Also fail if ->write_iter and ->write are both wired up as that 516 * implies very convoluted semantics. 517 */ 518 if (unlikely(!file->f_op->write_iter || file->f_op->write)) 519 return warn_unsupported(file, "write"); 520 521 init_sync_kiocb(&kiocb, file); 522 kiocb.ki_pos = pos ? *pos : 0; 523 ret = file->f_op->write_iter(&kiocb, from); 524 if (ret > 0) { 525 if (pos) 526 *pos = kiocb.ki_pos; 527 fsnotify_modify(file); 528 add_wchar(current, ret); 529 } 530 inc_syscw(current); 531 return ret; 532 } 533 534 /* caller is responsible for file_start_write/file_end_write */ 535 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) 536 { 537 struct kvec iov = { 538 .iov_base = (void *)buf, 539 .iov_len = min_t(size_t, count, MAX_RW_COUNT), 540 }; 541 struct iov_iter iter; 542 iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); 543 return __kernel_write_iter(file, &iter, pos); 544 } 545 /* 546 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", 547 * but autofs is one of the few internal kernel users that actually 548 * wants this _and_ can be built as a module. So we need to export 549 * this symbol for autofs, even though it really isn't appropriate 550 * for any other kernel modules. 551 */ 552 EXPORT_SYMBOL_GPL(__kernel_write); 553 554 ssize_t kernel_write(struct file *file, const void *buf, size_t count, 555 loff_t *pos) 556 { 557 ssize_t ret; 558 559 ret = rw_verify_area(WRITE, file, pos, count); 560 if (ret) 561 return ret; 562 563 file_start_write(file); 564 ret = __kernel_write(file, buf, count, pos); 565 file_end_write(file); 566 return ret; 567 } 568 EXPORT_SYMBOL(kernel_write); 569 570 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 571 { 572 ssize_t ret; 573 574 if (!(file->f_mode & FMODE_WRITE)) 575 return -EBADF; 576 if (!(file->f_mode & FMODE_CAN_WRITE)) 577 return -EINVAL; 578 if (unlikely(!access_ok(buf, count))) 579 return -EFAULT; 580 581 ret = rw_verify_area(WRITE, file, pos, count); 582 if (ret) 583 return ret; 584 if (count > MAX_RW_COUNT) 585 count = MAX_RW_COUNT; 586 file_start_write(file); 587 if (file->f_op->write) 588 ret = file->f_op->write(file, buf, count, pos); 589 else if (file->f_op->write_iter) 590 ret = new_sync_write(file, buf, count, pos); 591 else 592 ret = -EINVAL; 593 if (ret > 0) { 594 fsnotify_modify(file); 595 add_wchar(current, ret); 596 } 597 inc_syscw(current); 598 file_end_write(file); 599 return ret; 600 } 601 602 /* file_ppos returns &file->f_pos or NULL if file is stream */ 603 static inline loff_t *file_ppos(struct file *file) 604 { 605 return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; 606 } 607 608 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) 609 { 610 struct fd f = fdget_pos(fd); 611 ssize_t ret = -EBADF; 612 613 if (f.file) { 614 loff_t pos, *ppos = file_ppos(f.file); 615 if (ppos) { 616 pos = *ppos; 617 ppos = &pos; 618 } 619 ret = vfs_read(f.file, buf, count, ppos); 620 if (ret >= 0 && ppos) 621 f.file->f_pos = pos; 622 fdput_pos(f); 623 } 624 return ret; 625 } 626 627 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 628 { 629 return ksys_read(fd, buf, count); 630 } 631 632 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) 633 { 634 struct fd f = fdget_pos(fd); 635 ssize_t ret = -EBADF; 636 637 if (f.file) { 638 loff_t pos, *ppos = file_ppos(f.file); 639 if (ppos) { 640 pos = *ppos; 641 ppos = &pos; 642 } 643 ret = vfs_write(f.file, buf, count, ppos); 644 if (ret >= 0 && ppos) 645 f.file->f_pos = pos; 646 fdput_pos(f); 647 } 648 649 return ret; 650 } 651 652 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 653 size_t, count) 654 { 655 return ksys_write(fd, buf, count); 656 } 657 658 ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, 659 loff_t pos) 660 { 661 struct fd f; 662 ssize_t ret = -EBADF; 663 664 if (pos < 0) 665 return -EINVAL; 666 667 f = fdget(fd); 668 if (f.file) { 669 ret = -ESPIPE; 670 if (f.file->f_mode & FMODE_PREAD) 671 ret = vfs_read(f.file, buf, count, &pos); 672 fdput(f); 673 } 674 675 return ret; 676 } 677 678 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, 679 size_t, count, loff_t, pos) 680 { 681 return ksys_pread64(fd, buf, count, pos); 682 } 683 684 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) 685 COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, 686 size_t, count, compat_arg_u64_dual(pos)) 687 { 688 return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); 689 } 690 #endif 691 692 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, 693 size_t count, loff_t pos) 694 { 695 struct fd f; 696 ssize_t ret = -EBADF; 697 698 if (pos < 0) 699 return -EINVAL; 700 701 f = fdget(fd); 702 if (f.file) { 703 ret = -ESPIPE; 704 if (f.file->f_mode & FMODE_PWRITE) 705 ret = vfs_write(f.file, buf, count, &pos); 706 fdput(f); 707 } 708 709 return ret; 710 } 711 712 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, 713 size_t, count, loff_t, pos) 714 { 715 return ksys_pwrite64(fd, buf, count, pos); 716 } 717 718 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) 719 COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, 720 size_t, count, compat_arg_u64_dual(pos)) 721 { 722 return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); 723 } 724 #endif 725 726 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, 727 loff_t *ppos, int type, rwf_t flags) 728 { 729 struct kiocb kiocb; 730 ssize_t ret; 731 732 init_sync_kiocb(&kiocb, filp); 733 ret = kiocb_set_rw_flags(&kiocb, flags); 734 if (ret) 735 return ret; 736 kiocb.ki_pos = (ppos ? *ppos : 0); 737 738 if (type == READ) 739 ret = filp->f_op->read_iter(&kiocb, iter); 740 else 741 ret = filp->f_op->write_iter(&kiocb, iter); 742 BUG_ON(ret == -EIOCBQUEUED); 743 if (ppos) 744 *ppos = kiocb.ki_pos; 745 return ret; 746 } 747 748 /* Do it by hand, with file-ops */ 749 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, 750 loff_t *ppos, int type, rwf_t flags) 751 { 752 ssize_t ret = 0; 753 754 if (flags & ~RWF_HIPRI) 755 return -EOPNOTSUPP; 756 757 while (iov_iter_count(iter)) { 758 ssize_t nr; 759 760 if (type == READ) { 761 nr = filp->f_op->read(filp, iter_iov_addr(iter), 762 iter_iov_len(iter), ppos); 763 } else { 764 nr = filp->f_op->write(filp, iter_iov_addr(iter), 765 iter_iov_len(iter), ppos); 766 } 767 768 if (nr < 0) { 769 if (!ret) 770 ret = nr; 771 break; 772 } 773 ret += nr; 774 if (nr != iter_iov_len(iter)) 775 break; 776 iov_iter_advance(iter, nr); 777 } 778 779 return ret; 780 } 781 782 ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, 783 struct iov_iter *iter) 784 { 785 size_t tot_len; 786 ssize_t ret = 0; 787 788 if (!file->f_op->read_iter) 789 return -EINVAL; 790 if (!(file->f_mode & FMODE_READ)) 791 return -EBADF; 792 if (!(file->f_mode & FMODE_CAN_READ)) 793 return -EINVAL; 794 795 tot_len = iov_iter_count(iter); 796 if (!tot_len) 797 goto out; 798 ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); 799 if (ret < 0) 800 return ret; 801 802 ret = file->f_op->read_iter(iocb, iter); 803 out: 804 if (ret >= 0) 805 fsnotify_access(file); 806 return ret; 807 } 808 EXPORT_SYMBOL(vfs_iocb_iter_read); 809 810 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, 811 rwf_t flags) 812 { 813 size_t tot_len; 814 ssize_t ret = 0; 815 816 if (!file->f_op->read_iter) 817 return -EINVAL; 818 if (!(file->f_mode & FMODE_READ)) 819 return -EBADF; 820 if (!(file->f_mode & FMODE_CAN_READ)) 821 return -EINVAL; 822 823 tot_len = iov_iter_count(iter); 824 if (!tot_len) 825 goto out; 826 ret = rw_verify_area(READ, file, ppos, tot_len); 827 if (ret < 0) 828 return ret; 829 830 ret = do_iter_readv_writev(file, iter, ppos, READ, flags); 831 out: 832 if (ret >= 0) 833 fsnotify_access(file); 834 return ret; 835 } 836 EXPORT_SYMBOL(vfs_iter_read); 837 838 /* 839 * Caller is responsible for calling kiocb_end_write() on completion 840 * if async iocb was queued. 841 */ 842 ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, 843 struct iov_iter *iter) 844 { 845 size_t tot_len; 846 ssize_t ret = 0; 847 848 if (!file->f_op->write_iter) 849 return -EINVAL; 850 if (!(file->f_mode & FMODE_WRITE)) 851 return -EBADF; 852 if (!(file->f_mode & FMODE_CAN_WRITE)) 853 return -EINVAL; 854 855 tot_len = iov_iter_count(iter); 856 if (!tot_len) 857 return 0; 858 ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); 859 if (ret < 0) 860 return ret; 861 862 kiocb_start_write(iocb); 863 ret = file->f_op->write_iter(iocb, iter); 864 if (ret != -EIOCBQUEUED) 865 kiocb_end_write(iocb); 866 if (ret > 0) 867 fsnotify_modify(file); 868 869 return ret; 870 } 871 EXPORT_SYMBOL(vfs_iocb_iter_write); 872 873 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, 874 rwf_t flags) 875 { 876 size_t tot_len; 877 ssize_t ret; 878 879 if (!(file->f_mode & FMODE_WRITE)) 880 return -EBADF; 881 if (!(file->f_mode & FMODE_CAN_WRITE)) 882 return -EINVAL; 883 if (!file->f_op->write_iter) 884 return -EINVAL; 885 886 tot_len = iov_iter_count(iter); 887 if (!tot_len) 888 return 0; 889 890 ret = rw_verify_area(WRITE, file, ppos, tot_len); 891 if (ret < 0) 892 return ret; 893 894 file_start_write(file); 895 ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); 896 if (ret > 0) 897 fsnotify_modify(file); 898 file_end_write(file); 899 900 return ret; 901 } 902 EXPORT_SYMBOL(vfs_iter_write); 903 904 static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 905 unsigned long vlen, loff_t *pos, rwf_t flags) 906 { 907 struct iovec iovstack[UIO_FASTIOV]; 908 struct iovec *iov = iovstack; 909 struct iov_iter iter; 910 size_t tot_len; 911 ssize_t ret = 0; 912 913 if (!(file->f_mode & FMODE_READ)) 914 return -EBADF; 915 if (!(file->f_mode & FMODE_CAN_READ)) 916 return -EINVAL; 917 918 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, 919 &iter); 920 if (ret < 0) 921 return ret; 922 923 tot_len = iov_iter_count(&iter); 924 if (!tot_len) 925 goto out; 926 927 ret = rw_verify_area(READ, file, pos, tot_len); 928 if (ret < 0) 929 goto out; 930 931 if (file->f_op->read_iter) 932 ret = do_iter_readv_writev(file, &iter, pos, READ, flags); 933 else 934 ret = do_loop_readv_writev(file, &iter, pos, READ, flags); 935 out: 936 if (ret >= 0) 937 fsnotify_access(file); 938 kfree(iov); 939 return ret; 940 } 941 942 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 943 unsigned long vlen, loff_t *pos, rwf_t flags) 944 { 945 struct iovec iovstack[UIO_FASTIOV]; 946 struct iovec *iov = iovstack; 947 struct iov_iter iter; 948 size_t tot_len; 949 ssize_t ret = 0; 950 951 if (!(file->f_mode & FMODE_WRITE)) 952 return -EBADF; 953 if (!(file->f_mode & FMODE_CAN_WRITE)) 954 return -EINVAL; 955 956 ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, 957 &iter); 958 if (ret < 0) 959 return ret; 960 961 tot_len = iov_iter_count(&iter); 962 if (!tot_len) 963 goto out; 964 965 ret = rw_verify_area(WRITE, file, pos, tot_len); 966 if (ret < 0) 967 goto out; 968 969 file_start_write(file); 970 if (file->f_op->write_iter) 971 ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); 972 else 973 ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); 974 if (ret > 0) 975 fsnotify_modify(file); 976 file_end_write(file); 977 out: 978 kfree(iov); 979 return ret; 980 } 981 982 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, 983 unsigned long vlen, rwf_t flags) 984 { 985 struct fd f = fdget_pos(fd); 986 ssize_t ret = -EBADF; 987 988 if (f.file) { 989 loff_t pos, *ppos = file_ppos(f.file); 990 if (ppos) { 991 pos = *ppos; 992 ppos = &pos; 993 } 994 ret = vfs_readv(f.file, vec, vlen, ppos, flags); 995 if (ret >= 0 && ppos) 996 f.file->f_pos = pos; 997 fdput_pos(f); 998 } 999 1000 if (ret > 0) 1001 add_rchar(current, ret); 1002 inc_syscr(current); 1003 return ret; 1004 } 1005 1006 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, 1007 unsigned long vlen, rwf_t flags) 1008 { 1009 struct fd f = fdget_pos(fd); 1010 ssize_t ret = -EBADF; 1011 1012 if (f.file) { 1013 loff_t pos, *ppos = file_ppos(f.file); 1014 if (ppos) { 1015 pos = *ppos; 1016 ppos = &pos; 1017 } 1018 ret = vfs_writev(f.file, vec, vlen, ppos, flags); 1019 if (ret >= 0 && ppos) 1020 f.file->f_pos = pos; 1021 fdput_pos(f); 1022 } 1023 1024 if (ret > 0) 1025 add_wchar(current, ret); 1026 inc_syscw(current); 1027 return ret; 1028 } 1029 1030 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) 1031 { 1032 #define HALF_LONG_BITS (BITS_PER_LONG / 2) 1033 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; 1034 } 1035 1036 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, 1037 unsigned long vlen, loff_t pos, rwf_t flags) 1038 { 1039 struct fd f; 1040 ssize_t ret = -EBADF; 1041 1042 if (pos < 0) 1043 return -EINVAL; 1044 1045 f = fdget(fd); 1046 if (f.file) { 1047 ret = -ESPIPE; 1048 if (f.file->f_mode & FMODE_PREAD) 1049 ret = vfs_readv(f.file, vec, vlen, &pos, flags); 1050 fdput(f); 1051 } 1052 1053 if (ret > 0) 1054 add_rchar(current, ret); 1055 inc_syscr(current); 1056 return ret; 1057 } 1058 1059 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, 1060 unsigned long vlen, loff_t pos, rwf_t flags) 1061 { 1062 struct fd f; 1063 ssize_t ret = -EBADF; 1064 1065 if (pos < 0) 1066 return -EINVAL; 1067 1068 f = fdget(fd); 1069 if (f.file) { 1070 ret = -ESPIPE; 1071 if (f.file->f_mode & FMODE_PWRITE) 1072 ret = vfs_writev(f.file, vec, vlen, &pos, flags); 1073 fdput(f); 1074 } 1075 1076 if (ret > 0) 1077 add_wchar(current, ret); 1078 inc_syscw(current); 1079 return ret; 1080 } 1081 1082 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 1083 unsigned long, vlen) 1084 { 1085 return do_readv(fd, vec, vlen, 0); 1086 } 1087 1088 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 1089 unsigned long, vlen) 1090 { 1091 return do_writev(fd, vec, vlen, 0); 1092 } 1093 1094 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 1095 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1096 { 1097 loff_t pos = pos_from_hilo(pos_h, pos_l); 1098 1099 return do_preadv(fd, vec, vlen, pos, 0); 1100 } 1101 1102 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, 1103 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1104 rwf_t, flags) 1105 { 1106 loff_t pos = pos_from_hilo(pos_h, pos_l); 1107 1108 if (pos == -1) 1109 return do_readv(fd, vec, vlen, flags); 1110 1111 return do_preadv(fd, vec, vlen, pos, flags); 1112 } 1113 1114 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 1115 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1116 { 1117 loff_t pos = pos_from_hilo(pos_h, pos_l); 1118 1119 return do_pwritev(fd, vec, vlen, pos, 0); 1120 } 1121 1122 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, 1123 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1124 rwf_t, flags) 1125 { 1126 loff_t pos = pos_from_hilo(pos_h, pos_l); 1127 1128 if (pos == -1) 1129 return do_writev(fd, vec, vlen, flags); 1130 1131 return do_pwritev(fd, vec, vlen, pos, flags); 1132 } 1133 1134 /* 1135 * Various compat syscalls. Note that they all pretend to take a native 1136 * iovec - import_iovec will properly treat those as compat_iovecs based on 1137 * in_compat_syscall(). 1138 */ 1139 #ifdef CONFIG_COMPAT 1140 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 1141 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 1142 const struct iovec __user *, vec, 1143 unsigned long, vlen, loff_t, pos) 1144 { 1145 return do_preadv(fd, vec, vlen, pos, 0); 1146 } 1147 #endif 1148 1149 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1150 const struct iovec __user *, vec, 1151 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1152 { 1153 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1154 1155 return do_preadv(fd, vec, vlen, pos, 0); 1156 } 1157 1158 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 1159 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, 1160 const struct iovec __user *, vec, 1161 unsigned long, vlen, loff_t, pos, rwf_t, flags) 1162 { 1163 if (pos == -1) 1164 return do_readv(fd, vec, vlen, flags); 1165 return do_preadv(fd, vec, vlen, pos, flags); 1166 } 1167 #endif 1168 1169 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, 1170 const struct iovec __user *, vec, 1171 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, 1172 rwf_t, flags) 1173 { 1174 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1175 1176 if (pos == -1) 1177 return do_readv(fd, vec, vlen, flags); 1178 return do_preadv(fd, vec, vlen, pos, flags); 1179 } 1180 1181 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 1182 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1183 const struct iovec __user *, vec, 1184 unsigned long, vlen, loff_t, pos) 1185 { 1186 return do_pwritev(fd, vec, vlen, pos, 0); 1187 } 1188 #endif 1189 1190 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1191 const struct iovec __user *,vec, 1192 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1193 { 1194 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1195 1196 return do_pwritev(fd, vec, vlen, pos, 0); 1197 } 1198 1199 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 1200 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, 1201 const struct iovec __user *, vec, 1202 unsigned long, vlen, loff_t, pos, rwf_t, flags) 1203 { 1204 if (pos == -1) 1205 return do_writev(fd, vec, vlen, flags); 1206 return do_pwritev(fd, vec, vlen, pos, flags); 1207 } 1208 #endif 1209 1210 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, 1211 const struct iovec __user *,vec, 1212 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) 1213 { 1214 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1215 1216 if (pos == -1) 1217 return do_writev(fd, vec, vlen, flags); 1218 return do_pwritev(fd, vec, vlen, pos, flags); 1219 } 1220 #endif /* CONFIG_COMPAT */ 1221 1222 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1223 size_t count, loff_t max) 1224 { 1225 struct fd in, out; 1226 struct inode *in_inode, *out_inode; 1227 struct pipe_inode_info *opipe; 1228 loff_t pos; 1229 loff_t out_pos; 1230 ssize_t retval; 1231 int fl; 1232 1233 /* 1234 * Get input file, and verify that it is ok.. 1235 */ 1236 retval = -EBADF; 1237 in = fdget(in_fd); 1238 if (!in.file) 1239 goto out; 1240 if (!(in.file->f_mode & FMODE_READ)) 1241 goto fput_in; 1242 retval = -ESPIPE; 1243 if (!ppos) { 1244 pos = in.file->f_pos; 1245 } else { 1246 pos = *ppos; 1247 if (!(in.file->f_mode & FMODE_PREAD)) 1248 goto fput_in; 1249 } 1250 retval = rw_verify_area(READ, in.file, &pos, count); 1251 if (retval < 0) 1252 goto fput_in; 1253 if (count > MAX_RW_COUNT) 1254 count = MAX_RW_COUNT; 1255 1256 /* 1257 * Get output file, and verify that it is ok.. 1258 */ 1259 retval = -EBADF; 1260 out = fdget(out_fd); 1261 if (!out.file) 1262 goto fput_in; 1263 if (!(out.file->f_mode & FMODE_WRITE)) 1264 goto fput_out; 1265 in_inode = file_inode(in.file); 1266 out_inode = file_inode(out.file); 1267 out_pos = out.file->f_pos; 1268 1269 if (!max) 1270 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 1271 1272 if (unlikely(pos + count > max)) { 1273 retval = -EOVERFLOW; 1274 if (pos >= max) 1275 goto fput_out; 1276 count = max - pos; 1277 } 1278 1279 fl = 0; 1280 #if 0 1281 /* 1282 * We need to debate whether we can enable this or not. The 1283 * man page documents EAGAIN return for the output at least, 1284 * and the application is arguably buggy if it doesn't expect 1285 * EAGAIN on a non-blocking file descriptor. 1286 */ 1287 if (in.file->f_flags & O_NONBLOCK) 1288 fl = SPLICE_F_NONBLOCK; 1289 #endif 1290 opipe = get_pipe_info(out.file, true); 1291 if (!opipe) { 1292 retval = rw_verify_area(WRITE, out.file, &out_pos, count); 1293 if (retval < 0) 1294 goto fput_out; 1295 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, 1296 count, fl); 1297 } else { 1298 if (out.file->f_flags & O_NONBLOCK) 1299 fl |= SPLICE_F_NONBLOCK; 1300 1301 retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); 1302 } 1303 1304 if (retval > 0) { 1305 add_rchar(current, retval); 1306 add_wchar(current, retval); 1307 fsnotify_access(in.file); 1308 fsnotify_modify(out.file); 1309 out.file->f_pos = out_pos; 1310 if (ppos) 1311 *ppos = pos; 1312 else 1313 in.file->f_pos = pos; 1314 } 1315 1316 inc_syscr(current); 1317 inc_syscw(current); 1318 if (pos > max) 1319 retval = -EOVERFLOW; 1320 1321 fput_out: 1322 fdput(out); 1323 fput_in: 1324 fdput(in); 1325 out: 1326 return retval; 1327 } 1328 1329 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) 1330 { 1331 loff_t pos; 1332 off_t off; 1333 ssize_t ret; 1334 1335 if (offset) { 1336 if (unlikely(get_user(off, offset))) 1337 return -EFAULT; 1338 pos = off; 1339 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1340 if (unlikely(put_user(pos, offset))) 1341 return -EFAULT; 1342 return ret; 1343 } 1344 1345 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1346 } 1347 1348 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) 1349 { 1350 loff_t pos; 1351 ssize_t ret; 1352 1353 if (offset) { 1354 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1355 return -EFAULT; 1356 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1357 if (unlikely(put_user(pos, offset))) 1358 return -EFAULT; 1359 return ret; 1360 } 1361 1362 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1363 } 1364 1365 #ifdef CONFIG_COMPAT 1366 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, 1367 compat_off_t __user *, offset, compat_size_t, count) 1368 { 1369 loff_t pos; 1370 off_t off; 1371 ssize_t ret; 1372 1373 if (offset) { 1374 if (unlikely(get_user(off, offset))) 1375 return -EFAULT; 1376 pos = off; 1377 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1378 if (unlikely(put_user(pos, offset))) 1379 return -EFAULT; 1380 return ret; 1381 } 1382 1383 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1384 } 1385 1386 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, 1387 compat_loff_t __user *, offset, compat_size_t, count) 1388 { 1389 loff_t pos; 1390 ssize_t ret; 1391 1392 if (offset) { 1393 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1394 return -EFAULT; 1395 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1396 if (unlikely(put_user(pos, offset))) 1397 return -EFAULT; 1398 return ret; 1399 } 1400 1401 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1402 } 1403 #endif 1404 1405 /* 1406 * Performs necessary checks before doing a file copy 1407 * 1408 * Can adjust amount of bytes to copy via @req_count argument. 1409 * Returns appropriate error code that caller should return or 1410 * zero in case the copy should be allowed. 1411 */ 1412 static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, 1413 struct file *file_out, loff_t pos_out, 1414 size_t *req_count, unsigned int flags) 1415 { 1416 struct inode *inode_in = file_inode(file_in); 1417 struct inode *inode_out = file_inode(file_out); 1418 uint64_t count = *req_count; 1419 loff_t size_in; 1420 int ret; 1421 1422 ret = generic_file_rw_checks(file_in, file_out); 1423 if (ret) 1424 return ret; 1425 1426 /* 1427 * We allow some filesystems to handle cross sb copy, but passing 1428 * a file of the wrong filesystem type to filesystem driver can result 1429 * in an attempt to dereference the wrong type of ->private_data, so 1430 * avoid doing that until we really have a good reason. 1431 * 1432 * nfs and cifs define several different file_system_type structures 1433 * and several different sets of file_operations, but they all end up 1434 * using the same ->copy_file_range() function pointer. 1435 */ 1436 if (flags & COPY_FILE_SPLICE) { 1437 /* cross sb splice is allowed */ 1438 } else if (file_out->f_op->copy_file_range) { 1439 if (file_in->f_op->copy_file_range != 1440 file_out->f_op->copy_file_range) 1441 return -EXDEV; 1442 } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { 1443 return -EXDEV; 1444 } 1445 1446 /* Don't touch certain kinds of inodes */ 1447 if (IS_IMMUTABLE(inode_out)) 1448 return -EPERM; 1449 1450 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) 1451 return -ETXTBSY; 1452 1453 /* Ensure offsets don't wrap. */ 1454 if (pos_in + count < pos_in || pos_out + count < pos_out) 1455 return -EOVERFLOW; 1456 1457 /* Shorten the copy to EOF */ 1458 size_in = i_size_read(inode_in); 1459 if (pos_in >= size_in) 1460 count = 0; 1461 else 1462 count = min(count, size_in - (uint64_t)pos_in); 1463 1464 ret = generic_write_check_limits(file_out, pos_out, &count); 1465 if (ret) 1466 return ret; 1467 1468 /* Don't allow overlapped copying within the same file. */ 1469 if (inode_in == inode_out && 1470 pos_out + count > pos_in && 1471 pos_out < pos_in + count) 1472 return -EINVAL; 1473 1474 *req_count = count; 1475 return 0; 1476 } 1477 1478 /* 1479 * copy_file_range() differs from regular file read and write in that it 1480 * specifically allows return partial success. When it does so is up to 1481 * the copy_file_range method. 1482 */ 1483 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, 1484 struct file *file_out, loff_t pos_out, 1485 size_t len, unsigned int flags) 1486 { 1487 ssize_t ret; 1488 bool splice = flags & COPY_FILE_SPLICE; 1489 bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; 1490 1491 if (flags & ~COPY_FILE_SPLICE) 1492 return -EINVAL; 1493 1494 ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, 1495 flags); 1496 if (unlikely(ret)) 1497 return ret; 1498 1499 ret = rw_verify_area(READ, file_in, &pos_in, len); 1500 if (unlikely(ret)) 1501 return ret; 1502 1503 ret = rw_verify_area(WRITE, file_out, &pos_out, len); 1504 if (unlikely(ret)) 1505 return ret; 1506 1507 if (len == 0) 1508 return 0; 1509 1510 file_start_write(file_out); 1511 1512 /* 1513 * Cloning is supported by more file systems, so we implement copy on 1514 * same sb using clone, but for filesystems where both clone and copy 1515 * are supported (e.g. nfs,cifs), we only call the copy method. 1516 */ 1517 if (!splice && file_out->f_op->copy_file_range) { 1518 ret = file_out->f_op->copy_file_range(file_in, pos_in, 1519 file_out, pos_out, 1520 len, flags); 1521 } else if (!splice && file_in->f_op->remap_file_range && samesb) { 1522 ret = file_in->f_op->remap_file_range(file_in, pos_in, 1523 file_out, pos_out, 1524 min_t(loff_t, MAX_RW_COUNT, len), 1525 REMAP_FILE_CAN_SHORTEN); 1526 /* fallback to splice */ 1527 if (ret <= 0) 1528 splice = true; 1529 } else if (samesb) { 1530 /* Fallback to splice for same sb copy for backward compat */ 1531 splice = true; 1532 } 1533 1534 file_end_write(file_out); 1535 1536 if (!splice) 1537 goto done; 1538 1539 /* 1540 * We can get here for same sb copy of filesystems that do not implement 1541 * ->copy_file_range() in case filesystem does not support clone or in 1542 * case filesystem supports clone but rejected the clone request (e.g. 1543 * because it was not block aligned). 1544 * 1545 * In both cases, fall back to kernel copy so we are able to maintain a 1546 * consistent story about which filesystems support copy_file_range() 1547 * and which filesystems do not, that will allow userspace tools to 1548 * make consistent desicions w.r.t using copy_file_range(). 1549 * 1550 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE 1551 * for server-side-copy between any two sb. 1552 * 1553 * In any case, we call do_splice_direct() and not splice_file_range(), 1554 * without file_start_write() held, to avoid possible deadlocks related 1555 * to splicing from input file, while file_start_write() is held on 1556 * the output file on a different sb. 1557 */ 1558 ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, 1559 min_t(size_t, len, MAX_RW_COUNT), 0); 1560 done: 1561 if (ret > 0) { 1562 fsnotify_access(file_in); 1563 add_rchar(current, ret); 1564 fsnotify_modify(file_out); 1565 add_wchar(current, ret); 1566 } 1567 1568 inc_syscr(current); 1569 inc_syscw(current); 1570 1571 return ret; 1572 } 1573 EXPORT_SYMBOL(vfs_copy_file_range); 1574 1575 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, 1576 int, fd_out, loff_t __user *, off_out, 1577 size_t, len, unsigned int, flags) 1578 { 1579 loff_t pos_in; 1580 loff_t pos_out; 1581 struct fd f_in; 1582 struct fd f_out; 1583 ssize_t ret = -EBADF; 1584 1585 f_in = fdget(fd_in); 1586 if (!f_in.file) 1587 goto out2; 1588 1589 f_out = fdget(fd_out); 1590 if (!f_out.file) 1591 goto out1; 1592 1593 ret = -EFAULT; 1594 if (off_in) { 1595 if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) 1596 goto out; 1597 } else { 1598 pos_in = f_in.file->f_pos; 1599 } 1600 1601 if (off_out) { 1602 if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) 1603 goto out; 1604 } else { 1605 pos_out = f_out.file->f_pos; 1606 } 1607 1608 ret = -EINVAL; 1609 if (flags != 0) 1610 goto out; 1611 1612 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, 1613 flags); 1614 if (ret > 0) { 1615 pos_in += ret; 1616 pos_out += ret; 1617 1618 if (off_in) { 1619 if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) 1620 ret = -EFAULT; 1621 } else { 1622 f_in.file->f_pos = pos_in; 1623 } 1624 1625 if (off_out) { 1626 if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) 1627 ret = -EFAULT; 1628 } else { 1629 f_out.file->f_pos = pos_out; 1630 } 1631 } 1632 1633 out: 1634 fdput(f_out); 1635 out1: 1636 fdput(f_in); 1637 out2: 1638 return ret; 1639 } 1640 1641 /* 1642 * Don't operate on ranges the page cache doesn't support, and don't exceed the 1643 * LFS limits. If pos is under the limit it becomes a short access. If it 1644 * exceeds the limit we return -EFBIG. 1645 */ 1646 int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) 1647 { 1648 struct inode *inode = file->f_mapping->host; 1649 loff_t max_size = inode->i_sb->s_maxbytes; 1650 loff_t limit = rlimit(RLIMIT_FSIZE); 1651 1652 if (limit != RLIM_INFINITY) { 1653 if (pos >= limit) { 1654 send_sig(SIGXFSZ, current, 0); 1655 return -EFBIG; 1656 } 1657 *count = min(*count, limit - pos); 1658 } 1659 1660 if (!(file->f_flags & O_LARGEFILE)) 1661 max_size = MAX_NON_LFS; 1662 1663 if (unlikely(pos >= max_size)) 1664 return -EFBIG; 1665 1666 *count = min(*count, max_size - pos); 1667 1668 return 0; 1669 } 1670 EXPORT_SYMBOL_GPL(generic_write_check_limits); 1671 1672 /* Like generic_write_checks(), but takes size of write instead of iter. */ 1673 int generic_write_checks_count(struct kiocb *iocb, loff_t *count) 1674 { 1675 struct file *file = iocb->ki_filp; 1676 struct inode *inode = file->f_mapping->host; 1677 1678 if (IS_SWAPFILE(inode)) 1679 return -ETXTBSY; 1680 1681 if (!*count) 1682 return 0; 1683 1684 if (iocb->ki_flags & IOCB_APPEND) 1685 iocb->ki_pos = i_size_read(inode); 1686 1687 if ((iocb->ki_flags & IOCB_NOWAIT) && 1688 !((iocb->ki_flags & IOCB_DIRECT) || 1689 (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) 1690 return -EINVAL; 1691 1692 return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); 1693 } 1694 EXPORT_SYMBOL(generic_write_checks_count); 1695 1696 /* 1697 * Performs necessary checks before doing a write 1698 * 1699 * Can adjust writing position or amount of bytes to write. 1700 * Returns appropriate error code that caller should return or 1701 * zero in case that write should be allowed. 1702 */ 1703 ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 1704 { 1705 loff_t count = iov_iter_count(from); 1706 int ret; 1707 1708 ret = generic_write_checks_count(iocb, &count); 1709 if (ret) 1710 return ret; 1711 1712 iov_iter_truncate(from, count); 1713 return iov_iter_count(from); 1714 } 1715 EXPORT_SYMBOL(generic_write_checks); 1716 1717 /* 1718 * Performs common checks before doing a file copy/clone 1719 * from @file_in to @file_out. 1720 */ 1721 int generic_file_rw_checks(struct file *file_in, struct file *file_out) 1722 { 1723 struct inode *inode_in = file_inode(file_in); 1724 struct inode *inode_out = file_inode(file_out); 1725 1726 /* Don't copy dirs, pipes, sockets... */ 1727 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) 1728 return -EISDIR; 1729 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) 1730 return -EINVAL; 1731 1732 if (!(file_in->f_mode & FMODE_READ) || 1733 !(file_out->f_mode & FMODE_WRITE) || 1734 (file_out->f_flags & O_APPEND)) 1735 return -EBADF; 1736 1737 return 0; 1738 } 1739