1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/read_write.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/slab.h> 9 #include <linux/stat.h> 10 #include <linux/sched/xacct.h> 11 #include <linux/fcntl.h> 12 #include <linux/file.h> 13 #include <linux/uio.h> 14 #include <linux/fsnotify.h> 15 #include <linux/security.h> 16 #include <linux/export.h> 17 #include <linux/syscalls.h> 18 #include <linux/pagemap.h> 19 #include <linux/splice.h> 20 #include <linux/compat.h> 21 #include <linux/mount.h> 22 #include <linux/fs.h> 23 #include <linux/filelock.h> 24 #include "internal.h" 25 26 #include <linux/uaccess.h> 27 #include <asm/unistd.h> 28 29 const struct file_operations generic_ro_fops = { 30 .llseek = generic_file_llseek, 31 .read_iter = generic_file_read_iter, 32 .mmap_prepare = generic_file_readonly_mmap_prepare, 33 .splice_read = filemap_splice_read, 34 .setlease = generic_setlease, 35 }; 36 37 EXPORT_SYMBOL(generic_ro_fops); 38 39 static inline bool unsigned_offsets(struct file *file) 40 { 41 return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; 42 } 43 44 /** 45 * vfs_setpos_cookie - update the file offset for lseek and reset cookie 46 * @file: file structure in question 47 * @offset: file offset to seek to 48 * @maxsize: maximum file size 49 * @cookie: cookie to reset 50 * 51 * Update the file offset to the value specified by @offset if the given 52 * offset is valid and it is not equal to the current file offset and 53 * reset the specified cookie to indicate that a seek happened. 54 * 55 * Return the specified offset on success and -EINVAL on invalid offset. 56 */ 57 static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, 58 loff_t maxsize, u64 *cookie) 59 { 60 if (offset < 0 && !unsigned_offsets(file)) 61 return -EINVAL; 62 if (offset > maxsize) 63 return -EINVAL; 64 65 if (offset != file->f_pos) { 66 file->f_pos = offset; 67 if (cookie) 68 *cookie = 0; 69 } 70 return offset; 71 } 72 73 /** 74 * vfs_setpos - update the file offset for lseek 75 * @file: file structure in question 76 * @offset: file offset to seek to 77 * @maxsize: maximum file size 78 * 79 * This is a low-level filesystem helper for updating the file offset to 80 * the value specified by @offset if the given offset is valid and it is 81 * not equal to the current file offset. 82 * 83 * Return the specified offset on success and -EINVAL on invalid offset. 84 */ 85 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) 86 { 87 return vfs_setpos_cookie(file, offset, maxsize, NULL); 88 } 89 EXPORT_SYMBOL(vfs_setpos); 90 91 /** 92 * must_set_pos - check whether f_pos has to be updated 93 * @file: file to seek on 94 * @offset: offset to use 95 * @whence: type of seek operation 96 * @eof: end of file 97 * 98 * Check whether f_pos needs to be updated and update @offset according 99 * to @whence. 100 * 101 * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be 102 * updated, and negative error code on failure. 103 */ 104 static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) 105 { 106 switch (whence) { 107 case SEEK_END: 108 *offset += eof; 109 break; 110 case SEEK_CUR: 111 /* 112 * Here we special-case the lseek(fd, 0, SEEK_CUR) 113 * position-querying operation. Avoid rewriting the "same" 114 * f_pos value back to the file because a concurrent read(), 115 * write() or lseek() might have altered it 116 */ 117 if (*offset == 0) { 118 *offset = file->f_pos; 119 return 0; 120 } 121 break; 122 case SEEK_DATA: 123 /* 124 * In the generic case the entire file is data, so as long as 125 * offset isn't at the end of the file then the offset is data. 126 */ 127 if ((unsigned long long)*offset >= eof) 128 return -ENXIO; 129 break; 130 case SEEK_HOLE: 131 /* 132 * There is a virtual hole at the end of the file, so as long as 133 * offset isn't i_size or larger, return i_size. 134 */ 135 if ((unsigned long long)*offset >= eof) 136 return -ENXIO; 137 *offset = eof; 138 break; 139 } 140 141 return 1; 142 } 143 144 /** 145 * generic_file_llseek_size - generic llseek implementation for regular files 146 * @file: file structure to seek on 147 * @offset: file offset to seek to 148 * @whence: type of seek 149 * @maxsize: max size of this file in file system 150 * @eof: offset used for SEEK_END position 151 * 152 * This is a variant of generic_file_llseek that allows passing in a custom 153 * maximum file size and a custom EOF position, for e.g. hashed directories 154 * 155 * Synchronization: 156 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) 157 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. 158 * read/writes behave like SEEK_SET against seeks. 159 */ 160 loff_t 161 generic_file_llseek_size(struct file *file, loff_t offset, int whence, 162 loff_t maxsize, loff_t eof) 163 { 164 int ret; 165 166 ret = must_set_pos(file, &offset, whence, eof); 167 if (ret < 0) 168 return ret; 169 if (ret == 0) 170 return offset; 171 172 if (whence == SEEK_CUR) { 173 /* 174 * If the file requires locking via f_pos_lock we know 175 * that mutual exclusion for SEEK_CUR on the same file 176 * is guaranteed. If the file isn't locked, we take 177 * f_lock to protect against f_pos races with other 178 * SEEK_CURs. 179 */ 180 if (file_seek_cur_needs_f_lock(file)) { 181 guard(spinlock)(&file->f_lock); 182 return vfs_setpos(file, file->f_pos + offset, maxsize); 183 } 184 return vfs_setpos(file, file->f_pos + offset, maxsize); 185 } 186 187 return vfs_setpos(file, offset, maxsize); 188 } 189 EXPORT_SYMBOL(generic_file_llseek_size); 190 191 /** 192 * generic_llseek_cookie - versioned llseek implementation 193 * @file: file structure to seek on 194 * @offset: file offset to seek to 195 * @whence: type of seek 196 * @cookie: cookie to update 197 * 198 * See generic_file_llseek for a general description and locking assumptions. 199 * 200 * In contrast to generic_file_llseek, this function also resets a 201 * specified cookie to indicate a seek took place. 202 */ 203 loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, 204 u64 *cookie) 205 { 206 struct inode *inode = file->f_mapping->host; 207 loff_t maxsize = inode->i_sb->s_maxbytes; 208 loff_t eof = i_size_read(inode); 209 int ret; 210 211 if (WARN_ON_ONCE(!cookie)) 212 return -EINVAL; 213 214 /* 215 * Require that this is only used for directories that guarantee 216 * synchronization between readdir and seek so that an update to 217 * @cookie is correctly synchronized with concurrent readdir. 218 */ 219 if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) 220 return -EINVAL; 221 222 ret = must_set_pos(file, &offset, whence, eof); 223 if (ret < 0) 224 return ret; 225 if (ret == 0) 226 return offset; 227 228 /* No need to hold f_lock because we know that f_pos_lock is held. */ 229 if (whence == SEEK_CUR) 230 return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); 231 232 return vfs_setpos_cookie(file, offset, maxsize, cookie); 233 } 234 EXPORT_SYMBOL(generic_llseek_cookie); 235 236 /** 237 * generic_file_llseek - generic llseek implementation for regular files 238 * @file: file structure to seek on 239 * @offset: file offset to seek to 240 * @whence: type of seek 241 * 242 * This is a generic implementation of ->llseek useable for all normal local 243 * filesystems. It just updates the file offset to the value specified by 244 * @offset and @whence. 245 */ 246 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) 247 { 248 struct inode *inode = file->f_mapping->host; 249 250 return generic_file_llseek_size(file, offset, whence, 251 inode->i_sb->s_maxbytes, 252 i_size_read(inode)); 253 } 254 EXPORT_SYMBOL(generic_file_llseek); 255 256 /** 257 * fixed_size_llseek - llseek implementation for fixed-sized devices 258 * @file: file structure to seek on 259 * @offset: file offset to seek to 260 * @whence: type of seek 261 * @size: size of the file 262 * 263 */ 264 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) 265 { 266 switch (whence) { 267 case SEEK_SET: case SEEK_CUR: case SEEK_END: 268 return generic_file_llseek_size(file, offset, whence, 269 size, size); 270 default: 271 return -EINVAL; 272 } 273 } 274 EXPORT_SYMBOL(fixed_size_llseek); 275 276 /** 277 * no_seek_end_llseek - llseek implementation for fixed-sized devices 278 * @file: file structure to seek on 279 * @offset: file offset to seek to 280 * @whence: type of seek 281 * 282 */ 283 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) 284 { 285 switch (whence) { 286 case SEEK_SET: case SEEK_CUR: 287 return generic_file_llseek_size(file, offset, whence, 288 OFFSET_MAX, 0); 289 default: 290 return -EINVAL; 291 } 292 } 293 EXPORT_SYMBOL(no_seek_end_llseek); 294 295 /** 296 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices 297 * @file: file structure to seek on 298 * @offset: file offset to seek to 299 * @whence: type of seek 300 * @size: maximal offset allowed 301 * 302 */ 303 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) 304 { 305 switch (whence) { 306 case SEEK_SET: case SEEK_CUR: 307 return generic_file_llseek_size(file, offset, whence, 308 size, 0); 309 default: 310 return -EINVAL; 311 } 312 } 313 EXPORT_SYMBOL(no_seek_end_llseek_size); 314 315 /** 316 * noop_llseek - No Operation Performed llseek implementation 317 * @file: file structure to seek on 318 * @offset: file offset to seek to 319 * @whence: type of seek 320 * 321 * This is an implementation of ->llseek useable for the rare special case when 322 * userspace expects the seek to succeed but the (device) file is actually not 323 * able to perform the seek. In this case you use noop_llseek() instead of 324 * falling back to the default implementation of ->llseek. 325 */ 326 loff_t noop_llseek(struct file *file, loff_t offset, int whence) 327 { 328 return file->f_pos; 329 } 330 EXPORT_SYMBOL(noop_llseek); 331 332 loff_t default_llseek(struct file *file, loff_t offset, int whence) 333 { 334 struct inode *inode = file_inode(file); 335 loff_t retval; 336 337 retval = inode_lock_killable(inode); 338 if (retval) 339 return retval; 340 switch (whence) { 341 case SEEK_END: 342 offset += i_size_read(inode); 343 break; 344 case SEEK_CUR: 345 if (offset == 0) { 346 retval = file->f_pos; 347 goto out; 348 } 349 offset += file->f_pos; 350 break; 351 case SEEK_DATA: 352 /* 353 * In the generic case the entire file is data, so as 354 * long as offset isn't at the end of the file then the 355 * offset is data. 356 */ 357 if (offset >= inode->i_size) { 358 retval = -ENXIO; 359 goto out; 360 } 361 break; 362 case SEEK_HOLE: 363 /* 364 * There is a virtual hole at the end of the file, so 365 * as long as offset isn't i_size or larger, return 366 * i_size. 367 */ 368 if (offset >= inode->i_size) { 369 retval = -ENXIO; 370 goto out; 371 } 372 offset = inode->i_size; 373 break; 374 } 375 retval = -EINVAL; 376 if (offset >= 0 || unsigned_offsets(file)) { 377 if (offset != file->f_pos) 378 file->f_pos = offset; 379 retval = offset; 380 } 381 out: 382 inode_unlock(inode); 383 return retval; 384 } 385 EXPORT_SYMBOL(default_llseek); 386 387 loff_t vfs_llseek(struct file *file, loff_t offset, int whence) 388 { 389 if (!(file->f_mode & FMODE_LSEEK)) 390 return -ESPIPE; 391 return file->f_op->llseek(file, offset, whence); 392 } 393 EXPORT_SYMBOL(vfs_llseek); 394 395 static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) 396 { 397 off_t retval; 398 CLASS(fd_pos, f)(fd); 399 if (fd_empty(f)) 400 return -EBADF; 401 402 retval = -EINVAL; 403 if (whence <= SEEK_MAX) { 404 loff_t res = vfs_llseek(fd_file(f), offset, whence); 405 retval = res; 406 if (res != (loff_t)retval) 407 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 408 } 409 return retval; 410 } 411 412 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 413 { 414 return ksys_lseek(fd, offset, whence); 415 } 416 417 #ifdef CONFIG_COMPAT 418 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) 419 { 420 return ksys_lseek(fd, offset, whence); 421 } 422 #endif 423 424 #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ 425 defined(__ARCH_WANT_SYS_LLSEEK) 426 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 427 unsigned long, offset_low, loff_t __user *, result, 428 unsigned int, whence) 429 { 430 int retval; 431 CLASS(fd_pos, f)(fd); 432 loff_t offset; 433 434 if (fd_empty(f)) 435 return -EBADF; 436 437 if (whence > SEEK_MAX) 438 return -EINVAL; 439 440 offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, 441 whence); 442 443 retval = (int)offset; 444 if (offset >= 0) { 445 retval = -EFAULT; 446 if (!copy_to_user(result, &offset, sizeof(offset))) 447 retval = 0; 448 } 449 return retval; 450 } 451 #endif 452 453 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) 454 { 455 int mask = read_write == READ ? MAY_READ : MAY_WRITE; 456 int ret; 457 458 if (unlikely((ssize_t) count < 0)) 459 return -EINVAL; 460 461 if (ppos) { 462 loff_t pos = *ppos; 463 464 if (unlikely(pos < 0)) { 465 if (!unsigned_offsets(file)) 466 return -EINVAL; 467 if (count >= -pos) /* both values are in 0..LLONG_MAX */ 468 return -EOVERFLOW; 469 } else if (unlikely((loff_t) (pos + count) < 0)) { 470 if (!unsigned_offsets(file)) 471 return -EINVAL; 472 } 473 } 474 475 ret = security_file_permission(file, mask); 476 if (ret) 477 return ret; 478 479 return fsnotify_file_area_perm(file, mask, ppos, count); 480 } 481 EXPORT_SYMBOL(rw_verify_area); 482 483 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 484 { 485 struct kiocb kiocb; 486 struct iov_iter iter; 487 ssize_t ret; 488 489 init_sync_kiocb(&kiocb, filp); 490 kiocb.ki_pos = (ppos ? *ppos : 0); 491 iov_iter_ubuf(&iter, ITER_DEST, buf, len); 492 493 ret = filp->f_op->read_iter(&kiocb, &iter); 494 BUG_ON(ret == -EIOCBQUEUED); 495 if (ppos) 496 *ppos = kiocb.ki_pos; 497 return ret; 498 } 499 500 static int warn_unsupported(struct file *file, const char *op) 501 { 502 pr_warn_ratelimited( 503 "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 504 op, file, current->pid, current->comm); 505 return -EINVAL; 506 } 507 508 ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 509 { 510 struct kvec iov = { 511 .iov_base = buf, 512 .iov_len = min_t(size_t, count, MAX_RW_COUNT), 513 }; 514 struct kiocb kiocb; 515 struct iov_iter iter; 516 ssize_t ret; 517 518 if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) 519 return -EINVAL; 520 if (!(file->f_mode & FMODE_CAN_READ)) 521 return -EINVAL; 522 /* 523 * Also fail if ->read_iter and ->read are both wired up as that 524 * implies very convoluted semantics. 525 */ 526 if (unlikely(!file->f_op->read_iter || file->f_op->read)) 527 return warn_unsupported(file, "read"); 528 529 init_sync_kiocb(&kiocb, file); 530 kiocb.ki_pos = pos ? *pos : 0; 531 iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); 532 ret = file->f_op->read_iter(&kiocb, &iter); 533 if (ret > 0) { 534 if (pos) 535 *pos = kiocb.ki_pos; 536 fsnotify_access(file); 537 add_rchar(current, ret); 538 } 539 inc_syscr(current); 540 return ret; 541 } 542 543 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 544 { 545 ssize_t ret; 546 547 ret = rw_verify_area(READ, file, pos, count); 548 if (ret) 549 return ret; 550 return __kernel_read(file, buf, count, pos); 551 } 552 EXPORT_SYMBOL(kernel_read); 553 554 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 555 { 556 ssize_t ret; 557 558 if (!(file->f_mode & FMODE_READ)) 559 return -EBADF; 560 if (!(file->f_mode & FMODE_CAN_READ)) 561 return -EINVAL; 562 if (unlikely(!access_ok(buf, count))) 563 return -EFAULT; 564 565 ret = rw_verify_area(READ, file, pos, count); 566 if (ret) 567 return ret; 568 if (count > MAX_RW_COUNT) 569 count = MAX_RW_COUNT; 570 571 if (file->f_op->read) 572 ret = file->f_op->read(file, buf, count, pos); 573 else if (file->f_op->read_iter) 574 ret = new_sync_read(file, buf, count, pos); 575 else 576 ret = -EINVAL; 577 if (ret > 0) { 578 fsnotify_access(file); 579 add_rchar(current, ret); 580 } 581 inc_syscr(current); 582 return ret; 583 } 584 585 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) 586 { 587 struct kiocb kiocb; 588 struct iov_iter iter; 589 ssize_t ret; 590 591 init_sync_kiocb(&kiocb, filp); 592 kiocb.ki_pos = (ppos ? *ppos : 0); 593 iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); 594 595 ret = filp->f_op->write_iter(&kiocb, &iter); 596 BUG_ON(ret == -EIOCBQUEUED); 597 if (ret > 0 && ppos) 598 *ppos = kiocb.ki_pos; 599 return ret; 600 } 601 602 /* caller is responsible for file_start_write/file_end_write */ 603 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) 604 { 605 struct kiocb kiocb; 606 ssize_t ret; 607 608 if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) 609 return -EBADF; 610 if (!(file->f_mode & FMODE_CAN_WRITE)) 611 return -EINVAL; 612 /* 613 * Also fail if ->write_iter and ->write are both wired up as that 614 * implies very convoluted semantics. 615 */ 616 if (unlikely(!file->f_op->write_iter || file->f_op->write)) 617 return warn_unsupported(file, "write"); 618 619 init_sync_kiocb(&kiocb, file); 620 kiocb.ki_pos = pos ? *pos : 0; 621 ret = file->f_op->write_iter(&kiocb, from); 622 if (ret > 0) { 623 if (pos) 624 *pos = kiocb.ki_pos; 625 fsnotify_modify(file); 626 add_wchar(current, ret); 627 } 628 inc_syscw(current); 629 return ret; 630 } 631 632 /* caller is responsible for file_start_write/file_end_write */ 633 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) 634 { 635 struct kvec iov = { 636 .iov_base = (void *)buf, 637 .iov_len = min_t(size_t, count, MAX_RW_COUNT), 638 }; 639 struct iov_iter iter; 640 iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); 641 return __kernel_write_iter(file, &iter, pos); 642 } 643 /* 644 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", 645 * but autofs is one of the few internal kernel users that actually 646 * wants this _and_ can be built as a module. So we need to export 647 * this symbol for autofs, even though it really isn't appropriate 648 * for any other kernel modules. 649 */ 650 EXPORT_SYMBOL_GPL(__kernel_write); 651 652 ssize_t kernel_write(struct file *file, const void *buf, size_t count, 653 loff_t *pos) 654 { 655 ssize_t ret; 656 657 ret = rw_verify_area(WRITE, file, pos, count); 658 if (ret) 659 return ret; 660 661 file_start_write(file); 662 ret = __kernel_write(file, buf, count, pos); 663 file_end_write(file); 664 return ret; 665 } 666 EXPORT_SYMBOL(kernel_write); 667 668 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 669 { 670 ssize_t ret; 671 672 if (!(file->f_mode & FMODE_WRITE)) 673 return -EBADF; 674 if (!(file->f_mode & FMODE_CAN_WRITE)) 675 return -EINVAL; 676 if (unlikely(!access_ok(buf, count))) 677 return -EFAULT; 678 679 ret = rw_verify_area(WRITE, file, pos, count); 680 if (ret) 681 return ret; 682 if (count > MAX_RW_COUNT) 683 count = MAX_RW_COUNT; 684 file_start_write(file); 685 if (file->f_op->write) 686 ret = file->f_op->write(file, buf, count, pos); 687 else if (file->f_op->write_iter) 688 ret = new_sync_write(file, buf, count, pos); 689 else 690 ret = -EINVAL; 691 if (ret > 0) { 692 fsnotify_modify(file); 693 add_wchar(current, ret); 694 } 695 inc_syscw(current); 696 file_end_write(file); 697 return ret; 698 } 699 700 /* file_ppos returns &file->f_pos or NULL if file is stream */ 701 static inline loff_t *file_ppos(struct file *file) 702 { 703 return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; 704 } 705 706 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) 707 { 708 CLASS(fd_pos, f)(fd); 709 ssize_t ret = -EBADF; 710 711 if (!fd_empty(f)) { 712 loff_t pos, *ppos = file_ppos(fd_file(f)); 713 if (ppos) { 714 pos = *ppos; 715 ppos = &pos; 716 } 717 ret = vfs_read(fd_file(f), buf, count, ppos); 718 if (ret >= 0 && ppos) 719 fd_file(f)->f_pos = pos; 720 } 721 return ret; 722 } 723 724 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 725 { 726 return ksys_read(fd, buf, count); 727 } 728 729 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) 730 { 731 CLASS(fd_pos, f)(fd); 732 ssize_t ret = -EBADF; 733 734 if (!fd_empty(f)) { 735 loff_t pos, *ppos = file_ppos(fd_file(f)); 736 if (ppos) { 737 pos = *ppos; 738 ppos = &pos; 739 } 740 ret = vfs_write(fd_file(f), buf, count, ppos); 741 if (ret >= 0 && ppos) 742 fd_file(f)->f_pos = pos; 743 } 744 745 return ret; 746 } 747 748 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 749 size_t, count) 750 { 751 return ksys_write(fd, buf, count); 752 } 753 754 ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, 755 loff_t pos) 756 { 757 if (pos < 0) 758 return -EINVAL; 759 760 CLASS(fd, f)(fd); 761 if (fd_empty(f)) 762 return -EBADF; 763 764 if (fd_file(f)->f_mode & FMODE_PREAD) 765 return vfs_read(fd_file(f), buf, count, &pos); 766 767 return -ESPIPE; 768 } 769 770 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, 771 size_t, count, loff_t, pos) 772 { 773 return ksys_pread64(fd, buf, count, pos); 774 } 775 776 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) 777 COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, 778 size_t, count, compat_arg_u64_dual(pos)) 779 { 780 return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); 781 } 782 #endif 783 784 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, 785 size_t count, loff_t pos) 786 { 787 if (pos < 0) 788 return -EINVAL; 789 790 CLASS(fd, f)(fd); 791 if (fd_empty(f)) 792 return -EBADF; 793 794 if (fd_file(f)->f_mode & FMODE_PWRITE) 795 return vfs_write(fd_file(f), buf, count, &pos); 796 797 return -ESPIPE; 798 } 799 800 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, 801 size_t, count, loff_t, pos) 802 { 803 return ksys_pwrite64(fd, buf, count, pos); 804 } 805 806 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) 807 COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, 808 size_t, count, compat_arg_u64_dual(pos)) 809 { 810 return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); 811 } 812 #endif 813 814 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, 815 loff_t *ppos, int type, rwf_t flags) 816 { 817 struct kiocb kiocb; 818 ssize_t ret; 819 820 init_sync_kiocb(&kiocb, filp); 821 ret = kiocb_set_rw_flags(&kiocb, flags, type); 822 if (ret) 823 return ret; 824 kiocb.ki_pos = (ppos ? *ppos : 0); 825 826 if (type == READ) 827 ret = filp->f_op->read_iter(&kiocb, iter); 828 else 829 ret = filp->f_op->write_iter(&kiocb, iter); 830 BUG_ON(ret == -EIOCBQUEUED); 831 if (ppos) 832 *ppos = kiocb.ki_pos; 833 return ret; 834 } 835 836 /* Do it by hand, with file-ops */ 837 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, 838 loff_t *ppos, int type, rwf_t flags) 839 { 840 ssize_t ret = 0; 841 842 if (flags & ~RWF_HIPRI) 843 return -EOPNOTSUPP; 844 845 while (iov_iter_count(iter)) { 846 ssize_t nr; 847 848 if (type == READ) { 849 nr = filp->f_op->read(filp, iter_iov_addr(iter), 850 iter_iov_len(iter), ppos); 851 } else { 852 nr = filp->f_op->write(filp, iter_iov_addr(iter), 853 iter_iov_len(iter), ppos); 854 } 855 856 if (nr < 0) { 857 if (!ret) 858 ret = nr; 859 break; 860 } 861 ret += nr; 862 if (nr != iter_iov_len(iter)) 863 break; 864 iov_iter_advance(iter, nr); 865 } 866 867 return ret; 868 } 869 870 ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, 871 struct iov_iter *iter) 872 { 873 size_t tot_len; 874 ssize_t ret = 0; 875 876 if (!file->f_op->read_iter) 877 return -EINVAL; 878 if (!(file->f_mode & FMODE_READ)) 879 return -EBADF; 880 if (!(file->f_mode & FMODE_CAN_READ)) 881 return -EINVAL; 882 883 tot_len = iov_iter_count(iter); 884 if (!tot_len) 885 goto out; 886 ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); 887 if (ret < 0) 888 return ret; 889 890 ret = file->f_op->read_iter(iocb, iter); 891 out: 892 if (ret >= 0) 893 fsnotify_access(file); 894 return ret; 895 } 896 EXPORT_SYMBOL(vfs_iocb_iter_read); 897 898 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, 899 rwf_t flags) 900 { 901 size_t tot_len; 902 ssize_t ret = 0; 903 904 if (!file->f_op->read_iter) 905 return -EINVAL; 906 if (!(file->f_mode & FMODE_READ)) 907 return -EBADF; 908 if (!(file->f_mode & FMODE_CAN_READ)) 909 return -EINVAL; 910 911 tot_len = iov_iter_count(iter); 912 if (!tot_len) 913 goto out; 914 ret = rw_verify_area(READ, file, ppos, tot_len); 915 if (ret < 0) 916 return ret; 917 918 ret = do_iter_readv_writev(file, iter, ppos, READ, flags); 919 out: 920 if (ret >= 0) 921 fsnotify_access(file); 922 return ret; 923 } 924 EXPORT_SYMBOL(vfs_iter_read); 925 926 /* 927 * Caller is responsible for calling kiocb_end_write() on completion 928 * if async iocb was queued. 929 */ 930 ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, 931 struct iov_iter *iter) 932 { 933 size_t tot_len; 934 ssize_t ret = 0; 935 936 if (!file->f_op->write_iter) 937 return -EINVAL; 938 if (!(file->f_mode & FMODE_WRITE)) 939 return -EBADF; 940 if (!(file->f_mode & FMODE_CAN_WRITE)) 941 return -EINVAL; 942 943 tot_len = iov_iter_count(iter); 944 if (!tot_len) 945 return 0; 946 ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); 947 if (ret < 0) 948 return ret; 949 950 kiocb_start_write(iocb); 951 ret = file->f_op->write_iter(iocb, iter); 952 if (ret != -EIOCBQUEUED) 953 kiocb_end_write(iocb); 954 if (ret > 0) 955 fsnotify_modify(file); 956 957 return ret; 958 } 959 EXPORT_SYMBOL(vfs_iocb_iter_write); 960 961 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, 962 rwf_t flags) 963 { 964 size_t tot_len; 965 ssize_t ret; 966 967 if (!(file->f_mode & FMODE_WRITE)) 968 return -EBADF; 969 if (!(file->f_mode & FMODE_CAN_WRITE)) 970 return -EINVAL; 971 if (!file->f_op->write_iter) 972 return -EINVAL; 973 974 tot_len = iov_iter_count(iter); 975 if (!tot_len) 976 return 0; 977 978 ret = rw_verify_area(WRITE, file, ppos, tot_len); 979 if (ret < 0) 980 return ret; 981 982 file_start_write(file); 983 ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); 984 if (ret > 0) 985 fsnotify_modify(file); 986 file_end_write(file); 987 988 return ret; 989 } 990 EXPORT_SYMBOL(vfs_iter_write); 991 992 static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 993 unsigned long vlen, loff_t *pos, rwf_t flags) 994 { 995 struct iovec iovstack[UIO_FASTIOV]; 996 struct iovec *iov = iovstack; 997 struct iov_iter iter; 998 size_t tot_len; 999 ssize_t ret = 0; 1000 1001 if (!(file->f_mode & FMODE_READ)) 1002 return -EBADF; 1003 if (!(file->f_mode & FMODE_CAN_READ)) 1004 return -EINVAL; 1005 1006 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, 1007 &iter); 1008 if (ret < 0) 1009 return ret; 1010 1011 tot_len = iov_iter_count(&iter); 1012 if (!tot_len) 1013 goto out; 1014 1015 ret = rw_verify_area(READ, file, pos, tot_len); 1016 if (ret < 0) 1017 goto out; 1018 1019 if (file->f_op->read_iter) 1020 ret = do_iter_readv_writev(file, &iter, pos, READ, flags); 1021 else 1022 ret = do_loop_readv_writev(file, &iter, pos, READ, flags); 1023 out: 1024 if (ret >= 0) 1025 fsnotify_access(file); 1026 kfree(iov); 1027 return ret; 1028 } 1029 1030 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 1031 unsigned long vlen, loff_t *pos, rwf_t flags) 1032 { 1033 struct iovec iovstack[UIO_FASTIOV]; 1034 struct iovec *iov = iovstack; 1035 struct iov_iter iter; 1036 size_t tot_len; 1037 ssize_t ret = 0; 1038 1039 if (!(file->f_mode & FMODE_WRITE)) 1040 return -EBADF; 1041 if (!(file->f_mode & FMODE_CAN_WRITE)) 1042 return -EINVAL; 1043 1044 ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, 1045 &iter); 1046 if (ret < 0) 1047 return ret; 1048 1049 tot_len = iov_iter_count(&iter); 1050 if (!tot_len) 1051 goto out; 1052 1053 ret = rw_verify_area(WRITE, file, pos, tot_len); 1054 if (ret < 0) 1055 goto out; 1056 1057 file_start_write(file); 1058 if (file->f_op->write_iter) 1059 ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); 1060 else 1061 ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); 1062 if (ret > 0) 1063 fsnotify_modify(file); 1064 file_end_write(file); 1065 out: 1066 kfree(iov); 1067 return ret; 1068 } 1069 1070 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, 1071 unsigned long vlen, rwf_t flags) 1072 { 1073 CLASS(fd_pos, f)(fd); 1074 ssize_t ret = -EBADF; 1075 1076 if (!fd_empty(f)) { 1077 loff_t pos, *ppos = file_ppos(fd_file(f)); 1078 if (ppos) { 1079 pos = *ppos; 1080 ppos = &pos; 1081 } 1082 ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); 1083 if (ret >= 0 && ppos) 1084 fd_file(f)->f_pos = pos; 1085 } 1086 1087 if (ret > 0) 1088 add_rchar(current, ret); 1089 inc_syscr(current); 1090 return ret; 1091 } 1092 1093 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, 1094 unsigned long vlen, rwf_t flags) 1095 { 1096 CLASS(fd_pos, f)(fd); 1097 ssize_t ret = -EBADF; 1098 1099 if (!fd_empty(f)) { 1100 loff_t pos, *ppos = file_ppos(fd_file(f)); 1101 if (ppos) { 1102 pos = *ppos; 1103 ppos = &pos; 1104 } 1105 ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); 1106 if (ret >= 0 && ppos) 1107 fd_file(f)->f_pos = pos; 1108 } 1109 1110 if (ret > 0) 1111 add_wchar(current, ret); 1112 inc_syscw(current); 1113 return ret; 1114 } 1115 1116 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) 1117 { 1118 #define HALF_LONG_BITS (BITS_PER_LONG / 2) 1119 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; 1120 } 1121 1122 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, 1123 unsigned long vlen, loff_t pos, rwf_t flags) 1124 { 1125 ssize_t ret = -EBADF; 1126 1127 if (pos < 0) 1128 return -EINVAL; 1129 1130 CLASS(fd, f)(fd); 1131 if (!fd_empty(f)) { 1132 ret = -ESPIPE; 1133 if (fd_file(f)->f_mode & FMODE_PREAD) 1134 ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); 1135 } 1136 1137 if (ret > 0) 1138 add_rchar(current, ret); 1139 inc_syscr(current); 1140 return ret; 1141 } 1142 1143 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, 1144 unsigned long vlen, loff_t pos, rwf_t flags) 1145 { 1146 ssize_t ret = -EBADF; 1147 1148 if (pos < 0) 1149 return -EINVAL; 1150 1151 CLASS(fd, f)(fd); 1152 if (!fd_empty(f)) { 1153 ret = -ESPIPE; 1154 if (fd_file(f)->f_mode & FMODE_PWRITE) 1155 ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); 1156 } 1157 1158 if (ret > 0) 1159 add_wchar(current, ret); 1160 inc_syscw(current); 1161 return ret; 1162 } 1163 1164 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 1165 unsigned long, vlen) 1166 { 1167 return do_readv(fd, vec, vlen, 0); 1168 } 1169 1170 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 1171 unsigned long, vlen) 1172 { 1173 return do_writev(fd, vec, vlen, 0); 1174 } 1175 1176 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 1177 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1178 { 1179 loff_t pos = pos_from_hilo(pos_h, pos_l); 1180 1181 return do_preadv(fd, vec, vlen, pos, 0); 1182 } 1183 1184 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, 1185 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1186 rwf_t, flags) 1187 { 1188 loff_t pos = pos_from_hilo(pos_h, pos_l); 1189 1190 if (pos == -1) 1191 return do_readv(fd, vec, vlen, flags); 1192 1193 return do_preadv(fd, vec, vlen, pos, flags); 1194 } 1195 1196 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 1197 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1198 { 1199 loff_t pos = pos_from_hilo(pos_h, pos_l); 1200 1201 return do_pwritev(fd, vec, vlen, pos, 0); 1202 } 1203 1204 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, 1205 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1206 rwf_t, flags) 1207 { 1208 loff_t pos = pos_from_hilo(pos_h, pos_l); 1209 1210 if (pos == -1) 1211 return do_writev(fd, vec, vlen, flags); 1212 1213 return do_pwritev(fd, vec, vlen, pos, flags); 1214 } 1215 1216 /* 1217 * Various compat syscalls. Note that they all pretend to take a native 1218 * iovec - import_iovec will properly treat those as compat_iovecs based on 1219 * in_compat_syscall(). 1220 */ 1221 #ifdef CONFIG_COMPAT 1222 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 1223 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 1224 const struct iovec __user *, vec, 1225 unsigned long, vlen, loff_t, pos) 1226 { 1227 return do_preadv(fd, vec, vlen, pos, 0); 1228 } 1229 #endif 1230 1231 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1232 const struct iovec __user *, vec, 1233 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1234 { 1235 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1236 1237 return do_preadv(fd, vec, vlen, pos, 0); 1238 } 1239 1240 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 1241 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, 1242 const struct iovec __user *, vec, 1243 unsigned long, vlen, loff_t, pos, rwf_t, flags) 1244 { 1245 if (pos == -1) 1246 return do_readv(fd, vec, vlen, flags); 1247 return do_preadv(fd, vec, vlen, pos, flags); 1248 } 1249 #endif 1250 1251 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, 1252 const struct iovec __user *, vec, 1253 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, 1254 rwf_t, flags) 1255 { 1256 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1257 1258 if (pos == -1) 1259 return do_readv(fd, vec, vlen, flags); 1260 return do_preadv(fd, vec, vlen, pos, flags); 1261 } 1262 1263 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 1264 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1265 const struct iovec __user *, vec, 1266 unsigned long, vlen, loff_t, pos) 1267 { 1268 return do_pwritev(fd, vec, vlen, pos, 0); 1269 } 1270 #endif 1271 1272 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1273 const struct iovec __user *,vec, 1274 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1275 { 1276 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1277 1278 return do_pwritev(fd, vec, vlen, pos, 0); 1279 } 1280 1281 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 1282 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, 1283 const struct iovec __user *, vec, 1284 unsigned long, vlen, loff_t, pos, rwf_t, flags) 1285 { 1286 if (pos == -1) 1287 return do_writev(fd, vec, vlen, flags); 1288 return do_pwritev(fd, vec, vlen, pos, flags); 1289 } 1290 #endif 1291 1292 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, 1293 const struct iovec __user *,vec, 1294 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) 1295 { 1296 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1297 1298 if (pos == -1) 1299 return do_writev(fd, vec, vlen, flags); 1300 return do_pwritev(fd, vec, vlen, pos, flags); 1301 } 1302 #endif /* CONFIG_COMPAT */ 1303 1304 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1305 size_t count, loff_t max) 1306 { 1307 struct inode *in_inode, *out_inode; 1308 struct pipe_inode_info *opipe; 1309 loff_t pos; 1310 loff_t out_pos; 1311 ssize_t retval; 1312 int fl; 1313 1314 /* 1315 * Get input file, and verify that it is ok.. 1316 */ 1317 CLASS(fd, in)(in_fd); 1318 if (fd_empty(in)) 1319 return -EBADF; 1320 if (!(fd_file(in)->f_mode & FMODE_READ)) 1321 return -EBADF; 1322 if (!ppos) { 1323 pos = fd_file(in)->f_pos; 1324 } else { 1325 pos = *ppos; 1326 if (!(fd_file(in)->f_mode & FMODE_PREAD)) 1327 return -ESPIPE; 1328 } 1329 retval = rw_verify_area(READ, fd_file(in), &pos, count); 1330 if (retval < 0) 1331 return retval; 1332 if (count > MAX_RW_COUNT) 1333 count = MAX_RW_COUNT; 1334 1335 /* 1336 * Get output file, and verify that it is ok.. 1337 */ 1338 CLASS(fd, out)(out_fd); 1339 if (fd_empty(out)) 1340 return -EBADF; 1341 if (!(fd_file(out)->f_mode & FMODE_WRITE)) 1342 return -EBADF; 1343 in_inode = file_inode(fd_file(in)); 1344 out_inode = file_inode(fd_file(out)); 1345 out_pos = fd_file(out)->f_pos; 1346 1347 if (!max) 1348 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 1349 1350 if (unlikely(pos + count > max)) { 1351 if (pos >= max) 1352 return -EOVERFLOW; 1353 count = max - pos; 1354 } 1355 1356 fl = 0; 1357 #if 0 1358 /* 1359 * We need to debate whether we can enable this or not. The 1360 * man page documents EAGAIN return for the output at least, 1361 * and the application is arguably buggy if it doesn't expect 1362 * EAGAIN on a non-blocking file descriptor. 1363 */ 1364 if (fd_file(in)->f_flags & O_NONBLOCK) 1365 fl = SPLICE_F_NONBLOCK; 1366 #endif 1367 opipe = get_pipe_info(fd_file(out), true); 1368 if (!opipe) { 1369 retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); 1370 if (retval < 0) 1371 return retval; 1372 retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, 1373 count, fl); 1374 } else { 1375 if (fd_file(out)->f_flags & O_NONBLOCK) 1376 fl |= SPLICE_F_NONBLOCK; 1377 1378 retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl); 1379 } 1380 1381 if (retval > 0) { 1382 add_rchar(current, retval); 1383 add_wchar(current, retval); 1384 fsnotify_access(fd_file(in)); 1385 fsnotify_modify(fd_file(out)); 1386 fd_file(out)->f_pos = out_pos; 1387 if (ppos) 1388 *ppos = pos; 1389 else 1390 fd_file(in)->f_pos = pos; 1391 } 1392 1393 inc_syscr(current); 1394 inc_syscw(current); 1395 if (pos > max) 1396 retval = -EOVERFLOW; 1397 return retval; 1398 } 1399 1400 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) 1401 { 1402 loff_t pos; 1403 off_t off; 1404 ssize_t ret; 1405 1406 if (offset) { 1407 if (unlikely(get_user(off, offset))) 1408 return -EFAULT; 1409 pos = off; 1410 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1411 if (unlikely(put_user(pos, offset))) 1412 return -EFAULT; 1413 return ret; 1414 } 1415 1416 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1417 } 1418 1419 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) 1420 { 1421 loff_t pos; 1422 ssize_t ret; 1423 1424 if (offset) { 1425 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1426 return -EFAULT; 1427 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1428 if (unlikely(put_user(pos, offset))) 1429 return -EFAULT; 1430 return ret; 1431 } 1432 1433 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1434 } 1435 1436 #ifdef CONFIG_COMPAT 1437 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, 1438 compat_off_t __user *, offset, compat_size_t, count) 1439 { 1440 loff_t pos; 1441 off_t off; 1442 ssize_t ret; 1443 1444 if (offset) { 1445 if (unlikely(get_user(off, offset))) 1446 return -EFAULT; 1447 pos = off; 1448 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1449 if (unlikely(put_user(pos, offset))) 1450 return -EFAULT; 1451 return ret; 1452 } 1453 1454 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1455 } 1456 1457 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, 1458 compat_loff_t __user *, offset, compat_size_t, count) 1459 { 1460 loff_t pos; 1461 ssize_t ret; 1462 1463 if (offset) { 1464 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1465 return -EFAULT; 1466 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1467 if (unlikely(put_user(pos, offset))) 1468 return -EFAULT; 1469 return ret; 1470 } 1471 1472 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1473 } 1474 #endif 1475 1476 /* 1477 * Performs necessary checks before doing a file copy 1478 * 1479 * Can adjust amount of bytes to copy via @req_count argument. 1480 * Returns appropriate error code that caller should return or 1481 * zero in case the copy should be allowed. 1482 */ 1483 static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, 1484 struct file *file_out, loff_t pos_out, 1485 size_t *req_count, unsigned int flags) 1486 { 1487 struct inode *inode_in = file_inode(file_in); 1488 struct inode *inode_out = file_inode(file_out); 1489 uint64_t count = *req_count; 1490 loff_t size_in; 1491 int ret; 1492 1493 ret = generic_file_rw_checks(file_in, file_out); 1494 if (ret) 1495 return ret; 1496 1497 /* 1498 * We allow some filesystems to handle cross sb copy, but passing 1499 * a file of the wrong filesystem type to filesystem driver can result 1500 * in an attempt to dereference the wrong type of ->private_data, so 1501 * avoid doing that until we really have a good reason. 1502 * 1503 * nfs and cifs define several different file_system_type structures 1504 * and several different sets of file_operations, but they all end up 1505 * using the same ->copy_file_range() function pointer. 1506 */ 1507 if (flags & COPY_FILE_SPLICE) { 1508 /* cross sb splice is allowed */ 1509 } else if (file_out->f_op->copy_file_range) { 1510 if (file_in->f_op->copy_file_range != 1511 file_out->f_op->copy_file_range) 1512 return -EXDEV; 1513 } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { 1514 return -EXDEV; 1515 } 1516 1517 /* Don't touch certain kinds of inodes */ 1518 if (IS_IMMUTABLE(inode_out)) 1519 return -EPERM; 1520 1521 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) 1522 return -ETXTBSY; 1523 1524 /* Ensure offsets don't wrap. */ 1525 if (pos_in + count < pos_in || pos_out + count < pos_out) 1526 return -EOVERFLOW; 1527 1528 /* Shorten the copy to EOF */ 1529 size_in = i_size_read(inode_in); 1530 if (pos_in >= size_in) 1531 count = 0; 1532 else 1533 count = min(count, size_in - (uint64_t)pos_in); 1534 1535 ret = generic_write_check_limits(file_out, pos_out, &count); 1536 if (ret) 1537 return ret; 1538 1539 /* Don't allow overlapped copying within the same file. */ 1540 if (inode_in == inode_out && 1541 pos_out + count > pos_in && 1542 pos_out < pos_in + count) 1543 return -EINVAL; 1544 1545 *req_count = count; 1546 return 0; 1547 } 1548 1549 /* 1550 * copy_file_range() differs from regular file read and write in that it 1551 * specifically allows return partial success. When it does so is up to 1552 * the copy_file_range method. 1553 */ 1554 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, 1555 struct file *file_out, loff_t pos_out, 1556 size_t len, unsigned int flags) 1557 { 1558 ssize_t ret; 1559 bool splice = flags & COPY_FILE_SPLICE; 1560 bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; 1561 1562 if (flags & ~COPY_FILE_SPLICE) 1563 return -EINVAL; 1564 1565 ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, 1566 flags); 1567 if (unlikely(ret)) 1568 return ret; 1569 1570 ret = rw_verify_area(READ, file_in, &pos_in, len); 1571 if (unlikely(ret)) 1572 return ret; 1573 1574 ret = rw_verify_area(WRITE, file_out, &pos_out, len); 1575 if (unlikely(ret)) 1576 return ret; 1577 1578 if (len == 0) 1579 return 0; 1580 1581 /* 1582 * Make sure return value doesn't overflow in 32bit compat mode. Also 1583 * limit the size for all cases except when calling ->copy_file_range(). 1584 */ 1585 if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) 1586 len = min_t(size_t, MAX_RW_COUNT, len); 1587 1588 file_start_write(file_out); 1589 1590 /* 1591 * Cloning is supported by more file systems, so we implement copy on 1592 * same sb using clone, but for filesystems where both clone and copy 1593 * are supported (e.g. nfs,cifs), we only call the copy method. 1594 */ 1595 if (!splice && file_out->f_op->copy_file_range) { 1596 ret = file_out->f_op->copy_file_range(file_in, pos_in, 1597 file_out, pos_out, 1598 len, flags); 1599 } else if (!splice && file_in->f_op->remap_file_range && samesb) { 1600 ret = file_in->f_op->remap_file_range(file_in, pos_in, 1601 file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); 1602 /* fallback to splice */ 1603 if (ret <= 0) 1604 splice = true; 1605 } else if (samesb) { 1606 /* Fallback to splice for same sb copy for backward compat */ 1607 splice = true; 1608 } 1609 1610 file_end_write(file_out); 1611 1612 if (!splice) 1613 goto done; 1614 1615 /* 1616 * We can get here for same sb copy of filesystems that do not implement 1617 * ->copy_file_range() in case filesystem does not support clone or in 1618 * case filesystem supports clone but rejected the clone request (e.g. 1619 * because it was not block aligned). 1620 * 1621 * In both cases, fall back to kernel copy so we are able to maintain a 1622 * consistent story about which filesystems support copy_file_range() 1623 * and which filesystems do not, that will allow userspace tools to 1624 * make consistent desicions w.r.t using copy_file_range(). 1625 * 1626 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE 1627 * for server-side-copy between any two sb. 1628 * 1629 * In any case, we call do_splice_direct() and not splice_file_range(), 1630 * without file_start_write() held, to avoid possible deadlocks related 1631 * to splicing from input file, while file_start_write() is held on 1632 * the output file on a different sb. 1633 */ 1634 ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); 1635 done: 1636 if (ret > 0) { 1637 fsnotify_access(file_in); 1638 add_rchar(current, ret); 1639 fsnotify_modify(file_out); 1640 add_wchar(current, ret); 1641 } 1642 1643 inc_syscr(current); 1644 inc_syscw(current); 1645 1646 return ret; 1647 } 1648 EXPORT_SYMBOL(vfs_copy_file_range); 1649 1650 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, 1651 int, fd_out, loff_t __user *, off_out, 1652 size_t, len, unsigned int, flags) 1653 { 1654 loff_t pos_in; 1655 loff_t pos_out; 1656 ssize_t ret = -EBADF; 1657 1658 CLASS(fd, f_in)(fd_in); 1659 if (fd_empty(f_in)) 1660 return -EBADF; 1661 1662 CLASS(fd, f_out)(fd_out); 1663 if (fd_empty(f_out)) 1664 return -EBADF; 1665 1666 if (off_in) { 1667 if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) 1668 return -EFAULT; 1669 } else { 1670 pos_in = fd_file(f_in)->f_pos; 1671 } 1672 1673 if (off_out) { 1674 if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) 1675 return -EFAULT; 1676 } else { 1677 pos_out = fd_file(f_out)->f_pos; 1678 } 1679 1680 if (flags != 0) 1681 return -EINVAL; 1682 1683 ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, 1684 flags); 1685 if (ret > 0) { 1686 pos_in += ret; 1687 pos_out += ret; 1688 1689 if (off_in) { 1690 if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) 1691 ret = -EFAULT; 1692 } else { 1693 fd_file(f_in)->f_pos = pos_in; 1694 } 1695 1696 if (off_out) { 1697 if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) 1698 ret = -EFAULT; 1699 } else { 1700 fd_file(f_out)->f_pos = pos_out; 1701 } 1702 } 1703 return ret; 1704 } 1705 1706 /* 1707 * Don't operate on ranges the page cache doesn't support, and don't exceed the 1708 * LFS limits. If pos is under the limit it becomes a short access. If it 1709 * exceeds the limit we return -EFBIG. 1710 */ 1711 int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) 1712 { 1713 struct inode *inode = file->f_mapping->host; 1714 loff_t max_size = inode->i_sb->s_maxbytes; 1715 loff_t limit = rlimit(RLIMIT_FSIZE); 1716 1717 if (limit != RLIM_INFINITY) { 1718 if (pos >= limit) { 1719 send_sig(SIGXFSZ, current, 0); 1720 return -EFBIG; 1721 } 1722 *count = min(*count, limit - pos); 1723 } 1724 1725 if (!(file->f_flags & O_LARGEFILE)) 1726 max_size = MAX_NON_LFS; 1727 1728 if (unlikely(pos >= max_size)) 1729 return -EFBIG; 1730 1731 *count = min(*count, max_size - pos); 1732 1733 return 0; 1734 } 1735 EXPORT_SYMBOL_GPL(generic_write_check_limits); 1736 1737 /* Like generic_write_checks(), but takes size of write instead of iter. */ 1738 int generic_write_checks_count(struct kiocb *iocb, loff_t *count) 1739 { 1740 struct file *file = iocb->ki_filp; 1741 struct inode *inode = file->f_mapping->host; 1742 1743 if (IS_SWAPFILE(inode)) 1744 return -ETXTBSY; 1745 1746 if (!*count) 1747 return 0; 1748 1749 if (iocb->ki_flags & IOCB_APPEND) 1750 iocb->ki_pos = i_size_read(inode); 1751 1752 if ((iocb->ki_flags & IOCB_NOWAIT) && 1753 !((iocb->ki_flags & IOCB_DIRECT) || 1754 (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) 1755 return -EINVAL; 1756 1757 return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); 1758 } 1759 EXPORT_SYMBOL(generic_write_checks_count); 1760 1761 /* 1762 * Performs necessary checks before doing a write 1763 * 1764 * Can adjust writing position or amount of bytes to write. 1765 * Returns appropriate error code that caller should return or 1766 * zero in case that write should be allowed. 1767 */ 1768 ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 1769 { 1770 loff_t count = iov_iter_count(from); 1771 int ret; 1772 1773 ret = generic_write_checks_count(iocb, &count); 1774 if (ret) 1775 return ret; 1776 1777 iov_iter_truncate(from, count); 1778 return iov_iter_count(from); 1779 } 1780 EXPORT_SYMBOL(generic_write_checks); 1781 1782 /* 1783 * Performs common checks before doing a file copy/clone 1784 * from @file_in to @file_out. 1785 */ 1786 int generic_file_rw_checks(struct file *file_in, struct file *file_out) 1787 { 1788 struct inode *inode_in = file_inode(file_in); 1789 struct inode *inode_out = file_inode(file_out); 1790 1791 /* Don't copy dirs, pipes, sockets... */ 1792 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) 1793 return -EISDIR; 1794 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) 1795 return -EINVAL; 1796 1797 if (!(file_in->f_mode & FMODE_READ) || 1798 !(file_out->f_mode & FMODE_WRITE) || 1799 (file_out->f_flags & O_APPEND)) 1800 return -EBADF; 1801 1802 return 0; 1803 } 1804 1805 int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) 1806 { 1807 size_t len = iov_iter_count(iter); 1808 1809 if (!iter_is_ubuf(iter)) 1810 return -EINVAL; 1811 1812 if (!is_power_of_2(len)) 1813 return -EINVAL; 1814 1815 if (!IS_ALIGNED(iocb->ki_pos, len)) 1816 return -EINVAL; 1817 1818 if (!(iocb->ki_flags & IOCB_DIRECT)) 1819 return -EOPNOTSUPP; 1820 1821 return 0; 1822 } 1823 EXPORT_SYMBOL_GPL(generic_atomic_write_valid); 1824