1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/read_write.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/slab.h> 9 #include <linux/stat.h> 10 #include <linux/sched/xacct.h> 11 #include <linux/fcntl.h> 12 #include <linux/file.h> 13 #include <linux/uio.h> 14 #include <linux/fsnotify.h> 15 #include <linux/security.h> 16 #include <linux/export.h> 17 #include <linux/syscalls.h> 18 #include <linux/pagemap.h> 19 #include <linux/splice.h> 20 #include <linux/compat.h> 21 #include <linux/mount.h> 22 #include <linux/fs.h> 23 #include "internal.h" 24 25 #include <linux/uaccess.h> 26 #include <asm/unistd.h> 27 28 const struct file_operations generic_ro_fops = { 29 .llseek = generic_file_llseek, 30 .read_iter = generic_file_read_iter, 31 .mmap = generic_file_readonly_mmap, 32 .splice_read = filemap_splice_read, 33 }; 34 35 EXPORT_SYMBOL(generic_ro_fops); 36 37 static inline bool unsigned_offsets(struct file *file) 38 { 39 return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; 40 } 41 42 /** 43 * vfs_setpos_cookie - update the file offset for lseek and reset cookie 44 * @file: file structure in question 45 * @offset: file offset to seek to 46 * @maxsize: maximum file size 47 * @cookie: cookie to reset 48 * 49 * Update the file offset to the value specified by @offset if the given 50 * offset is valid and it is not equal to the current file offset and 51 * reset the specified cookie to indicate that a seek happened. 52 * 53 * Return the specified offset on success and -EINVAL on invalid offset. 54 */ 55 static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, 56 loff_t maxsize, u64 *cookie) 57 { 58 if (offset < 0 && !unsigned_offsets(file)) 59 return -EINVAL; 60 if (offset > maxsize) 61 return -EINVAL; 62 63 if (offset != file->f_pos) { 64 file->f_pos = offset; 65 if (cookie) 66 *cookie = 0; 67 } 68 return offset; 69 } 70 71 /** 72 * vfs_setpos - update the file offset for lseek 73 * @file: file structure in question 74 * @offset: file offset to seek to 75 * @maxsize: maximum file size 76 * 77 * This is a low-level filesystem helper for updating the file offset to 78 * the value specified by @offset if the given offset is valid and it is 79 * not equal to the current file offset. 80 * 81 * Return the specified offset on success and -EINVAL on invalid offset. 82 */ 83 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) 84 { 85 return vfs_setpos_cookie(file, offset, maxsize, NULL); 86 } 87 EXPORT_SYMBOL(vfs_setpos); 88 89 /** 90 * must_set_pos - check whether f_pos has to be updated 91 * @file: file to seek on 92 * @offset: offset to use 93 * @whence: type of seek operation 94 * @eof: end of file 95 * 96 * Check whether f_pos needs to be updated and update @offset according 97 * to @whence. 98 * 99 * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be 100 * updated, and negative error code on failure. 101 */ 102 static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) 103 { 104 switch (whence) { 105 case SEEK_END: 106 *offset += eof; 107 break; 108 case SEEK_CUR: 109 /* 110 * Here we special-case the lseek(fd, 0, SEEK_CUR) 111 * position-querying operation. Avoid rewriting the "same" 112 * f_pos value back to the file because a concurrent read(), 113 * write() or lseek() might have altered it 114 */ 115 if (*offset == 0) { 116 *offset = file->f_pos; 117 return 0; 118 } 119 break; 120 case SEEK_DATA: 121 /* 122 * In the generic case the entire file is data, so as long as 123 * offset isn't at the end of the file then the offset is data. 124 */ 125 if ((unsigned long long)*offset >= eof) 126 return -ENXIO; 127 break; 128 case SEEK_HOLE: 129 /* 130 * There is a virtual hole at the end of the file, so as long as 131 * offset isn't i_size or larger, return i_size. 132 */ 133 if ((unsigned long long)*offset >= eof) 134 return -ENXIO; 135 *offset = eof; 136 break; 137 } 138 139 return 1; 140 } 141 142 /** 143 * generic_file_llseek_size - generic llseek implementation for regular files 144 * @file: file structure to seek on 145 * @offset: file offset to seek to 146 * @whence: type of seek 147 * @maxsize: max size of this file in file system 148 * @eof: offset used for SEEK_END position 149 * 150 * This is a variant of generic_file_llseek that allows passing in a custom 151 * maximum file size and a custom EOF position, for e.g. hashed directories 152 * 153 * Synchronization: 154 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) 155 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. 156 * read/writes behave like SEEK_SET against seeks. 157 */ 158 loff_t 159 generic_file_llseek_size(struct file *file, loff_t offset, int whence, 160 loff_t maxsize, loff_t eof) 161 { 162 int ret; 163 164 ret = must_set_pos(file, &offset, whence, eof); 165 if (ret < 0) 166 return ret; 167 if (ret == 0) 168 return offset; 169 170 if (whence == SEEK_CUR) { 171 /* 172 * If the file requires locking via f_pos_lock we know 173 * that mutual exclusion for SEEK_CUR on the same file 174 * is guaranteed. If the file isn't locked, we take 175 * f_lock to protect against f_pos races with other 176 * SEEK_CURs. 177 */ 178 if (file_seek_cur_needs_f_lock(file)) { 179 guard(spinlock)(&file->f_lock); 180 return vfs_setpos(file, file->f_pos + offset, maxsize); 181 } 182 return vfs_setpos(file, file->f_pos + offset, maxsize); 183 } 184 185 return vfs_setpos(file, offset, maxsize); 186 } 187 EXPORT_SYMBOL(generic_file_llseek_size); 188 189 /** 190 * generic_llseek_cookie - versioned llseek implementation 191 * @file: file structure to seek on 192 * @offset: file offset to seek to 193 * @whence: type of seek 194 * @cookie: cookie to update 195 * 196 * See generic_file_llseek for a general description and locking assumptions. 197 * 198 * In contrast to generic_file_llseek, this function also resets a 199 * specified cookie to indicate a seek took place. 200 */ 201 loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, 202 u64 *cookie) 203 { 204 struct inode *inode = file->f_mapping->host; 205 loff_t maxsize = inode->i_sb->s_maxbytes; 206 loff_t eof = i_size_read(inode); 207 int ret; 208 209 if (WARN_ON_ONCE(!cookie)) 210 return -EINVAL; 211 212 /* 213 * Require that this is only used for directories that guarantee 214 * synchronization between readdir and seek so that an update to 215 * @cookie is correctly synchronized with concurrent readdir. 216 */ 217 if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) 218 return -EINVAL; 219 220 ret = must_set_pos(file, &offset, whence, eof); 221 if (ret < 0) 222 return ret; 223 if (ret == 0) 224 return offset; 225 226 /* No need to hold f_lock because we know that f_pos_lock is held. */ 227 if (whence == SEEK_CUR) 228 return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); 229 230 return vfs_setpos_cookie(file, offset, maxsize, cookie); 231 } 232 EXPORT_SYMBOL(generic_llseek_cookie); 233 234 /** 235 * generic_file_llseek - generic llseek implementation for regular files 236 * @file: file structure to seek on 237 * @offset: file offset to seek to 238 * @whence: type of seek 239 * 240 * This is a generic implemenation of ->llseek useable for all normal local 241 * filesystems. It just updates the file offset to the value specified by 242 * @offset and @whence. 243 */ 244 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) 245 { 246 struct inode *inode = file->f_mapping->host; 247 248 return generic_file_llseek_size(file, offset, whence, 249 inode->i_sb->s_maxbytes, 250 i_size_read(inode)); 251 } 252 EXPORT_SYMBOL(generic_file_llseek); 253 254 /** 255 * fixed_size_llseek - llseek implementation for fixed-sized devices 256 * @file: file structure to seek on 257 * @offset: file offset to seek to 258 * @whence: type of seek 259 * @size: size of the file 260 * 261 */ 262 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) 263 { 264 switch (whence) { 265 case SEEK_SET: case SEEK_CUR: case SEEK_END: 266 return generic_file_llseek_size(file, offset, whence, 267 size, size); 268 default: 269 return -EINVAL; 270 } 271 } 272 EXPORT_SYMBOL(fixed_size_llseek); 273 274 /** 275 * no_seek_end_llseek - llseek implementation for fixed-sized devices 276 * @file: file structure to seek on 277 * @offset: file offset to seek to 278 * @whence: type of seek 279 * 280 */ 281 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) 282 { 283 switch (whence) { 284 case SEEK_SET: case SEEK_CUR: 285 return generic_file_llseek_size(file, offset, whence, 286 OFFSET_MAX, 0); 287 default: 288 return -EINVAL; 289 } 290 } 291 EXPORT_SYMBOL(no_seek_end_llseek); 292 293 /** 294 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices 295 * @file: file structure to seek on 296 * @offset: file offset to seek to 297 * @whence: type of seek 298 * @size: maximal offset allowed 299 * 300 */ 301 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) 302 { 303 switch (whence) { 304 case SEEK_SET: case SEEK_CUR: 305 return generic_file_llseek_size(file, offset, whence, 306 size, 0); 307 default: 308 return -EINVAL; 309 } 310 } 311 EXPORT_SYMBOL(no_seek_end_llseek_size); 312 313 /** 314 * noop_llseek - No Operation Performed llseek implementation 315 * @file: file structure to seek on 316 * @offset: file offset to seek to 317 * @whence: type of seek 318 * 319 * This is an implementation of ->llseek useable for the rare special case when 320 * userspace expects the seek to succeed but the (device) file is actually not 321 * able to perform the seek. In this case you use noop_llseek() instead of 322 * falling back to the default implementation of ->llseek. 323 */ 324 loff_t noop_llseek(struct file *file, loff_t offset, int whence) 325 { 326 return file->f_pos; 327 } 328 EXPORT_SYMBOL(noop_llseek); 329 330 loff_t default_llseek(struct file *file, loff_t offset, int whence) 331 { 332 struct inode *inode = file_inode(file); 333 loff_t retval; 334 335 retval = inode_lock_killable(inode); 336 if (retval) 337 return retval; 338 switch (whence) { 339 case SEEK_END: 340 offset += i_size_read(inode); 341 break; 342 case SEEK_CUR: 343 if (offset == 0) { 344 retval = file->f_pos; 345 goto out; 346 } 347 offset += file->f_pos; 348 break; 349 case SEEK_DATA: 350 /* 351 * In the generic case the entire file is data, so as 352 * long as offset isn't at the end of the file then the 353 * offset is data. 354 */ 355 if (offset >= inode->i_size) { 356 retval = -ENXIO; 357 goto out; 358 } 359 break; 360 case SEEK_HOLE: 361 /* 362 * There is a virtual hole at the end of the file, so 363 * as long as offset isn't i_size or larger, return 364 * i_size. 365 */ 366 if (offset >= inode->i_size) { 367 retval = -ENXIO; 368 goto out; 369 } 370 offset = inode->i_size; 371 break; 372 } 373 retval = -EINVAL; 374 if (offset >= 0 || unsigned_offsets(file)) { 375 if (offset != file->f_pos) 376 file->f_pos = offset; 377 retval = offset; 378 } 379 out: 380 inode_unlock(inode); 381 return retval; 382 } 383 EXPORT_SYMBOL(default_llseek); 384 385 loff_t vfs_llseek(struct file *file, loff_t offset, int whence) 386 { 387 if (!(file->f_mode & FMODE_LSEEK)) 388 return -ESPIPE; 389 return file->f_op->llseek(file, offset, whence); 390 } 391 EXPORT_SYMBOL(vfs_llseek); 392 393 static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) 394 { 395 off_t retval; 396 CLASS(fd_pos, f)(fd); 397 if (fd_empty(f)) 398 return -EBADF; 399 400 retval = -EINVAL; 401 if (whence <= SEEK_MAX) { 402 loff_t res = vfs_llseek(fd_file(f), offset, whence); 403 retval = res; 404 if (res != (loff_t)retval) 405 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 406 } 407 return retval; 408 } 409 410 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 411 { 412 return ksys_lseek(fd, offset, whence); 413 } 414 415 #ifdef CONFIG_COMPAT 416 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) 417 { 418 return ksys_lseek(fd, offset, whence); 419 } 420 #endif 421 422 #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ 423 defined(__ARCH_WANT_SYS_LLSEEK) 424 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 425 unsigned long, offset_low, loff_t __user *, result, 426 unsigned int, whence) 427 { 428 int retval; 429 CLASS(fd_pos, f)(fd); 430 loff_t offset; 431 432 if (fd_empty(f)) 433 return -EBADF; 434 435 if (whence > SEEK_MAX) 436 return -EINVAL; 437 438 offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, 439 whence); 440 441 retval = (int)offset; 442 if (offset >= 0) { 443 retval = -EFAULT; 444 if (!copy_to_user(result, &offset, sizeof(offset))) 445 retval = 0; 446 } 447 return retval; 448 } 449 #endif 450 451 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) 452 { 453 int mask = read_write == READ ? MAY_READ : MAY_WRITE; 454 int ret; 455 456 if (unlikely((ssize_t) count < 0)) 457 return -EINVAL; 458 459 if (ppos) { 460 loff_t pos = *ppos; 461 462 if (unlikely(pos < 0)) { 463 if (!unsigned_offsets(file)) 464 return -EINVAL; 465 if (count >= -pos) /* both values are in 0..LLONG_MAX */ 466 return -EOVERFLOW; 467 } else if (unlikely((loff_t) (pos + count) < 0)) { 468 if (!unsigned_offsets(file)) 469 return -EINVAL; 470 } 471 } 472 473 ret = security_file_permission(file, mask); 474 if (ret) 475 return ret; 476 477 return fsnotify_file_area_perm(file, mask, ppos, count); 478 } 479 EXPORT_SYMBOL(rw_verify_area); 480 481 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 482 { 483 struct kiocb kiocb; 484 struct iov_iter iter; 485 ssize_t ret; 486 487 init_sync_kiocb(&kiocb, filp); 488 kiocb.ki_pos = (ppos ? *ppos : 0); 489 iov_iter_ubuf(&iter, ITER_DEST, buf, len); 490 491 ret = filp->f_op->read_iter(&kiocb, &iter); 492 BUG_ON(ret == -EIOCBQUEUED); 493 if (ppos) 494 *ppos = kiocb.ki_pos; 495 return ret; 496 } 497 498 static int warn_unsupported(struct file *file, const char *op) 499 { 500 pr_warn_ratelimited( 501 "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 502 op, file, current->pid, current->comm); 503 return -EINVAL; 504 } 505 506 ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 507 { 508 struct kvec iov = { 509 .iov_base = buf, 510 .iov_len = min_t(size_t, count, MAX_RW_COUNT), 511 }; 512 struct kiocb kiocb; 513 struct iov_iter iter; 514 ssize_t ret; 515 516 if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) 517 return -EINVAL; 518 if (!(file->f_mode & FMODE_CAN_READ)) 519 return -EINVAL; 520 /* 521 * Also fail if ->read_iter and ->read are both wired up as that 522 * implies very convoluted semantics. 523 */ 524 if (unlikely(!file->f_op->read_iter || file->f_op->read)) 525 return warn_unsupported(file, "read"); 526 527 init_sync_kiocb(&kiocb, file); 528 kiocb.ki_pos = pos ? *pos : 0; 529 iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); 530 ret = file->f_op->read_iter(&kiocb, &iter); 531 if (ret > 0) { 532 if (pos) 533 *pos = kiocb.ki_pos; 534 fsnotify_access(file); 535 add_rchar(current, ret); 536 } 537 inc_syscr(current); 538 return ret; 539 } 540 541 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 542 { 543 ssize_t ret; 544 545 ret = rw_verify_area(READ, file, pos, count); 546 if (ret) 547 return ret; 548 return __kernel_read(file, buf, count, pos); 549 } 550 EXPORT_SYMBOL(kernel_read); 551 552 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 553 { 554 ssize_t ret; 555 556 if (!(file->f_mode & FMODE_READ)) 557 return -EBADF; 558 if (!(file->f_mode & FMODE_CAN_READ)) 559 return -EINVAL; 560 if (unlikely(!access_ok(buf, count))) 561 return -EFAULT; 562 563 ret = rw_verify_area(READ, file, pos, count); 564 if (ret) 565 return ret; 566 if (count > MAX_RW_COUNT) 567 count = MAX_RW_COUNT; 568 569 if (file->f_op->read) 570 ret = file->f_op->read(file, buf, count, pos); 571 else if (file->f_op->read_iter) 572 ret = new_sync_read(file, buf, count, pos); 573 else 574 ret = -EINVAL; 575 if (ret > 0) { 576 fsnotify_access(file); 577 add_rchar(current, ret); 578 } 579 inc_syscr(current); 580 return ret; 581 } 582 583 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) 584 { 585 struct kiocb kiocb; 586 struct iov_iter iter; 587 ssize_t ret; 588 589 init_sync_kiocb(&kiocb, filp); 590 kiocb.ki_pos = (ppos ? *ppos : 0); 591 iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); 592 593 ret = filp->f_op->write_iter(&kiocb, &iter); 594 BUG_ON(ret == -EIOCBQUEUED); 595 if (ret > 0 && ppos) 596 *ppos = kiocb.ki_pos; 597 return ret; 598 } 599 600 /* caller is responsible for file_start_write/file_end_write */ 601 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) 602 { 603 struct kiocb kiocb; 604 ssize_t ret; 605 606 if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) 607 return -EBADF; 608 if (!(file->f_mode & FMODE_CAN_WRITE)) 609 return -EINVAL; 610 /* 611 * Also fail if ->write_iter and ->write are both wired up as that 612 * implies very convoluted semantics. 613 */ 614 if (unlikely(!file->f_op->write_iter || file->f_op->write)) 615 return warn_unsupported(file, "write"); 616 617 init_sync_kiocb(&kiocb, file); 618 kiocb.ki_pos = pos ? *pos : 0; 619 ret = file->f_op->write_iter(&kiocb, from); 620 if (ret > 0) { 621 if (pos) 622 *pos = kiocb.ki_pos; 623 fsnotify_modify(file); 624 add_wchar(current, ret); 625 } 626 inc_syscw(current); 627 return ret; 628 } 629 630 /* caller is responsible for file_start_write/file_end_write */ 631 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) 632 { 633 struct kvec iov = { 634 .iov_base = (void *)buf, 635 .iov_len = min_t(size_t, count, MAX_RW_COUNT), 636 }; 637 struct iov_iter iter; 638 iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); 639 return __kernel_write_iter(file, &iter, pos); 640 } 641 /* 642 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", 643 * but autofs is one of the few internal kernel users that actually 644 * wants this _and_ can be built as a module. So we need to export 645 * this symbol for autofs, even though it really isn't appropriate 646 * for any other kernel modules. 647 */ 648 EXPORT_SYMBOL_GPL(__kernel_write); 649 650 ssize_t kernel_write(struct file *file, const void *buf, size_t count, 651 loff_t *pos) 652 { 653 ssize_t ret; 654 655 ret = rw_verify_area(WRITE, file, pos, count); 656 if (ret) 657 return ret; 658 659 file_start_write(file); 660 ret = __kernel_write(file, buf, count, pos); 661 file_end_write(file); 662 return ret; 663 } 664 EXPORT_SYMBOL(kernel_write); 665 666 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 667 { 668 ssize_t ret; 669 670 if (!(file->f_mode & FMODE_WRITE)) 671 return -EBADF; 672 if (!(file->f_mode & FMODE_CAN_WRITE)) 673 return -EINVAL; 674 if (unlikely(!access_ok(buf, count))) 675 return -EFAULT; 676 677 ret = rw_verify_area(WRITE, file, pos, count); 678 if (ret) 679 return ret; 680 if (count > MAX_RW_COUNT) 681 count = MAX_RW_COUNT; 682 file_start_write(file); 683 if (file->f_op->write) 684 ret = file->f_op->write(file, buf, count, pos); 685 else if (file->f_op->write_iter) 686 ret = new_sync_write(file, buf, count, pos); 687 else 688 ret = -EINVAL; 689 if (ret > 0) { 690 fsnotify_modify(file); 691 add_wchar(current, ret); 692 } 693 inc_syscw(current); 694 file_end_write(file); 695 return ret; 696 } 697 698 /* file_ppos returns &file->f_pos or NULL if file is stream */ 699 static inline loff_t *file_ppos(struct file *file) 700 { 701 return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; 702 } 703 704 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) 705 { 706 CLASS(fd_pos, f)(fd); 707 ssize_t ret = -EBADF; 708 709 if (!fd_empty(f)) { 710 loff_t pos, *ppos = file_ppos(fd_file(f)); 711 if (ppos) { 712 pos = *ppos; 713 ppos = &pos; 714 } 715 ret = vfs_read(fd_file(f), buf, count, ppos); 716 if (ret >= 0 && ppos) 717 fd_file(f)->f_pos = pos; 718 } 719 return ret; 720 } 721 722 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 723 { 724 return ksys_read(fd, buf, count); 725 } 726 727 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) 728 { 729 CLASS(fd_pos, f)(fd); 730 ssize_t ret = -EBADF; 731 732 if (!fd_empty(f)) { 733 loff_t pos, *ppos = file_ppos(fd_file(f)); 734 if (ppos) { 735 pos = *ppos; 736 ppos = &pos; 737 } 738 ret = vfs_write(fd_file(f), buf, count, ppos); 739 if (ret >= 0 && ppos) 740 fd_file(f)->f_pos = pos; 741 } 742 743 return ret; 744 } 745 746 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 747 size_t, count) 748 { 749 return ksys_write(fd, buf, count); 750 } 751 752 ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, 753 loff_t pos) 754 { 755 if (pos < 0) 756 return -EINVAL; 757 758 CLASS(fd, f)(fd); 759 if (fd_empty(f)) 760 return -EBADF; 761 762 if (fd_file(f)->f_mode & FMODE_PREAD) 763 return vfs_read(fd_file(f), buf, count, &pos); 764 765 return -ESPIPE; 766 } 767 768 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, 769 size_t, count, loff_t, pos) 770 { 771 return ksys_pread64(fd, buf, count, pos); 772 } 773 774 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) 775 COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, 776 size_t, count, compat_arg_u64_dual(pos)) 777 { 778 return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); 779 } 780 #endif 781 782 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, 783 size_t count, loff_t pos) 784 { 785 if (pos < 0) 786 return -EINVAL; 787 788 CLASS(fd, f)(fd); 789 if (fd_empty(f)) 790 return -EBADF; 791 792 if (fd_file(f)->f_mode & FMODE_PWRITE) 793 return vfs_write(fd_file(f), buf, count, &pos); 794 795 return -ESPIPE; 796 } 797 798 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, 799 size_t, count, loff_t, pos) 800 { 801 return ksys_pwrite64(fd, buf, count, pos); 802 } 803 804 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) 805 COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, 806 size_t, count, compat_arg_u64_dual(pos)) 807 { 808 return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); 809 } 810 #endif 811 812 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, 813 loff_t *ppos, int type, rwf_t flags) 814 { 815 struct kiocb kiocb; 816 ssize_t ret; 817 818 init_sync_kiocb(&kiocb, filp); 819 ret = kiocb_set_rw_flags(&kiocb, flags, type); 820 if (ret) 821 return ret; 822 kiocb.ki_pos = (ppos ? *ppos : 0); 823 824 if (type == READ) 825 ret = filp->f_op->read_iter(&kiocb, iter); 826 else 827 ret = filp->f_op->write_iter(&kiocb, iter); 828 BUG_ON(ret == -EIOCBQUEUED); 829 if (ppos) 830 *ppos = kiocb.ki_pos; 831 return ret; 832 } 833 834 /* Do it by hand, with file-ops */ 835 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, 836 loff_t *ppos, int type, rwf_t flags) 837 { 838 ssize_t ret = 0; 839 840 if (flags & ~RWF_HIPRI) 841 return -EOPNOTSUPP; 842 843 while (iov_iter_count(iter)) { 844 ssize_t nr; 845 846 if (type == READ) { 847 nr = filp->f_op->read(filp, iter_iov_addr(iter), 848 iter_iov_len(iter), ppos); 849 } else { 850 nr = filp->f_op->write(filp, iter_iov_addr(iter), 851 iter_iov_len(iter), ppos); 852 } 853 854 if (nr < 0) { 855 if (!ret) 856 ret = nr; 857 break; 858 } 859 ret += nr; 860 if (nr != iter_iov_len(iter)) 861 break; 862 iov_iter_advance(iter, nr); 863 } 864 865 return ret; 866 } 867 868 ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, 869 struct iov_iter *iter) 870 { 871 size_t tot_len; 872 ssize_t ret = 0; 873 874 if (!file->f_op->read_iter) 875 return -EINVAL; 876 if (!(file->f_mode & FMODE_READ)) 877 return -EBADF; 878 if (!(file->f_mode & FMODE_CAN_READ)) 879 return -EINVAL; 880 881 tot_len = iov_iter_count(iter); 882 if (!tot_len) 883 goto out; 884 ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); 885 if (ret < 0) 886 return ret; 887 888 ret = file->f_op->read_iter(iocb, iter); 889 out: 890 if (ret >= 0) 891 fsnotify_access(file); 892 return ret; 893 } 894 EXPORT_SYMBOL(vfs_iocb_iter_read); 895 896 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, 897 rwf_t flags) 898 { 899 size_t tot_len; 900 ssize_t ret = 0; 901 902 if (!file->f_op->read_iter) 903 return -EINVAL; 904 if (!(file->f_mode & FMODE_READ)) 905 return -EBADF; 906 if (!(file->f_mode & FMODE_CAN_READ)) 907 return -EINVAL; 908 909 tot_len = iov_iter_count(iter); 910 if (!tot_len) 911 goto out; 912 ret = rw_verify_area(READ, file, ppos, tot_len); 913 if (ret < 0) 914 return ret; 915 916 ret = do_iter_readv_writev(file, iter, ppos, READ, flags); 917 out: 918 if (ret >= 0) 919 fsnotify_access(file); 920 return ret; 921 } 922 EXPORT_SYMBOL(vfs_iter_read); 923 924 /* 925 * Caller is responsible for calling kiocb_end_write() on completion 926 * if async iocb was queued. 927 */ 928 ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, 929 struct iov_iter *iter) 930 { 931 size_t tot_len; 932 ssize_t ret = 0; 933 934 if (!file->f_op->write_iter) 935 return -EINVAL; 936 if (!(file->f_mode & FMODE_WRITE)) 937 return -EBADF; 938 if (!(file->f_mode & FMODE_CAN_WRITE)) 939 return -EINVAL; 940 941 tot_len = iov_iter_count(iter); 942 if (!tot_len) 943 return 0; 944 ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); 945 if (ret < 0) 946 return ret; 947 948 kiocb_start_write(iocb); 949 ret = file->f_op->write_iter(iocb, iter); 950 if (ret != -EIOCBQUEUED) 951 kiocb_end_write(iocb); 952 if (ret > 0) 953 fsnotify_modify(file); 954 955 return ret; 956 } 957 EXPORT_SYMBOL(vfs_iocb_iter_write); 958 959 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, 960 rwf_t flags) 961 { 962 size_t tot_len; 963 ssize_t ret; 964 965 if (!(file->f_mode & FMODE_WRITE)) 966 return -EBADF; 967 if (!(file->f_mode & FMODE_CAN_WRITE)) 968 return -EINVAL; 969 if (!file->f_op->write_iter) 970 return -EINVAL; 971 972 tot_len = iov_iter_count(iter); 973 if (!tot_len) 974 return 0; 975 976 ret = rw_verify_area(WRITE, file, ppos, tot_len); 977 if (ret < 0) 978 return ret; 979 980 file_start_write(file); 981 ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); 982 if (ret > 0) 983 fsnotify_modify(file); 984 file_end_write(file); 985 986 return ret; 987 } 988 EXPORT_SYMBOL(vfs_iter_write); 989 990 static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 991 unsigned long vlen, loff_t *pos, rwf_t flags) 992 { 993 struct iovec iovstack[UIO_FASTIOV]; 994 struct iovec *iov = iovstack; 995 struct iov_iter iter; 996 size_t tot_len; 997 ssize_t ret = 0; 998 999 if (!(file->f_mode & FMODE_READ)) 1000 return -EBADF; 1001 if (!(file->f_mode & FMODE_CAN_READ)) 1002 return -EINVAL; 1003 1004 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, 1005 &iter); 1006 if (ret < 0) 1007 return ret; 1008 1009 tot_len = iov_iter_count(&iter); 1010 if (!tot_len) 1011 goto out; 1012 1013 ret = rw_verify_area(READ, file, pos, tot_len); 1014 if (ret < 0) 1015 goto out; 1016 1017 if (file->f_op->read_iter) 1018 ret = do_iter_readv_writev(file, &iter, pos, READ, flags); 1019 else 1020 ret = do_loop_readv_writev(file, &iter, pos, READ, flags); 1021 out: 1022 if (ret >= 0) 1023 fsnotify_access(file); 1024 kfree(iov); 1025 return ret; 1026 } 1027 1028 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 1029 unsigned long vlen, loff_t *pos, rwf_t flags) 1030 { 1031 struct iovec iovstack[UIO_FASTIOV]; 1032 struct iovec *iov = iovstack; 1033 struct iov_iter iter; 1034 size_t tot_len; 1035 ssize_t ret = 0; 1036 1037 if (!(file->f_mode & FMODE_WRITE)) 1038 return -EBADF; 1039 if (!(file->f_mode & FMODE_CAN_WRITE)) 1040 return -EINVAL; 1041 1042 ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, 1043 &iter); 1044 if (ret < 0) 1045 return ret; 1046 1047 tot_len = iov_iter_count(&iter); 1048 if (!tot_len) 1049 goto out; 1050 1051 ret = rw_verify_area(WRITE, file, pos, tot_len); 1052 if (ret < 0) 1053 goto out; 1054 1055 file_start_write(file); 1056 if (file->f_op->write_iter) 1057 ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); 1058 else 1059 ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); 1060 if (ret > 0) 1061 fsnotify_modify(file); 1062 file_end_write(file); 1063 out: 1064 kfree(iov); 1065 return ret; 1066 } 1067 1068 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, 1069 unsigned long vlen, rwf_t flags) 1070 { 1071 CLASS(fd_pos, f)(fd); 1072 ssize_t ret = -EBADF; 1073 1074 if (!fd_empty(f)) { 1075 loff_t pos, *ppos = file_ppos(fd_file(f)); 1076 if (ppos) { 1077 pos = *ppos; 1078 ppos = &pos; 1079 } 1080 ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); 1081 if (ret >= 0 && ppos) 1082 fd_file(f)->f_pos = pos; 1083 } 1084 1085 if (ret > 0) 1086 add_rchar(current, ret); 1087 inc_syscr(current); 1088 return ret; 1089 } 1090 1091 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, 1092 unsigned long vlen, rwf_t flags) 1093 { 1094 CLASS(fd_pos, f)(fd); 1095 ssize_t ret = -EBADF; 1096 1097 if (!fd_empty(f)) { 1098 loff_t pos, *ppos = file_ppos(fd_file(f)); 1099 if (ppos) { 1100 pos = *ppos; 1101 ppos = &pos; 1102 } 1103 ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); 1104 if (ret >= 0 && ppos) 1105 fd_file(f)->f_pos = pos; 1106 } 1107 1108 if (ret > 0) 1109 add_wchar(current, ret); 1110 inc_syscw(current); 1111 return ret; 1112 } 1113 1114 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) 1115 { 1116 #define HALF_LONG_BITS (BITS_PER_LONG / 2) 1117 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; 1118 } 1119 1120 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, 1121 unsigned long vlen, loff_t pos, rwf_t flags) 1122 { 1123 ssize_t ret = -EBADF; 1124 1125 if (pos < 0) 1126 return -EINVAL; 1127 1128 CLASS(fd, f)(fd); 1129 if (!fd_empty(f)) { 1130 ret = -ESPIPE; 1131 if (fd_file(f)->f_mode & FMODE_PREAD) 1132 ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); 1133 } 1134 1135 if (ret > 0) 1136 add_rchar(current, ret); 1137 inc_syscr(current); 1138 return ret; 1139 } 1140 1141 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, 1142 unsigned long vlen, loff_t pos, rwf_t flags) 1143 { 1144 ssize_t ret = -EBADF; 1145 1146 if (pos < 0) 1147 return -EINVAL; 1148 1149 CLASS(fd, f)(fd); 1150 if (!fd_empty(f)) { 1151 ret = -ESPIPE; 1152 if (fd_file(f)->f_mode & FMODE_PWRITE) 1153 ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); 1154 } 1155 1156 if (ret > 0) 1157 add_wchar(current, ret); 1158 inc_syscw(current); 1159 return ret; 1160 } 1161 1162 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 1163 unsigned long, vlen) 1164 { 1165 return do_readv(fd, vec, vlen, 0); 1166 } 1167 1168 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 1169 unsigned long, vlen) 1170 { 1171 return do_writev(fd, vec, vlen, 0); 1172 } 1173 1174 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 1175 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1176 { 1177 loff_t pos = pos_from_hilo(pos_h, pos_l); 1178 1179 return do_preadv(fd, vec, vlen, pos, 0); 1180 } 1181 1182 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, 1183 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1184 rwf_t, flags) 1185 { 1186 loff_t pos = pos_from_hilo(pos_h, pos_l); 1187 1188 if (pos == -1) 1189 return do_readv(fd, vec, vlen, flags); 1190 1191 return do_preadv(fd, vec, vlen, pos, flags); 1192 } 1193 1194 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 1195 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1196 { 1197 loff_t pos = pos_from_hilo(pos_h, pos_l); 1198 1199 return do_pwritev(fd, vec, vlen, pos, 0); 1200 } 1201 1202 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, 1203 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1204 rwf_t, flags) 1205 { 1206 loff_t pos = pos_from_hilo(pos_h, pos_l); 1207 1208 if (pos == -1) 1209 return do_writev(fd, vec, vlen, flags); 1210 1211 return do_pwritev(fd, vec, vlen, pos, flags); 1212 } 1213 1214 /* 1215 * Various compat syscalls. Note that they all pretend to take a native 1216 * iovec - import_iovec will properly treat those as compat_iovecs based on 1217 * in_compat_syscall(). 1218 */ 1219 #ifdef CONFIG_COMPAT 1220 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 1221 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 1222 const struct iovec __user *, vec, 1223 unsigned long, vlen, loff_t, pos) 1224 { 1225 return do_preadv(fd, vec, vlen, pos, 0); 1226 } 1227 #endif 1228 1229 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1230 const struct iovec __user *, vec, 1231 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1232 { 1233 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1234 1235 return do_preadv(fd, vec, vlen, pos, 0); 1236 } 1237 1238 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 1239 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, 1240 const struct iovec __user *, vec, 1241 unsigned long, vlen, loff_t, pos, rwf_t, flags) 1242 { 1243 if (pos == -1) 1244 return do_readv(fd, vec, vlen, flags); 1245 return do_preadv(fd, vec, vlen, pos, flags); 1246 } 1247 #endif 1248 1249 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, 1250 const struct iovec __user *, vec, 1251 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, 1252 rwf_t, flags) 1253 { 1254 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1255 1256 if (pos == -1) 1257 return do_readv(fd, vec, vlen, flags); 1258 return do_preadv(fd, vec, vlen, pos, flags); 1259 } 1260 1261 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 1262 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1263 const struct iovec __user *, vec, 1264 unsigned long, vlen, loff_t, pos) 1265 { 1266 return do_pwritev(fd, vec, vlen, pos, 0); 1267 } 1268 #endif 1269 1270 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1271 const struct iovec __user *,vec, 1272 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1273 { 1274 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1275 1276 return do_pwritev(fd, vec, vlen, pos, 0); 1277 } 1278 1279 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 1280 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, 1281 const struct iovec __user *, vec, 1282 unsigned long, vlen, loff_t, pos, rwf_t, flags) 1283 { 1284 if (pos == -1) 1285 return do_writev(fd, vec, vlen, flags); 1286 return do_pwritev(fd, vec, vlen, pos, flags); 1287 } 1288 #endif 1289 1290 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, 1291 const struct iovec __user *,vec, 1292 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) 1293 { 1294 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1295 1296 if (pos == -1) 1297 return do_writev(fd, vec, vlen, flags); 1298 return do_pwritev(fd, vec, vlen, pos, flags); 1299 } 1300 #endif /* CONFIG_COMPAT */ 1301 1302 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1303 size_t count, loff_t max) 1304 { 1305 struct inode *in_inode, *out_inode; 1306 struct pipe_inode_info *opipe; 1307 loff_t pos; 1308 loff_t out_pos; 1309 ssize_t retval; 1310 int fl; 1311 1312 /* 1313 * Get input file, and verify that it is ok.. 1314 */ 1315 CLASS(fd, in)(in_fd); 1316 if (fd_empty(in)) 1317 return -EBADF; 1318 if (!(fd_file(in)->f_mode & FMODE_READ)) 1319 return -EBADF; 1320 if (!ppos) { 1321 pos = fd_file(in)->f_pos; 1322 } else { 1323 pos = *ppos; 1324 if (!(fd_file(in)->f_mode & FMODE_PREAD)) 1325 return -ESPIPE; 1326 } 1327 retval = rw_verify_area(READ, fd_file(in), &pos, count); 1328 if (retval < 0) 1329 return retval; 1330 if (count > MAX_RW_COUNT) 1331 count = MAX_RW_COUNT; 1332 1333 /* 1334 * Get output file, and verify that it is ok.. 1335 */ 1336 CLASS(fd, out)(out_fd); 1337 if (fd_empty(out)) 1338 return -EBADF; 1339 if (!(fd_file(out)->f_mode & FMODE_WRITE)) 1340 return -EBADF; 1341 in_inode = file_inode(fd_file(in)); 1342 out_inode = file_inode(fd_file(out)); 1343 out_pos = fd_file(out)->f_pos; 1344 1345 if (!max) 1346 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 1347 1348 if (unlikely(pos + count > max)) { 1349 if (pos >= max) 1350 return -EOVERFLOW; 1351 count = max - pos; 1352 } 1353 1354 fl = 0; 1355 #if 0 1356 /* 1357 * We need to debate whether we can enable this or not. The 1358 * man page documents EAGAIN return for the output at least, 1359 * and the application is arguably buggy if it doesn't expect 1360 * EAGAIN on a non-blocking file descriptor. 1361 */ 1362 if (fd_file(in)->f_flags & O_NONBLOCK) 1363 fl = SPLICE_F_NONBLOCK; 1364 #endif 1365 opipe = get_pipe_info(fd_file(out), true); 1366 if (!opipe) { 1367 retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); 1368 if (retval < 0) 1369 return retval; 1370 retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, 1371 count, fl); 1372 } else { 1373 if (fd_file(out)->f_flags & O_NONBLOCK) 1374 fl |= SPLICE_F_NONBLOCK; 1375 1376 retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl); 1377 } 1378 1379 if (retval > 0) { 1380 add_rchar(current, retval); 1381 add_wchar(current, retval); 1382 fsnotify_access(fd_file(in)); 1383 fsnotify_modify(fd_file(out)); 1384 fd_file(out)->f_pos = out_pos; 1385 if (ppos) 1386 *ppos = pos; 1387 else 1388 fd_file(in)->f_pos = pos; 1389 } 1390 1391 inc_syscr(current); 1392 inc_syscw(current); 1393 if (pos > max) 1394 retval = -EOVERFLOW; 1395 return retval; 1396 } 1397 1398 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) 1399 { 1400 loff_t pos; 1401 off_t off; 1402 ssize_t ret; 1403 1404 if (offset) { 1405 if (unlikely(get_user(off, offset))) 1406 return -EFAULT; 1407 pos = off; 1408 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1409 if (unlikely(put_user(pos, offset))) 1410 return -EFAULT; 1411 return ret; 1412 } 1413 1414 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1415 } 1416 1417 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) 1418 { 1419 loff_t pos; 1420 ssize_t ret; 1421 1422 if (offset) { 1423 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1424 return -EFAULT; 1425 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1426 if (unlikely(put_user(pos, offset))) 1427 return -EFAULT; 1428 return ret; 1429 } 1430 1431 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1432 } 1433 1434 #ifdef CONFIG_COMPAT 1435 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, 1436 compat_off_t __user *, offset, compat_size_t, count) 1437 { 1438 loff_t pos; 1439 off_t off; 1440 ssize_t ret; 1441 1442 if (offset) { 1443 if (unlikely(get_user(off, offset))) 1444 return -EFAULT; 1445 pos = off; 1446 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1447 if (unlikely(put_user(pos, offset))) 1448 return -EFAULT; 1449 return ret; 1450 } 1451 1452 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1453 } 1454 1455 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, 1456 compat_loff_t __user *, offset, compat_size_t, count) 1457 { 1458 loff_t pos; 1459 ssize_t ret; 1460 1461 if (offset) { 1462 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1463 return -EFAULT; 1464 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1465 if (unlikely(put_user(pos, offset))) 1466 return -EFAULT; 1467 return ret; 1468 } 1469 1470 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1471 } 1472 #endif 1473 1474 /* 1475 * Performs necessary checks before doing a file copy 1476 * 1477 * Can adjust amount of bytes to copy via @req_count argument. 1478 * Returns appropriate error code that caller should return or 1479 * zero in case the copy should be allowed. 1480 */ 1481 static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, 1482 struct file *file_out, loff_t pos_out, 1483 size_t *req_count, unsigned int flags) 1484 { 1485 struct inode *inode_in = file_inode(file_in); 1486 struct inode *inode_out = file_inode(file_out); 1487 uint64_t count = *req_count; 1488 loff_t size_in; 1489 int ret; 1490 1491 ret = generic_file_rw_checks(file_in, file_out); 1492 if (ret) 1493 return ret; 1494 1495 /* 1496 * We allow some filesystems to handle cross sb copy, but passing 1497 * a file of the wrong filesystem type to filesystem driver can result 1498 * in an attempt to dereference the wrong type of ->private_data, so 1499 * avoid doing that until we really have a good reason. 1500 * 1501 * nfs and cifs define several different file_system_type structures 1502 * and several different sets of file_operations, but they all end up 1503 * using the same ->copy_file_range() function pointer. 1504 */ 1505 if (flags & COPY_FILE_SPLICE) { 1506 /* cross sb splice is allowed */ 1507 } else if (file_out->f_op->copy_file_range) { 1508 if (file_in->f_op->copy_file_range != 1509 file_out->f_op->copy_file_range) 1510 return -EXDEV; 1511 } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { 1512 return -EXDEV; 1513 } 1514 1515 /* Don't touch certain kinds of inodes */ 1516 if (IS_IMMUTABLE(inode_out)) 1517 return -EPERM; 1518 1519 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) 1520 return -ETXTBSY; 1521 1522 /* Ensure offsets don't wrap. */ 1523 if (pos_in + count < pos_in || pos_out + count < pos_out) 1524 return -EOVERFLOW; 1525 1526 /* Shorten the copy to EOF */ 1527 size_in = i_size_read(inode_in); 1528 if (pos_in >= size_in) 1529 count = 0; 1530 else 1531 count = min(count, size_in - (uint64_t)pos_in); 1532 1533 ret = generic_write_check_limits(file_out, pos_out, &count); 1534 if (ret) 1535 return ret; 1536 1537 /* Don't allow overlapped copying within the same file. */ 1538 if (inode_in == inode_out && 1539 pos_out + count > pos_in && 1540 pos_out < pos_in + count) 1541 return -EINVAL; 1542 1543 *req_count = count; 1544 return 0; 1545 } 1546 1547 /* 1548 * copy_file_range() differs from regular file read and write in that it 1549 * specifically allows return partial success. When it does so is up to 1550 * the copy_file_range method. 1551 */ 1552 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, 1553 struct file *file_out, loff_t pos_out, 1554 size_t len, unsigned int flags) 1555 { 1556 ssize_t ret; 1557 bool splice = flags & COPY_FILE_SPLICE; 1558 bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; 1559 1560 if (flags & ~COPY_FILE_SPLICE) 1561 return -EINVAL; 1562 1563 ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, 1564 flags); 1565 if (unlikely(ret)) 1566 return ret; 1567 1568 ret = rw_verify_area(READ, file_in, &pos_in, len); 1569 if (unlikely(ret)) 1570 return ret; 1571 1572 ret = rw_verify_area(WRITE, file_out, &pos_out, len); 1573 if (unlikely(ret)) 1574 return ret; 1575 1576 if (len == 0) 1577 return 0; 1578 1579 file_start_write(file_out); 1580 1581 /* 1582 * Cloning is supported by more file systems, so we implement copy on 1583 * same sb using clone, but for filesystems where both clone and copy 1584 * are supported (e.g. nfs,cifs), we only call the copy method. 1585 */ 1586 if (!splice && file_out->f_op->copy_file_range) { 1587 ret = file_out->f_op->copy_file_range(file_in, pos_in, 1588 file_out, pos_out, 1589 len, flags); 1590 } else if (!splice && file_in->f_op->remap_file_range && samesb) { 1591 ret = file_in->f_op->remap_file_range(file_in, pos_in, 1592 file_out, pos_out, 1593 min_t(loff_t, MAX_RW_COUNT, len), 1594 REMAP_FILE_CAN_SHORTEN); 1595 /* fallback to splice */ 1596 if (ret <= 0) 1597 splice = true; 1598 } else if (samesb) { 1599 /* Fallback to splice for same sb copy for backward compat */ 1600 splice = true; 1601 } 1602 1603 file_end_write(file_out); 1604 1605 if (!splice) 1606 goto done; 1607 1608 /* 1609 * We can get here for same sb copy of filesystems that do not implement 1610 * ->copy_file_range() in case filesystem does not support clone or in 1611 * case filesystem supports clone but rejected the clone request (e.g. 1612 * because it was not block aligned). 1613 * 1614 * In both cases, fall back to kernel copy so we are able to maintain a 1615 * consistent story about which filesystems support copy_file_range() 1616 * and which filesystems do not, that will allow userspace tools to 1617 * make consistent desicions w.r.t using copy_file_range(). 1618 * 1619 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE 1620 * for server-side-copy between any two sb. 1621 * 1622 * In any case, we call do_splice_direct() and not splice_file_range(), 1623 * without file_start_write() held, to avoid possible deadlocks related 1624 * to splicing from input file, while file_start_write() is held on 1625 * the output file on a different sb. 1626 */ 1627 ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, 1628 min_t(size_t, len, MAX_RW_COUNT), 0); 1629 done: 1630 if (ret > 0) { 1631 fsnotify_access(file_in); 1632 add_rchar(current, ret); 1633 fsnotify_modify(file_out); 1634 add_wchar(current, ret); 1635 } 1636 1637 inc_syscr(current); 1638 inc_syscw(current); 1639 1640 return ret; 1641 } 1642 EXPORT_SYMBOL(vfs_copy_file_range); 1643 1644 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, 1645 int, fd_out, loff_t __user *, off_out, 1646 size_t, len, unsigned int, flags) 1647 { 1648 loff_t pos_in; 1649 loff_t pos_out; 1650 ssize_t ret = -EBADF; 1651 1652 CLASS(fd, f_in)(fd_in); 1653 if (fd_empty(f_in)) 1654 return -EBADF; 1655 1656 CLASS(fd, f_out)(fd_out); 1657 if (fd_empty(f_out)) 1658 return -EBADF; 1659 1660 if (off_in) { 1661 if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) 1662 return -EFAULT; 1663 } else { 1664 pos_in = fd_file(f_in)->f_pos; 1665 } 1666 1667 if (off_out) { 1668 if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) 1669 return -EFAULT; 1670 } else { 1671 pos_out = fd_file(f_out)->f_pos; 1672 } 1673 1674 if (flags != 0) 1675 return -EINVAL; 1676 1677 ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, 1678 flags); 1679 if (ret > 0) { 1680 pos_in += ret; 1681 pos_out += ret; 1682 1683 if (off_in) { 1684 if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) 1685 ret = -EFAULT; 1686 } else { 1687 fd_file(f_in)->f_pos = pos_in; 1688 } 1689 1690 if (off_out) { 1691 if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) 1692 ret = -EFAULT; 1693 } else { 1694 fd_file(f_out)->f_pos = pos_out; 1695 } 1696 } 1697 return ret; 1698 } 1699 1700 /* 1701 * Don't operate on ranges the page cache doesn't support, and don't exceed the 1702 * LFS limits. If pos is under the limit it becomes a short access. If it 1703 * exceeds the limit we return -EFBIG. 1704 */ 1705 int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) 1706 { 1707 struct inode *inode = file->f_mapping->host; 1708 loff_t max_size = inode->i_sb->s_maxbytes; 1709 loff_t limit = rlimit(RLIMIT_FSIZE); 1710 1711 if (limit != RLIM_INFINITY) { 1712 if (pos >= limit) { 1713 send_sig(SIGXFSZ, current, 0); 1714 return -EFBIG; 1715 } 1716 *count = min(*count, limit - pos); 1717 } 1718 1719 if (!(file->f_flags & O_LARGEFILE)) 1720 max_size = MAX_NON_LFS; 1721 1722 if (unlikely(pos >= max_size)) 1723 return -EFBIG; 1724 1725 *count = min(*count, max_size - pos); 1726 1727 return 0; 1728 } 1729 EXPORT_SYMBOL_GPL(generic_write_check_limits); 1730 1731 /* Like generic_write_checks(), but takes size of write instead of iter. */ 1732 int generic_write_checks_count(struct kiocb *iocb, loff_t *count) 1733 { 1734 struct file *file = iocb->ki_filp; 1735 struct inode *inode = file->f_mapping->host; 1736 1737 if (IS_SWAPFILE(inode)) 1738 return -ETXTBSY; 1739 1740 if (!*count) 1741 return 0; 1742 1743 if (iocb->ki_flags & IOCB_APPEND) 1744 iocb->ki_pos = i_size_read(inode); 1745 1746 if ((iocb->ki_flags & IOCB_NOWAIT) && 1747 !((iocb->ki_flags & IOCB_DIRECT) || 1748 (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) 1749 return -EINVAL; 1750 1751 return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); 1752 } 1753 EXPORT_SYMBOL(generic_write_checks_count); 1754 1755 /* 1756 * Performs necessary checks before doing a write 1757 * 1758 * Can adjust writing position or amount of bytes to write. 1759 * Returns appropriate error code that caller should return or 1760 * zero in case that write should be allowed. 1761 */ 1762 ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 1763 { 1764 loff_t count = iov_iter_count(from); 1765 int ret; 1766 1767 ret = generic_write_checks_count(iocb, &count); 1768 if (ret) 1769 return ret; 1770 1771 iov_iter_truncate(from, count); 1772 return iov_iter_count(from); 1773 } 1774 EXPORT_SYMBOL(generic_write_checks); 1775 1776 /* 1777 * Performs common checks before doing a file copy/clone 1778 * from @file_in to @file_out. 1779 */ 1780 int generic_file_rw_checks(struct file *file_in, struct file *file_out) 1781 { 1782 struct inode *inode_in = file_inode(file_in); 1783 struct inode *inode_out = file_inode(file_out); 1784 1785 /* Don't copy dirs, pipes, sockets... */ 1786 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) 1787 return -EISDIR; 1788 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) 1789 return -EINVAL; 1790 1791 if (!(file_in->f_mode & FMODE_READ) || 1792 !(file_out->f_mode & FMODE_WRITE) || 1793 (file_out->f_flags & O_APPEND)) 1794 return -EBADF; 1795 1796 return 0; 1797 } 1798 1799 int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) 1800 { 1801 size_t len = iov_iter_count(iter); 1802 1803 if (!iter_is_ubuf(iter)) 1804 return -EINVAL; 1805 1806 if (!is_power_of_2(len)) 1807 return -EINVAL; 1808 1809 if (!IS_ALIGNED(iocb->ki_pos, len)) 1810 return -EINVAL; 1811 1812 if (!(iocb->ki_flags & IOCB_DIRECT)) 1813 return -EOPNOTSUPP; 1814 1815 return 0; 1816 } 1817 EXPORT_SYMBOL_GPL(generic_atomic_write_valid); 1818