1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. 23 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 24 */ 25 26 27 #ifdef CONFIG_COMPAT 28 #include <linux/compat.h> 29 #endif 30 #include <linux/fs.h> 31 #include <linux/migrate.h> 32 #include <sys/file.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/zfs_znode.h> 35 #include <sys/zfs_vfsops.h> 36 #include <sys/zfs_vnops.h> 37 #include <sys/zfs_project.h> 38 #if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ 39 defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) 40 #include <linux/pagemap.h> 41 #endif 42 #include <linux/fadvise.h> 43 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 44 #include <linux/writeback.h> 45 #endif 46 47 /* 48 * When using fallocate(2) to preallocate space, inflate the requested 49 * capacity check by 10% to account for the required metadata blocks. 50 */ 51 static unsigned int zfs_fallocate_reserve_percent = 110; 52 53 static int 54 zpl_open(struct inode *ip, struct file *filp) 55 { 56 cred_t *cr = CRED(); 57 int error; 58 fstrans_cookie_t cookie; 59 60 error = generic_file_open(ip, filp); 61 if (error) 62 return (error); 63 64 crhold(cr); 65 cookie = spl_fstrans_mark(); 66 error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); 67 spl_fstrans_unmark(cookie); 68 crfree(cr); 69 ASSERT3S(error, <=, 0); 70 71 return (error); 72 } 73 74 static int 75 zpl_release(struct inode *ip, struct file *filp) 76 { 77 cred_t *cr = CRED(); 78 int error; 79 fstrans_cookie_t cookie; 80 81 cookie = spl_fstrans_mark(); 82 if (ITOZ(ip)->z_atime_dirty) 83 zfs_mark_inode_dirty(ip); 84 85 crhold(cr); 86 error = -zfs_close(ip, filp->f_flags, cr); 87 spl_fstrans_unmark(cookie); 88 crfree(cr); 89 ASSERT3S(error, <=, 0); 90 91 return (error); 92 } 93 94 static int 95 zpl_iterate(struct file *filp, struct dir_context *ctx) 96 { 97 cred_t *cr = CRED(); 98 int error; 99 fstrans_cookie_t cookie; 100 101 crhold(cr); 102 cookie = spl_fstrans_mark(); 103 error = -zfs_readdir(file_inode(filp), ctx, cr); 104 spl_fstrans_unmark(cookie); 105 crfree(cr); 106 ASSERT3S(error, <=, 0); 107 108 return (error); 109 } 110 111 static int 112 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) 113 { 114 struct inode *inode = filp->f_mapping->host; 115 znode_t *zp = ITOZ(inode); 116 zfsvfs_t *zfsvfs = ITOZSB(inode); 117 cred_t *cr = CRED(); 118 int error; 119 fstrans_cookie_t cookie; 120 121 /* 122 * The variables z_sync_writes_cnt and z_async_writes_cnt work in 123 * tandem so that sync writes can detect if there are any non-sync 124 * writes going on and vice-versa. The "vice-versa" part to this logic 125 * is located in zfs_putpage() where non-sync writes check if there are 126 * any ongoing sync writes. If any sync and non-sync writes overlap, 127 * we do a commit to complete the non-sync writes since the latter can 128 * potentially take several seconds to complete and thus block sync 129 * writes in the upcoming call to filemap_write_and_wait_range(). 130 */ 131 atomic_inc_32(&zp->z_sync_writes_cnt); 132 /* 133 * If the following check does not detect an overlapping non-sync write 134 * (say because it's just about to start), then it is guaranteed that 135 * the non-sync write will detect this sync write. This is because we 136 * always increment z_sync_writes_cnt / z_async_writes_cnt before doing 137 * the check on z_async_writes_cnt / z_sync_writes_cnt here and in 138 * zfs_putpage() respectively. 139 */ 140 if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { 141 if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { 142 atomic_dec_32(&zp->z_sync_writes_cnt); 143 return (error); 144 } 145 zil_commit(zfsvfs->z_log, zp->z_id); 146 zpl_exit(zfsvfs, FTAG); 147 } 148 149 error = filemap_write_and_wait_range(inode->i_mapping, start, end); 150 151 /* 152 * The sync write is not complete yet but we decrement 153 * z_sync_writes_cnt since zfs_fsync() increments and decrements 154 * it internally. If a non-sync write starts just after the decrement 155 * operation but before we call zfs_fsync(), it may not detect this 156 * overlapping sync write but it does not matter since we have already 157 * gone past filemap_write_and_wait_range() and we won't block due to 158 * the non-sync write. 159 */ 160 atomic_dec_32(&zp->z_sync_writes_cnt); 161 162 if (error) 163 return (error); 164 165 crhold(cr); 166 cookie = spl_fstrans_mark(); 167 error = -zfs_fsync(zp, datasync, cr); 168 spl_fstrans_unmark(cookie); 169 crfree(cr); 170 ASSERT3S(error, <=, 0); 171 172 return (error); 173 } 174 175 static inline int 176 zfs_io_flags(struct kiocb *kiocb) 177 { 178 int flags = 0; 179 180 #if defined(IOCB_DSYNC) 181 if (kiocb->ki_flags & IOCB_DSYNC) 182 flags |= O_DSYNC; 183 #endif 184 #if defined(IOCB_SYNC) 185 if (kiocb->ki_flags & IOCB_SYNC) 186 flags |= O_SYNC; 187 #endif 188 #if defined(IOCB_APPEND) 189 if (kiocb->ki_flags & IOCB_APPEND) 190 flags |= O_APPEND; 191 #endif 192 #if defined(IOCB_DIRECT) 193 if (kiocb->ki_flags & IOCB_DIRECT) 194 flags |= O_DIRECT; 195 #endif 196 return (flags); 197 } 198 199 /* 200 * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() 201 * is true. This is needed since datasets with inherited "relatime" property 202 * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after 203 * `zfs set relatime=...`), which is what relatime test in VFS by 204 * relatime_need_update() is based on. 205 */ 206 static inline void 207 zpl_file_accessed(struct file *filp) 208 { 209 struct inode *ip = filp->f_mapping->host; 210 211 if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { 212 if (zfs_relatime_need_update(ip)) 213 file_accessed(filp); 214 } else { 215 file_accessed(filp); 216 } 217 } 218 219 /* 220 * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports 221 * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to 222 * manipulate the iov_iter are available. In which case the full iov_iter 223 * can be attached to the uio and correctly handled in the lower layers. 224 * Otherwise, for older kernels extract the iovec and pass it instead. 225 */ 226 static void 227 zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, 228 loff_t pos, ssize_t count, size_t skip) 229 { 230 #if defined(HAVE_VFS_IOV_ITER) 231 zfs_uio_iov_iter_init(uio, to, pos, count, skip); 232 #else 233 zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, 234 zfs_uio_iov_iter_type(to) & ITER_KVEC ? 235 UIO_SYSSPACE : UIO_USERSPACE, 236 count, skip); 237 #endif 238 } 239 240 static ssize_t 241 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) 242 { 243 cred_t *cr = CRED(); 244 fstrans_cookie_t cookie; 245 struct file *filp = kiocb->ki_filp; 246 ssize_t count = iov_iter_count(to); 247 zfs_uio_t uio; 248 249 zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); 250 251 crhold(cr); 252 cookie = spl_fstrans_mark(); 253 254 ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, 255 filp->f_flags | zfs_io_flags(kiocb), cr); 256 257 spl_fstrans_unmark(cookie); 258 crfree(cr); 259 260 if (ret < 0) 261 return (ret); 262 263 ssize_t read = count - uio.uio_resid; 264 kiocb->ki_pos += read; 265 266 zpl_file_accessed(filp); 267 268 return (read); 269 } 270 271 static inline ssize_t 272 zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, 273 size_t *countp) 274 { 275 ssize_t ret = generic_write_checks(kiocb, from); 276 if (ret <= 0) 277 return (ret); 278 279 *countp = ret; 280 281 return (0); 282 } 283 284 static ssize_t 285 zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) 286 { 287 cred_t *cr = CRED(); 288 fstrans_cookie_t cookie; 289 struct file *filp = kiocb->ki_filp; 290 struct inode *ip = filp->f_mapping->host; 291 zfs_uio_t uio; 292 size_t count = 0; 293 ssize_t ret; 294 295 ret = zpl_generic_write_checks(kiocb, from, &count); 296 if (ret) 297 return (ret); 298 299 zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); 300 301 crhold(cr); 302 cookie = spl_fstrans_mark(); 303 304 ret = -zfs_write(ITOZ(ip), &uio, 305 filp->f_flags | zfs_io_flags(kiocb), cr); 306 307 spl_fstrans_unmark(cookie); 308 crfree(cr); 309 310 if (ret < 0) 311 return (ret); 312 313 ssize_t wrote = count - uio.uio_resid; 314 kiocb->ki_pos += wrote; 315 316 return (wrote); 317 } 318 319 static ssize_t 320 zpl_direct_IO_impl(void) 321 { 322 /* 323 * All O_DIRECT requests should be handled by 324 * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code 325 * should call the direct_IO address_space_operations function. We set 326 * this code path to be fatal if it is executed. 327 */ 328 PANIC(0); 329 return (0); 330 } 331 332 #if defined(HAVE_VFS_DIRECT_IO_ITER) 333 static ssize_t 334 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) 335 { 336 return (zpl_direct_IO_impl()); 337 } 338 #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) 339 static ssize_t 340 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) 341 { 342 return (zpl_direct_IO_impl()); 343 } 344 #else 345 #error "Unknown Direct I/O interface" 346 #endif 347 348 static loff_t 349 zpl_llseek(struct file *filp, loff_t offset, int whence) 350 { 351 #if defined(SEEK_HOLE) && defined(SEEK_DATA) 352 fstrans_cookie_t cookie; 353 354 if (whence == SEEK_DATA || whence == SEEK_HOLE) { 355 struct inode *ip = filp->f_mapping->host; 356 loff_t maxbytes = ip->i_sb->s_maxbytes; 357 loff_t error; 358 359 spl_inode_lock_shared(ip); 360 cookie = spl_fstrans_mark(); 361 error = -zfs_holey(ITOZ(ip), whence, &offset); 362 spl_fstrans_unmark(cookie); 363 if (error == 0) 364 error = lseek_execute(filp, ip, offset, maxbytes); 365 spl_inode_unlock_shared(ip); 366 367 return (error); 368 } 369 #endif /* SEEK_HOLE && SEEK_DATA */ 370 371 return (generic_file_llseek(filp, offset, whence)); 372 } 373 374 /* 375 * It's worth taking a moment to describe how mmap is implemented 376 * for zfs because it differs considerably from other Linux filesystems. 377 * However, this issue is handled the same way under OpenSolaris. 378 * 379 * The issue is that by design zfs bypasses the Linux page cache and 380 * leaves all caching up to the ARC. This has been shown to work 381 * well for the common read(2)/write(2) case. However, mmap(2) 382 * is problem because it relies on being tightly integrated with the 383 * page cache. To handle this we cache mmap'ed files twice, once in 384 * the ARC and a second time in the page cache. The code is careful 385 * to keep both copies synchronized. 386 * 387 * When a file with an mmap'ed region is written to using write(2) 388 * both the data in the ARC and existing pages in the page cache 389 * are updated. For a read(2) data will be read first from the page 390 * cache then the ARC if needed. Neither a write(2) or read(2) will 391 * will ever result in new pages being added to the page cache. 392 * 393 * New pages are added to the page cache only via .readpage() which 394 * is called when the vfs needs to read a page off disk to back the 395 * virtual memory region. These pages may be modified without 396 * notifying the ARC and will be written out periodically via 397 * .writepage(). This will occur due to either a sync or the usual 398 * page aging behavior. Note because a read(2) of a mmap'ed file 399 * will always check the page cache first even when the ARC is out 400 * of date correct data will still be returned. 401 * 402 * While this implementation ensures correct behavior it does have 403 * have some drawbacks. The most obvious of which is that it 404 * increases the required memory footprint when access mmap'ed 405 * files. It also adds additional complexity to the code keeping 406 * both caches synchronized. 407 * 408 * Longer term it may be possible to cleanly resolve this wart by 409 * mapping page cache pages directly on to the ARC buffers. The 410 * Linux address space operations are flexible enough to allow 411 * selection of which pages back a particular index. The trick 412 * would be working out the details of which subsystem is in 413 * charge, the ARC, the page cache, or both. It may also prove 414 * helpful to move the ARC buffers to a scatter-gather lists 415 * rather than a vmalloc'ed region. 416 */ 417 static int 418 zpl_mmap(struct file *filp, struct vm_area_struct *vma) 419 { 420 struct inode *ip = filp->f_mapping->host; 421 int error; 422 fstrans_cookie_t cookie; 423 424 cookie = spl_fstrans_mark(); 425 error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, 426 (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); 427 spl_fstrans_unmark(cookie); 428 429 if (error) 430 return (error); 431 432 error = generic_file_mmap(filp, vma); 433 if (error) 434 return (error); 435 436 return (error); 437 } 438 439 /* 440 * Populate a page with data for the Linux page cache. This function is 441 * only used to support mmap(2). There will be an identical copy of the 442 * data in the ARC which is kept up to date via .write() and .writepage(). 443 */ 444 static inline int 445 zpl_readpage_common(struct page *pp) 446 { 447 fstrans_cookie_t cookie; 448 449 ASSERT(PageLocked(pp)); 450 451 cookie = spl_fstrans_mark(); 452 int error = -zfs_getpage(pp->mapping->host, pp); 453 spl_fstrans_unmark(cookie); 454 455 unlock_page(pp); 456 457 return (error); 458 } 459 460 #ifdef HAVE_VFS_READ_FOLIO 461 static int 462 zpl_read_folio(struct file *filp, struct folio *folio) 463 { 464 return (zpl_readpage_common(&folio->page)); 465 } 466 #else 467 static int 468 zpl_readpage(struct file *filp, struct page *pp) 469 { 470 return (zpl_readpage_common(pp)); 471 } 472 #endif 473 474 static int 475 zpl_readpage_filler(void *data, struct page *pp) 476 { 477 return (zpl_readpage_common(pp)); 478 } 479 480 /* 481 * Populate a set of pages with data for the Linux page cache. This 482 * function will only be called for read ahead and never for demand 483 * paging. For simplicity, the code relies on read_cache_pages() to 484 * correctly lock each page for IO and call zpl_readpage(). 485 */ 486 #ifdef HAVE_VFS_READPAGES 487 static int 488 zpl_readpages(struct file *filp, struct address_space *mapping, 489 struct list_head *pages, unsigned nr_pages) 490 { 491 return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); 492 } 493 #else 494 static void 495 zpl_readahead(struct readahead_control *ractl) 496 { 497 struct page *page; 498 499 while ((page = readahead_page(ractl)) != NULL) { 500 int ret; 501 502 ret = zpl_readpage_filler(NULL, page); 503 put_page(page); 504 if (ret) 505 break; 506 } 507 } 508 #endif 509 510 static int 511 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) 512 { 513 boolean_t *for_sync = data; 514 fstrans_cookie_t cookie; 515 int ret; 516 517 ASSERT(PageLocked(pp)); 518 ASSERT(!PageWriteback(pp)); 519 520 cookie = spl_fstrans_mark(); 521 ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); 522 spl_fstrans_unmark(cookie); 523 524 return (ret); 525 } 526 527 #ifdef HAVE_WRITEPAGE_T_FOLIO 528 static int 529 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) 530 { 531 return (zpl_putpage(&pp->page, wbc, data)); 532 } 533 #endif 534 535 static inline int 536 zpl_write_cache_pages(struct address_space *mapping, 537 struct writeback_control *wbc, void *data) 538 { 539 int result; 540 541 #ifdef HAVE_WRITEPAGE_T_FOLIO 542 result = write_cache_pages(mapping, wbc, zpl_putfolio, data); 543 #else 544 result = write_cache_pages(mapping, wbc, zpl_putpage, data); 545 #endif 546 return (result); 547 } 548 549 static int 550 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) 551 { 552 znode_t *zp = ITOZ(mapping->host); 553 zfsvfs_t *zfsvfs = ITOZSB(mapping->host); 554 enum writeback_sync_modes sync_mode; 555 int result; 556 557 if ((result = zpl_enter(zfsvfs, FTAG)) != 0) 558 return (result); 559 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 560 wbc->sync_mode = WB_SYNC_ALL; 561 zpl_exit(zfsvfs, FTAG); 562 sync_mode = wbc->sync_mode; 563 564 /* 565 * We don't want to run write_cache_pages() in SYNC mode here, because 566 * that would make putpage() wait for a single page to be committed to 567 * disk every single time, resulting in atrocious performance. Instead 568 * we run it once in non-SYNC mode so that the ZIL gets all the data, 569 * and then we commit it all in one go. 570 */ 571 boolean_t for_sync = (sync_mode == WB_SYNC_ALL); 572 wbc->sync_mode = WB_SYNC_NONE; 573 result = zpl_write_cache_pages(mapping, wbc, &for_sync); 574 if (sync_mode != wbc->sync_mode) { 575 if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 576 return (result); 577 if (zfsvfs->z_log != NULL) 578 zil_commit(zfsvfs->z_log, zp->z_id); 579 zpl_exit(zfsvfs, FTAG); 580 581 /* 582 * We need to call write_cache_pages() again (we can't just 583 * return after the commit) because the previous call in 584 * non-SYNC mode does not guarantee that we got all the dirty 585 * pages (see the implementation of write_cache_pages() for 586 * details). That being said, this is a no-op in most cases. 587 */ 588 wbc->sync_mode = sync_mode; 589 result = zpl_write_cache_pages(mapping, wbc, &for_sync); 590 } 591 return (result); 592 } 593 594 /* 595 * Write out dirty pages to the ARC, this function is only required to 596 * support mmap(2). Mapped pages may be dirtied by memory operations 597 * which never call .write(). These dirty pages are kept in sync with 598 * the ARC buffers via this hook. 599 */ 600 static int 601 zpl_writepage(struct page *pp, struct writeback_control *wbc) 602 { 603 if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) 604 wbc->sync_mode = WB_SYNC_ALL; 605 606 boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL); 607 608 return (zpl_putpage(pp, wbc, &for_sync)); 609 } 610 611 /* 612 * The flag combination which matches the behavior of zfs_space() is 613 * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE 614 * flag was introduced in the 2.6.38 kernel. 615 * 616 * The original mode=0 (allocate space) behavior can be reasonably emulated 617 * by checking if enough space exists and creating a sparse file, as real 618 * persistent space reservation is not possible due to COW, snapshots, etc. 619 */ 620 static long 621 zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) 622 { 623 cred_t *cr = CRED(); 624 loff_t olen; 625 fstrans_cookie_t cookie; 626 int error = 0; 627 628 int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE; 629 630 if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0) 631 return (-EOPNOTSUPP); 632 633 if (offset < 0 || len <= 0) 634 return (-EINVAL); 635 636 spl_inode_lock(ip); 637 olen = i_size_read(ip); 638 639 crhold(cr); 640 cookie = spl_fstrans_mark(); 641 if (mode & (test_mode)) { 642 flock64_t bf; 643 644 if (mode & FALLOC_FL_KEEP_SIZE) { 645 if (offset > olen) 646 goto out_unmark; 647 648 if (offset + len > olen) 649 len = olen - offset; 650 } 651 bf.l_type = F_WRLCK; 652 bf.l_whence = SEEK_SET; 653 bf.l_start = offset; 654 bf.l_len = len; 655 bf.l_pid = 0; 656 657 error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); 658 } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { 659 unsigned int percent = zfs_fallocate_reserve_percent; 660 struct kstatfs statfs; 661 662 /* Legacy mode, disable fallocate compatibility. */ 663 if (percent == 0) { 664 error = -EOPNOTSUPP; 665 goto out_unmark; 666 } 667 668 /* 669 * Use zfs_statvfs() instead of dmu_objset_space() since it 670 * also checks project quota limits, which are relevant here. 671 */ 672 error = zfs_statvfs(ip, &statfs); 673 if (error) 674 goto out_unmark; 675 676 /* 677 * Shrink available space a bit to account for overhead/races. 678 * We know the product previously fit into availbytes from 679 * dmu_objset_space(), so the smaller product will also fit. 680 */ 681 if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { 682 error = -ENOSPC; 683 goto out_unmark; 684 } 685 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) 686 error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); 687 } 688 out_unmark: 689 spl_fstrans_unmark(cookie); 690 spl_inode_unlock(ip); 691 692 crfree(cr); 693 694 return (error); 695 } 696 697 static long 698 zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) 699 { 700 return zpl_fallocate_common(file_inode(filp), 701 mode, offset, len); 702 } 703 704 static int 705 zpl_ioctl_getversion(struct file *filp, void __user *arg) 706 { 707 uint32_t generation = file_inode(filp)->i_generation; 708 709 return (copy_to_user(arg, &generation, sizeof (generation))); 710 } 711 712 static int 713 zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) 714 { 715 struct inode *ip = file_inode(filp); 716 znode_t *zp = ITOZ(ip); 717 zfsvfs_t *zfsvfs = ITOZSB(ip); 718 objset_t *os = zfsvfs->z_os; 719 int error = 0; 720 721 if (S_ISFIFO(ip->i_mode)) 722 return (-ESPIPE); 723 724 if (offset < 0 || len < 0) 725 return (-EINVAL); 726 727 if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 728 return (error); 729 730 switch (advice) { 731 case POSIX_FADV_SEQUENTIAL: 732 case POSIX_FADV_WILLNEED: 733 #ifdef HAVE_GENERIC_FADVISE 734 if (zn_has_cached_data(zp, offset, offset + len - 1)) 735 error = generic_fadvise(filp, offset, len, advice); 736 #endif 737 /* 738 * Pass on the caller's size directly, but note that 739 * dmu_prefetch_max will effectively cap it. If there 740 * really is a larger sequential access pattern, perhaps 741 * dmu_zfetch will detect it. 742 */ 743 if (len == 0) 744 len = i_size_read(ip) - offset; 745 746 dmu_prefetch(os, zp->z_id, 0, offset, len, 747 ZIO_PRIORITY_ASYNC_READ); 748 break; 749 case POSIX_FADV_NORMAL: 750 case POSIX_FADV_RANDOM: 751 case POSIX_FADV_DONTNEED: 752 case POSIX_FADV_NOREUSE: 753 /* ignored for now */ 754 break; 755 default: 756 error = -EINVAL; 757 break; 758 } 759 760 zfs_exit(zfsvfs, FTAG); 761 762 return (error); 763 } 764 765 #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) 766 #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) 767 768 static uint32_t 769 __zpl_ioctl_getflags(struct inode *ip) 770 { 771 uint64_t zfs_flags = ITOZ(ip)->z_pflags; 772 uint32_t ioctl_flags = 0; 773 774 if (zfs_flags & ZFS_IMMUTABLE) 775 ioctl_flags |= FS_IMMUTABLE_FL; 776 777 if (zfs_flags & ZFS_APPENDONLY) 778 ioctl_flags |= FS_APPEND_FL; 779 780 if (zfs_flags & ZFS_NODUMP) 781 ioctl_flags |= FS_NODUMP_FL; 782 783 if (zfs_flags & ZFS_PROJINHERIT) 784 ioctl_flags |= ZFS_PROJINHERIT_FL; 785 786 return (ioctl_flags & ZFS_FL_USER_VISIBLE); 787 } 788 789 /* 790 * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file 791 * attributes common to both Linux and Solaris are mapped. 792 */ 793 static int 794 zpl_ioctl_getflags(struct file *filp, void __user *arg) 795 { 796 uint32_t flags; 797 int err; 798 799 flags = __zpl_ioctl_getflags(file_inode(filp)); 800 err = copy_to_user(arg, &flags, sizeof (flags)); 801 802 return (err); 803 } 804 805 /* 806 * fchange() is a helper macro to detect if we have been asked to change a 807 * flag. This is ugly, but the requirement that we do this is a consequence of 808 * how the Linux file attribute interface was designed. Another consequence is 809 * that concurrent modification of files suffers from a TOCTOU race. Neither 810 * are things we can fix without modifying the kernel-userland interface, which 811 * is outside of our jurisdiction. 812 */ 813 814 #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) 815 816 static int 817 __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) 818 { 819 uint64_t zfs_flags = ITOZ(ip)->z_pflags; 820 xoptattr_t *xoap; 821 822 if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | 823 ZFS_PROJINHERIT_FL)) 824 return (-EOPNOTSUPP); 825 826 if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) 827 return (-EACCES); 828 829 if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || 830 fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && 831 !capable(CAP_LINUX_IMMUTABLE)) 832 return (-EPERM); 833 834 if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) 835 return (-EACCES); 836 837 xva_init(xva); 838 xoap = xva_getxoptattr(xva); 839 840 #define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \ 841 if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \ 842 ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \ 843 XVA_SET_REQ(xva, (xflag)); \ 844 (xfield) = ((ioctl_flags & (iflag)) != 0); \ 845 } \ 846 } while (0) 847 848 FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE, 849 xoap->xoa_immutable); 850 FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY, 851 xoap->xoa_appendonly); 852 FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP, 853 xoap->xoa_nodump); 854 FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, 855 xoap->xoa_projinherit); 856 857 #undef FLAG_CHANGE 858 859 return (0); 860 } 861 862 static int 863 zpl_ioctl_setflags(struct file *filp, void __user *arg) 864 { 865 struct inode *ip = file_inode(filp); 866 uint32_t flags; 867 cred_t *cr = CRED(); 868 xvattr_t xva; 869 int err; 870 fstrans_cookie_t cookie; 871 872 if (copy_from_user(&flags, arg, sizeof (flags))) 873 return (-EFAULT); 874 875 err = __zpl_ioctl_setflags(ip, flags, &xva); 876 if (err) 877 return (err); 878 879 crhold(cr); 880 cookie = spl_fstrans_mark(); 881 err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); 882 spl_fstrans_unmark(cookie); 883 crfree(cr); 884 885 return (err); 886 } 887 888 static int 889 zpl_ioctl_getxattr(struct file *filp, void __user *arg) 890 { 891 zfsxattr_t fsx = { 0 }; 892 struct inode *ip = file_inode(filp); 893 int err; 894 895 fsx.fsx_xflags = __zpl_ioctl_getflags(ip); 896 fsx.fsx_projid = ITOZ(ip)->z_projid; 897 err = copy_to_user(arg, &fsx, sizeof (fsx)); 898 899 return (err); 900 } 901 902 static int 903 zpl_ioctl_setxattr(struct file *filp, void __user *arg) 904 { 905 struct inode *ip = file_inode(filp); 906 zfsxattr_t fsx; 907 cred_t *cr = CRED(); 908 xvattr_t xva; 909 xoptattr_t *xoap; 910 int err; 911 fstrans_cookie_t cookie; 912 913 if (copy_from_user(&fsx, arg, sizeof (fsx))) 914 return (-EFAULT); 915 916 if (!zpl_is_valid_projid(fsx.fsx_projid)) 917 return (-EINVAL); 918 919 err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); 920 if (err) 921 return (err); 922 923 xoap = xva_getxoptattr(&xva); 924 XVA_SET_REQ(&xva, XAT_PROJID); 925 xoap->xoa_projid = fsx.fsx_projid; 926 927 crhold(cr); 928 cookie = spl_fstrans_mark(); 929 err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); 930 spl_fstrans_unmark(cookie); 931 crfree(cr); 932 933 return (err); 934 } 935 936 /* 937 * Expose Additional File Level Attributes of ZFS. 938 */ 939 static int 940 zpl_ioctl_getdosflags(struct file *filp, void __user *arg) 941 { 942 struct inode *ip = file_inode(filp); 943 uint64_t dosflags = ITOZ(ip)->z_pflags; 944 dosflags &= ZFS_DOS_FL_USER_VISIBLE; 945 int err = copy_to_user(arg, &dosflags, sizeof (dosflags)); 946 947 return (err); 948 } 949 950 static int 951 __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva) 952 { 953 uint64_t zfs_flags = ITOZ(ip)->z_pflags; 954 xoptattr_t *xoap; 955 956 if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE)) 957 return (-EOPNOTSUPP); 958 959 if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) || 960 fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) && 961 !capable(CAP_LINUX_IMMUTABLE)) 962 return (-EPERM); 963 964 if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) 965 return (-EACCES); 966 967 xva_init(xva); 968 xoap = xva_getxoptattr(xva); 969 970 #define FLAG_CHANGE(iflag, xflag, xfield) do { \ 971 if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \ 972 ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \ 973 XVA_SET_REQ(xva, (xflag)); \ 974 (xfield) = ((ioctl_flags & (iflag)) != 0); \ 975 } \ 976 } while (0) 977 978 FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); 979 FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); 980 FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); 981 FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly); 982 FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden); 983 FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system); 984 FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive); 985 FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink); 986 FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse); 987 FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline); 988 FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse); 989 990 #undef FLAG_CHANGE 991 992 return (0); 993 } 994 995 /* 996 * Set Additional File Level Attributes of ZFS. 997 */ 998 static int 999 zpl_ioctl_setdosflags(struct file *filp, void __user *arg) 1000 { 1001 struct inode *ip = file_inode(filp); 1002 uint64_t dosflags; 1003 cred_t *cr = CRED(); 1004 xvattr_t xva; 1005 int err; 1006 fstrans_cookie_t cookie; 1007 1008 if (copy_from_user(&dosflags, arg, sizeof (dosflags))) 1009 return (-EFAULT); 1010 1011 err = __zpl_ioctl_setdosflags(ip, dosflags, &xva); 1012 if (err) 1013 return (err); 1014 1015 crhold(cr); 1016 cookie = spl_fstrans_mark(); 1017 err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); 1018 spl_fstrans_unmark(cookie); 1019 crfree(cr); 1020 1021 return (err); 1022 } 1023 1024 static long 1025 zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1026 { 1027 switch (cmd) { 1028 case FS_IOC_GETVERSION: 1029 return (zpl_ioctl_getversion(filp, (void *)arg)); 1030 case FS_IOC_GETFLAGS: 1031 return (zpl_ioctl_getflags(filp, (void *)arg)); 1032 case FS_IOC_SETFLAGS: 1033 return (zpl_ioctl_setflags(filp, (void *)arg)); 1034 case ZFS_IOC_FSGETXATTR: 1035 return (zpl_ioctl_getxattr(filp, (void *)arg)); 1036 case ZFS_IOC_FSSETXATTR: 1037 return (zpl_ioctl_setxattr(filp, (void *)arg)); 1038 case ZFS_IOC_GETDOSFLAGS: 1039 return (zpl_ioctl_getdosflags(filp, (void *)arg)); 1040 case ZFS_IOC_SETDOSFLAGS: 1041 return (zpl_ioctl_setdosflags(filp, (void *)arg)); 1042 case ZFS_IOC_COMPAT_FICLONE: 1043 return (zpl_ioctl_ficlone(filp, (void *)arg)); 1044 case ZFS_IOC_COMPAT_FICLONERANGE: 1045 return (zpl_ioctl_ficlonerange(filp, (void *)arg)); 1046 case ZFS_IOC_COMPAT_FIDEDUPERANGE: 1047 return (zpl_ioctl_fideduperange(filp, (void *)arg)); 1048 default: 1049 return (-ENOTTY); 1050 } 1051 } 1052 1053 #ifdef CONFIG_COMPAT 1054 static long 1055 zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1056 { 1057 switch (cmd) { 1058 case FS_IOC32_GETVERSION: 1059 cmd = FS_IOC_GETVERSION; 1060 break; 1061 case FS_IOC32_GETFLAGS: 1062 cmd = FS_IOC_GETFLAGS; 1063 break; 1064 case FS_IOC32_SETFLAGS: 1065 cmd = FS_IOC_SETFLAGS; 1066 break; 1067 default: 1068 return (-ENOTTY); 1069 } 1070 return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); 1071 } 1072 #endif /* CONFIG_COMPAT */ 1073 1074 const struct address_space_operations zpl_address_space_operations = { 1075 #ifdef HAVE_VFS_READPAGES 1076 .readpages = zpl_readpages, 1077 #else 1078 .readahead = zpl_readahead, 1079 #endif 1080 #ifdef HAVE_VFS_READ_FOLIO 1081 .read_folio = zpl_read_folio, 1082 #else 1083 .readpage = zpl_readpage, 1084 #endif 1085 .writepage = zpl_writepage, 1086 .writepages = zpl_writepages, 1087 .direct_IO = zpl_direct_IO, 1088 #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS 1089 .set_page_dirty = __set_page_dirty_nobuffers, 1090 #endif 1091 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 1092 .dirty_folio = filemap_dirty_folio, 1093 #endif 1094 #ifdef HAVE_VFS_MIGRATE_FOLIO 1095 .migrate_folio = migrate_folio, 1096 #else 1097 .migratepage = migrate_page, 1098 #endif 1099 }; 1100 1101 const struct file_operations zpl_file_operations = { 1102 .open = zpl_open, 1103 .release = zpl_release, 1104 .llseek = zpl_llseek, 1105 .read_iter = zpl_iter_read, 1106 .write_iter = zpl_iter_write, 1107 #ifdef HAVE_VFS_IOV_ITER 1108 #ifdef HAVE_COPY_SPLICE_READ 1109 .splice_read = copy_splice_read, 1110 #else 1111 .splice_read = generic_file_splice_read, 1112 #endif 1113 .splice_write = iter_file_splice_write, 1114 #endif 1115 .mmap = zpl_mmap, 1116 .fsync = zpl_fsync, 1117 .fallocate = zpl_fallocate, 1118 .copy_file_range = zpl_copy_file_range, 1119 #ifdef HAVE_VFS_CLONE_FILE_RANGE 1120 .clone_file_range = zpl_clone_file_range, 1121 #endif 1122 #ifdef HAVE_VFS_REMAP_FILE_RANGE 1123 .remap_file_range = zpl_remap_file_range, 1124 #endif 1125 #ifdef HAVE_VFS_DEDUPE_FILE_RANGE 1126 .dedupe_file_range = zpl_dedupe_file_range, 1127 #endif 1128 .fadvise = zpl_fadvise, 1129 .unlocked_ioctl = zpl_ioctl, 1130 #ifdef CONFIG_COMPAT 1131 .compat_ioctl = zpl_compat_ioctl, 1132 #endif 1133 }; 1134 1135 const struct file_operations zpl_dir_file_operations = { 1136 .llseek = generic_file_llseek, 1137 .read = generic_read_dir, 1138 .iterate_shared = zpl_iterate, 1139 .fsync = zpl_fsync, 1140 .unlocked_ioctl = zpl_ioctl, 1141 #ifdef CONFIG_COMPAT 1142 .compat_ioctl = zpl_compat_ioctl, 1143 #endif 1144 }; 1145 1146 module_param(zfs_fallocate_reserve_percent, uint, 0644); 1147 MODULE_PARM_DESC(zfs_fallocate_reserve_percent, 1148 "Percentage of length to use for the available capacity check"); 1149