1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. 24 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 25 * Copyright (c) 2025, Klara, Inc. 26 */ 27 28 29 #ifdef CONFIG_COMPAT 30 #include <linux/compat.h> 31 #endif 32 #include <linux/fs.h> 33 #include <linux/migrate.h> 34 #include <sys/file.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/zfs_znode.h> 37 #include <sys/zfs_vfsops.h> 38 #include <sys/zfs_vnops.h> 39 #include <sys/zfs_project.h> 40 #include <linux/pagemap_compat.h> 41 #include <linux/fadvise.h> 42 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 43 #include <linux/writeback.h> 44 #endif 45 46 /* 47 * When using fallocate(2) to preallocate space, inflate the requested 48 * capacity check by 10% to account for the required metadata blocks. 49 */ 50 static unsigned int zfs_fallocate_reserve_percent = 110; 51 52 static int 53 zpl_open(struct inode *ip, struct file *filp) 54 { 55 cred_t *cr = CRED(); 56 int error; 57 fstrans_cookie_t cookie; 58 59 error = generic_file_open(ip, filp); 60 if (error) 61 return (error); 62 63 crhold(cr); 64 cookie = spl_fstrans_mark(); 65 error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); 66 spl_fstrans_unmark(cookie); 67 crfree(cr); 68 ASSERT3S(error, <=, 0); 69 70 return (error); 71 } 72 73 static int 74 zpl_release(struct inode *ip, struct file *filp) 75 { 76 cred_t *cr = CRED(); 77 int error; 78 fstrans_cookie_t cookie; 79 80 cookie = spl_fstrans_mark(); 81 if (ITOZ(ip)->z_atime_dirty) 82 zfs_mark_inode_dirty(ip); 83 84 crhold(cr); 85 error = -zfs_close(ip, filp->f_flags, cr); 86 spl_fstrans_unmark(cookie); 87 crfree(cr); 88 ASSERT3S(error, <=, 0); 89 90 return (error); 91 } 92 93 static int 94 zpl_iterate(struct file *filp, struct dir_context *ctx) 95 { 96 cred_t *cr = CRED(); 97 int error; 98 fstrans_cookie_t cookie; 99 100 crhold(cr); 101 cookie = spl_fstrans_mark(); 102 error = -zfs_readdir(file_inode(filp), ctx, cr); 103 spl_fstrans_unmark(cookie); 104 crfree(cr); 105 ASSERT3S(error, <=, 0); 106 107 return (error); 108 } 109 110 static inline int 111 zpl_write_cache_pages(struct address_space *mapping, 112 struct writeback_control *wbc, void *data); 113 114 static int 115 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) 116 { 117 struct inode *inode = filp->f_mapping->host; 118 znode_t *zp = ITOZ(inode); 119 cred_t *cr = CRED(); 120 int error; 121 fstrans_cookie_t cookie; 122 123 /* 124 * Force dirty pages in the range out to the DMU and the log, ready 125 * for zil_commit() to write down. 126 * 127 * We call write_cache_pages() directly to ensure that zpl_putpage() is 128 * called with the flags we need. We need WB_SYNC_NONE to avoid a call 129 * to zil_commit() (since we're doing this as a kind of pre-sync); but 130 * we do need for_sync so that the pages remain in writeback until 131 * they're on disk, and so that we get an error if the DMU write fails. 132 */ 133 if (filemap_range_has_page(inode->i_mapping, start, end)) { 134 int for_sync = 1; 135 struct writeback_control wbc = { 136 .sync_mode = WB_SYNC_NONE, 137 .nr_to_write = LONG_MAX, 138 .range_start = start, 139 .range_end = end, 140 }; 141 error = 142 zpl_write_cache_pages(inode->i_mapping, &wbc, &for_sync); 143 if (error != 0) { 144 /* 145 * Unclear what state things are in. zfs_putpage() will 146 * ensure the pages remain dirty if they haven't been 147 * written down to the DMU, but because there may be 148 * nothing logged, we can't assume that zfs_sync() -> 149 * zil_commit() will give us a useful error. It's 150 * safest if we just error out here. 151 */ 152 return (error); 153 } 154 } 155 156 crhold(cr); 157 cookie = spl_fstrans_mark(); 158 error = -zfs_fsync(zp, datasync, cr); 159 spl_fstrans_unmark(cookie); 160 crfree(cr); 161 ASSERT3S(error, <=, 0); 162 163 return (error); 164 } 165 166 static inline int 167 zfs_io_flags(struct kiocb *kiocb) 168 { 169 int flags = 0; 170 171 #if defined(IOCB_DSYNC) 172 if (kiocb->ki_flags & IOCB_DSYNC) 173 flags |= O_DSYNC; 174 #endif 175 #if defined(IOCB_SYNC) 176 if (kiocb->ki_flags & IOCB_SYNC) 177 flags |= O_SYNC; 178 #endif 179 #if defined(IOCB_APPEND) 180 if (kiocb->ki_flags & IOCB_APPEND) 181 flags |= O_APPEND; 182 #endif 183 #if defined(IOCB_DIRECT) 184 if (kiocb->ki_flags & IOCB_DIRECT) 185 flags |= O_DIRECT; 186 #endif 187 return (flags); 188 } 189 190 /* 191 * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() 192 * is true. This is needed since datasets with inherited "relatime" property 193 * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after 194 * `zfs set relatime=...`), which is what relatime test in VFS by 195 * relatime_need_update() is based on. 196 */ 197 static inline void 198 zpl_file_accessed(struct file *filp) 199 { 200 struct inode *ip = filp->f_mapping->host; 201 202 if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { 203 if (zfs_relatime_need_update(ip)) 204 file_accessed(filp); 205 } else { 206 file_accessed(filp); 207 } 208 } 209 210 static ssize_t 211 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) 212 { 213 cred_t *cr = CRED(); 214 fstrans_cookie_t cookie; 215 struct file *filp = kiocb->ki_filp; 216 ssize_t count = iov_iter_count(to); 217 zfs_uio_t uio; 218 219 zfs_uio_iov_iter_init(&uio, to, kiocb->ki_pos, count); 220 221 crhold(cr); 222 cookie = spl_fstrans_mark(); 223 224 ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, 225 filp->f_flags | zfs_io_flags(kiocb), cr); 226 227 spl_fstrans_unmark(cookie); 228 crfree(cr); 229 230 if (ret < 0) 231 return (ret); 232 233 ssize_t read = count - uio.uio_resid; 234 kiocb->ki_pos += read; 235 236 zpl_file_accessed(filp); 237 238 return (read); 239 } 240 241 static inline ssize_t 242 zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, 243 size_t *countp) 244 { 245 ssize_t ret = generic_write_checks(kiocb, from); 246 if (ret <= 0) 247 return (ret); 248 249 *countp = ret; 250 251 return (0); 252 } 253 254 static ssize_t 255 zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) 256 { 257 cred_t *cr = CRED(); 258 fstrans_cookie_t cookie; 259 struct file *filp = kiocb->ki_filp; 260 struct inode *ip = filp->f_mapping->host; 261 zfs_uio_t uio; 262 size_t count = 0; 263 ssize_t ret; 264 265 ret = zpl_generic_write_checks(kiocb, from, &count); 266 if (ret) 267 return (ret); 268 269 zfs_uio_iov_iter_init(&uio, from, kiocb->ki_pos, count); 270 271 crhold(cr); 272 cookie = spl_fstrans_mark(); 273 274 ret = -zfs_write(ITOZ(ip), &uio, 275 filp->f_flags | zfs_io_flags(kiocb), cr); 276 277 spl_fstrans_unmark(cookie); 278 crfree(cr); 279 280 if (ret < 0) 281 return (ret); 282 283 ssize_t wrote = count - uio.uio_resid; 284 kiocb->ki_pos += wrote; 285 286 return (wrote); 287 } 288 289 static ssize_t 290 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) 291 { 292 /* 293 * All O_DIRECT requests should be handled by 294 * zpl_iter_write/read}(). There is no way kernel generic code should 295 * call the direct_IO address_space_operations function. We set this 296 * code path to be fatal if it is executed. 297 */ 298 PANIC(0); 299 return (0); 300 } 301 302 static loff_t 303 zpl_llseek(struct file *filp, loff_t offset, int whence) 304 { 305 #if defined(SEEK_HOLE) && defined(SEEK_DATA) 306 fstrans_cookie_t cookie; 307 308 if (whence == SEEK_DATA || whence == SEEK_HOLE) { 309 struct inode *ip = filp->f_mapping->host; 310 loff_t maxbytes = ip->i_sb->s_maxbytes; 311 loff_t error; 312 313 spl_inode_lock_shared(ip); 314 cookie = spl_fstrans_mark(); 315 error = -zfs_holey(ITOZ(ip), whence, &offset); 316 spl_fstrans_unmark(cookie); 317 if (error == 0) 318 error = lseek_execute(filp, ip, offset, maxbytes); 319 spl_inode_unlock_shared(ip); 320 321 return (error); 322 } 323 #endif /* SEEK_HOLE && SEEK_DATA */ 324 325 return (generic_file_llseek(filp, offset, whence)); 326 } 327 328 /* 329 * It's worth taking a moment to describe how mmap is implemented 330 * for zfs because it differs considerably from other Linux filesystems. 331 * However, this issue is handled the same way under OpenSolaris. 332 * 333 * The issue is that by design zfs bypasses the Linux page cache and 334 * leaves all caching up to the ARC. This has been shown to work 335 * well for the common read(2)/write(2) case. However, mmap(2) 336 * is problem because it relies on being tightly integrated with the 337 * page cache. To handle this we cache mmap'ed files twice, once in 338 * the ARC and a second time in the page cache. The code is careful 339 * to keep both copies synchronized. 340 * 341 * When a file with an mmap'ed region is written to using write(2) 342 * both the data in the ARC and existing pages in the page cache 343 * are updated. For a read(2) data will be read first from the page 344 * cache then the ARC if needed. Neither a write(2) or read(2) will 345 * will ever result in new pages being added to the page cache. 346 * 347 * New pages are added to the page cache only via .readpage() which 348 * is called when the vfs needs to read a page off disk to back the 349 * virtual memory region. These pages may be modified without 350 * notifying the ARC and will be written out periodically via 351 * .writepage(). This will occur due to either a sync or the usual 352 * page aging behavior. Note because a read(2) of a mmap'ed file 353 * will always check the page cache first even when the ARC is out 354 * of date correct data will still be returned. 355 * 356 * While this implementation ensures correct behavior it does have 357 * have some drawbacks. The most obvious of which is that it 358 * increases the required memory footprint when access mmap'ed 359 * files. It also adds additional complexity to the code keeping 360 * both caches synchronized. 361 * 362 * Longer term it may be possible to cleanly resolve this wart by 363 * mapping page cache pages directly on to the ARC buffers. The 364 * Linux address space operations are flexible enough to allow 365 * selection of which pages back a particular index. The trick 366 * would be working out the details of which subsystem is in 367 * charge, the ARC, the page cache, or both. It may also prove 368 * helpful to move the ARC buffers to a scatter-gather lists 369 * rather than a vmalloc'ed region. 370 */ 371 static int 372 zpl_mmap(struct file *filp, struct vm_area_struct *vma) 373 { 374 struct inode *ip = filp->f_mapping->host; 375 int error; 376 fstrans_cookie_t cookie; 377 378 cookie = spl_fstrans_mark(); 379 error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, 380 (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); 381 spl_fstrans_unmark(cookie); 382 383 if (error) 384 return (error); 385 386 error = generic_file_mmap(filp, vma); 387 if (error) 388 return (error); 389 390 return (error); 391 } 392 393 /* 394 * Populate a page with data for the Linux page cache. This function is 395 * only used to support mmap(2). There will be an identical copy of the 396 * data in the ARC which is kept up to date via .write() and .writepage(). 397 */ 398 static inline int 399 zpl_readpage_common(struct page *pp) 400 { 401 fstrans_cookie_t cookie; 402 403 ASSERT(PageLocked(pp)); 404 405 cookie = spl_fstrans_mark(); 406 int error = -zfs_getpage(pp->mapping->host, pp); 407 spl_fstrans_unmark(cookie); 408 409 unlock_page(pp); 410 411 return (error); 412 } 413 414 #ifdef HAVE_VFS_READ_FOLIO 415 static int 416 zpl_read_folio(struct file *filp, struct folio *folio) 417 { 418 return (zpl_readpage_common(&folio->page)); 419 } 420 #else 421 static int 422 zpl_readpage(struct file *filp, struct page *pp) 423 { 424 return (zpl_readpage_common(pp)); 425 } 426 #endif 427 428 static int 429 zpl_readpage_filler(void *data, struct page *pp) 430 { 431 return (zpl_readpage_common(pp)); 432 } 433 434 /* 435 * Populate a set of pages with data for the Linux page cache. This 436 * function will only be called for read ahead and never for demand 437 * paging. For simplicity, the code relies on read_cache_pages() to 438 * correctly lock each page for IO and call zpl_readpage(). 439 */ 440 #ifdef HAVE_VFS_READPAGES 441 static int 442 zpl_readpages(struct file *filp, struct address_space *mapping, 443 struct list_head *pages, unsigned nr_pages) 444 { 445 return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); 446 } 447 #else 448 static void 449 zpl_readahead(struct readahead_control *ractl) 450 { 451 struct page *page; 452 453 while ((page = readahead_page(ractl)) != NULL) { 454 int ret; 455 456 ret = zpl_readpage_filler(NULL, page); 457 put_page(page); 458 if (ret) 459 break; 460 } 461 } 462 #endif 463 464 static int 465 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) 466 { 467 boolean_t *for_sync = data; 468 fstrans_cookie_t cookie; 469 int ret; 470 471 ASSERT(PageLocked(pp)); 472 ASSERT(!PageWriteback(pp)); 473 474 cookie = spl_fstrans_mark(); 475 ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); 476 spl_fstrans_unmark(cookie); 477 478 return (ret); 479 } 480 481 #ifdef HAVE_WRITEPAGE_T_FOLIO 482 static int 483 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) 484 { 485 return (zpl_putpage(&pp->page, wbc, data)); 486 } 487 #endif 488 489 static inline int 490 zpl_write_cache_pages(struct address_space *mapping, 491 struct writeback_control *wbc, void *data) 492 { 493 int result; 494 495 #ifdef HAVE_WRITEPAGE_T_FOLIO 496 result = write_cache_pages(mapping, wbc, zpl_putfolio, data); 497 #else 498 result = write_cache_pages(mapping, wbc, zpl_putpage, data); 499 #endif 500 return (result); 501 } 502 503 static int 504 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) 505 { 506 znode_t *zp = ITOZ(mapping->host); 507 zfsvfs_t *zfsvfs = ITOZSB(mapping->host); 508 enum writeback_sync_modes sync_mode; 509 int result; 510 511 if ((result = zpl_enter(zfsvfs, FTAG)) != 0) 512 return (result); 513 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 514 wbc->sync_mode = WB_SYNC_ALL; 515 zpl_exit(zfsvfs, FTAG); 516 sync_mode = wbc->sync_mode; 517 518 /* 519 * We don't want to run write_cache_pages() in SYNC mode here, because 520 * that would make putpage() wait for a single page to be committed to 521 * disk every single time, resulting in atrocious performance. Instead 522 * we run it once in non-SYNC mode so that the ZIL gets all the data, 523 * and then we commit it all in one go. 524 */ 525 boolean_t for_sync = (sync_mode == WB_SYNC_ALL); 526 wbc->sync_mode = WB_SYNC_NONE; 527 result = zpl_write_cache_pages(mapping, wbc, &for_sync); 528 if (sync_mode != wbc->sync_mode) { 529 if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 530 return (result); 531 532 if (zfsvfs->z_log != NULL) { 533 /* 534 * We don't want to block here if the pool suspends, 535 * because this is not a syncing op by itself, but 536 * might be part of one that the caller will 537 * coordinate. 538 */ 539 result = -zil_commit_flags(zfsvfs->z_log, zp->z_id, 540 ZIL_COMMIT_NOW); 541 } 542 543 zpl_exit(zfsvfs, FTAG); 544 545 /* 546 * If zil_commit_flags() failed, it's unclear what state things 547 * are currently in. putpage() has written back out what it can 548 * to the DMU, but it may not be on disk. We have little choice 549 * but to escape. 550 */ 551 if (result != 0) 552 return (result); 553 554 /* 555 * We need to call write_cache_pages() again (we can't just 556 * return after the commit) because the previous call in 557 * non-SYNC mode does not guarantee that we got all the dirty 558 * pages (see the implementation of write_cache_pages() for 559 * details). That being said, this is a no-op in most cases. 560 */ 561 wbc->sync_mode = sync_mode; 562 result = zpl_write_cache_pages(mapping, wbc, &for_sync); 563 } 564 return (result); 565 } 566 567 #ifdef HAVE_VFS_WRITEPAGE 568 /* 569 * Write out dirty pages to the ARC, this function is only required to 570 * support mmap(2). Mapped pages may be dirtied by memory operations 571 * which never call .write(). These dirty pages are kept in sync with 572 * the ARC buffers via this hook. 573 */ 574 static int 575 zpl_writepage(struct page *pp, struct writeback_control *wbc) 576 { 577 if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) 578 wbc->sync_mode = WB_SYNC_ALL; 579 580 boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL); 581 582 return (zpl_putpage(pp, wbc, &for_sync)); 583 } 584 #endif 585 586 /* 587 * The flag combination which matches the behavior of zfs_space() is 588 * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE 589 * flag was introduced in the 2.6.38 kernel. 590 * 591 * The original mode=0 (allocate space) behavior can be reasonably emulated 592 * by checking if enough space exists and creating a sparse file, as real 593 * persistent space reservation is not possible due to COW, snapshots, etc. 594 */ 595 static long 596 zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) 597 { 598 cred_t *cr = CRED(); 599 loff_t olen; 600 fstrans_cookie_t cookie; 601 int error = 0; 602 603 int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE; 604 605 if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0) 606 return (-EOPNOTSUPP); 607 608 if (offset < 0 || len <= 0) 609 return (-EINVAL); 610 611 spl_inode_lock(ip); 612 olen = i_size_read(ip); 613 614 crhold(cr); 615 cookie = spl_fstrans_mark(); 616 if (mode & (test_mode)) { 617 flock64_t bf; 618 619 if (mode & FALLOC_FL_KEEP_SIZE) { 620 if (offset > olen) 621 goto out_unmark; 622 623 if (offset + len > olen) 624 len = olen - offset; 625 } 626 bf.l_type = F_WRLCK; 627 bf.l_whence = SEEK_SET; 628 bf.l_start = offset; 629 bf.l_len = len; 630 bf.l_pid = 0; 631 632 error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); 633 } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { 634 unsigned int percent = zfs_fallocate_reserve_percent; 635 struct kstatfs statfs; 636 637 /* Legacy mode, disable fallocate compatibility. */ 638 if (percent == 0) { 639 error = -EOPNOTSUPP; 640 goto out_unmark; 641 } 642 643 /* 644 * Use zfs_statvfs() instead of dmu_objset_space() since it 645 * also checks project quota limits, which are relevant here. 646 */ 647 error = zfs_statvfs(ip, &statfs); 648 if (error) 649 goto out_unmark; 650 651 /* 652 * Shrink available space a bit to account for overhead/races. 653 * We know the product previously fit into availbytes from 654 * dmu_objset_space(), so the smaller product will also fit. 655 */ 656 if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { 657 error = -ENOSPC; 658 goto out_unmark; 659 } 660 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) 661 error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); 662 } 663 out_unmark: 664 spl_fstrans_unmark(cookie); 665 spl_inode_unlock(ip); 666 667 crfree(cr); 668 669 return (error); 670 } 671 672 static long 673 zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) 674 { 675 return zpl_fallocate_common(file_inode(filp), 676 mode, offset, len); 677 } 678 679 static int 680 zpl_ioctl_getversion(struct file *filp, void __user *arg) 681 { 682 uint32_t generation = file_inode(filp)->i_generation; 683 684 return (copy_to_user(arg, &generation, sizeof (generation))); 685 } 686 687 static int 688 zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) 689 { 690 struct inode *ip = file_inode(filp); 691 znode_t *zp = ITOZ(ip); 692 zfsvfs_t *zfsvfs = ITOZSB(ip); 693 objset_t *os = zfsvfs->z_os; 694 int error = 0; 695 696 if (S_ISFIFO(ip->i_mode)) 697 return (-ESPIPE); 698 699 if (offset < 0 || len < 0) 700 return (-EINVAL); 701 702 if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 703 return (error); 704 705 switch (advice) { 706 case POSIX_FADV_SEQUENTIAL: 707 case POSIX_FADV_WILLNEED: 708 #ifdef HAVE_GENERIC_FADVISE 709 if (zn_has_cached_data(zp, offset, offset + len - 1)) 710 error = generic_fadvise(filp, offset, len, advice); 711 #endif 712 /* 713 * Pass on the caller's size directly, but note that 714 * dmu_prefetch_max will effectively cap it. If there 715 * really is a larger sequential access pattern, perhaps 716 * dmu_zfetch will detect it. 717 */ 718 if (len == 0) 719 len = i_size_read(ip) - offset; 720 721 dmu_prefetch(os, zp->z_id, 0, offset, len, 722 ZIO_PRIORITY_ASYNC_READ); 723 break; 724 case POSIX_FADV_NORMAL: 725 case POSIX_FADV_RANDOM: 726 case POSIX_FADV_DONTNEED: 727 case POSIX_FADV_NOREUSE: 728 /* ignored for now */ 729 break; 730 default: 731 error = -EINVAL; 732 break; 733 } 734 735 zfs_exit(zfsvfs, FTAG); 736 737 return (error); 738 } 739 740 #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) 741 #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) 742 743 static uint32_t 744 __zpl_ioctl_getflags(struct inode *ip) 745 { 746 uint64_t zfs_flags = ITOZ(ip)->z_pflags; 747 uint32_t ioctl_flags = 0; 748 749 if (zfs_flags & ZFS_IMMUTABLE) 750 ioctl_flags |= FS_IMMUTABLE_FL; 751 752 if (zfs_flags & ZFS_APPENDONLY) 753 ioctl_flags |= FS_APPEND_FL; 754 755 if (zfs_flags & ZFS_NODUMP) 756 ioctl_flags |= FS_NODUMP_FL; 757 758 if (zfs_flags & ZFS_PROJINHERIT) 759 ioctl_flags |= ZFS_PROJINHERIT_FL; 760 761 return (ioctl_flags & ZFS_FL_USER_VISIBLE); 762 } 763 764 /* 765 * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file 766 * attributes common to both Linux and Solaris are mapped. 767 */ 768 static int 769 zpl_ioctl_getflags(struct file *filp, void __user *arg) 770 { 771 uint32_t flags; 772 int err; 773 774 flags = __zpl_ioctl_getflags(file_inode(filp)); 775 err = copy_to_user(arg, &flags, sizeof (flags)); 776 777 return (err); 778 } 779 780 /* 781 * fchange() is a helper macro to detect if we have been asked to change a 782 * flag. This is ugly, but the requirement that we do this is a consequence of 783 * how the Linux file attribute interface was designed. Another consequence is 784 * that concurrent modification of files suffers from a TOCTOU race. Neither 785 * are things we can fix without modifying the kernel-userland interface, which 786 * is outside of our jurisdiction. 787 */ 788 789 #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) 790 791 static int 792 __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) 793 { 794 uint64_t zfs_flags = ITOZ(ip)->z_pflags; 795 xoptattr_t *xoap; 796 797 if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | 798 ZFS_PROJINHERIT_FL)) 799 return (-EOPNOTSUPP); 800 801 if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) 802 return (-EACCES); 803 804 if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || 805 fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && 806 !capable(CAP_LINUX_IMMUTABLE)) 807 return (-EPERM); 808 809 if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) 810 return (-EACCES); 811 812 xva_init(xva); 813 xoap = xva_getxoptattr(xva); 814 815 #define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \ 816 if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \ 817 ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \ 818 XVA_SET_REQ(xva, (xflag)); \ 819 (xfield) = ((ioctl_flags & (iflag)) != 0); \ 820 } \ 821 } while (0) 822 823 FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE, 824 xoap->xoa_immutable); 825 FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY, 826 xoap->xoa_appendonly); 827 FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP, 828 xoap->xoa_nodump); 829 FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, 830 xoap->xoa_projinherit); 831 832 #undef FLAG_CHANGE 833 834 return (0); 835 } 836 837 static int 838 zpl_ioctl_setflags(struct file *filp, void __user *arg) 839 { 840 struct inode *ip = file_inode(filp); 841 uint32_t flags; 842 cred_t *cr = CRED(); 843 xvattr_t xva; 844 int err; 845 fstrans_cookie_t cookie; 846 847 if (copy_from_user(&flags, arg, sizeof (flags))) 848 return (-EFAULT); 849 850 err = __zpl_ioctl_setflags(ip, flags, &xva); 851 if (err) 852 return (err); 853 854 crhold(cr); 855 cookie = spl_fstrans_mark(); 856 err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); 857 spl_fstrans_unmark(cookie); 858 crfree(cr); 859 860 return (err); 861 } 862 863 static int 864 zpl_ioctl_getxattr(struct file *filp, void __user *arg) 865 { 866 zfsxattr_t fsx = { 0 }; 867 struct inode *ip = file_inode(filp); 868 int err; 869 870 fsx.fsx_xflags = __zpl_ioctl_getflags(ip); 871 fsx.fsx_projid = ITOZ(ip)->z_projid; 872 err = copy_to_user(arg, &fsx, sizeof (fsx)); 873 874 return (err); 875 } 876 877 static int 878 zpl_ioctl_setxattr(struct file *filp, void __user *arg) 879 { 880 struct inode *ip = file_inode(filp); 881 zfsxattr_t fsx; 882 cred_t *cr = CRED(); 883 xvattr_t xva; 884 xoptattr_t *xoap; 885 int err; 886 fstrans_cookie_t cookie; 887 888 if (copy_from_user(&fsx, arg, sizeof (fsx))) 889 return (-EFAULT); 890 891 if (!zpl_is_valid_projid(fsx.fsx_projid)) 892 return (-EINVAL); 893 894 err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); 895 if (err) 896 return (err); 897 898 xoap = xva_getxoptattr(&xva); 899 XVA_SET_REQ(&xva, XAT_PROJID); 900 xoap->xoa_projid = fsx.fsx_projid; 901 902 crhold(cr); 903 cookie = spl_fstrans_mark(); 904 err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); 905 spl_fstrans_unmark(cookie); 906 crfree(cr); 907 908 return (err); 909 } 910 911 /* 912 * Expose Additional File Level Attributes of ZFS. 913 */ 914 static int 915 zpl_ioctl_getdosflags(struct file *filp, void __user *arg) 916 { 917 struct inode *ip = file_inode(filp); 918 uint64_t dosflags = ITOZ(ip)->z_pflags; 919 dosflags &= ZFS_DOS_FL_USER_VISIBLE; 920 int err = copy_to_user(arg, &dosflags, sizeof (dosflags)); 921 922 return (err); 923 } 924 925 static int 926 __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva) 927 { 928 uint64_t zfs_flags = ITOZ(ip)->z_pflags; 929 xoptattr_t *xoap; 930 931 if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE)) 932 return (-EOPNOTSUPP); 933 934 if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) || 935 fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) && 936 !capable(CAP_LINUX_IMMUTABLE)) 937 return (-EPERM); 938 939 if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) 940 return (-EACCES); 941 942 xva_init(xva); 943 xoap = xva_getxoptattr(xva); 944 945 #define FLAG_CHANGE(iflag, xflag, xfield) do { \ 946 if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \ 947 ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \ 948 XVA_SET_REQ(xva, (xflag)); \ 949 (xfield) = ((ioctl_flags & (iflag)) != 0); \ 950 } \ 951 } while (0) 952 953 FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); 954 FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); 955 FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); 956 FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly); 957 FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden); 958 FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system); 959 FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive); 960 FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink); 961 FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse); 962 FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline); 963 FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse); 964 965 #undef FLAG_CHANGE 966 967 return (0); 968 } 969 970 /* 971 * Set Additional File Level Attributes of ZFS. 972 */ 973 static int 974 zpl_ioctl_setdosflags(struct file *filp, void __user *arg) 975 { 976 struct inode *ip = file_inode(filp); 977 uint64_t dosflags; 978 cred_t *cr = CRED(); 979 xvattr_t xva; 980 int err; 981 fstrans_cookie_t cookie; 982 983 if (copy_from_user(&dosflags, arg, sizeof (dosflags))) 984 return (-EFAULT); 985 986 err = __zpl_ioctl_setdosflags(ip, dosflags, &xva); 987 if (err) 988 return (err); 989 990 crhold(cr); 991 cookie = spl_fstrans_mark(); 992 err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); 993 spl_fstrans_unmark(cookie); 994 crfree(cr); 995 996 return (err); 997 } 998 999 static int 1000 zpl_ioctl_rewrite(struct file *filp, void __user *arg) 1001 { 1002 struct inode *ip = file_inode(filp); 1003 zfs_rewrite_args_t args; 1004 fstrans_cookie_t cookie; 1005 int err; 1006 1007 if (copy_from_user(&args, arg, sizeof (args))) 1008 return (-EFAULT); 1009 1010 if (unlikely(!(filp->f_mode & FMODE_WRITE))) 1011 return (-EBADF); 1012 1013 cookie = spl_fstrans_mark(); 1014 err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg); 1015 spl_fstrans_unmark(cookie); 1016 1017 return (err); 1018 } 1019 1020 static long 1021 zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1022 { 1023 switch (cmd) { 1024 case FS_IOC_GETVERSION: 1025 return (zpl_ioctl_getversion(filp, (void *)arg)); 1026 case FS_IOC_GETFLAGS: 1027 return (zpl_ioctl_getflags(filp, (void *)arg)); 1028 case FS_IOC_SETFLAGS: 1029 return (zpl_ioctl_setflags(filp, (void *)arg)); 1030 case ZFS_IOC_FSGETXATTR: 1031 return (zpl_ioctl_getxattr(filp, (void *)arg)); 1032 case ZFS_IOC_FSSETXATTR: 1033 return (zpl_ioctl_setxattr(filp, (void *)arg)); 1034 case ZFS_IOC_GETDOSFLAGS: 1035 return (zpl_ioctl_getdosflags(filp, (void *)arg)); 1036 case ZFS_IOC_SETDOSFLAGS: 1037 return (zpl_ioctl_setdosflags(filp, (void *)arg)); 1038 case ZFS_IOC_REWRITE: 1039 return (zpl_ioctl_rewrite(filp, (void *)arg)); 1040 default: 1041 return (-ENOTTY); 1042 } 1043 } 1044 1045 #ifdef CONFIG_COMPAT 1046 static long 1047 zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1048 { 1049 switch (cmd) { 1050 case FS_IOC32_GETVERSION: 1051 cmd = FS_IOC_GETVERSION; 1052 break; 1053 case FS_IOC32_GETFLAGS: 1054 cmd = FS_IOC_GETFLAGS; 1055 break; 1056 case FS_IOC32_SETFLAGS: 1057 cmd = FS_IOC_SETFLAGS; 1058 break; 1059 default: 1060 return (-ENOTTY); 1061 } 1062 return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); 1063 } 1064 #endif /* CONFIG_COMPAT */ 1065 1066 const struct address_space_operations zpl_address_space_operations = { 1067 #ifdef HAVE_VFS_READPAGES 1068 .readpages = zpl_readpages, 1069 #else 1070 .readahead = zpl_readahead, 1071 #endif 1072 #ifdef HAVE_VFS_READ_FOLIO 1073 .read_folio = zpl_read_folio, 1074 #else 1075 .readpage = zpl_readpage, 1076 #endif 1077 #ifdef HAVE_VFS_WRITEPAGE 1078 .writepage = zpl_writepage, 1079 #endif 1080 .writepages = zpl_writepages, 1081 .direct_IO = zpl_direct_IO, 1082 #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS 1083 .set_page_dirty = __set_page_dirty_nobuffers, 1084 #endif 1085 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO 1086 .dirty_folio = filemap_dirty_folio, 1087 #endif 1088 #ifdef HAVE_VFS_MIGRATE_FOLIO 1089 .migrate_folio = migrate_folio, 1090 #elif defined(HAVE_VFS_MIGRATEPAGE) 1091 .migratepage = migrate_page, 1092 #endif 1093 }; 1094 1095 const struct file_operations zpl_file_operations = { 1096 .open = zpl_open, 1097 .release = zpl_release, 1098 .llseek = zpl_llseek, 1099 .read_iter = zpl_iter_read, 1100 .write_iter = zpl_iter_write, 1101 #ifdef HAVE_COPY_SPLICE_READ 1102 .splice_read = copy_splice_read, 1103 #else 1104 .splice_read = generic_file_splice_read, 1105 #endif 1106 .splice_write = iter_file_splice_write, 1107 .mmap = zpl_mmap, 1108 .fsync = zpl_fsync, 1109 .fallocate = zpl_fallocate, 1110 .copy_file_range = zpl_copy_file_range, 1111 #ifdef HAVE_VFS_CLONE_FILE_RANGE 1112 .clone_file_range = zpl_clone_file_range, 1113 #endif 1114 #ifdef HAVE_VFS_REMAP_FILE_RANGE 1115 .remap_file_range = zpl_remap_file_range, 1116 #endif 1117 #ifdef HAVE_VFS_DEDUPE_FILE_RANGE 1118 .dedupe_file_range = zpl_dedupe_file_range, 1119 #endif 1120 .fadvise = zpl_fadvise, 1121 .unlocked_ioctl = zpl_ioctl, 1122 #ifdef CONFIG_COMPAT 1123 .compat_ioctl = zpl_compat_ioctl, 1124 #endif 1125 }; 1126 1127 const struct file_operations zpl_dir_file_operations = { 1128 .llseek = generic_file_llseek, 1129 .read = generic_read_dir, 1130 .iterate_shared = zpl_iterate, 1131 .fsync = zpl_fsync, 1132 .unlocked_ioctl = zpl_ioctl, 1133 #ifdef CONFIG_COMPAT 1134 .compat_ioctl = zpl_compat_ioctl, 1135 #endif 1136 }; 1137 1138 module_param(zfs_fallocate_reserve_percent, uint, 0644); 1139 MODULE_PARM_DESC(zfs_fallocate_reserve_percent, 1140 "Percentage of length to use for the available capacity check"); 1141