1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. 24 * Copyright (c) 2023, Datto Inc. All rights reserved. 25 * Copyright (c) 2025, Klara, Inc. 26 * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> 27 */ 28 29 30 #include <sys/zfs_znode.h> 31 #include <sys/zfs_vfsops.h> 32 #include <sys/zfs_vnops.h> 33 #include <sys/zfs_ctldir.h> 34 #include <sys/zpl.h> 35 #include <linux/iversion.h> 36 #include <linux/version.h> 37 #include <linux/vfs_compat.h> 38 39 /* 40 * What to do when the last reference to an inode is released. If 0, the kernel 41 * will cache it on the superblock. If 1, the inode will be freed immediately. 42 * See zpl_drop_inode(). 43 */ 44 int zfs_delete_inode = 0; 45 46 /* 47 * What to do when the last reference to a dentry is released. If 0, the kernel 48 * will cache it until the entry (file) is destroyed. If 1, the dentry will be 49 * marked for cleanup, at which time its inode reference will be released. See 50 * zpl_dentry_delete(). 51 */ 52 int zfs_delete_dentry = 0; 53 54 static struct inode * 55 zpl_inode_alloc(struct super_block *sb) 56 { 57 struct inode *ip; 58 59 VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); 60 inode_set_iversion(ip, 1); 61 62 return (ip); 63 } 64 65 #ifdef HAVE_SOPS_FREE_INODE 66 static void 67 zpl_inode_free(struct inode *ip) 68 { 69 ASSERT0(atomic_read(&ip->i_count)); 70 zfs_inode_free(ip); 71 } 72 #endif 73 74 static void 75 zpl_inode_destroy(struct inode *ip) 76 { 77 ASSERT0(atomic_read(&ip->i_count)); 78 zfs_inode_destroy(ip); 79 } 80 81 /* 82 * Called from __mark_inode_dirty() to reflect that something in the 83 * inode has changed. We use it to ensure the znode system attributes 84 * are always strictly update to date with respect to the inode. 85 */ 86 static void 87 zpl_dirty_inode(struct inode *ip, int flags) 88 { 89 fstrans_cookie_t cookie; 90 91 cookie = spl_fstrans_mark(); 92 zfs_dirty_inode(ip, flags); 93 spl_fstrans_unmark(cookie); 94 } 95 96 /* 97 * ->drop_inode() is called when the last reference to an inode is released. 98 * Its return value indicates if the inode should be destroyed immediately, or 99 * cached on the superblock structure. 100 * 101 * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns 102 * "destroy immediately" if the inode is unhashed and has no links (roughly: no 103 * longer exists on disk). On datasets with millions of rarely-accessed files, 104 * this can cause a large amount of memory to be "pinned" by cached inodes, 105 * which in turn pin their associated dnodes and dbufs, until the kernel starts 106 * reporting memory pressure and requests OpenZFS release some memory (see 107 * zfs_prune()). 108 * 109 * When set to 1, we call generic_delete_inode(), which always returns "destroy 110 * immediately", resulting in inodes being destroyed immediately, releasing 111 * their associated dnodes and dbufs to the dbuf cached and the ARC to be 112 * evicted as normal. 113 * 114 * Note that the "last reference" doesn't always mean the last _userspace_ 115 * reference; the dentry cache also holds a reference, so "busy" inodes will 116 * still be kept alive that way (subject to dcache tuning). 117 */ 118 static int 119 zpl_drop_inode(struct inode *ip) 120 { 121 if (zfs_delete_inode) 122 return (generic_delete_inode(ip)); 123 return (generic_drop_inode(ip)); 124 } 125 126 /* 127 * The ->evict_inode() callback must minimally truncate the inode pages, 128 * and call clear_inode(). For 2.6.35 and later kernels this will 129 * simply update the inode state, with the sync occurring before the 130 * truncate in evict(). For earlier kernels clear_inode() maps to 131 * end_writeback() which is responsible for completing all outstanding 132 * write back. In either case, once this is done it is safe to cleanup 133 * any remaining inode specific data via zfs_inactive(). 134 * remaining filesystem specific data. 135 */ 136 static void 137 zpl_evict_inode(struct inode *ip) 138 { 139 fstrans_cookie_t cookie; 140 141 cookie = spl_fstrans_mark(); 142 truncate_setsize(ip, 0); 143 clear_inode(ip); 144 zfs_inactive(ip); 145 spl_fstrans_unmark(cookie); 146 } 147 148 static void 149 zpl_put_super(struct super_block *sb) 150 { 151 fstrans_cookie_t cookie; 152 int error; 153 154 cookie = spl_fstrans_mark(); 155 error = -zfs_umount(sb); 156 spl_fstrans_unmark(cookie); 157 ASSERT3S(error, <=, 0); 158 } 159 160 /* 161 * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2) 162 * syscalls, via sb->s_op->sync_fs(). 163 * 164 * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() -> 165 * sync_filesystem() would ignore the return from sync_fs(), instead only 166 * considing the error from syncing the underlying block device (sb->s_dev). 167 * Since OpenZFS doesn't _have_ an underlying block device, there's no way for 168 * us to report a sync directly. 169 * 170 * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra 171 * error store `s_wb_err`, to carry errors seen on page writeback since the 172 * last call to syncfs(). If sync_filesystem() does not return an error, any 173 * existing writeback error on the superblock will be used instead (and cleared 174 * either way). We don't use this (page writeback is a different thing for us), 175 * so for 5.8-5.17 we can use that instead to get syncfs() to return the error. 176 * 177 * Before 5.8, we have no other good options - no matter what happens, the 178 * userspace program will be told the call has succeeded, and so we must make 179 * it so, Therefore, when we are asked to wait for sync to complete (wait == 180 * 1), if zfs_sync() has returned an error we have no choice but to block, 181 * regardless of the reason. 182 * 183 * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely 184 * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the 185 * mainline Linux series at time of writing), and has likely been backported to 186 * vendor kernels before 5.8. We don't really want to use a workaround when we 187 * don't have to, but we can't really detect whether or not sync_filesystem() 188 * will return our errors (without a difficult runtime test anyway). So, we use 189 * a static version check: any kernel reporting its version as 5.17+ will use a 190 * direct error return, otherwise, we'll either use s_wb_err if it was detected 191 * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will 192 * block to ensure the correct semantics. 193 * 194 * See https://github.com/openzfs/zfs/issues/17416 for further discussion. 195 */ 196 static int 197 zpl_sync_fs(struct super_block *sb, int wait) 198 { 199 fstrans_cookie_t cookie; 200 cred_t *cr = CRED(); 201 int error; 202 203 crhold(cr); 204 cookie = spl_fstrans_mark(); 205 error = -zfs_sync(sb, wait, cr); 206 207 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) 208 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR 209 if (error && wait) 210 errseq_set(&sb->s_wb_err, error); 211 #else 212 if (error && wait) { 213 zfsvfs_t *zfsvfs = sb->s_fs_info; 214 ASSERT3P(zfsvfs, !=, NULL); 215 if (zfs_enter(zfsvfs, FTAG) == 0) { 216 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 217 zfs_exit(zfsvfs, FTAG); 218 error = 0; 219 } 220 } 221 #endif 222 #endif /* < 5.17.0 */ 223 224 spl_fstrans_unmark(cookie); 225 crfree(cr); 226 227 ASSERT3S(error, <=, 0); 228 return (error); 229 } 230 231 static int 232 zpl_statfs(struct dentry *dentry, struct kstatfs *statp) 233 { 234 fstrans_cookie_t cookie; 235 int error; 236 237 cookie = spl_fstrans_mark(); 238 error = -zfs_statvfs(dentry->d_inode, statp); 239 spl_fstrans_unmark(cookie); 240 ASSERT3S(error, <=, 0); 241 242 /* 243 * If required by a 32-bit system call, dynamically scale the 244 * block size up to 16MiB and decrease the block counts. This 245 * allows for a maximum size of 64EiB to be reported. The file 246 * counts must be artificially capped at 2^32-1. 247 */ 248 if (unlikely(zpl_is_32bit_api())) { 249 while (statp->f_blocks > UINT32_MAX && 250 statp->f_bsize < SPA_MAXBLOCKSIZE) { 251 statp->f_frsize <<= 1; 252 statp->f_bsize <<= 1; 253 254 statp->f_blocks >>= 1; 255 statp->f_bfree >>= 1; 256 statp->f_bavail >>= 1; 257 } 258 259 uint64_t usedobjs = statp->f_files - statp->f_ffree; 260 statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); 261 statp->f_files = statp->f_ffree + usedobjs; 262 } 263 264 return (error); 265 } 266 267 static int 268 zpl_remount_fs(struct super_block *sb, int *flags, char *data) 269 { 270 zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; 271 fstrans_cookie_t cookie; 272 int error; 273 274 cookie = spl_fstrans_mark(); 275 error = -zfs_remount(sb, flags, &zm); 276 spl_fstrans_unmark(cookie); 277 ASSERT3S(error, <=, 0); 278 279 return (error); 280 } 281 282 static int 283 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) 284 { 285 int error; 286 if ((error = zpl_enter(zfsvfs, FTAG)) != 0) 287 return (error); 288 289 char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 290 dmu_objset_name(zfsvfs->z_os, fsname); 291 292 for (int i = 0; fsname[i] != 0; i++) { 293 /* 294 * Spaces in the dataset name must be converted to their 295 * octal escape sequence for getmntent(3) to correctly 296 * parse then fsname portion of /proc/self/mounts. 297 */ 298 if (fsname[i] == ' ') { 299 seq_puts(seq, "\\040"); 300 } else { 301 seq_putc(seq, fsname[i]); 302 } 303 } 304 305 kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); 306 307 zpl_exit(zfsvfs, FTAG); 308 309 return (0); 310 } 311 312 static int 313 zpl_show_devname(struct seq_file *seq, struct dentry *root) 314 { 315 return (__zpl_show_devname(seq, root->d_sb->s_fs_info)); 316 } 317 318 static int 319 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) 320 { 321 seq_printf(seq, ",%s", 322 zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); 323 324 #ifdef CONFIG_FS_POSIX_ACL 325 switch (zfsvfs->z_acl_type) { 326 case ZFS_ACLTYPE_POSIX: 327 seq_puts(seq, ",posixacl"); 328 break; 329 default: 330 seq_puts(seq, ",noacl"); 331 break; 332 } 333 #endif /* CONFIG_FS_POSIX_ACL */ 334 335 switch (zfsvfs->z_case) { 336 case ZFS_CASE_SENSITIVE: 337 seq_puts(seq, ",casesensitive"); 338 break; 339 case ZFS_CASE_INSENSITIVE: 340 seq_puts(seq, ",caseinsensitive"); 341 break; 342 default: 343 seq_puts(seq, ",casemixed"); 344 break; 345 } 346 347 return (0); 348 } 349 350 static int 351 zpl_show_options(struct seq_file *seq, struct dentry *root) 352 { 353 return (__zpl_show_options(seq, root->d_sb->s_fs_info)); 354 } 355 356 static int 357 zpl_fill_super(struct super_block *sb, void *data, int silent) 358 { 359 zfs_mnt_t *zm = (zfs_mnt_t *)data; 360 fstrans_cookie_t cookie; 361 int error; 362 363 cookie = spl_fstrans_mark(); 364 error = -zfs_domount(sb, zm, silent); 365 spl_fstrans_unmark(cookie); 366 ASSERT3S(error, <=, 0); 367 368 return (error); 369 } 370 371 static int 372 zpl_test_super(struct super_block *s, void *data) 373 { 374 zfsvfs_t *zfsvfs = s->s_fs_info; 375 objset_t *os = data; 376 /* 377 * If the os doesn't match the z_os in the super_block, assume it is 378 * not a match. Matching would imply a multimount of a dataset. It is 379 * possible that during a multimount, there is a simultaneous operation 380 * that changes the z_os, e.g., rollback, where the match will be 381 * missed, but in that case the user will get an EBUSY. 382 */ 383 return (zfsvfs != NULL && os == zfsvfs->z_os); 384 } 385 386 static struct super_block * 387 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) 388 { 389 struct super_block *s; 390 objset_t *os; 391 boolean_t issnap = B_FALSE; 392 int err; 393 394 err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); 395 if (err) 396 return (ERR_PTR(-err)); 397 398 /* 399 * The dsl pool lock must be released prior to calling sget(). 400 * It is possible sget() may block on the lock in grab_super() 401 * while deactivate_super() holds that same lock and waits for 402 * a txg sync. If the dsl_pool lock is held over sget() 403 * this can prevent the pool sync and cause a deadlock. 404 */ 405 dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); 406 dsl_pool_rele(dmu_objset_pool(os), FTAG); 407 408 s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); 409 410 /* 411 * Recheck with the lock held to prevent mounting the wrong dataset 412 * since z_os can be stale when the teardown lock is held. 413 * 414 * We can't do this in zpl_test_super in since it's under spinlock and 415 * also s_umount lock is not held there so it would race with 416 * zfs_umount and zfsvfs can be freed. 417 */ 418 if (!IS_ERR(s) && s->s_fs_info != NULL) { 419 zfsvfs_t *zfsvfs = s->s_fs_info; 420 if (zpl_enter(zfsvfs, FTAG) == 0) { 421 if (os != zfsvfs->z_os) 422 err = -SET_ERROR(EBUSY); 423 issnap = zfsvfs->z_issnap; 424 zpl_exit(zfsvfs, FTAG); 425 } else { 426 err = -SET_ERROR(EBUSY); 427 } 428 } 429 dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); 430 dsl_dataset_rele(dmu_objset_ds(os), FTAG); 431 432 if (IS_ERR(s)) 433 return (ERR_CAST(s)); 434 435 if (err) { 436 deactivate_locked_super(s); 437 return (ERR_PTR(err)); 438 } 439 440 if (s->s_root == NULL) { 441 err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); 442 if (err) { 443 deactivate_locked_super(s); 444 return (ERR_PTR(err)); 445 } 446 s->s_flags |= SB_ACTIVE; 447 } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) { 448 /* 449 * Skip ro check for snap since snap is always ro regardless 450 * ro flag is passed by mount or not. 451 */ 452 deactivate_locked_super(s); 453 return (ERR_PTR(-EBUSY)); 454 } 455 456 return (s); 457 } 458 459 static struct dentry * 460 zpl_mount(struct file_system_type *fs_type, int flags, 461 const char *osname, void *data) 462 { 463 zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; 464 465 struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); 466 if (IS_ERR(sb)) 467 return (ERR_CAST(sb)); 468 469 return (dget(sb->s_root)); 470 } 471 472 static void 473 zpl_kill_sb(struct super_block *sb) 474 { 475 zfs_preumount(sb); 476 kill_anon_super(sb); 477 } 478 479 void 480 zpl_prune_sb(uint64_t nr_to_scan, void *arg) 481 { 482 struct super_block *sb = (struct super_block *)arg; 483 int objects = 0; 484 485 /* 486 * Ensure the superblock is not in the process of being torn down. 487 */ 488 #ifdef HAVE_SB_DYING 489 if (down_read_trylock(&sb->s_umount)) { 490 if (!(sb->s_flags & SB_DYING) && sb->s_root && 491 (sb->s_flags & SB_BORN)) { 492 (void) zfs_prune(sb, nr_to_scan, &objects); 493 } 494 up_read(&sb->s_umount); 495 } 496 #else 497 if (down_read_trylock(&sb->s_umount)) { 498 if (!hlist_unhashed(&sb->s_instances) && 499 sb->s_root && (sb->s_flags & SB_BORN)) { 500 (void) zfs_prune(sb, nr_to_scan, &objects); 501 } 502 up_read(&sb->s_umount); 503 } 504 #endif 505 } 506 507 const struct super_operations zpl_super_operations = { 508 .alloc_inode = zpl_inode_alloc, 509 #ifdef HAVE_SOPS_FREE_INODE 510 .free_inode = zpl_inode_free, 511 #endif 512 .destroy_inode = zpl_inode_destroy, 513 .dirty_inode = zpl_dirty_inode, 514 .write_inode = NULL, 515 .drop_inode = zpl_drop_inode, 516 .evict_inode = zpl_evict_inode, 517 .put_super = zpl_put_super, 518 .sync_fs = zpl_sync_fs, 519 .statfs = zpl_statfs, 520 .remount_fs = zpl_remount_fs, 521 .show_devname = zpl_show_devname, 522 .show_options = zpl_show_options, 523 .show_stats = NULL, 524 }; 525 526 /* 527 * ->d_delete() is called when the last reference to a dentry is released. Its 528 * return value indicates if the dentry should be destroyed immediately, or 529 * retained in the dentry cache. 530 * 531 * By default (zfs_delete_dentry=0) the kernel will always cache unused 532 * entries. Each dentry holds an inode reference, so cached dentries can hold 533 * the final inode reference indefinitely, leading to the inode and its related 534 * data being pinned (see zpl_drop_inode()). 535 * 536 * When set to 1, we signal that the dentry should be destroyed immediately and 537 * never cached. This reduces memory usage, at the cost of higher overheads to 538 * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be 539 * reloaded and reinflated. 540 * 541 * Note that userspace does not have direct control over dentry references and 542 * reclaim; rather, this is part of the kernel's caching and reclaim subsystems 543 * (eg vm.vfs_cache_pressure). 544 */ 545 static int 546 zpl_dentry_delete(const struct dentry *dentry) 547 { 548 return (zfs_delete_dentry ? 1 : 0); 549 } 550 551 const struct dentry_operations zpl_dentry_operations = { 552 .d_delete = zpl_dentry_delete, 553 }; 554 555 struct file_system_type zpl_fs_type = { 556 .owner = THIS_MODULE, 557 .name = ZFS_DRIVER, 558 #if defined(HAVE_IDMAP_MNT_API) 559 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 560 #else 561 .fs_flags = FS_USERNS_MOUNT, 562 #endif 563 .mount = zpl_mount, 564 .kill_sb = zpl_kill_sb, 565 }; 566 567 ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW, 568 "Delete inodes as soon as the last reference is released."); 569 570 ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW, 571 "Delete dentries from dentry cache as soon as the last reference is " 572 "released."); 573