1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. 24 * Copyright (c) 2023, Datto Inc. All rights reserved. 25 * Copyright (c) 2025, Klara, Inc. 26 */ 27 28 29 #include <sys/zfs_znode.h> 30 #include <sys/zfs_vfsops.h> 31 #include <sys/zfs_vnops.h> 32 #include <sys/zfs_ctldir.h> 33 #include <sys/zpl.h> 34 #include <linux/iversion.h> 35 #include <linux/version.h> 36 37 /* 38 * What to do when the last reference to an inode is released. If 0, the kernel 39 * will cache it on the superblock. If 1, the inode will be freed immediately. 40 * See zpl_drop_inode(). 41 */ 42 int zfs_delete_inode = 0; 43 44 /* 45 * What to do when the last reference to a dentry is released. If 0, the kernel 46 * will cache it until the entry (file) is destroyed. If 1, the dentry will be 47 * marked for cleanup, at which time its inode reference will be released. See 48 * zpl_dentry_delete(). 49 */ 50 int zfs_delete_dentry = 0; 51 52 static struct inode * 53 zpl_inode_alloc(struct super_block *sb) 54 { 55 struct inode *ip; 56 57 VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); 58 inode_set_iversion(ip, 1); 59 60 return (ip); 61 } 62 63 #ifdef HAVE_SOPS_FREE_INODE 64 static void 65 zpl_inode_free(struct inode *ip) 66 { 67 ASSERT0(atomic_read(&ip->i_count)); 68 zfs_inode_free(ip); 69 } 70 #endif 71 72 static void 73 zpl_inode_destroy(struct inode *ip) 74 { 75 ASSERT0(atomic_read(&ip->i_count)); 76 zfs_inode_destroy(ip); 77 } 78 79 /* 80 * Called from __mark_inode_dirty() to reflect that something in the 81 * inode has changed. We use it to ensure the znode system attributes 82 * are always strictly update to date with respect to the inode. 83 */ 84 static void 85 zpl_dirty_inode(struct inode *ip, int flags) 86 { 87 fstrans_cookie_t cookie; 88 89 cookie = spl_fstrans_mark(); 90 zfs_dirty_inode(ip, flags); 91 spl_fstrans_unmark(cookie); 92 } 93 94 /* 95 * ->drop_inode() is called when the last reference to an inode is released. 96 * Its return value indicates if the inode should be destroyed immediately, or 97 * cached on the superblock structure. 98 * 99 * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns 100 * "destroy immediately" if the inode is unhashed and has no links (roughly: no 101 * longer exists on disk). On datasets with millions of rarely-accessed files, 102 * this can cause a large amount of memory to be "pinned" by cached inodes, 103 * which in turn pin their associated dnodes and dbufs, until the kernel starts 104 * reporting memory pressure and requests OpenZFS release some memory (see 105 * zfs_prune()). 106 * 107 * When set to 1, we call generic_delete_node(), which always returns "destroy 108 * immediately", resulting in inodes being destroyed immediately, releasing 109 * their associated dnodes and dbufs to the dbuf cached and the ARC to be 110 * evicted as normal. 111 * 112 * Note that the "last reference" doesn't always mean the last _userspace_ 113 * reference; the dentry cache also holds a reference, so "busy" inodes will 114 * still be kept alive that way (subject to dcache tuning). 115 */ 116 static int 117 zpl_drop_inode(struct inode *ip) 118 { 119 if (zfs_delete_inode) 120 return (generic_delete_inode(ip)); 121 return (generic_drop_inode(ip)); 122 } 123 124 /* 125 * The ->evict_inode() callback must minimally truncate the inode pages, 126 * and call clear_inode(). For 2.6.35 and later kernels this will 127 * simply update the inode state, with the sync occurring before the 128 * truncate in evict(). For earlier kernels clear_inode() maps to 129 * end_writeback() which is responsible for completing all outstanding 130 * write back. In either case, once this is done it is safe to cleanup 131 * any remaining inode specific data via zfs_inactive(). 132 * remaining filesystem specific data. 133 */ 134 static void 135 zpl_evict_inode(struct inode *ip) 136 { 137 fstrans_cookie_t cookie; 138 139 cookie = spl_fstrans_mark(); 140 truncate_setsize(ip, 0); 141 clear_inode(ip); 142 zfs_inactive(ip); 143 spl_fstrans_unmark(cookie); 144 } 145 146 static void 147 zpl_put_super(struct super_block *sb) 148 { 149 fstrans_cookie_t cookie; 150 int error; 151 152 cookie = spl_fstrans_mark(); 153 error = -zfs_umount(sb); 154 spl_fstrans_unmark(cookie); 155 ASSERT3S(error, <=, 0); 156 } 157 158 /* 159 * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2) 160 * syscalls, via sb->s_op->sync_fs(). 161 * 162 * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() -> 163 * sync_filesystem() would ignore the return from sync_fs(), instead only 164 * considing the error from syncing the underlying block device (sb->s_dev). 165 * Since OpenZFS doesn't _have_ an underlying block device, there's no way for 166 * us to report a sync directly. 167 * 168 * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra 169 * error store `s_wb_err`, to carry errors seen on page writeback since the 170 * last call to syncfs(). If sync_filesystem() does not return an error, any 171 * existing writeback error on the superblock will be used instead (and cleared 172 * either way). We don't use this (page writeback is a different thing for us), 173 * so for 5.8-5.17 we can use that instead to get syncfs() to return the error. 174 * 175 * Before 5.8, we have no other good options - no matter what happens, the 176 * userspace program will be told the call has succeeded, and so we must make 177 * it so, Therefore, when we are asked to wait for sync to complete (wait == 178 * 1), if zfs_sync() has returned an error we have no choice but to block, 179 * regardless of the reason. 180 * 181 * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely 182 * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the 183 * mainline Linux series at time of writing), and has likely been backported to 184 * vendor kernels before 5.8. We don't really want to use a workaround when we 185 * don't have to, but we can't really detect whether or not sync_filesystem() 186 * will return our errors (without a difficult runtime test anyway). So, we use 187 * a static version check: any kernel reporting its version as 5.17+ will use a 188 * direct error return, otherwise, we'll either use s_wb_err if it was detected 189 * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will 190 * block to ensure the correct semantics. 191 * 192 * See https://github.com/openzfs/zfs/issues/17416 for further discussion. 193 */ 194 static int 195 zpl_sync_fs(struct super_block *sb, int wait) 196 { 197 fstrans_cookie_t cookie; 198 cred_t *cr = CRED(); 199 int error; 200 201 crhold(cr); 202 cookie = spl_fstrans_mark(); 203 error = -zfs_sync(sb, wait, cr); 204 205 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) 206 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR 207 if (error && wait) 208 errseq_set(&sb->s_wb_err, error); 209 #else 210 if (error && wait) { 211 zfsvfs_t *zfsvfs = sb->s_fs_info; 212 ASSERT3P(zfsvfs, !=, NULL); 213 if (zfs_enter(zfsvfs, FTAG) == 0) { 214 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 215 zfs_exit(zfsvfs, FTAG); 216 error = 0; 217 } 218 } 219 #endif 220 #endif /* < 5.17.0 */ 221 222 spl_fstrans_unmark(cookie); 223 crfree(cr); 224 225 ASSERT3S(error, <=, 0); 226 return (error); 227 } 228 229 static int 230 zpl_statfs(struct dentry *dentry, struct kstatfs *statp) 231 { 232 fstrans_cookie_t cookie; 233 int error; 234 235 cookie = spl_fstrans_mark(); 236 error = -zfs_statvfs(dentry->d_inode, statp); 237 spl_fstrans_unmark(cookie); 238 ASSERT3S(error, <=, 0); 239 240 /* 241 * If required by a 32-bit system call, dynamically scale the 242 * block size up to 16MiB and decrease the block counts. This 243 * allows for a maximum size of 64EiB to be reported. The file 244 * counts must be artificially capped at 2^32-1. 245 */ 246 if (unlikely(zpl_is_32bit_api())) { 247 while (statp->f_blocks > UINT32_MAX && 248 statp->f_bsize < SPA_MAXBLOCKSIZE) { 249 statp->f_frsize <<= 1; 250 statp->f_bsize <<= 1; 251 252 statp->f_blocks >>= 1; 253 statp->f_bfree >>= 1; 254 statp->f_bavail >>= 1; 255 } 256 257 uint64_t usedobjs = statp->f_files - statp->f_ffree; 258 statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); 259 statp->f_files = statp->f_ffree + usedobjs; 260 } 261 262 return (error); 263 } 264 265 static int 266 zpl_remount_fs(struct super_block *sb, int *flags, char *data) 267 { 268 zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; 269 fstrans_cookie_t cookie; 270 int error; 271 272 cookie = spl_fstrans_mark(); 273 error = -zfs_remount(sb, flags, &zm); 274 spl_fstrans_unmark(cookie); 275 ASSERT3S(error, <=, 0); 276 277 return (error); 278 } 279 280 static int 281 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) 282 { 283 int error; 284 if ((error = zpl_enter(zfsvfs, FTAG)) != 0) 285 return (error); 286 287 char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 288 dmu_objset_name(zfsvfs->z_os, fsname); 289 290 for (int i = 0; fsname[i] != 0; i++) { 291 /* 292 * Spaces in the dataset name must be converted to their 293 * octal escape sequence for getmntent(3) to correctly 294 * parse then fsname portion of /proc/self/mounts. 295 */ 296 if (fsname[i] == ' ') { 297 seq_puts(seq, "\\040"); 298 } else { 299 seq_putc(seq, fsname[i]); 300 } 301 } 302 303 kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); 304 305 zpl_exit(zfsvfs, FTAG); 306 307 return (0); 308 } 309 310 static int 311 zpl_show_devname(struct seq_file *seq, struct dentry *root) 312 { 313 return (__zpl_show_devname(seq, root->d_sb->s_fs_info)); 314 } 315 316 static int 317 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) 318 { 319 seq_printf(seq, ",%s", 320 zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); 321 322 #ifdef CONFIG_FS_POSIX_ACL 323 switch (zfsvfs->z_acl_type) { 324 case ZFS_ACLTYPE_POSIX: 325 seq_puts(seq, ",posixacl"); 326 break; 327 default: 328 seq_puts(seq, ",noacl"); 329 break; 330 } 331 #endif /* CONFIG_FS_POSIX_ACL */ 332 333 switch (zfsvfs->z_case) { 334 case ZFS_CASE_SENSITIVE: 335 seq_puts(seq, ",casesensitive"); 336 break; 337 case ZFS_CASE_INSENSITIVE: 338 seq_puts(seq, ",caseinsensitive"); 339 break; 340 default: 341 seq_puts(seq, ",casemixed"); 342 break; 343 } 344 345 return (0); 346 } 347 348 static int 349 zpl_show_options(struct seq_file *seq, struct dentry *root) 350 { 351 return (__zpl_show_options(seq, root->d_sb->s_fs_info)); 352 } 353 354 static int 355 zpl_fill_super(struct super_block *sb, void *data, int silent) 356 { 357 zfs_mnt_t *zm = (zfs_mnt_t *)data; 358 fstrans_cookie_t cookie; 359 int error; 360 361 cookie = spl_fstrans_mark(); 362 error = -zfs_domount(sb, zm, silent); 363 spl_fstrans_unmark(cookie); 364 ASSERT3S(error, <=, 0); 365 366 return (error); 367 } 368 369 static int 370 zpl_test_super(struct super_block *s, void *data) 371 { 372 zfsvfs_t *zfsvfs = s->s_fs_info; 373 objset_t *os = data; 374 /* 375 * If the os doesn't match the z_os in the super_block, assume it is 376 * not a match. Matching would imply a multimount of a dataset. It is 377 * possible that during a multimount, there is a simultaneous operation 378 * that changes the z_os, e.g., rollback, where the match will be 379 * missed, but in that case the user will get an EBUSY. 380 */ 381 return (zfsvfs != NULL && os == zfsvfs->z_os); 382 } 383 384 static struct super_block * 385 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) 386 { 387 struct super_block *s; 388 objset_t *os; 389 boolean_t issnap = B_FALSE; 390 int err; 391 392 err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); 393 if (err) 394 return (ERR_PTR(-err)); 395 396 /* 397 * The dsl pool lock must be released prior to calling sget(). 398 * It is possible sget() may block on the lock in grab_super() 399 * while deactivate_super() holds that same lock and waits for 400 * a txg sync. If the dsl_pool lock is held over sget() 401 * this can prevent the pool sync and cause a deadlock. 402 */ 403 dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); 404 dsl_pool_rele(dmu_objset_pool(os), FTAG); 405 406 s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); 407 408 /* 409 * Recheck with the lock held to prevent mounting the wrong dataset 410 * since z_os can be stale when the teardown lock is held. 411 * 412 * We can't do this in zpl_test_super in since it's under spinlock and 413 * also s_umount lock is not held there so it would race with 414 * zfs_umount and zfsvfs can be freed. 415 */ 416 if (!IS_ERR(s) && s->s_fs_info != NULL) { 417 zfsvfs_t *zfsvfs = s->s_fs_info; 418 if (zpl_enter(zfsvfs, FTAG) == 0) { 419 if (os != zfsvfs->z_os) 420 err = -SET_ERROR(EBUSY); 421 issnap = zfsvfs->z_issnap; 422 zpl_exit(zfsvfs, FTAG); 423 } else { 424 err = -SET_ERROR(EBUSY); 425 } 426 } 427 dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); 428 dsl_dataset_rele(dmu_objset_ds(os), FTAG); 429 430 if (IS_ERR(s)) 431 return (ERR_CAST(s)); 432 433 if (err) { 434 deactivate_locked_super(s); 435 return (ERR_PTR(err)); 436 } 437 438 if (s->s_root == NULL) { 439 err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); 440 if (err) { 441 deactivate_locked_super(s); 442 return (ERR_PTR(err)); 443 } 444 s->s_flags |= SB_ACTIVE; 445 } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) { 446 /* 447 * Skip ro check for snap since snap is always ro regardless 448 * ro flag is passed by mount or not. 449 */ 450 deactivate_locked_super(s); 451 return (ERR_PTR(-EBUSY)); 452 } 453 454 return (s); 455 } 456 457 static struct dentry * 458 zpl_mount(struct file_system_type *fs_type, int flags, 459 const char *osname, void *data) 460 { 461 zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; 462 463 struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); 464 if (IS_ERR(sb)) 465 return (ERR_CAST(sb)); 466 467 return (dget(sb->s_root)); 468 } 469 470 static void 471 zpl_kill_sb(struct super_block *sb) 472 { 473 zfs_preumount(sb); 474 kill_anon_super(sb); 475 } 476 477 void 478 zpl_prune_sb(uint64_t nr_to_scan, void *arg) 479 { 480 struct super_block *sb = (struct super_block *)arg; 481 int objects = 0; 482 483 /* 484 * Ensure the superblock is not in the process of being torn down. 485 */ 486 #ifdef HAVE_SB_DYING 487 if (down_read_trylock(&sb->s_umount)) { 488 if (!(sb->s_flags & SB_DYING) && sb->s_root && 489 (sb->s_flags & SB_BORN)) { 490 (void) zfs_prune(sb, nr_to_scan, &objects); 491 } 492 up_read(&sb->s_umount); 493 } 494 #else 495 if (down_read_trylock(&sb->s_umount)) { 496 if (!hlist_unhashed(&sb->s_instances) && 497 sb->s_root && (sb->s_flags & SB_BORN)) { 498 (void) zfs_prune(sb, nr_to_scan, &objects); 499 } 500 up_read(&sb->s_umount); 501 } 502 #endif 503 } 504 505 const struct super_operations zpl_super_operations = { 506 .alloc_inode = zpl_inode_alloc, 507 #ifdef HAVE_SOPS_FREE_INODE 508 .free_inode = zpl_inode_free, 509 #endif 510 .destroy_inode = zpl_inode_destroy, 511 .dirty_inode = zpl_dirty_inode, 512 .write_inode = NULL, 513 .drop_inode = zpl_drop_inode, 514 .evict_inode = zpl_evict_inode, 515 .put_super = zpl_put_super, 516 .sync_fs = zpl_sync_fs, 517 .statfs = zpl_statfs, 518 .remount_fs = zpl_remount_fs, 519 .show_devname = zpl_show_devname, 520 .show_options = zpl_show_options, 521 .show_stats = NULL, 522 }; 523 524 /* 525 * ->d_delete() is called when the last reference to a dentry is released. Its 526 * return value indicates if the dentry should be destroyed immediately, or 527 * retained in the dentry cache. 528 * 529 * By default (zfs_delete_dentry=0) the kernel will always cache unused 530 * entries. Each dentry holds an inode reference, so cached dentries can hold 531 * the final inode reference indefinitely, leading to the inode and its related 532 * data being pinned (see zpl_drop_inode()). 533 * 534 * When set to 1, we signal that the dentry should be destroyed immediately and 535 * never cached. This reduces memory usage, at the cost of higher overheads to 536 * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be 537 * reloaded and reinflated. 538 * 539 * Note that userspace does not have direct control over dentry references and 540 * reclaim; rather, this is part of the kernel's caching and reclaim subsystems 541 * (eg vm.vfs_cache_pressure). 542 */ 543 static int 544 zpl_dentry_delete(const struct dentry *dentry) 545 { 546 return (zfs_delete_dentry ? 1 : 0); 547 } 548 549 const struct dentry_operations zpl_dentry_operations = { 550 .d_delete = zpl_dentry_delete, 551 }; 552 553 struct file_system_type zpl_fs_type = { 554 .owner = THIS_MODULE, 555 .name = ZFS_DRIVER, 556 #if defined(HAVE_IDMAP_MNT_API) 557 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 558 #else 559 .fs_flags = FS_USERNS_MOUNT, 560 #endif 561 .mount = zpl_mount, 562 .kill_sb = zpl_kill_sb, 563 }; 564 565 ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW, 566 "Delete inodes as soon as the last reference is released."); 567 568 ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW, 569 "Delete dentries from dentry cache as soon as the last reference is " 570 "released."); 571