xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c (revision df21a004be237a1dccd03c7b47254625eea62fa9)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24  * Copyright (c) 2023, Datto Inc. All rights reserved.
25  * Copyright (c) 2025, Klara, Inc.
26  * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
27  */
28 
29 
30 #include <sys/zfs_znode.h>
31 #include <sys/zfs_vfsops.h>
32 #include <sys/zfs_vnops.h>
33 #include <sys/zfs_ctldir.h>
34 #include <sys/zpl.h>
35 #include <linux/iversion.h>
36 #include <linux/version.h>
37 #include <linux/vfs_compat.h>
38 
39 /*
40  * What to do when the last reference to an inode is released. If 0, the kernel
41  * will cache it on the superblock. If 1, the inode will be freed immediately.
42  * See zpl_drop_inode().
43  */
44 int zfs_delete_inode = 0;
45 
46 /*
47  * What to do when the last reference to a dentry is released. If 0, the kernel
48  * will cache it until the entry (file) is destroyed. If 1, the dentry will be
49  * marked for cleanup, at which time its inode reference will be released. See
50  * zpl_dentry_delete().
51  */
52 int zfs_delete_dentry = 0;
53 
54 static struct inode *
55 zpl_inode_alloc(struct super_block *sb)
56 {
57 	struct inode *ip;
58 
59 	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
60 	inode_set_iversion(ip, 1);
61 
62 	return (ip);
63 }
64 
65 #ifdef HAVE_SOPS_FREE_INODE
66 static void
67 zpl_inode_free(struct inode *ip)
68 {
69 	ASSERT0(atomic_read(&ip->i_count));
70 	zfs_inode_free(ip);
71 }
72 #endif
73 
74 static void
75 zpl_inode_destroy(struct inode *ip)
76 {
77 	ASSERT0(atomic_read(&ip->i_count));
78 	zfs_inode_destroy(ip);
79 }
80 
81 /*
82  * Called from __mark_inode_dirty() to reflect that something in the
83  * inode has changed.  We use it to ensure the znode system attributes
84  * are always strictly update to date with respect to the inode.
85  */
86 static void
87 zpl_dirty_inode(struct inode *ip, int flags)
88 {
89 	fstrans_cookie_t cookie;
90 
91 	cookie = spl_fstrans_mark();
92 	zfs_dirty_inode(ip, flags);
93 	spl_fstrans_unmark(cookie);
94 }
95 
96 /*
97  * ->drop_inode() is called when the last reference to an inode is released.
98  * Its return value indicates if the inode should be destroyed immediately, or
99  * cached on the superblock structure.
100  *
101  * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
102  * "destroy immediately" if the inode is unhashed and has no links (roughly: no
103  * longer exists on disk). On datasets with millions of rarely-accessed files,
104  * this can cause a large amount of memory to be "pinned" by cached inodes,
105  * which in turn pin their associated dnodes and dbufs, until the kernel starts
106  * reporting memory pressure and requests OpenZFS release some memory (see
107  * zfs_prune()).
108  *
109  * When set to 1, we call generic_delete_inode(), which always returns "destroy
110  * immediately", resulting in inodes being destroyed immediately, releasing
111  * their associated dnodes and dbufs to the dbuf cached and the ARC to be
112  * evicted as normal.
113  *
114  * Note that the "last reference" doesn't always mean the last _userspace_
115  * reference; the dentry cache also holds a reference, so "busy" inodes will
116  * still be kept alive that way (subject to dcache tuning).
117  */
118 static int
119 zpl_drop_inode(struct inode *ip)
120 {
121 	if (zfs_delete_inode)
122 		return (generic_delete_inode(ip));
123 	return (generic_drop_inode(ip));
124 }
125 
126 /*
127  * The ->evict_inode() callback must minimally truncate the inode pages,
128  * and call clear_inode().  For 2.6.35 and later kernels this will
129  * simply update the inode state, with the sync occurring before the
130  * truncate in evict().  For earlier kernels clear_inode() maps to
131  * end_writeback() which is responsible for completing all outstanding
132  * write back.  In either case, once this is done it is safe to cleanup
133  * any remaining inode specific data via zfs_inactive().
134  * remaining filesystem specific data.
135  */
136 static void
137 zpl_evict_inode(struct inode *ip)
138 {
139 	fstrans_cookie_t cookie;
140 
141 	cookie = spl_fstrans_mark();
142 	truncate_setsize(ip, 0);
143 	clear_inode(ip);
144 	zfs_inactive(ip);
145 	spl_fstrans_unmark(cookie);
146 }
147 
148 static void
149 zpl_put_super(struct super_block *sb)
150 {
151 	fstrans_cookie_t cookie;
152 	int error;
153 
154 	cookie = spl_fstrans_mark();
155 	error = -zfs_umount(sb);
156 	spl_fstrans_unmark(cookie);
157 	ASSERT3S(error, <=, 0);
158 }
159 
160 /*
161  * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
162  * syscalls, via sb->s_op->sync_fs().
163  *
164  * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
165  * sync_filesystem() would ignore the return from sync_fs(), instead only
166  * considing the error from syncing the underlying block device (sb->s_dev).
167  * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
168  * us to report a sync directly.
169  *
170  * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
171  * error store `s_wb_err`, to carry errors seen on page writeback since the
172  * last call to syncfs(). If sync_filesystem() does not return an error, any
173  * existing writeback error on the superblock will be used instead (and cleared
174  * either way). We don't use this (page writeback is a different thing for us),
175  * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
176  *
177  * Before 5.8, we have no other good options - no matter what happens, the
178  * userspace program will be told the call has succeeded, and so we must make
179  * it so, Therefore, when we are asked to wait for sync to complete (wait ==
180  * 1), if zfs_sync() has returned an error we have no choice but to block,
181  * regardless of the reason.
182  *
183  * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
184  * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
185  * mainline Linux series at time of writing), and has likely been backported to
186  * vendor kernels before 5.8. We don't really want to use a workaround when we
187  * don't have to, but we can't really detect whether or not sync_filesystem()
188  * will return our errors (without a difficult runtime test anyway). So, we use
189  * a static version check: any kernel reporting its version as 5.17+ will use a
190  * direct error return, otherwise, we'll either use s_wb_err if it was detected
191  * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
192  * block to ensure the correct semantics.
193  *
194  * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
195  */
196 static int
197 zpl_sync_fs(struct super_block *sb, int wait)
198 {
199 	fstrans_cookie_t cookie;
200 	cred_t *cr = CRED();
201 	int error;
202 
203 	crhold(cr);
204 	cookie = spl_fstrans_mark();
205 	error = -zfs_sync(sb, wait, cr);
206 
207 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
208 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
209 	if (error && wait)
210 		errseq_set(&sb->s_wb_err, error);
211 #else
212 	if (error && wait) {
213 		zfsvfs_t *zfsvfs = sb->s_fs_info;
214 		ASSERT3P(zfsvfs, !=, NULL);
215 		if (zfs_enter(zfsvfs, FTAG) == 0) {
216 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
217 			zfs_exit(zfsvfs, FTAG);
218 			error = 0;
219 		}
220 	}
221 #endif
222 #endif /* < 5.17.0 */
223 
224 	spl_fstrans_unmark(cookie);
225 	crfree(cr);
226 
227 	ASSERT3S(error, <=, 0);
228 	return (error);
229 }
230 
231 static int
232 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
233 {
234 	fstrans_cookie_t cookie;
235 	int error;
236 
237 	cookie = spl_fstrans_mark();
238 	error = -zfs_statvfs(dentry->d_inode, statp);
239 	spl_fstrans_unmark(cookie);
240 	ASSERT3S(error, <=, 0);
241 
242 	/*
243 	 * If required by a 32-bit system call, dynamically scale the
244 	 * block size up to 16MiB and decrease the block counts.  This
245 	 * allows for a maximum size of 64EiB to be reported.  The file
246 	 * counts must be artificially capped at 2^32-1.
247 	 */
248 	if (unlikely(zpl_is_32bit_api())) {
249 		while (statp->f_blocks > UINT32_MAX &&
250 		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
251 			statp->f_frsize <<= 1;
252 			statp->f_bsize <<= 1;
253 
254 			statp->f_blocks >>= 1;
255 			statp->f_bfree >>= 1;
256 			statp->f_bavail >>= 1;
257 		}
258 
259 		uint64_t usedobjs = statp->f_files - statp->f_ffree;
260 		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
261 		statp->f_files = statp->f_ffree + usedobjs;
262 	}
263 
264 	return (error);
265 }
266 
267 static int
268 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
269 {
270 	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
271 	fstrans_cookie_t cookie;
272 	int error;
273 
274 	cookie = spl_fstrans_mark();
275 	error = -zfs_remount(sb, flags, &zm);
276 	spl_fstrans_unmark(cookie);
277 	ASSERT3S(error, <=, 0);
278 
279 	return (error);
280 }
281 
282 static int
283 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
284 {
285 	int error;
286 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
287 		return (error);
288 
289 	char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
290 	dmu_objset_name(zfsvfs->z_os, fsname);
291 
292 	for (int i = 0; fsname[i] != 0; i++) {
293 		/*
294 		 * Spaces in the dataset name must be converted to their
295 		 * octal escape sequence for getmntent(3) to correctly
296 		 * parse then fsname portion of /proc/self/mounts.
297 		 */
298 		if (fsname[i] == ' ') {
299 			seq_puts(seq, "\\040");
300 		} else {
301 			seq_putc(seq, fsname[i]);
302 		}
303 	}
304 
305 	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
306 
307 	zpl_exit(zfsvfs, FTAG);
308 
309 	return (0);
310 }
311 
312 static int
313 zpl_show_devname(struct seq_file *seq, struct dentry *root)
314 {
315 	return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
316 }
317 
318 static int
319 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
320 {
321 	seq_printf(seq, ",%s",
322 	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
323 
324 #ifdef CONFIG_FS_POSIX_ACL
325 	switch (zfsvfs->z_acl_type) {
326 	case ZFS_ACLTYPE_POSIX:
327 		seq_puts(seq, ",posixacl");
328 		break;
329 	default:
330 		seq_puts(seq, ",noacl");
331 		break;
332 	}
333 #endif /* CONFIG_FS_POSIX_ACL */
334 
335 	switch (zfsvfs->z_case) {
336 	case ZFS_CASE_SENSITIVE:
337 		seq_puts(seq, ",casesensitive");
338 		break;
339 	case ZFS_CASE_INSENSITIVE:
340 		seq_puts(seq, ",caseinsensitive");
341 		break;
342 	default:
343 		seq_puts(seq, ",casemixed");
344 		break;
345 	}
346 
347 	return (0);
348 }
349 
350 static int
351 zpl_show_options(struct seq_file *seq, struct dentry *root)
352 {
353 	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
354 }
355 
356 static int
357 zpl_fill_super(struct super_block *sb, void *data, int silent)
358 {
359 	zfs_mnt_t *zm = (zfs_mnt_t *)data;
360 	fstrans_cookie_t cookie;
361 	int error;
362 
363 	cookie = spl_fstrans_mark();
364 	error = -zfs_domount(sb, zm, silent);
365 	spl_fstrans_unmark(cookie);
366 	ASSERT3S(error, <=, 0);
367 
368 	return (error);
369 }
370 
371 static int
372 zpl_test_super(struct super_block *s, void *data)
373 {
374 	zfsvfs_t *zfsvfs = s->s_fs_info;
375 	objset_t *os = data;
376 	/*
377 	 * If the os doesn't match the z_os in the super_block, assume it is
378 	 * not a match. Matching would imply a multimount of a dataset. It is
379 	 * possible that during a multimount, there is a simultaneous operation
380 	 * that changes the z_os, e.g., rollback, where the match will be
381 	 * missed, but in that case the user will get an EBUSY.
382 	 */
383 	return (zfsvfs != NULL && os == zfsvfs->z_os);
384 }
385 
386 static struct super_block *
387 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
388 {
389 	struct super_block *s;
390 	objset_t *os;
391 	boolean_t issnap = B_FALSE;
392 	int err;
393 
394 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
395 	if (err)
396 		return (ERR_PTR(-err));
397 
398 	/*
399 	 * The dsl pool lock must be released prior to calling sget().
400 	 * It is possible sget() may block on the lock in grab_super()
401 	 * while deactivate_super() holds that same lock and waits for
402 	 * a txg sync.  If the dsl_pool lock is held over sget()
403 	 * this can prevent the pool sync and cause a deadlock.
404 	 */
405 	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
406 	dsl_pool_rele(dmu_objset_pool(os), FTAG);
407 
408 	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
409 
410 	/*
411 	 * Recheck with the lock held to prevent mounting the wrong dataset
412 	 * since z_os can be stale when the teardown lock is held.
413 	 *
414 	 * We can't do this in zpl_test_super in since it's under spinlock and
415 	 * also s_umount lock is not held there so it would race with
416 	 * zfs_umount and zfsvfs can be freed.
417 	 */
418 	if (!IS_ERR(s) && s->s_fs_info != NULL) {
419 		zfsvfs_t *zfsvfs = s->s_fs_info;
420 		if (zpl_enter(zfsvfs, FTAG) == 0) {
421 			if (os != zfsvfs->z_os)
422 				err = -SET_ERROR(EBUSY);
423 			issnap = zfsvfs->z_issnap;
424 			zpl_exit(zfsvfs, FTAG);
425 		} else {
426 			err = -SET_ERROR(EBUSY);
427 		}
428 	}
429 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
430 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
431 
432 	if (IS_ERR(s))
433 		return (ERR_CAST(s));
434 
435 	if (err) {
436 		deactivate_locked_super(s);
437 		return (ERR_PTR(err));
438 	}
439 
440 	if (s->s_root == NULL) {
441 		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
442 		if (err) {
443 			deactivate_locked_super(s);
444 			return (ERR_PTR(err));
445 		}
446 		s->s_flags |= SB_ACTIVE;
447 	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
448 		/*
449 		 * Skip ro check for snap since snap is always ro regardless
450 		 * ro flag is passed by mount or not.
451 		 */
452 		deactivate_locked_super(s);
453 		return (ERR_PTR(-EBUSY));
454 	}
455 
456 	return (s);
457 }
458 
459 static struct dentry *
460 zpl_mount(struct file_system_type *fs_type, int flags,
461     const char *osname, void *data)
462 {
463 	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
464 
465 	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
466 	if (IS_ERR(sb))
467 		return (ERR_CAST(sb));
468 
469 	return (dget(sb->s_root));
470 }
471 
472 static void
473 zpl_kill_sb(struct super_block *sb)
474 {
475 	zfs_preumount(sb);
476 	kill_anon_super(sb);
477 }
478 
479 void
480 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
481 {
482 	struct super_block *sb = (struct super_block *)arg;
483 	int objects = 0;
484 
485 	/*
486 	 * Ensure the superblock is not in the process of being torn down.
487 	 */
488 #ifdef HAVE_SB_DYING
489 	if (down_read_trylock(&sb->s_umount)) {
490 		if (!(sb->s_flags & SB_DYING) && sb->s_root &&
491 		    (sb->s_flags & SB_BORN)) {
492 			(void) zfs_prune(sb, nr_to_scan, &objects);
493 		}
494 		up_read(&sb->s_umount);
495 	}
496 #else
497 	if (down_read_trylock(&sb->s_umount)) {
498 		if (!hlist_unhashed(&sb->s_instances) &&
499 		    sb->s_root && (sb->s_flags & SB_BORN)) {
500 			(void) zfs_prune(sb, nr_to_scan, &objects);
501 		}
502 		up_read(&sb->s_umount);
503 	}
504 #endif
505 }
506 
507 const struct super_operations zpl_super_operations = {
508 	.alloc_inode		= zpl_inode_alloc,
509 #ifdef HAVE_SOPS_FREE_INODE
510 	.free_inode		= zpl_inode_free,
511 #endif
512 	.destroy_inode		= zpl_inode_destroy,
513 	.dirty_inode		= zpl_dirty_inode,
514 	.write_inode		= NULL,
515 	.drop_inode		= zpl_drop_inode,
516 	.evict_inode		= zpl_evict_inode,
517 	.put_super		= zpl_put_super,
518 	.sync_fs		= zpl_sync_fs,
519 	.statfs			= zpl_statfs,
520 	.remount_fs		= zpl_remount_fs,
521 	.show_devname		= zpl_show_devname,
522 	.show_options		= zpl_show_options,
523 	.show_stats		= NULL,
524 };
525 
526 /*
527  * ->d_delete() is called when the last reference to a dentry is released. Its
528  *  return value indicates if the dentry should be destroyed immediately, or
529  *  retained in the dentry cache.
530  *
531  * By default (zfs_delete_dentry=0) the kernel will always cache unused
532  * entries.  Each dentry holds an inode reference, so cached dentries can hold
533  * the final inode reference indefinitely, leading to the inode and its related
534  * data being pinned (see zpl_drop_inode()).
535  *
536  * When set to 1, we signal that the dentry should be destroyed immediately and
537  * never cached. This reduces memory usage, at the cost of higher overheads to
538  * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
539  * reloaded and reinflated.
540  *
541  * Note that userspace does not have direct control over dentry references and
542  * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
543  * (eg vm.vfs_cache_pressure).
544  */
545 static int
546 zpl_dentry_delete(const struct dentry *dentry)
547 {
548 	return (zfs_delete_dentry ? 1 : 0);
549 }
550 
551 const struct dentry_operations zpl_dentry_operations = {
552 	.d_delete = zpl_dentry_delete,
553 };
554 
555 struct file_system_type zpl_fs_type = {
556 	.owner			= THIS_MODULE,
557 	.name			= ZFS_DRIVER,
558 #if defined(HAVE_IDMAP_MNT_API)
559 	.fs_flags		= FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
560 #else
561 	.fs_flags		= FS_USERNS_MOUNT,
562 #endif
563 	.mount			= zpl_mount,
564 	.kill_sb		= zpl_kill_sb,
565 };
566 
567 ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
568 	"Delete inodes as soon as the last reference is released.");
569 
570 ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
571 	"Delete dentries from dentry cache as soon as the last reference is "
572 	"released.");
573