xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c (revision 113e60742ef6ba5c069aa737ee57ba3c2f88b248)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24  * Copyright (c) 2023, Datto Inc. All rights reserved.
25  * Copyright (c) 2025, Klara, Inc.
26  */
27 
28 
29 #include <sys/zfs_znode.h>
30 #include <sys/zfs_vfsops.h>
31 #include <sys/zfs_vnops.h>
32 #include <sys/zfs_ctldir.h>
33 #include <sys/zpl.h>
34 #include <linux/iversion.h>
35 #include <linux/version.h>
36 
37 /*
38  * What to do when the last reference to an inode is released. If 0, the kernel
39  * will cache it on the superblock. If 1, the inode will be freed immediately.
40  * See zpl_drop_inode().
41  */
42 int zfs_delete_inode = 0;
43 
44 /*
45  * What to do when the last reference to a dentry is released. If 0, the kernel
46  * will cache it until the entry (file) is destroyed. If 1, the dentry will be
47  * marked for cleanup, at which time its inode reference will be released. See
48  * zpl_dentry_delete().
49  */
50 int zfs_delete_dentry = 0;
51 
52 static struct inode *
zpl_inode_alloc(struct super_block * sb)53 zpl_inode_alloc(struct super_block *sb)
54 {
55 	struct inode *ip;
56 
57 	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
58 	inode_set_iversion(ip, 1);
59 
60 	return (ip);
61 }
62 
63 #ifdef HAVE_SOPS_FREE_INODE
64 static void
zpl_inode_free(struct inode * ip)65 zpl_inode_free(struct inode *ip)
66 {
67 	ASSERT0(atomic_read(&ip->i_count));
68 	zfs_inode_free(ip);
69 }
70 #endif
71 
72 static void
zpl_inode_destroy(struct inode * ip)73 zpl_inode_destroy(struct inode *ip)
74 {
75 	ASSERT0(atomic_read(&ip->i_count));
76 	zfs_inode_destroy(ip);
77 }
78 
79 /*
80  * Called from __mark_inode_dirty() to reflect that something in the
81  * inode has changed.  We use it to ensure the znode system attributes
82  * are always strictly update to date with respect to the inode.
83  */
84 static void
zpl_dirty_inode(struct inode * ip,int flags)85 zpl_dirty_inode(struct inode *ip, int flags)
86 {
87 	fstrans_cookie_t cookie;
88 
89 	cookie = spl_fstrans_mark();
90 	zfs_dirty_inode(ip, flags);
91 	spl_fstrans_unmark(cookie);
92 }
93 
94 /*
95  * ->drop_inode() is called when the last reference to an inode is released.
96  * Its return value indicates if the inode should be destroyed immediately, or
97  * cached on the superblock structure.
98  *
99  * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
100  * "destroy immediately" if the inode is unhashed and has no links (roughly: no
101  * longer exists on disk). On datasets with millions of rarely-accessed files,
102  * this can cause a large amount of memory to be "pinned" by cached inodes,
103  * which in turn pin their associated dnodes and dbufs, until the kernel starts
104  * reporting memory pressure and requests OpenZFS release some memory (see
105  * zfs_prune()).
106  *
107  * When set to 1, we call generic_delete_node(), which always returns "destroy
108  * immediately", resulting in inodes being destroyed immediately, releasing
109  * their associated dnodes and dbufs to the dbuf cached and the ARC to be
110  * evicted as normal.
111  *
112  * Note that the "last reference" doesn't always mean the last _userspace_
113  * reference; the dentry cache also holds a reference, so "busy" inodes will
114  * still be kept alive that way (subject to dcache tuning).
115  */
116 static int
zpl_drop_inode(struct inode * ip)117 zpl_drop_inode(struct inode *ip)
118 {
119 	if (zfs_delete_inode)
120 		return (generic_delete_inode(ip));
121 	return (generic_drop_inode(ip));
122 }
123 
124 /*
125  * The ->evict_inode() callback must minimally truncate the inode pages,
126  * and call clear_inode().  For 2.6.35 and later kernels this will
127  * simply update the inode state, with the sync occurring before the
128  * truncate in evict().  For earlier kernels clear_inode() maps to
129  * end_writeback() which is responsible for completing all outstanding
130  * write back.  In either case, once this is done it is safe to cleanup
131  * any remaining inode specific data via zfs_inactive().
132  * remaining filesystem specific data.
133  */
134 static void
zpl_evict_inode(struct inode * ip)135 zpl_evict_inode(struct inode *ip)
136 {
137 	fstrans_cookie_t cookie;
138 
139 	cookie = spl_fstrans_mark();
140 	truncate_setsize(ip, 0);
141 	clear_inode(ip);
142 	zfs_inactive(ip);
143 	spl_fstrans_unmark(cookie);
144 }
145 
146 static void
zpl_put_super(struct super_block * sb)147 zpl_put_super(struct super_block *sb)
148 {
149 	fstrans_cookie_t cookie;
150 	int error;
151 
152 	cookie = spl_fstrans_mark();
153 	error = -zfs_umount(sb);
154 	spl_fstrans_unmark(cookie);
155 	ASSERT3S(error, <=, 0);
156 }
157 
158 /*
159  * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
160  * syscalls, via sb->s_op->sync_fs().
161  *
162  * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
163  * sync_filesystem() would ignore the return from sync_fs(), instead only
164  * considing the error from syncing the underlying block device (sb->s_dev).
165  * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
166  * us to report a sync directly.
167  *
168  * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
169  * error store `s_wb_err`, to carry errors seen on page writeback since the
170  * last call to syncfs(). If sync_filesystem() does not return an error, any
171  * existing writeback error on the superblock will be used instead (and cleared
172  * either way). We don't use this (page writeback is a different thing for us),
173  * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
174  *
175  * Before 5.8, we have no other good options - no matter what happens, the
176  * userspace program will be told the call has succeeded, and so we must make
177  * it so, Therefore, when we are asked to wait for sync to complete (wait ==
178  * 1), if zfs_sync() has returned an error we have no choice but to block,
179  * regardless of the reason.
180  *
181  * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
182  * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
183  * mainline Linux series at time of writing), and has likely been backported to
184  * vendor kernels before 5.8. We don't really want to use a workaround when we
185  * don't have to, but we can't really detect whether or not sync_filesystem()
186  * will return our errors (without a difficult runtime test anyway). So, we use
187  * a static version check: any kernel reporting its version as 5.17+ will use a
188  * direct error return, otherwise, we'll either use s_wb_err if it was detected
189  * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
190  * block to ensure the correct semantics.
191  *
192  * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
193  */
194 static int
zpl_sync_fs(struct super_block * sb,int wait)195 zpl_sync_fs(struct super_block *sb, int wait)
196 {
197 	fstrans_cookie_t cookie;
198 	cred_t *cr = CRED();
199 	int error;
200 
201 	crhold(cr);
202 	cookie = spl_fstrans_mark();
203 	error = -zfs_sync(sb, wait, cr);
204 
205 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
206 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
207 	if (error && wait)
208 		errseq_set(&sb->s_wb_err, error);
209 #else
210 	if (error && wait) {
211 		zfsvfs_t *zfsvfs = sb->s_fs_info;
212 		ASSERT3P(zfsvfs, !=, NULL);
213 		if (zfs_enter(zfsvfs, FTAG) == 0) {
214 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
215 			zfs_exit(zfsvfs, FTAG);
216 			error = 0;
217 		}
218 	}
219 #endif
220 #endif /* < 5.17.0 */
221 
222 	spl_fstrans_unmark(cookie);
223 	crfree(cr);
224 
225 	ASSERT3S(error, <=, 0);
226 	return (error);
227 }
228 
229 static int
zpl_statfs(struct dentry * dentry,struct kstatfs * statp)230 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
231 {
232 	fstrans_cookie_t cookie;
233 	int error;
234 
235 	cookie = spl_fstrans_mark();
236 	error = -zfs_statvfs(dentry->d_inode, statp);
237 	spl_fstrans_unmark(cookie);
238 	ASSERT3S(error, <=, 0);
239 
240 	/*
241 	 * If required by a 32-bit system call, dynamically scale the
242 	 * block size up to 16MiB and decrease the block counts.  This
243 	 * allows for a maximum size of 64EiB to be reported.  The file
244 	 * counts must be artificially capped at 2^32-1.
245 	 */
246 	if (unlikely(zpl_is_32bit_api())) {
247 		while (statp->f_blocks > UINT32_MAX &&
248 		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
249 			statp->f_frsize <<= 1;
250 			statp->f_bsize <<= 1;
251 
252 			statp->f_blocks >>= 1;
253 			statp->f_bfree >>= 1;
254 			statp->f_bavail >>= 1;
255 		}
256 
257 		uint64_t usedobjs = statp->f_files - statp->f_ffree;
258 		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
259 		statp->f_files = statp->f_ffree + usedobjs;
260 	}
261 
262 	return (error);
263 }
264 
265 static int
zpl_remount_fs(struct super_block * sb,int * flags,char * data)266 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
267 {
268 	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
269 	fstrans_cookie_t cookie;
270 	int error;
271 
272 	cookie = spl_fstrans_mark();
273 	error = -zfs_remount(sb, flags, &zm);
274 	spl_fstrans_unmark(cookie);
275 	ASSERT3S(error, <=, 0);
276 
277 	return (error);
278 }
279 
280 static int
__zpl_show_devname(struct seq_file * seq,zfsvfs_t * zfsvfs)281 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
282 {
283 	int error;
284 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
285 		return (error);
286 
287 	char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
288 	dmu_objset_name(zfsvfs->z_os, fsname);
289 
290 	for (int i = 0; fsname[i] != 0; i++) {
291 		/*
292 		 * Spaces in the dataset name must be converted to their
293 		 * octal escape sequence for getmntent(3) to correctly
294 		 * parse then fsname portion of /proc/self/mounts.
295 		 */
296 		if (fsname[i] == ' ') {
297 			seq_puts(seq, "\\040");
298 		} else {
299 			seq_putc(seq, fsname[i]);
300 		}
301 	}
302 
303 	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
304 
305 	zpl_exit(zfsvfs, FTAG);
306 
307 	return (0);
308 }
309 
310 static int
zpl_show_devname(struct seq_file * seq,struct dentry * root)311 zpl_show_devname(struct seq_file *seq, struct dentry *root)
312 {
313 	return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
314 }
315 
316 static int
__zpl_show_options(struct seq_file * seq,zfsvfs_t * zfsvfs)317 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
318 {
319 	seq_printf(seq, ",%s",
320 	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
321 
322 #ifdef CONFIG_FS_POSIX_ACL
323 	switch (zfsvfs->z_acl_type) {
324 	case ZFS_ACLTYPE_POSIX:
325 		seq_puts(seq, ",posixacl");
326 		break;
327 	default:
328 		seq_puts(seq, ",noacl");
329 		break;
330 	}
331 #endif /* CONFIG_FS_POSIX_ACL */
332 
333 	switch (zfsvfs->z_case) {
334 	case ZFS_CASE_SENSITIVE:
335 		seq_puts(seq, ",casesensitive");
336 		break;
337 	case ZFS_CASE_INSENSITIVE:
338 		seq_puts(seq, ",caseinsensitive");
339 		break;
340 	default:
341 		seq_puts(seq, ",casemixed");
342 		break;
343 	}
344 
345 	return (0);
346 }
347 
348 static int
zpl_show_options(struct seq_file * seq,struct dentry * root)349 zpl_show_options(struct seq_file *seq, struct dentry *root)
350 {
351 	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
352 }
353 
354 static int
zpl_fill_super(struct super_block * sb,void * data,int silent)355 zpl_fill_super(struct super_block *sb, void *data, int silent)
356 {
357 	zfs_mnt_t *zm = (zfs_mnt_t *)data;
358 	fstrans_cookie_t cookie;
359 	int error;
360 
361 	cookie = spl_fstrans_mark();
362 	error = -zfs_domount(sb, zm, silent);
363 	spl_fstrans_unmark(cookie);
364 	ASSERT3S(error, <=, 0);
365 
366 	return (error);
367 }
368 
369 static int
zpl_test_super(struct super_block * s,void * data)370 zpl_test_super(struct super_block *s, void *data)
371 {
372 	zfsvfs_t *zfsvfs = s->s_fs_info;
373 	objset_t *os = data;
374 	/*
375 	 * If the os doesn't match the z_os in the super_block, assume it is
376 	 * not a match. Matching would imply a multimount of a dataset. It is
377 	 * possible that during a multimount, there is a simultaneous operation
378 	 * that changes the z_os, e.g., rollback, where the match will be
379 	 * missed, but in that case the user will get an EBUSY.
380 	 */
381 	return (zfsvfs != NULL && os == zfsvfs->z_os);
382 }
383 
384 static struct super_block *
zpl_mount_impl(struct file_system_type * fs_type,int flags,zfs_mnt_t * zm)385 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
386 {
387 	struct super_block *s;
388 	objset_t *os;
389 	boolean_t issnap = B_FALSE;
390 	int err;
391 
392 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
393 	if (err)
394 		return (ERR_PTR(-err));
395 
396 	/*
397 	 * The dsl pool lock must be released prior to calling sget().
398 	 * It is possible sget() may block on the lock in grab_super()
399 	 * while deactivate_super() holds that same lock and waits for
400 	 * a txg sync.  If the dsl_pool lock is held over sget()
401 	 * this can prevent the pool sync and cause a deadlock.
402 	 */
403 	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
404 	dsl_pool_rele(dmu_objset_pool(os), FTAG);
405 
406 	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
407 
408 	/*
409 	 * Recheck with the lock held to prevent mounting the wrong dataset
410 	 * since z_os can be stale when the teardown lock is held.
411 	 *
412 	 * We can't do this in zpl_test_super in since it's under spinlock and
413 	 * also s_umount lock is not held there so it would race with
414 	 * zfs_umount and zfsvfs can be freed.
415 	 */
416 	if (!IS_ERR(s) && s->s_fs_info != NULL) {
417 		zfsvfs_t *zfsvfs = s->s_fs_info;
418 		if (zpl_enter(zfsvfs, FTAG) == 0) {
419 			if (os != zfsvfs->z_os)
420 				err = -SET_ERROR(EBUSY);
421 			issnap = zfsvfs->z_issnap;
422 			zpl_exit(zfsvfs, FTAG);
423 		} else {
424 			err = -SET_ERROR(EBUSY);
425 		}
426 	}
427 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
428 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
429 
430 	if (IS_ERR(s))
431 		return (ERR_CAST(s));
432 
433 	if (err) {
434 		deactivate_locked_super(s);
435 		return (ERR_PTR(err));
436 	}
437 
438 	if (s->s_root == NULL) {
439 		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
440 		if (err) {
441 			deactivate_locked_super(s);
442 			return (ERR_PTR(err));
443 		}
444 		s->s_flags |= SB_ACTIVE;
445 	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
446 		/*
447 		 * Skip ro check for snap since snap is always ro regardless
448 		 * ro flag is passed by mount or not.
449 		 */
450 		deactivate_locked_super(s);
451 		return (ERR_PTR(-EBUSY));
452 	}
453 
454 	return (s);
455 }
456 
457 static struct dentry *
zpl_mount(struct file_system_type * fs_type,int flags,const char * osname,void * data)458 zpl_mount(struct file_system_type *fs_type, int flags,
459     const char *osname, void *data)
460 {
461 	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
462 
463 	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
464 	if (IS_ERR(sb))
465 		return (ERR_CAST(sb));
466 
467 	return (dget(sb->s_root));
468 }
469 
470 static void
zpl_kill_sb(struct super_block * sb)471 zpl_kill_sb(struct super_block *sb)
472 {
473 	zfs_preumount(sb);
474 	kill_anon_super(sb);
475 }
476 
477 void
zpl_prune_sb(uint64_t nr_to_scan,void * arg)478 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
479 {
480 	struct super_block *sb = (struct super_block *)arg;
481 	int objects = 0;
482 
483 	/*
484 	 * Ensure the superblock is not in the process of being torn down.
485 	 */
486 #ifdef HAVE_SB_DYING
487 	if (down_read_trylock(&sb->s_umount)) {
488 		if (!(sb->s_flags & SB_DYING) && sb->s_root &&
489 		    (sb->s_flags & SB_BORN)) {
490 			(void) zfs_prune(sb, nr_to_scan, &objects);
491 		}
492 		up_read(&sb->s_umount);
493 	}
494 #else
495 	if (down_read_trylock(&sb->s_umount)) {
496 		if (!hlist_unhashed(&sb->s_instances) &&
497 		    sb->s_root && (sb->s_flags & SB_BORN)) {
498 			(void) zfs_prune(sb, nr_to_scan, &objects);
499 		}
500 		up_read(&sb->s_umount);
501 	}
502 #endif
503 }
504 
505 const struct super_operations zpl_super_operations = {
506 	.alloc_inode		= zpl_inode_alloc,
507 #ifdef HAVE_SOPS_FREE_INODE
508 	.free_inode		= zpl_inode_free,
509 #endif
510 	.destroy_inode		= zpl_inode_destroy,
511 	.dirty_inode		= zpl_dirty_inode,
512 	.write_inode		= NULL,
513 	.drop_inode		= zpl_drop_inode,
514 	.evict_inode		= zpl_evict_inode,
515 	.put_super		= zpl_put_super,
516 	.sync_fs		= zpl_sync_fs,
517 	.statfs			= zpl_statfs,
518 	.remount_fs		= zpl_remount_fs,
519 	.show_devname		= zpl_show_devname,
520 	.show_options		= zpl_show_options,
521 	.show_stats		= NULL,
522 };
523 
524 /*
525  * ->d_delete() is called when the last reference to a dentry is released. Its
526  *  return value indicates if the dentry should be destroyed immediately, or
527  *  retained in the dentry cache.
528  *
529  * By default (zfs_delete_dentry=0) the kernel will always cache unused
530  * entries.  Each dentry holds an inode reference, so cached dentries can hold
531  * the final inode reference indefinitely, leading to the inode and its related
532  * data being pinned (see zpl_drop_inode()).
533  *
534  * When set to 1, we signal that the dentry should be destroyed immediately and
535  * never cached. This reduces memory usage, at the cost of higher overheads to
536  * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
537  * reloaded and reinflated.
538  *
539  * Note that userspace does not have direct control over dentry references and
540  * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
541  * (eg vm.vfs_cache_pressure).
542  */
543 static int
zpl_dentry_delete(const struct dentry * dentry)544 zpl_dentry_delete(const struct dentry *dentry)
545 {
546 	return (zfs_delete_dentry ? 1 : 0);
547 }
548 
549 const struct dentry_operations zpl_dentry_operations = {
550 	.d_delete = zpl_dentry_delete,
551 };
552 
553 struct file_system_type zpl_fs_type = {
554 	.owner			= THIS_MODULE,
555 	.name			= ZFS_DRIVER,
556 #if defined(HAVE_IDMAP_MNT_API)
557 	.fs_flags		= FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
558 #else
559 	.fs_flags		= FS_USERNS_MOUNT,
560 #endif
561 	.mount			= zpl_mount,
562 	.kill_sb		= zpl_kill_sb,
563 };
564 
565 ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
566 	"Delete inodes as soon as the last reference is released.");
567 
568 ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
569 	"Delete dentries from dentry cache as soon as the last reference is "
570 	"released.");
571