1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24 * Copyright (c) 2023, Datto Inc. All rights reserved.
25 * Copyright (c) 2025, Klara, Inc.
26 */
27
28
29 #include <sys/zfs_znode.h>
30 #include <sys/zfs_vfsops.h>
31 #include <sys/zfs_vnops.h>
32 #include <sys/zfs_ctldir.h>
33 #include <sys/zpl.h>
34 #include <linux/iversion.h>
35 #include <linux/version.h>
36
37 /*
38 * What to do when the last reference to an inode is released. If 0, the kernel
39 * will cache it on the superblock. If 1, the inode will be freed immediately.
40 * See zpl_drop_inode().
41 */
42 int zfs_delete_inode = 0;
43
44 /*
45 * What to do when the last reference to a dentry is released. If 0, the kernel
46 * will cache it until the entry (file) is destroyed. If 1, the dentry will be
47 * marked for cleanup, at which time its inode reference will be released. See
48 * zpl_dentry_delete().
49 */
50 int zfs_delete_dentry = 0;
51
52 static struct inode *
zpl_inode_alloc(struct super_block * sb)53 zpl_inode_alloc(struct super_block *sb)
54 {
55 struct inode *ip;
56
57 VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
58 inode_set_iversion(ip, 1);
59
60 return (ip);
61 }
62
63 #ifdef HAVE_SOPS_FREE_INODE
64 static void
zpl_inode_free(struct inode * ip)65 zpl_inode_free(struct inode *ip)
66 {
67 ASSERT0(atomic_read(&ip->i_count));
68 zfs_inode_free(ip);
69 }
70 #endif
71
72 static void
zpl_inode_destroy(struct inode * ip)73 zpl_inode_destroy(struct inode *ip)
74 {
75 ASSERT0(atomic_read(&ip->i_count));
76 zfs_inode_destroy(ip);
77 }
78
79 /*
80 * Called from __mark_inode_dirty() to reflect that something in the
81 * inode has changed. We use it to ensure the znode system attributes
82 * are always strictly update to date with respect to the inode.
83 */
84 static void
zpl_dirty_inode(struct inode * ip,int flags)85 zpl_dirty_inode(struct inode *ip, int flags)
86 {
87 fstrans_cookie_t cookie;
88
89 cookie = spl_fstrans_mark();
90 zfs_dirty_inode(ip, flags);
91 spl_fstrans_unmark(cookie);
92 }
93
94 /*
95 * ->drop_inode() is called when the last reference to an inode is released.
96 * Its return value indicates if the inode should be destroyed immediately, or
97 * cached on the superblock structure.
98 *
99 * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
100 * "destroy immediately" if the inode is unhashed and has no links (roughly: no
101 * longer exists on disk). On datasets with millions of rarely-accessed files,
102 * this can cause a large amount of memory to be "pinned" by cached inodes,
103 * which in turn pin their associated dnodes and dbufs, until the kernel starts
104 * reporting memory pressure and requests OpenZFS release some memory (see
105 * zfs_prune()).
106 *
107 * When set to 1, we call generic_delete_node(), which always returns "destroy
108 * immediately", resulting in inodes being destroyed immediately, releasing
109 * their associated dnodes and dbufs to the dbuf cached and the ARC to be
110 * evicted as normal.
111 *
112 * Note that the "last reference" doesn't always mean the last _userspace_
113 * reference; the dentry cache also holds a reference, so "busy" inodes will
114 * still be kept alive that way (subject to dcache tuning).
115 */
116 static int
zpl_drop_inode(struct inode * ip)117 zpl_drop_inode(struct inode *ip)
118 {
119 if (zfs_delete_inode)
120 return (generic_delete_inode(ip));
121 return (generic_drop_inode(ip));
122 }
123
124 /*
125 * The ->evict_inode() callback must minimally truncate the inode pages,
126 * and call clear_inode(). For 2.6.35 and later kernels this will
127 * simply update the inode state, with the sync occurring before the
128 * truncate in evict(). For earlier kernels clear_inode() maps to
129 * end_writeback() which is responsible for completing all outstanding
130 * write back. In either case, once this is done it is safe to cleanup
131 * any remaining inode specific data via zfs_inactive().
132 * remaining filesystem specific data.
133 */
134 static void
zpl_evict_inode(struct inode * ip)135 zpl_evict_inode(struct inode *ip)
136 {
137 fstrans_cookie_t cookie;
138
139 cookie = spl_fstrans_mark();
140 truncate_setsize(ip, 0);
141 clear_inode(ip);
142 zfs_inactive(ip);
143 spl_fstrans_unmark(cookie);
144 }
145
146 static void
zpl_put_super(struct super_block * sb)147 zpl_put_super(struct super_block *sb)
148 {
149 fstrans_cookie_t cookie;
150 int error;
151
152 cookie = spl_fstrans_mark();
153 error = -zfs_umount(sb);
154 spl_fstrans_unmark(cookie);
155 ASSERT3S(error, <=, 0);
156 }
157
158 /*
159 * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
160 * syscalls, via sb->s_op->sync_fs().
161 *
162 * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
163 * sync_filesystem() would ignore the return from sync_fs(), instead only
164 * considing the error from syncing the underlying block device (sb->s_dev).
165 * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
166 * us to report a sync directly.
167 *
168 * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
169 * error store `s_wb_err`, to carry errors seen on page writeback since the
170 * last call to syncfs(). If sync_filesystem() does not return an error, any
171 * existing writeback error on the superblock will be used instead (and cleared
172 * either way). We don't use this (page writeback is a different thing for us),
173 * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
174 *
175 * Before 5.8, we have no other good options - no matter what happens, the
176 * userspace program will be told the call has succeeded, and so we must make
177 * it so, Therefore, when we are asked to wait for sync to complete (wait ==
178 * 1), if zfs_sync() has returned an error we have no choice but to block,
179 * regardless of the reason.
180 *
181 * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
182 * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
183 * mainline Linux series at time of writing), and has likely been backported to
184 * vendor kernels before 5.8. We don't really want to use a workaround when we
185 * don't have to, but we can't really detect whether or not sync_filesystem()
186 * will return our errors (without a difficult runtime test anyway). So, we use
187 * a static version check: any kernel reporting its version as 5.17+ will use a
188 * direct error return, otherwise, we'll either use s_wb_err if it was detected
189 * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
190 * block to ensure the correct semantics.
191 *
192 * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
193 */
194 static int
zpl_sync_fs(struct super_block * sb,int wait)195 zpl_sync_fs(struct super_block *sb, int wait)
196 {
197 fstrans_cookie_t cookie;
198 cred_t *cr = CRED();
199 int error;
200
201 crhold(cr);
202 cookie = spl_fstrans_mark();
203 error = -zfs_sync(sb, wait, cr);
204
205 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
206 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
207 if (error && wait)
208 errseq_set(&sb->s_wb_err, error);
209 #else
210 if (error && wait) {
211 zfsvfs_t *zfsvfs = sb->s_fs_info;
212 ASSERT3P(zfsvfs, !=, NULL);
213 if (zfs_enter(zfsvfs, FTAG) == 0) {
214 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
215 zfs_exit(zfsvfs, FTAG);
216 error = 0;
217 }
218 }
219 #endif
220 #endif /* < 5.17.0 */
221
222 spl_fstrans_unmark(cookie);
223 crfree(cr);
224
225 ASSERT3S(error, <=, 0);
226 return (error);
227 }
228
229 static int
zpl_statfs(struct dentry * dentry,struct kstatfs * statp)230 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
231 {
232 fstrans_cookie_t cookie;
233 int error;
234
235 cookie = spl_fstrans_mark();
236 error = -zfs_statvfs(dentry->d_inode, statp);
237 spl_fstrans_unmark(cookie);
238 ASSERT3S(error, <=, 0);
239
240 /*
241 * If required by a 32-bit system call, dynamically scale the
242 * block size up to 16MiB and decrease the block counts. This
243 * allows for a maximum size of 64EiB to be reported. The file
244 * counts must be artificially capped at 2^32-1.
245 */
246 if (unlikely(zpl_is_32bit_api())) {
247 while (statp->f_blocks > UINT32_MAX &&
248 statp->f_bsize < SPA_MAXBLOCKSIZE) {
249 statp->f_frsize <<= 1;
250 statp->f_bsize <<= 1;
251
252 statp->f_blocks >>= 1;
253 statp->f_bfree >>= 1;
254 statp->f_bavail >>= 1;
255 }
256
257 uint64_t usedobjs = statp->f_files - statp->f_ffree;
258 statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
259 statp->f_files = statp->f_ffree + usedobjs;
260 }
261
262 return (error);
263 }
264
265 static int
zpl_remount_fs(struct super_block * sb,int * flags,char * data)266 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
267 {
268 zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
269 fstrans_cookie_t cookie;
270 int error;
271
272 cookie = spl_fstrans_mark();
273 error = -zfs_remount(sb, flags, &zm);
274 spl_fstrans_unmark(cookie);
275 ASSERT3S(error, <=, 0);
276
277 return (error);
278 }
279
280 static int
__zpl_show_devname(struct seq_file * seq,zfsvfs_t * zfsvfs)281 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
282 {
283 int error;
284 if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
285 return (error);
286
287 char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
288 dmu_objset_name(zfsvfs->z_os, fsname);
289
290 for (int i = 0; fsname[i] != 0; i++) {
291 /*
292 * Spaces in the dataset name must be converted to their
293 * octal escape sequence for getmntent(3) to correctly
294 * parse then fsname portion of /proc/self/mounts.
295 */
296 if (fsname[i] == ' ') {
297 seq_puts(seq, "\\040");
298 } else {
299 seq_putc(seq, fsname[i]);
300 }
301 }
302
303 kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
304
305 zpl_exit(zfsvfs, FTAG);
306
307 return (0);
308 }
309
310 static int
zpl_show_devname(struct seq_file * seq,struct dentry * root)311 zpl_show_devname(struct seq_file *seq, struct dentry *root)
312 {
313 return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
314 }
315
316 static int
__zpl_show_options(struct seq_file * seq,zfsvfs_t * zfsvfs)317 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
318 {
319 seq_printf(seq, ",%s",
320 zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
321
322 #ifdef CONFIG_FS_POSIX_ACL
323 switch (zfsvfs->z_acl_type) {
324 case ZFS_ACLTYPE_POSIX:
325 seq_puts(seq, ",posixacl");
326 break;
327 default:
328 seq_puts(seq, ",noacl");
329 break;
330 }
331 #endif /* CONFIG_FS_POSIX_ACL */
332
333 switch (zfsvfs->z_case) {
334 case ZFS_CASE_SENSITIVE:
335 seq_puts(seq, ",casesensitive");
336 break;
337 case ZFS_CASE_INSENSITIVE:
338 seq_puts(seq, ",caseinsensitive");
339 break;
340 default:
341 seq_puts(seq, ",casemixed");
342 break;
343 }
344
345 return (0);
346 }
347
348 static int
zpl_show_options(struct seq_file * seq,struct dentry * root)349 zpl_show_options(struct seq_file *seq, struct dentry *root)
350 {
351 return (__zpl_show_options(seq, root->d_sb->s_fs_info));
352 }
353
354 static int
zpl_fill_super(struct super_block * sb,void * data,int silent)355 zpl_fill_super(struct super_block *sb, void *data, int silent)
356 {
357 zfs_mnt_t *zm = (zfs_mnt_t *)data;
358 fstrans_cookie_t cookie;
359 int error;
360
361 cookie = spl_fstrans_mark();
362 error = -zfs_domount(sb, zm, silent);
363 spl_fstrans_unmark(cookie);
364 ASSERT3S(error, <=, 0);
365
366 return (error);
367 }
368
369 static int
zpl_test_super(struct super_block * s,void * data)370 zpl_test_super(struct super_block *s, void *data)
371 {
372 zfsvfs_t *zfsvfs = s->s_fs_info;
373 objset_t *os = data;
374 /*
375 * If the os doesn't match the z_os in the super_block, assume it is
376 * not a match. Matching would imply a multimount of a dataset. It is
377 * possible that during a multimount, there is a simultaneous operation
378 * that changes the z_os, e.g., rollback, where the match will be
379 * missed, but in that case the user will get an EBUSY.
380 */
381 return (zfsvfs != NULL && os == zfsvfs->z_os);
382 }
383
384 static struct super_block *
zpl_mount_impl(struct file_system_type * fs_type,int flags,zfs_mnt_t * zm)385 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
386 {
387 struct super_block *s;
388 objset_t *os;
389 boolean_t issnap = B_FALSE;
390 int err;
391
392 err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
393 if (err)
394 return (ERR_PTR(-err));
395
396 /*
397 * The dsl pool lock must be released prior to calling sget().
398 * It is possible sget() may block on the lock in grab_super()
399 * while deactivate_super() holds that same lock and waits for
400 * a txg sync. If the dsl_pool lock is held over sget()
401 * this can prevent the pool sync and cause a deadlock.
402 */
403 dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
404 dsl_pool_rele(dmu_objset_pool(os), FTAG);
405
406 s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
407
408 /*
409 * Recheck with the lock held to prevent mounting the wrong dataset
410 * since z_os can be stale when the teardown lock is held.
411 *
412 * We can't do this in zpl_test_super in since it's under spinlock and
413 * also s_umount lock is not held there so it would race with
414 * zfs_umount and zfsvfs can be freed.
415 */
416 if (!IS_ERR(s) && s->s_fs_info != NULL) {
417 zfsvfs_t *zfsvfs = s->s_fs_info;
418 if (zpl_enter(zfsvfs, FTAG) == 0) {
419 if (os != zfsvfs->z_os)
420 err = -SET_ERROR(EBUSY);
421 issnap = zfsvfs->z_issnap;
422 zpl_exit(zfsvfs, FTAG);
423 } else {
424 err = -SET_ERROR(EBUSY);
425 }
426 }
427 dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
428 dsl_dataset_rele(dmu_objset_ds(os), FTAG);
429
430 if (IS_ERR(s))
431 return (ERR_CAST(s));
432
433 if (err) {
434 deactivate_locked_super(s);
435 return (ERR_PTR(err));
436 }
437
438 if (s->s_root == NULL) {
439 err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
440 if (err) {
441 deactivate_locked_super(s);
442 return (ERR_PTR(err));
443 }
444 s->s_flags |= SB_ACTIVE;
445 } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
446 /*
447 * Skip ro check for snap since snap is always ro regardless
448 * ro flag is passed by mount or not.
449 */
450 deactivate_locked_super(s);
451 return (ERR_PTR(-EBUSY));
452 }
453
454 return (s);
455 }
456
457 static struct dentry *
zpl_mount(struct file_system_type * fs_type,int flags,const char * osname,void * data)458 zpl_mount(struct file_system_type *fs_type, int flags,
459 const char *osname, void *data)
460 {
461 zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
462
463 struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
464 if (IS_ERR(sb))
465 return (ERR_CAST(sb));
466
467 return (dget(sb->s_root));
468 }
469
470 static void
zpl_kill_sb(struct super_block * sb)471 zpl_kill_sb(struct super_block *sb)
472 {
473 zfs_preumount(sb);
474 kill_anon_super(sb);
475 }
476
477 void
zpl_prune_sb(uint64_t nr_to_scan,void * arg)478 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
479 {
480 struct super_block *sb = (struct super_block *)arg;
481 int objects = 0;
482
483 /*
484 * Ensure the superblock is not in the process of being torn down.
485 */
486 #ifdef HAVE_SB_DYING
487 if (down_read_trylock(&sb->s_umount)) {
488 if (!(sb->s_flags & SB_DYING) && sb->s_root &&
489 (sb->s_flags & SB_BORN)) {
490 (void) zfs_prune(sb, nr_to_scan, &objects);
491 }
492 up_read(&sb->s_umount);
493 }
494 #else
495 if (down_read_trylock(&sb->s_umount)) {
496 if (!hlist_unhashed(&sb->s_instances) &&
497 sb->s_root && (sb->s_flags & SB_BORN)) {
498 (void) zfs_prune(sb, nr_to_scan, &objects);
499 }
500 up_read(&sb->s_umount);
501 }
502 #endif
503 }
504
505 const struct super_operations zpl_super_operations = {
506 .alloc_inode = zpl_inode_alloc,
507 #ifdef HAVE_SOPS_FREE_INODE
508 .free_inode = zpl_inode_free,
509 #endif
510 .destroy_inode = zpl_inode_destroy,
511 .dirty_inode = zpl_dirty_inode,
512 .write_inode = NULL,
513 .drop_inode = zpl_drop_inode,
514 .evict_inode = zpl_evict_inode,
515 .put_super = zpl_put_super,
516 .sync_fs = zpl_sync_fs,
517 .statfs = zpl_statfs,
518 .remount_fs = zpl_remount_fs,
519 .show_devname = zpl_show_devname,
520 .show_options = zpl_show_options,
521 .show_stats = NULL,
522 };
523
524 /*
525 * ->d_delete() is called when the last reference to a dentry is released. Its
526 * return value indicates if the dentry should be destroyed immediately, or
527 * retained in the dentry cache.
528 *
529 * By default (zfs_delete_dentry=0) the kernel will always cache unused
530 * entries. Each dentry holds an inode reference, so cached dentries can hold
531 * the final inode reference indefinitely, leading to the inode and its related
532 * data being pinned (see zpl_drop_inode()).
533 *
534 * When set to 1, we signal that the dentry should be destroyed immediately and
535 * never cached. This reduces memory usage, at the cost of higher overheads to
536 * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
537 * reloaded and reinflated.
538 *
539 * Note that userspace does not have direct control over dentry references and
540 * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
541 * (eg vm.vfs_cache_pressure).
542 */
543 static int
zpl_dentry_delete(const struct dentry * dentry)544 zpl_dentry_delete(const struct dentry *dentry)
545 {
546 return (zfs_delete_dentry ? 1 : 0);
547 }
548
549 const struct dentry_operations zpl_dentry_operations = {
550 .d_delete = zpl_dentry_delete,
551 };
552
553 struct file_system_type zpl_fs_type = {
554 .owner = THIS_MODULE,
555 .name = ZFS_DRIVER,
556 #if defined(HAVE_IDMAP_MNT_API)
557 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
558 #else
559 .fs_flags = FS_USERNS_MOUNT,
560 #endif
561 .mount = zpl_mount,
562 .kill_sb = zpl_kill_sb,
563 };
564
565 ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
566 "Delete inodes as soon as the last reference is released.");
567
568 ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
569 "Delete dentries from dentry cache as soon as the last reference is "
570 "released.");
571