161145dc2SMartin Matuska // SPDX-License-Identifier: CDDL-1.0
2eda14cbcSMatt Macy /*
3eda14cbcSMatt Macy * CDDL HEADER START
4eda14cbcSMatt Macy *
5eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
6eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
7eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
8eda14cbcSMatt Macy *
9eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0.
11eda14cbcSMatt Macy * See the License for the specific language governing permissions
12eda14cbcSMatt Macy * and limitations under the License.
13eda14cbcSMatt Macy *
14eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
15eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
17eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
18eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
19eda14cbcSMatt Macy *
20eda14cbcSMatt Macy * CDDL HEADER END
21eda14cbcSMatt Macy */
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
242a58b312SMartin Matuska * Copyright (c) 2023, Datto Inc. All rights reserved.
25113e6074SMartin Matuska * Copyright (c) 2025, Klara, Inc.
26*e6e941e6SMartin Matuska * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
27eda14cbcSMatt Macy */
28eda14cbcSMatt Macy
29eda14cbcSMatt Macy
30eda14cbcSMatt Macy #include <sys/zfs_znode.h>
31eda14cbcSMatt Macy #include <sys/zfs_vfsops.h>
32eda14cbcSMatt Macy #include <sys/zfs_vnops.h>
33eda14cbcSMatt Macy #include <sys/zfs_ctldir.h>
34eda14cbcSMatt Macy #include <sys/zpl.h>
357a7741afSMartin Matuska #include <linux/iversion.h>
363a896071SMartin Matuska #include <linux/version.h>
37*e6e941e6SMartin Matuska #include <linux/vfs_compat.h>
38eda14cbcSMatt Macy
39113e6074SMartin Matuska /*
40113e6074SMartin Matuska * What to do when the last reference to an inode is released. If 0, the kernel
41113e6074SMartin Matuska * will cache it on the superblock. If 1, the inode will be freed immediately.
42113e6074SMartin Matuska * See zpl_drop_inode().
43113e6074SMartin Matuska */
44113e6074SMartin Matuska int zfs_delete_inode = 0;
45113e6074SMartin Matuska
46113e6074SMartin Matuska /*
47113e6074SMartin Matuska * What to do when the last reference to a dentry is released. If 0, the kernel
48113e6074SMartin Matuska * will cache it until the entry (file) is destroyed. If 1, the dentry will be
49113e6074SMartin Matuska * marked for cleanup, at which time its inode reference will be released. See
50113e6074SMartin Matuska * zpl_dentry_delete().
51113e6074SMartin Matuska */
52113e6074SMartin Matuska int zfs_delete_dentry = 0;
53eda14cbcSMatt Macy
54eda14cbcSMatt Macy static struct inode *
zpl_inode_alloc(struct super_block * sb)55eda14cbcSMatt Macy zpl_inode_alloc(struct super_block *sb)
56eda14cbcSMatt Macy {
57eda14cbcSMatt Macy struct inode *ip;
58eda14cbcSMatt Macy
59eda14cbcSMatt Macy VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
60eda14cbcSMatt Macy inode_set_iversion(ip, 1);
61eda14cbcSMatt Macy
62eda14cbcSMatt Macy return (ip);
63eda14cbcSMatt Macy }
64eda14cbcSMatt Macy
65df58e8b1SMartin Matuska #ifdef HAVE_SOPS_FREE_INODE
66df58e8b1SMartin Matuska static void
zpl_inode_free(struct inode * ip)67df58e8b1SMartin Matuska zpl_inode_free(struct inode *ip)
68df58e8b1SMartin Matuska {
69d0abb9a6SMartin Matuska ASSERT0(atomic_read(&ip->i_count));
70df58e8b1SMartin Matuska zfs_inode_free(ip);
71df58e8b1SMartin Matuska }
72df58e8b1SMartin Matuska #endif
73df58e8b1SMartin Matuska
74eda14cbcSMatt Macy static void
zpl_inode_destroy(struct inode * ip)75eda14cbcSMatt Macy zpl_inode_destroy(struct inode *ip)
76eda14cbcSMatt Macy {
77d0abb9a6SMartin Matuska ASSERT0(atomic_read(&ip->i_count));
78eda14cbcSMatt Macy zfs_inode_destroy(ip);
79eda14cbcSMatt Macy }
80eda14cbcSMatt Macy
81eda14cbcSMatt Macy /*
82eda14cbcSMatt Macy * Called from __mark_inode_dirty() to reflect that something in the
83eda14cbcSMatt Macy * inode has changed. We use it to ensure the znode system attributes
84eda14cbcSMatt Macy * are always strictly update to date with respect to the inode.
85eda14cbcSMatt Macy */
86eda14cbcSMatt Macy static void
zpl_dirty_inode(struct inode * ip,int flags)87eda14cbcSMatt Macy zpl_dirty_inode(struct inode *ip, int flags)
88eda14cbcSMatt Macy {
89eda14cbcSMatt Macy fstrans_cookie_t cookie;
90eda14cbcSMatt Macy
91eda14cbcSMatt Macy cookie = spl_fstrans_mark();
92eda14cbcSMatt Macy zfs_dirty_inode(ip, flags);
93eda14cbcSMatt Macy spl_fstrans_unmark(cookie);
94eda14cbcSMatt Macy }
95eda14cbcSMatt Macy
96eda14cbcSMatt Macy /*
97113e6074SMartin Matuska * ->drop_inode() is called when the last reference to an inode is released.
98113e6074SMartin Matuska * Its return value indicates if the inode should be destroyed immediately, or
99113e6074SMartin Matuska * cached on the superblock structure.
100eda14cbcSMatt Macy *
101113e6074SMartin Matuska * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns
102113e6074SMartin Matuska * "destroy immediately" if the inode is unhashed and has no links (roughly: no
103113e6074SMartin Matuska * longer exists on disk). On datasets with millions of rarely-accessed files,
104113e6074SMartin Matuska * this can cause a large amount of memory to be "pinned" by cached inodes,
105113e6074SMartin Matuska * which in turn pin their associated dnodes and dbufs, until the kernel starts
106113e6074SMartin Matuska * reporting memory pressure and requests OpenZFS release some memory (see
107113e6074SMartin Matuska * zfs_prune()).
108113e6074SMartin Matuska *
109*e6e941e6SMartin Matuska * When set to 1, we call generic_delete_inode(), which always returns "destroy
110113e6074SMartin Matuska * immediately", resulting in inodes being destroyed immediately, releasing
111113e6074SMartin Matuska * their associated dnodes and dbufs to the dbuf cached and the ARC to be
112113e6074SMartin Matuska * evicted as normal.
113113e6074SMartin Matuska *
114113e6074SMartin Matuska * Note that the "last reference" doesn't always mean the last _userspace_
115113e6074SMartin Matuska * reference; the dentry cache also holds a reference, so "busy" inodes will
116113e6074SMartin Matuska * still be kept alive that way (subject to dcache tuning).
117113e6074SMartin Matuska */
118113e6074SMartin Matuska static int
zpl_drop_inode(struct inode * ip)119113e6074SMartin Matuska zpl_drop_inode(struct inode *ip)
120113e6074SMartin Matuska {
121113e6074SMartin Matuska if (zfs_delete_inode)
122113e6074SMartin Matuska return (generic_delete_inode(ip));
123113e6074SMartin Matuska return (generic_drop_inode(ip));
124113e6074SMartin Matuska }
125113e6074SMartin Matuska
126113e6074SMartin Matuska /*
127eda14cbcSMatt Macy * The ->evict_inode() callback must minimally truncate the inode pages,
128eda14cbcSMatt Macy * and call clear_inode(). For 2.6.35 and later kernels this will
129eda14cbcSMatt Macy * simply update the inode state, with the sync occurring before the
130eda14cbcSMatt Macy * truncate in evict(). For earlier kernels clear_inode() maps to
131eda14cbcSMatt Macy * end_writeback() which is responsible for completing all outstanding
132eda14cbcSMatt Macy * write back. In either case, once this is done it is safe to cleanup
133eda14cbcSMatt Macy * any remaining inode specific data via zfs_inactive().
134eda14cbcSMatt Macy * remaining filesystem specific data.
135eda14cbcSMatt Macy */
136eda14cbcSMatt Macy static void
zpl_evict_inode(struct inode * ip)137eda14cbcSMatt Macy zpl_evict_inode(struct inode *ip)
138eda14cbcSMatt Macy {
139eda14cbcSMatt Macy fstrans_cookie_t cookie;
140eda14cbcSMatt Macy
141eda14cbcSMatt Macy cookie = spl_fstrans_mark();
142eda14cbcSMatt Macy truncate_setsize(ip, 0);
143eda14cbcSMatt Macy clear_inode(ip);
144eda14cbcSMatt Macy zfs_inactive(ip);
145eda14cbcSMatt Macy spl_fstrans_unmark(cookie);
146eda14cbcSMatt Macy }
147eda14cbcSMatt Macy
148eda14cbcSMatt Macy static void
zpl_put_super(struct super_block * sb)149eda14cbcSMatt Macy zpl_put_super(struct super_block *sb)
150eda14cbcSMatt Macy {
151eda14cbcSMatt Macy fstrans_cookie_t cookie;
152eda14cbcSMatt Macy int error;
153eda14cbcSMatt Macy
154eda14cbcSMatt Macy cookie = spl_fstrans_mark();
155eda14cbcSMatt Macy error = -zfs_umount(sb);
156eda14cbcSMatt Macy spl_fstrans_unmark(cookie);
157eda14cbcSMatt Macy ASSERT3S(error, <=, 0);
158eda14cbcSMatt Macy }
159eda14cbcSMatt Macy
1603a896071SMartin Matuska /*
1613a896071SMartin Matuska * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
1623a896071SMartin Matuska * syscalls, via sb->s_op->sync_fs().
1633a896071SMartin Matuska *
1643a896071SMartin Matuska * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
1653a896071SMartin Matuska * sync_filesystem() would ignore the return from sync_fs(), instead only
1663a896071SMartin Matuska * considing the error from syncing the underlying block device (sb->s_dev).
1673a896071SMartin Matuska * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
1683a896071SMartin Matuska * us to report a sync directly.
1693a896071SMartin Matuska *
1703a896071SMartin Matuska * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
1713a896071SMartin Matuska * error store `s_wb_err`, to carry errors seen on page writeback since the
1723a896071SMartin Matuska * last call to syncfs(). If sync_filesystem() does not return an error, any
1733a896071SMartin Matuska * existing writeback error on the superblock will be used instead (and cleared
1743a896071SMartin Matuska * either way). We don't use this (page writeback is a different thing for us),
1753a896071SMartin Matuska * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
1763a896071SMartin Matuska *
1773a896071SMartin Matuska * Before 5.8, we have no other good options - no matter what happens, the
1783a896071SMartin Matuska * userspace program will be told the call has succeeded, and so we must make
1793a896071SMartin Matuska * it so, Therefore, when we are asked to wait for sync to complete (wait ==
1803a896071SMartin Matuska * 1), if zfs_sync() has returned an error we have no choice but to block,
1813a896071SMartin Matuska * regardless of the reason.
1823a896071SMartin Matuska *
1833a896071SMartin Matuska * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
1843a896071SMartin Matuska * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
1853a896071SMartin Matuska * mainline Linux series at time of writing), and has likely been backported to
1863a896071SMartin Matuska * vendor kernels before 5.8. We don't really want to use a workaround when we
1873a896071SMartin Matuska * don't have to, but we can't really detect whether or not sync_filesystem()
1883a896071SMartin Matuska * will return our errors (without a difficult runtime test anyway). So, we use
1893a896071SMartin Matuska * a static version check: any kernel reporting its version as 5.17+ will use a
1903a896071SMartin Matuska * direct error return, otherwise, we'll either use s_wb_err if it was detected
1913a896071SMartin Matuska * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
1923a896071SMartin Matuska * block to ensure the correct semantics.
1933a896071SMartin Matuska *
1943a896071SMartin Matuska * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
1953a896071SMartin Matuska */
196eda14cbcSMatt Macy static int
zpl_sync_fs(struct super_block * sb,int wait)197eda14cbcSMatt Macy zpl_sync_fs(struct super_block *sb, int wait)
198eda14cbcSMatt Macy {
199eda14cbcSMatt Macy fstrans_cookie_t cookie;
200eda14cbcSMatt Macy cred_t *cr = CRED();
201eda14cbcSMatt Macy int error;
202eda14cbcSMatt Macy
203eda14cbcSMatt Macy crhold(cr);
204eda14cbcSMatt Macy cookie = spl_fstrans_mark();
205eda14cbcSMatt Macy error = -zfs_sync(sb, wait, cr);
2063a896071SMartin Matuska
2073a896071SMartin Matuska #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
2083a896071SMartin Matuska #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
2093a896071SMartin Matuska if (error && wait)
2103a896071SMartin Matuska errseq_set(&sb->s_wb_err, error);
2113a896071SMartin Matuska #else
2123a896071SMartin Matuska if (error && wait) {
2133a896071SMartin Matuska zfsvfs_t *zfsvfs = sb->s_fs_info;
2143a896071SMartin Matuska ASSERT3P(zfsvfs, !=, NULL);
2153a896071SMartin Matuska if (zfs_enter(zfsvfs, FTAG) == 0) {
2163a896071SMartin Matuska txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
2173a896071SMartin Matuska zfs_exit(zfsvfs, FTAG);
2183a896071SMartin Matuska error = 0;
2193a896071SMartin Matuska }
2203a896071SMartin Matuska }
2213a896071SMartin Matuska #endif
2223a896071SMartin Matuska #endif /* < 5.17.0 */
2233a896071SMartin Matuska
224eda14cbcSMatt Macy spl_fstrans_unmark(cookie);
225eda14cbcSMatt Macy crfree(cr);
226eda14cbcSMatt Macy
2273a896071SMartin Matuska ASSERT3S(error, <=, 0);
228eda14cbcSMatt Macy return (error);
229eda14cbcSMatt Macy }
230eda14cbcSMatt Macy
231eda14cbcSMatt Macy static int
zpl_statfs(struct dentry * dentry,struct kstatfs * statp)232eda14cbcSMatt Macy zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
233eda14cbcSMatt Macy {
234eda14cbcSMatt Macy fstrans_cookie_t cookie;
235eda14cbcSMatt Macy int error;
236eda14cbcSMatt Macy
237eda14cbcSMatt Macy cookie = spl_fstrans_mark();
238eda14cbcSMatt Macy error = -zfs_statvfs(dentry->d_inode, statp);
239eda14cbcSMatt Macy spl_fstrans_unmark(cookie);
240eda14cbcSMatt Macy ASSERT3S(error, <=, 0);
241eda14cbcSMatt Macy
242eda14cbcSMatt Macy /*
243eda14cbcSMatt Macy * If required by a 32-bit system call, dynamically scale the
244eda14cbcSMatt Macy * block size up to 16MiB and decrease the block counts. This
245eda14cbcSMatt Macy * allows for a maximum size of 64EiB to be reported. The file
246eda14cbcSMatt Macy * counts must be artificially capped at 2^32-1.
247eda14cbcSMatt Macy */
248eda14cbcSMatt Macy if (unlikely(zpl_is_32bit_api())) {
249eda14cbcSMatt Macy while (statp->f_blocks > UINT32_MAX &&
250eda14cbcSMatt Macy statp->f_bsize < SPA_MAXBLOCKSIZE) {
251eda14cbcSMatt Macy statp->f_frsize <<= 1;
252eda14cbcSMatt Macy statp->f_bsize <<= 1;
253eda14cbcSMatt Macy
254eda14cbcSMatt Macy statp->f_blocks >>= 1;
255eda14cbcSMatt Macy statp->f_bfree >>= 1;
256eda14cbcSMatt Macy statp->f_bavail >>= 1;
257eda14cbcSMatt Macy }
258eda14cbcSMatt Macy
259eda14cbcSMatt Macy uint64_t usedobjs = statp->f_files - statp->f_ffree;
260eda14cbcSMatt Macy statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
261eda14cbcSMatt Macy statp->f_files = statp->f_ffree + usedobjs;
262eda14cbcSMatt Macy }
263eda14cbcSMatt Macy
264eda14cbcSMatt Macy return (error);
265eda14cbcSMatt Macy }
266eda14cbcSMatt Macy
267eda14cbcSMatt Macy static int
zpl_remount_fs(struct super_block * sb,int * flags,char * data)268eda14cbcSMatt Macy zpl_remount_fs(struct super_block *sb, int *flags, char *data)
269eda14cbcSMatt Macy {
270eda14cbcSMatt Macy zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
271eda14cbcSMatt Macy fstrans_cookie_t cookie;
272eda14cbcSMatt Macy int error;
273eda14cbcSMatt Macy
274eda14cbcSMatt Macy cookie = spl_fstrans_mark();
275eda14cbcSMatt Macy error = -zfs_remount(sb, flags, &zm);
276eda14cbcSMatt Macy spl_fstrans_unmark(cookie);
277eda14cbcSMatt Macy ASSERT3S(error, <=, 0);
278eda14cbcSMatt Macy
279eda14cbcSMatt Macy return (error);
280eda14cbcSMatt Macy }
281eda14cbcSMatt Macy
282eda14cbcSMatt Macy static int
__zpl_show_devname(struct seq_file * seq,zfsvfs_t * zfsvfs)283eac7052fSMatt Macy __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
284eac7052fSMatt Macy {
285c7046f76SMartin Matuska int error;
286c7046f76SMartin Matuska if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
287c7046f76SMartin Matuska return (error);
288eac7052fSMatt Macy
2897877fdebSMatt Macy char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
290eac7052fSMatt Macy dmu_objset_name(zfsvfs->z_os, fsname);
2917877fdebSMatt Macy
2927877fdebSMatt Macy for (int i = 0; fsname[i] != 0; i++) {
2937877fdebSMatt Macy /*
2947877fdebSMatt Macy * Spaces in the dataset name must be converted to their
2957877fdebSMatt Macy * octal escape sequence for getmntent(3) to correctly
2967877fdebSMatt Macy * parse then fsname portion of /proc/self/mounts.
2977877fdebSMatt Macy */
2987877fdebSMatt Macy if (fsname[i] == ' ') {
2997877fdebSMatt Macy seq_puts(seq, "\\040");
3007877fdebSMatt Macy } else {
3017877fdebSMatt Macy seq_putc(seq, fsname[i]);
3027877fdebSMatt Macy }
3037877fdebSMatt Macy }
3047877fdebSMatt Macy
305eac7052fSMatt Macy kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
3067877fdebSMatt Macy
307c7046f76SMartin Matuska zpl_exit(zfsvfs, FTAG);
308eac7052fSMatt Macy
309eac7052fSMatt Macy return (0);
310eac7052fSMatt Macy }
311eac7052fSMatt Macy
312eac7052fSMatt Macy static int
zpl_show_devname(struct seq_file * seq,struct dentry * root)313eac7052fSMatt Macy zpl_show_devname(struct seq_file *seq, struct dentry *root)
314eac7052fSMatt Macy {
315eac7052fSMatt Macy return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
316eac7052fSMatt Macy }
317eac7052fSMatt Macy
318eac7052fSMatt Macy static int
__zpl_show_options(struct seq_file * seq,zfsvfs_t * zfsvfs)319eda14cbcSMatt Macy __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
320eda14cbcSMatt Macy {
321eda14cbcSMatt Macy seq_printf(seq, ",%s",
322eda14cbcSMatt Macy zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
323eda14cbcSMatt Macy
324eda14cbcSMatt Macy #ifdef CONFIG_FS_POSIX_ACL
325eda14cbcSMatt Macy switch (zfsvfs->z_acl_type) {
3262c48331dSMatt Macy case ZFS_ACLTYPE_POSIX:
327eda14cbcSMatt Macy seq_puts(seq, ",posixacl");
328eda14cbcSMatt Macy break;
329eda14cbcSMatt Macy default:
330eda14cbcSMatt Macy seq_puts(seq, ",noacl");
331eda14cbcSMatt Macy break;
332eda14cbcSMatt Macy }
333eda14cbcSMatt Macy #endif /* CONFIG_FS_POSIX_ACL */
334eda14cbcSMatt Macy
335271171e0SMartin Matuska switch (zfsvfs->z_case) {
336271171e0SMartin Matuska case ZFS_CASE_SENSITIVE:
337271171e0SMartin Matuska seq_puts(seq, ",casesensitive");
338271171e0SMartin Matuska break;
339271171e0SMartin Matuska case ZFS_CASE_INSENSITIVE:
340271171e0SMartin Matuska seq_puts(seq, ",caseinsensitive");
341271171e0SMartin Matuska break;
342271171e0SMartin Matuska default:
343271171e0SMartin Matuska seq_puts(seq, ",casemixed");
344271171e0SMartin Matuska break;
345271171e0SMartin Matuska }
346271171e0SMartin Matuska
347eda14cbcSMatt Macy return (0);
348eda14cbcSMatt Macy }
349eda14cbcSMatt Macy
350eda14cbcSMatt Macy static int
zpl_show_options(struct seq_file * seq,struct dentry * root)351eda14cbcSMatt Macy zpl_show_options(struct seq_file *seq, struct dentry *root)
352eda14cbcSMatt Macy {
353eda14cbcSMatt Macy return (__zpl_show_options(seq, root->d_sb->s_fs_info));
354eda14cbcSMatt Macy }
355eda14cbcSMatt Macy
356eda14cbcSMatt Macy static int
zpl_fill_super(struct super_block * sb,void * data,int silent)357eda14cbcSMatt Macy zpl_fill_super(struct super_block *sb, void *data, int silent)
358eda14cbcSMatt Macy {
359eda14cbcSMatt Macy zfs_mnt_t *zm = (zfs_mnt_t *)data;
360eda14cbcSMatt Macy fstrans_cookie_t cookie;
361eda14cbcSMatt Macy int error;
362eda14cbcSMatt Macy
363eda14cbcSMatt Macy cookie = spl_fstrans_mark();
364eda14cbcSMatt Macy error = -zfs_domount(sb, zm, silent);
365eda14cbcSMatt Macy spl_fstrans_unmark(cookie);
366eda14cbcSMatt Macy ASSERT3S(error, <=, 0);
367eda14cbcSMatt Macy
368eda14cbcSMatt Macy return (error);
369eda14cbcSMatt Macy }
370eda14cbcSMatt Macy
371eda14cbcSMatt Macy static int
zpl_test_super(struct super_block * s,void * data)372eda14cbcSMatt Macy zpl_test_super(struct super_block *s, void *data)
373eda14cbcSMatt Macy {
374eda14cbcSMatt Macy zfsvfs_t *zfsvfs = s->s_fs_info;
375eda14cbcSMatt Macy objset_t *os = data;
3762a58b312SMartin Matuska /*
3772a58b312SMartin Matuska * If the os doesn't match the z_os in the super_block, assume it is
3782a58b312SMartin Matuska * not a match. Matching would imply a multimount of a dataset. It is
3792a58b312SMartin Matuska * possible that during a multimount, there is a simultaneous operation
3802a58b312SMartin Matuska * that changes the z_os, e.g., rollback, where the match will be
3812a58b312SMartin Matuska * missed, but in that case the user will get an EBUSY.
3822a58b312SMartin Matuska */
383315ee00fSMartin Matuska return (zfsvfs != NULL && os == zfsvfs->z_os);
384eda14cbcSMatt Macy }
385eda14cbcSMatt Macy
386eda14cbcSMatt Macy static struct super_block *
zpl_mount_impl(struct file_system_type * fs_type,int flags,zfs_mnt_t * zm)387eda14cbcSMatt Macy zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
388eda14cbcSMatt Macy {
389eda14cbcSMatt Macy struct super_block *s;
390eda14cbcSMatt Macy objset_t *os;
391e2df9bb4SMartin Matuska boolean_t issnap = B_FALSE;
392eda14cbcSMatt Macy int err;
393eda14cbcSMatt Macy
394eda14cbcSMatt Macy err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
395eda14cbcSMatt Macy if (err)
396eda14cbcSMatt Macy return (ERR_PTR(-err));
397eda14cbcSMatt Macy
398eda14cbcSMatt Macy /*
399eda14cbcSMatt Macy * The dsl pool lock must be released prior to calling sget().
400eda14cbcSMatt Macy * It is possible sget() may block on the lock in grab_super()
401eda14cbcSMatt Macy * while deactivate_super() holds that same lock and waits for
402eda14cbcSMatt Macy * a txg sync. If the dsl_pool lock is held over sget()
403eda14cbcSMatt Macy * this can prevent the pool sync and cause a deadlock.
404eda14cbcSMatt Macy */
4052c48331dSMatt Macy dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
406eda14cbcSMatt Macy dsl_pool_rele(dmu_objset_pool(os), FTAG);
4072c48331dSMatt Macy
408eda14cbcSMatt Macy s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
4092c48331dSMatt Macy
410315ee00fSMartin Matuska /*
411315ee00fSMartin Matuska * Recheck with the lock held to prevent mounting the wrong dataset
412315ee00fSMartin Matuska * since z_os can be stale when the teardown lock is held.
413315ee00fSMartin Matuska *
414315ee00fSMartin Matuska * We can't do this in zpl_test_super in since it's under spinlock and
415315ee00fSMartin Matuska * also s_umount lock is not held there so it would race with
416315ee00fSMartin Matuska * zfs_umount and zfsvfs can be freed.
417315ee00fSMartin Matuska */
418315ee00fSMartin Matuska if (!IS_ERR(s) && s->s_fs_info != NULL) {
419315ee00fSMartin Matuska zfsvfs_t *zfsvfs = s->s_fs_info;
420315ee00fSMartin Matuska if (zpl_enter(zfsvfs, FTAG) == 0) {
421315ee00fSMartin Matuska if (os != zfsvfs->z_os)
422315ee00fSMartin Matuska err = -SET_ERROR(EBUSY);
423e2df9bb4SMartin Matuska issnap = zfsvfs->z_issnap;
424315ee00fSMartin Matuska zpl_exit(zfsvfs, FTAG);
425315ee00fSMartin Matuska } else {
426315ee00fSMartin Matuska err = -SET_ERROR(EBUSY);
427315ee00fSMartin Matuska }
428315ee00fSMartin Matuska }
4292c48331dSMatt Macy dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
430eda14cbcSMatt Macy dsl_dataset_rele(dmu_objset_ds(os), FTAG);
431eda14cbcSMatt Macy
432eda14cbcSMatt Macy if (IS_ERR(s))
433eda14cbcSMatt Macy return (ERR_CAST(s));
434eda14cbcSMatt Macy
435315ee00fSMartin Matuska if (err) {
436315ee00fSMartin Matuska deactivate_locked_super(s);
437315ee00fSMartin Matuska return (ERR_PTR(err));
438315ee00fSMartin Matuska }
439315ee00fSMartin Matuska
440eda14cbcSMatt Macy if (s->s_root == NULL) {
441eda14cbcSMatt Macy err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
442eda14cbcSMatt Macy if (err) {
443eda14cbcSMatt Macy deactivate_locked_super(s);
444eda14cbcSMatt Macy return (ERR_PTR(err));
445eda14cbcSMatt Macy }
446eda14cbcSMatt Macy s->s_flags |= SB_ACTIVE;
447e2df9bb4SMartin Matuska } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
448e2df9bb4SMartin Matuska /*
449e2df9bb4SMartin Matuska * Skip ro check for snap since snap is always ro regardless
450e2df9bb4SMartin Matuska * ro flag is passed by mount or not.
451e2df9bb4SMartin Matuska */
452eda14cbcSMatt Macy deactivate_locked_super(s);
453eda14cbcSMatt Macy return (ERR_PTR(-EBUSY));
454eda14cbcSMatt Macy }
455eda14cbcSMatt Macy
456eda14cbcSMatt Macy return (s);
457eda14cbcSMatt Macy }
458eda14cbcSMatt Macy
459eda14cbcSMatt Macy static struct dentry *
zpl_mount(struct file_system_type * fs_type,int flags,const char * osname,void * data)460eda14cbcSMatt Macy zpl_mount(struct file_system_type *fs_type, int flags,
461eda14cbcSMatt Macy const char *osname, void *data)
462eda14cbcSMatt Macy {
463eda14cbcSMatt Macy zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
464eda14cbcSMatt Macy
465eda14cbcSMatt Macy struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
466eda14cbcSMatt Macy if (IS_ERR(sb))
467eda14cbcSMatt Macy return (ERR_CAST(sb));
468eda14cbcSMatt Macy
469eda14cbcSMatt Macy return (dget(sb->s_root));
470eda14cbcSMatt Macy }
471eda14cbcSMatt Macy
472eda14cbcSMatt Macy static void
zpl_kill_sb(struct super_block * sb)473eda14cbcSMatt Macy zpl_kill_sb(struct super_block *sb)
474eda14cbcSMatt Macy {
475eda14cbcSMatt Macy zfs_preumount(sb);
476eda14cbcSMatt Macy kill_anon_super(sb);
477eda14cbcSMatt Macy }
478eda14cbcSMatt Macy
479eda14cbcSMatt Macy void
zpl_prune_sb(uint64_t nr_to_scan,void * arg)480f8b1db88SMartin Matuska zpl_prune_sb(uint64_t nr_to_scan, void *arg)
481eda14cbcSMatt Macy {
482eda14cbcSMatt Macy struct super_block *sb = (struct super_block *)arg;
483eda14cbcSMatt Macy int objects = 0;
484eda14cbcSMatt Macy
485718519f4SMartin Matuska /*
48661145dc2SMartin Matuska * Ensure the superblock is not in the process of being torn down.
487718519f4SMartin Matuska */
48861145dc2SMartin Matuska #ifdef HAVE_SB_DYING
48961145dc2SMartin Matuska if (down_read_trylock(&sb->s_umount)) {
49061145dc2SMartin Matuska if (!(sb->s_flags & SB_DYING) && sb->s_root &&
49161145dc2SMartin Matuska (sb->s_flags & SB_BORN)) {
49261145dc2SMartin Matuska (void) zfs_prune(sb, nr_to_scan, &objects);
493718519f4SMartin Matuska }
49461145dc2SMartin Matuska up_read(&sb->s_umount);
49561145dc2SMartin Matuska }
49661145dc2SMartin Matuska #else
49761145dc2SMartin Matuska if (down_read_trylock(&sb->s_umount)) {
49861145dc2SMartin Matuska if (!hlist_unhashed(&sb->s_instances) &&
49961145dc2SMartin Matuska sb->s_root && (sb->s_flags & SB_BORN)) {
50061145dc2SMartin Matuska (void) zfs_prune(sb, nr_to_scan, &objects);
50161145dc2SMartin Matuska }
50261145dc2SMartin Matuska up_read(&sb->s_umount);
50361145dc2SMartin Matuska }
50461145dc2SMartin Matuska #endif
505eda14cbcSMatt Macy }
506eda14cbcSMatt Macy
507eda14cbcSMatt Macy const struct super_operations zpl_super_operations = {
508eda14cbcSMatt Macy .alloc_inode = zpl_inode_alloc,
509df58e8b1SMartin Matuska #ifdef HAVE_SOPS_FREE_INODE
510df58e8b1SMartin Matuska .free_inode = zpl_inode_free,
511df58e8b1SMartin Matuska #endif
512eda14cbcSMatt Macy .destroy_inode = zpl_inode_destroy,
513eda14cbcSMatt Macy .dirty_inode = zpl_dirty_inode,
514eda14cbcSMatt Macy .write_inode = NULL,
515113e6074SMartin Matuska .drop_inode = zpl_drop_inode,
516eda14cbcSMatt Macy .evict_inode = zpl_evict_inode,
517eda14cbcSMatt Macy .put_super = zpl_put_super,
518eda14cbcSMatt Macy .sync_fs = zpl_sync_fs,
519eda14cbcSMatt Macy .statfs = zpl_statfs,
520eda14cbcSMatt Macy .remount_fs = zpl_remount_fs,
521eac7052fSMatt Macy .show_devname = zpl_show_devname,
522eda14cbcSMatt Macy .show_options = zpl_show_options,
523eda14cbcSMatt Macy .show_stats = NULL,
524eda14cbcSMatt Macy };
525eda14cbcSMatt Macy
526113e6074SMartin Matuska /*
527113e6074SMartin Matuska * ->d_delete() is called when the last reference to a dentry is released. Its
528113e6074SMartin Matuska * return value indicates if the dentry should be destroyed immediately, or
529113e6074SMartin Matuska * retained in the dentry cache.
530113e6074SMartin Matuska *
531113e6074SMartin Matuska * By default (zfs_delete_dentry=0) the kernel will always cache unused
532113e6074SMartin Matuska * entries. Each dentry holds an inode reference, so cached dentries can hold
533113e6074SMartin Matuska * the final inode reference indefinitely, leading to the inode and its related
534113e6074SMartin Matuska * data being pinned (see zpl_drop_inode()).
535113e6074SMartin Matuska *
536113e6074SMartin Matuska * When set to 1, we signal that the dentry should be destroyed immediately and
537113e6074SMartin Matuska * never cached. This reduces memory usage, at the cost of higher overheads to
538113e6074SMartin Matuska * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be
539113e6074SMartin Matuska * reloaded and reinflated.
540113e6074SMartin Matuska *
541113e6074SMartin Matuska * Note that userspace does not have direct control over dentry references and
542113e6074SMartin Matuska * reclaim; rather, this is part of the kernel's caching and reclaim subsystems
543113e6074SMartin Matuska * (eg vm.vfs_cache_pressure).
544113e6074SMartin Matuska */
545113e6074SMartin Matuska static int
zpl_dentry_delete(const struct dentry * dentry)546113e6074SMartin Matuska zpl_dentry_delete(const struct dentry *dentry)
547113e6074SMartin Matuska {
548113e6074SMartin Matuska return (zfs_delete_dentry ? 1 : 0);
549113e6074SMartin Matuska }
550113e6074SMartin Matuska
551113e6074SMartin Matuska const struct dentry_operations zpl_dentry_operations = {
552113e6074SMartin Matuska .d_delete = zpl_dentry_delete,
553113e6074SMartin Matuska };
554113e6074SMartin Matuska
555eda14cbcSMatt Macy struct file_system_type zpl_fs_type = {
556eda14cbcSMatt Macy .owner = THIS_MODULE,
557eda14cbcSMatt Macy .name = ZFS_DRIVER,
558dbd5678dSMartin Matuska #if defined(HAVE_IDMAP_MNT_API)
559dbd5678dSMartin Matuska .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
560dbd5678dSMartin Matuska #else
5611f1e2261SMartin Matuska .fs_flags = FS_USERNS_MOUNT,
562dbd5678dSMartin Matuska #endif
563eda14cbcSMatt Macy .mount = zpl_mount,
564eda14cbcSMatt Macy .kill_sb = zpl_kill_sb,
565eda14cbcSMatt Macy };
566113e6074SMartin Matuska
567113e6074SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW,
568113e6074SMartin Matuska "Delete inodes as soon as the last reference is released.");
569113e6074SMartin Matuska
570113e6074SMartin Matuska ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW,
571113e6074SMartin Matuska "Delete dentries from dentry cache as soon as the last reference is "
572113e6074SMartin Matuska "released.");
573