xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
23  * Copyright (c) 2023, Datto Inc. All rights reserved.
24  */
25 
26 
27 #include <sys/zfs_znode.h>
28 #include <sys/zfs_vfsops.h>
29 #include <sys/zfs_vnops.h>
30 #include <sys/zfs_ctldir.h>
31 #include <sys/zpl.h>
32 #include <linux/iversion.h>
33 
34 
35 static struct inode *
36 zpl_inode_alloc(struct super_block *sb)
37 {
38 	struct inode *ip;
39 
40 	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
41 	inode_set_iversion(ip, 1);
42 
43 	return (ip);
44 }
45 
46 static void
47 zpl_inode_destroy(struct inode *ip)
48 {
49 	ASSERT(atomic_read(&ip->i_count) == 0);
50 	zfs_inode_destroy(ip);
51 }
52 
53 /*
54  * Called from __mark_inode_dirty() to reflect that something in the
55  * inode has changed.  We use it to ensure the znode system attributes
56  * are always strictly update to date with respect to the inode.
57  */
58 static void
59 zpl_dirty_inode(struct inode *ip, int flags)
60 {
61 	fstrans_cookie_t cookie;
62 
63 	cookie = spl_fstrans_mark();
64 	zfs_dirty_inode(ip, flags);
65 	spl_fstrans_unmark(cookie);
66 }
67 
68 /*
69  * When ->drop_inode() is called its return value indicates if the
70  * inode should be evicted from the inode cache.  If the inode is
71  * unhashed and has no links the default policy is to evict it
72  * immediately.
73  *
74  * The ->evict_inode() callback must minimally truncate the inode pages,
75  * and call clear_inode().  For 2.6.35 and later kernels this will
76  * simply update the inode state, with the sync occurring before the
77  * truncate in evict().  For earlier kernels clear_inode() maps to
78  * end_writeback() which is responsible for completing all outstanding
79  * write back.  In either case, once this is done it is safe to cleanup
80  * any remaining inode specific data via zfs_inactive().
81  * remaining filesystem specific data.
82  */
83 static void
84 zpl_evict_inode(struct inode *ip)
85 {
86 	fstrans_cookie_t cookie;
87 
88 	cookie = spl_fstrans_mark();
89 	truncate_setsize(ip, 0);
90 	clear_inode(ip);
91 	zfs_inactive(ip);
92 	spl_fstrans_unmark(cookie);
93 }
94 
95 static void
96 zpl_put_super(struct super_block *sb)
97 {
98 	fstrans_cookie_t cookie;
99 	int error;
100 
101 	cookie = spl_fstrans_mark();
102 	error = -zfs_umount(sb);
103 	spl_fstrans_unmark(cookie);
104 	ASSERT3S(error, <=, 0);
105 }
106 
107 static int
108 zpl_sync_fs(struct super_block *sb, int wait)
109 {
110 	fstrans_cookie_t cookie;
111 	cred_t *cr = CRED();
112 	int error;
113 
114 	crhold(cr);
115 	cookie = spl_fstrans_mark();
116 	error = -zfs_sync(sb, wait, cr);
117 	spl_fstrans_unmark(cookie);
118 	crfree(cr);
119 	ASSERT3S(error, <=, 0);
120 
121 	return (error);
122 }
123 
124 static int
125 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
126 {
127 	fstrans_cookie_t cookie;
128 	int error;
129 
130 	cookie = spl_fstrans_mark();
131 	error = -zfs_statvfs(dentry->d_inode, statp);
132 	spl_fstrans_unmark(cookie);
133 	ASSERT3S(error, <=, 0);
134 
135 	/*
136 	 * If required by a 32-bit system call, dynamically scale the
137 	 * block size up to 16MiB and decrease the block counts.  This
138 	 * allows for a maximum size of 64EiB to be reported.  The file
139 	 * counts must be artificially capped at 2^32-1.
140 	 */
141 	if (unlikely(zpl_is_32bit_api())) {
142 		while (statp->f_blocks > UINT32_MAX &&
143 		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
144 			statp->f_frsize <<= 1;
145 			statp->f_bsize <<= 1;
146 
147 			statp->f_blocks >>= 1;
148 			statp->f_bfree >>= 1;
149 			statp->f_bavail >>= 1;
150 		}
151 
152 		uint64_t usedobjs = statp->f_files - statp->f_ffree;
153 		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
154 		statp->f_files = statp->f_ffree + usedobjs;
155 	}
156 
157 	return (error);
158 }
159 
160 static int
161 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
162 {
163 	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
164 	fstrans_cookie_t cookie;
165 	int error;
166 
167 	cookie = spl_fstrans_mark();
168 	error = -zfs_remount(sb, flags, &zm);
169 	spl_fstrans_unmark(cookie);
170 	ASSERT3S(error, <=, 0);
171 
172 	return (error);
173 }
174 
175 static int
176 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
177 {
178 	int error;
179 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
180 		return (error);
181 
182 	char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
183 	dmu_objset_name(zfsvfs->z_os, fsname);
184 
185 	for (int i = 0; fsname[i] != 0; i++) {
186 		/*
187 		 * Spaces in the dataset name must be converted to their
188 		 * octal escape sequence for getmntent(3) to correctly
189 		 * parse then fsname portion of /proc/self/mounts.
190 		 */
191 		if (fsname[i] == ' ') {
192 			seq_puts(seq, "\\040");
193 		} else {
194 			seq_putc(seq, fsname[i]);
195 		}
196 	}
197 
198 	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
199 
200 	zpl_exit(zfsvfs, FTAG);
201 
202 	return (0);
203 }
204 
205 static int
206 zpl_show_devname(struct seq_file *seq, struct dentry *root)
207 {
208 	return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
209 }
210 
211 static int
212 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
213 {
214 	seq_printf(seq, ",%s",
215 	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
216 
217 #ifdef CONFIG_FS_POSIX_ACL
218 	switch (zfsvfs->z_acl_type) {
219 	case ZFS_ACLTYPE_POSIX:
220 		seq_puts(seq, ",posixacl");
221 		break;
222 	default:
223 		seq_puts(seq, ",noacl");
224 		break;
225 	}
226 #endif /* CONFIG_FS_POSIX_ACL */
227 
228 	switch (zfsvfs->z_case) {
229 	case ZFS_CASE_SENSITIVE:
230 		seq_puts(seq, ",casesensitive");
231 		break;
232 	case ZFS_CASE_INSENSITIVE:
233 		seq_puts(seq, ",caseinsensitive");
234 		break;
235 	default:
236 		seq_puts(seq, ",casemixed");
237 		break;
238 	}
239 
240 	return (0);
241 }
242 
243 static int
244 zpl_show_options(struct seq_file *seq, struct dentry *root)
245 {
246 	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
247 }
248 
249 static int
250 zpl_fill_super(struct super_block *sb, void *data, int silent)
251 {
252 	zfs_mnt_t *zm = (zfs_mnt_t *)data;
253 	fstrans_cookie_t cookie;
254 	int error;
255 
256 	cookie = spl_fstrans_mark();
257 	error = -zfs_domount(sb, zm, silent);
258 	spl_fstrans_unmark(cookie);
259 	ASSERT3S(error, <=, 0);
260 
261 	return (error);
262 }
263 
264 static int
265 zpl_test_super(struct super_block *s, void *data)
266 {
267 	zfsvfs_t *zfsvfs = s->s_fs_info;
268 	objset_t *os = data;
269 	/*
270 	 * If the os doesn't match the z_os in the super_block, assume it is
271 	 * not a match. Matching would imply a multimount of a dataset. It is
272 	 * possible that during a multimount, there is a simultaneous operation
273 	 * that changes the z_os, e.g., rollback, where the match will be
274 	 * missed, but in that case the user will get an EBUSY.
275 	 */
276 	return (zfsvfs != NULL && os == zfsvfs->z_os);
277 }
278 
279 static struct super_block *
280 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
281 {
282 	struct super_block *s;
283 	objset_t *os;
284 	boolean_t issnap = B_FALSE;
285 	int err;
286 
287 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
288 	if (err)
289 		return (ERR_PTR(-err));
290 
291 	/*
292 	 * The dsl pool lock must be released prior to calling sget().
293 	 * It is possible sget() may block on the lock in grab_super()
294 	 * while deactivate_super() holds that same lock and waits for
295 	 * a txg sync.  If the dsl_pool lock is held over sget()
296 	 * this can prevent the pool sync and cause a deadlock.
297 	 */
298 	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
299 	dsl_pool_rele(dmu_objset_pool(os), FTAG);
300 
301 	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
302 
303 	/*
304 	 * Recheck with the lock held to prevent mounting the wrong dataset
305 	 * since z_os can be stale when the teardown lock is held.
306 	 *
307 	 * We can't do this in zpl_test_super in since it's under spinlock and
308 	 * also s_umount lock is not held there so it would race with
309 	 * zfs_umount and zfsvfs can be freed.
310 	 */
311 	if (!IS_ERR(s) && s->s_fs_info != NULL) {
312 		zfsvfs_t *zfsvfs = s->s_fs_info;
313 		if (zpl_enter(zfsvfs, FTAG) == 0) {
314 			if (os != zfsvfs->z_os)
315 				err = -SET_ERROR(EBUSY);
316 			issnap = zfsvfs->z_issnap;
317 			zpl_exit(zfsvfs, FTAG);
318 		} else {
319 			err = -SET_ERROR(EBUSY);
320 		}
321 	}
322 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
323 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
324 
325 	if (IS_ERR(s))
326 		return (ERR_CAST(s));
327 
328 	if (err) {
329 		deactivate_locked_super(s);
330 		return (ERR_PTR(err));
331 	}
332 
333 	if (s->s_root == NULL) {
334 		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
335 		if (err) {
336 			deactivate_locked_super(s);
337 			return (ERR_PTR(err));
338 		}
339 		s->s_flags |= SB_ACTIVE;
340 	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
341 		/*
342 		 * Skip ro check for snap since snap is always ro regardless
343 		 * ro flag is passed by mount or not.
344 		 */
345 		deactivate_locked_super(s);
346 		return (ERR_PTR(-EBUSY));
347 	}
348 
349 	return (s);
350 }
351 
352 static struct dentry *
353 zpl_mount(struct file_system_type *fs_type, int flags,
354     const char *osname, void *data)
355 {
356 	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
357 
358 	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
359 	if (IS_ERR(sb))
360 		return (ERR_CAST(sb));
361 
362 	return (dget(sb->s_root));
363 }
364 
365 static void
366 zpl_kill_sb(struct super_block *sb)
367 {
368 	zfs_preumount(sb);
369 	kill_anon_super(sb);
370 }
371 
372 void
373 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
374 {
375 	struct super_block *sb = (struct super_block *)arg;
376 	int objects = 0;
377 
378 	/*
379 	 * deactivate_locked_super calls shrinker_free and only then
380 	 * sops->kill_sb cb, resulting in UAF on umount when trying to reach
381 	 * for the shrinker functions in zpl_prune_sb of in-umount dataset.
382 	 * Increment if s_active is not zero, but don't prune if it is -
383 	 * umount could be underway.
384 	 */
385 	if (atomic_inc_not_zero(&sb->s_active)) {
386 		(void) -zfs_prune(sb, nr_to_scan, &objects);
387 		atomic_dec(&sb->s_active);
388 	}
389 
390 }
391 
392 const struct super_operations zpl_super_operations = {
393 	.alloc_inode		= zpl_inode_alloc,
394 	.destroy_inode		= zpl_inode_destroy,
395 	.dirty_inode		= zpl_dirty_inode,
396 	.write_inode		= NULL,
397 	.evict_inode		= zpl_evict_inode,
398 	.put_super		= zpl_put_super,
399 	.sync_fs		= zpl_sync_fs,
400 	.statfs			= zpl_statfs,
401 	.remount_fs		= zpl_remount_fs,
402 	.show_devname		= zpl_show_devname,
403 	.show_options		= zpl_show_options,
404 	.show_stats		= NULL,
405 };
406 
407 struct file_system_type zpl_fs_type = {
408 	.owner			= THIS_MODULE,
409 	.name			= ZFS_DRIVER,
410 #if defined(HAVE_IDMAP_MNT_API)
411 	.fs_flags		= FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
412 #else
413 	.fs_flags		= FS_USERNS_MOUNT,
414 #endif
415 	.mount			= zpl_mount,
416 	.kill_sb		= zpl_kill_sb,
417 };
418