xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c (revision e7be843b4a162e68651d3911f0357ed464915629)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24  * Copyright (c) 2023, Datto Inc. All rights reserved.
25  */
26 
27 
28 #include <sys/zfs_znode.h>
29 #include <sys/zfs_vfsops.h>
30 #include <sys/zfs_vnops.h>
31 #include <sys/zfs_ctldir.h>
32 #include <sys/zpl.h>
33 #include <linux/iversion.h>
34 #include <linux/version.h>
35 
36 
37 static struct inode *
38 zpl_inode_alloc(struct super_block *sb)
39 {
40 	struct inode *ip;
41 
42 	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
43 	inode_set_iversion(ip, 1);
44 
45 	return (ip);
46 }
47 
48 #ifdef HAVE_SOPS_FREE_INODE
49 static void
50 zpl_inode_free(struct inode *ip)
51 {
52 	ASSERT(atomic_read(&ip->i_count) == 0);
53 	zfs_inode_free(ip);
54 }
55 #endif
56 
57 static void
58 zpl_inode_destroy(struct inode *ip)
59 {
60 	ASSERT(atomic_read(&ip->i_count) == 0);
61 	zfs_inode_destroy(ip);
62 }
63 
64 /*
65  * Called from __mark_inode_dirty() to reflect that something in the
66  * inode has changed.  We use it to ensure the znode system attributes
67  * are always strictly update to date with respect to the inode.
68  */
69 static void
70 zpl_dirty_inode(struct inode *ip, int flags)
71 {
72 	fstrans_cookie_t cookie;
73 
74 	cookie = spl_fstrans_mark();
75 	zfs_dirty_inode(ip, flags);
76 	spl_fstrans_unmark(cookie);
77 }
78 
79 /*
80  * When ->drop_inode() is called its return value indicates if the
81  * inode should be evicted from the inode cache.  If the inode is
82  * unhashed and has no links the default policy is to evict it
83  * immediately.
84  *
85  * The ->evict_inode() callback must minimally truncate the inode pages,
86  * and call clear_inode().  For 2.6.35 and later kernels this will
87  * simply update the inode state, with the sync occurring before the
88  * truncate in evict().  For earlier kernels clear_inode() maps to
89  * end_writeback() which is responsible for completing all outstanding
90  * write back.  In either case, once this is done it is safe to cleanup
91  * any remaining inode specific data via zfs_inactive().
92  * remaining filesystem specific data.
93  */
94 static void
95 zpl_evict_inode(struct inode *ip)
96 {
97 	fstrans_cookie_t cookie;
98 
99 	cookie = spl_fstrans_mark();
100 	truncate_setsize(ip, 0);
101 	clear_inode(ip);
102 	zfs_inactive(ip);
103 	spl_fstrans_unmark(cookie);
104 }
105 
106 static void
107 zpl_put_super(struct super_block *sb)
108 {
109 	fstrans_cookie_t cookie;
110 	int error;
111 
112 	cookie = spl_fstrans_mark();
113 	error = -zfs_umount(sb);
114 	spl_fstrans_unmark(cookie);
115 	ASSERT3S(error, <=, 0);
116 }
117 
118 /*
119  * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
120  * syscalls, via sb->s_op->sync_fs().
121  *
122  * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
123  * sync_filesystem() would ignore the return from sync_fs(), instead only
124  * considing the error from syncing the underlying block device (sb->s_dev).
125  * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
126  * us to report a sync directly.
127  *
128  * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
129  * error store `s_wb_err`, to carry errors seen on page writeback since the
130  * last call to syncfs(). If sync_filesystem() does not return an error, any
131  * existing writeback error on the superblock will be used instead (and cleared
132  * either way). We don't use this (page writeback is a different thing for us),
133  * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
134  *
135  * Before 5.8, we have no other good options - no matter what happens, the
136  * userspace program will be told the call has succeeded, and so we must make
137  * it so, Therefore, when we are asked to wait for sync to complete (wait ==
138  * 1), if zfs_sync() has returned an error we have no choice but to block,
139  * regardless of the reason.
140  *
141  * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
142  * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
143  * mainline Linux series at time of writing), and has likely been backported to
144  * vendor kernels before 5.8. We don't really want to use a workaround when we
145  * don't have to, but we can't really detect whether or not sync_filesystem()
146  * will return our errors (without a difficult runtime test anyway). So, we use
147  * a static version check: any kernel reporting its version as 5.17+ will use a
148  * direct error return, otherwise, we'll either use s_wb_err if it was detected
149  * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
150  * block to ensure the correct semantics.
151  *
152  * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
153  */
154 static int
155 zpl_sync_fs(struct super_block *sb, int wait)
156 {
157 	fstrans_cookie_t cookie;
158 	cred_t *cr = CRED();
159 	int error;
160 
161 	crhold(cr);
162 	cookie = spl_fstrans_mark();
163 	error = -zfs_sync(sb, wait, cr);
164 
165 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
166 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
167 	if (error && wait)
168 		errseq_set(&sb->s_wb_err, error);
169 #else
170 	if (error && wait) {
171 		zfsvfs_t *zfsvfs = sb->s_fs_info;
172 		ASSERT3P(zfsvfs, !=, NULL);
173 		if (zfs_enter(zfsvfs, FTAG) == 0) {
174 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
175 			zfs_exit(zfsvfs, FTAG);
176 			error = 0;
177 		}
178 	}
179 #endif
180 #endif /* < 5.17.0 */
181 
182 	spl_fstrans_unmark(cookie);
183 	crfree(cr);
184 
185 	ASSERT3S(error, <=, 0);
186 	return (error);
187 }
188 
189 static int
190 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
191 {
192 	fstrans_cookie_t cookie;
193 	int error;
194 
195 	cookie = spl_fstrans_mark();
196 	error = -zfs_statvfs(dentry->d_inode, statp);
197 	spl_fstrans_unmark(cookie);
198 	ASSERT3S(error, <=, 0);
199 
200 	/*
201 	 * If required by a 32-bit system call, dynamically scale the
202 	 * block size up to 16MiB and decrease the block counts.  This
203 	 * allows for a maximum size of 64EiB to be reported.  The file
204 	 * counts must be artificially capped at 2^32-1.
205 	 */
206 	if (unlikely(zpl_is_32bit_api())) {
207 		while (statp->f_blocks > UINT32_MAX &&
208 		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
209 			statp->f_frsize <<= 1;
210 			statp->f_bsize <<= 1;
211 
212 			statp->f_blocks >>= 1;
213 			statp->f_bfree >>= 1;
214 			statp->f_bavail >>= 1;
215 		}
216 
217 		uint64_t usedobjs = statp->f_files - statp->f_ffree;
218 		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
219 		statp->f_files = statp->f_ffree + usedobjs;
220 	}
221 
222 	return (error);
223 }
224 
225 static int
226 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
227 {
228 	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
229 	fstrans_cookie_t cookie;
230 	int error;
231 
232 	cookie = spl_fstrans_mark();
233 	error = -zfs_remount(sb, flags, &zm);
234 	spl_fstrans_unmark(cookie);
235 	ASSERT3S(error, <=, 0);
236 
237 	return (error);
238 }
239 
240 static int
241 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
242 {
243 	int error;
244 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
245 		return (error);
246 
247 	char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
248 	dmu_objset_name(zfsvfs->z_os, fsname);
249 
250 	for (int i = 0; fsname[i] != 0; i++) {
251 		/*
252 		 * Spaces in the dataset name must be converted to their
253 		 * octal escape sequence for getmntent(3) to correctly
254 		 * parse then fsname portion of /proc/self/mounts.
255 		 */
256 		if (fsname[i] == ' ') {
257 			seq_puts(seq, "\\040");
258 		} else {
259 			seq_putc(seq, fsname[i]);
260 		}
261 	}
262 
263 	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
264 
265 	zpl_exit(zfsvfs, FTAG);
266 
267 	return (0);
268 }
269 
270 static int
271 zpl_show_devname(struct seq_file *seq, struct dentry *root)
272 {
273 	return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
274 }
275 
276 static int
277 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
278 {
279 	seq_printf(seq, ",%s",
280 	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
281 
282 #ifdef CONFIG_FS_POSIX_ACL
283 	switch (zfsvfs->z_acl_type) {
284 	case ZFS_ACLTYPE_POSIX:
285 		seq_puts(seq, ",posixacl");
286 		break;
287 	default:
288 		seq_puts(seq, ",noacl");
289 		break;
290 	}
291 #endif /* CONFIG_FS_POSIX_ACL */
292 
293 	switch (zfsvfs->z_case) {
294 	case ZFS_CASE_SENSITIVE:
295 		seq_puts(seq, ",casesensitive");
296 		break;
297 	case ZFS_CASE_INSENSITIVE:
298 		seq_puts(seq, ",caseinsensitive");
299 		break;
300 	default:
301 		seq_puts(seq, ",casemixed");
302 		break;
303 	}
304 
305 	return (0);
306 }
307 
308 static int
309 zpl_show_options(struct seq_file *seq, struct dentry *root)
310 {
311 	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
312 }
313 
314 static int
315 zpl_fill_super(struct super_block *sb, void *data, int silent)
316 {
317 	zfs_mnt_t *zm = (zfs_mnt_t *)data;
318 	fstrans_cookie_t cookie;
319 	int error;
320 
321 	cookie = spl_fstrans_mark();
322 	error = -zfs_domount(sb, zm, silent);
323 	spl_fstrans_unmark(cookie);
324 	ASSERT3S(error, <=, 0);
325 
326 	return (error);
327 }
328 
329 static int
330 zpl_test_super(struct super_block *s, void *data)
331 {
332 	zfsvfs_t *zfsvfs = s->s_fs_info;
333 	objset_t *os = data;
334 	/*
335 	 * If the os doesn't match the z_os in the super_block, assume it is
336 	 * not a match. Matching would imply a multimount of a dataset. It is
337 	 * possible that during a multimount, there is a simultaneous operation
338 	 * that changes the z_os, e.g., rollback, where the match will be
339 	 * missed, but in that case the user will get an EBUSY.
340 	 */
341 	return (zfsvfs != NULL && os == zfsvfs->z_os);
342 }
343 
344 static struct super_block *
345 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
346 {
347 	struct super_block *s;
348 	objset_t *os;
349 	boolean_t issnap = B_FALSE;
350 	int err;
351 
352 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
353 	if (err)
354 		return (ERR_PTR(-err));
355 
356 	/*
357 	 * The dsl pool lock must be released prior to calling sget().
358 	 * It is possible sget() may block on the lock in grab_super()
359 	 * while deactivate_super() holds that same lock and waits for
360 	 * a txg sync.  If the dsl_pool lock is held over sget()
361 	 * this can prevent the pool sync and cause a deadlock.
362 	 */
363 	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
364 	dsl_pool_rele(dmu_objset_pool(os), FTAG);
365 
366 	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
367 
368 	/*
369 	 * Recheck with the lock held to prevent mounting the wrong dataset
370 	 * since z_os can be stale when the teardown lock is held.
371 	 *
372 	 * We can't do this in zpl_test_super in since it's under spinlock and
373 	 * also s_umount lock is not held there so it would race with
374 	 * zfs_umount and zfsvfs can be freed.
375 	 */
376 	if (!IS_ERR(s) && s->s_fs_info != NULL) {
377 		zfsvfs_t *zfsvfs = s->s_fs_info;
378 		if (zpl_enter(zfsvfs, FTAG) == 0) {
379 			if (os != zfsvfs->z_os)
380 				err = -SET_ERROR(EBUSY);
381 			issnap = zfsvfs->z_issnap;
382 			zpl_exit(zfsvfs, FTAG);
383 		} else {
384 			err = -SET_ERROR(EBUSY);
385 		}
386 	}
387 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
388 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
389 
390 	if (IS_ERR(s))
391 		return (ERR_CAST(s));
392 
393 	if (err) {
394 		deactivate_locked_super(s);
395 		return (ERR_PTR(err));
396 	}
397 
398 	if (s->s_root == NULL) {
399 		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
400 		if (err) {
401 			deactivate_locked_super(s);
402 			return (ERR_PTR(err));
403 		}
404 		s->s_flags |= SB_ACTIVE;
405 	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
406 		/*
407 		 * Skip ro check for snap since snap is always ro regardless
408 		 * ro flag is passed by mount or not.
409 		 */
410 		deactivate_locked_super(s);
411 		return (ERR_PTR(-EBUSY));
412 	}
413 
414 	return (s);
415 }
416 
417 static struct dentry *
418 zpl_mount(struct file_system_type *fs_type, int flags,
419     const char *osname, void *data)
420 {
421 	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
422 
423 	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
424 	if (IS_ERR(sb))
425 		return (ERR_CAST(sb));
426 
427 	return (dget(sb->s_root));
428 }
429 
430 static void
431 zpl_kill_sb(struct super_block *sb)
432 {
433 	zfs_preumount(sb);
434 	kill_anon_super(sb);
435 }
436 
437 void
438 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
439 {
440 	struct super_block *sb = (struct super_block *)arg;
441 	int objects = 0;
442 
443 	/*
444 	 * Ensure the superblock is not in the process of being torn down.
445 	 */
446 #ifdef HAVE_SB_DYING
447 	if (down_read_trylock(&sb->s_umount)) {
448 		if (!(sb->s_flags & SB_DYING) && sb->s_root &&
449 		    (sb->s_flags & SB_BORN)) {
450 			(void) zfs_prune(sb, nr_to_scan, &objects);
451 		}
452 		up_read(&sb->s_umount);
453 	}
454 #else
455 	if (down_read_trylock(&sb->s_umount)) {
456 		if (!hlist_unhashed(&sb->s_instances) &&
457 		    sb->s_root && (sb->s_flags & SB_BORN)) {
458 			(void) zfs_prune(sb, nr_to_scan, &objects);
459 		}
460 		up_read(&sb->s_umount);
461 	}
462 #endif
463 }
464 
465 const struct super_operations zpl_super_operations = {
466 	.alloc_inode		= zpl_inode_alloc,
467 #ifdef HAVE_SOPS_FREE_INODE
468 	.free_inode		= zpl_inode_free,
469 #endif
470 	.destroy_inode		= zpl_inode_destroy,
471 	.dirty_inode		= zpl_dirty_inode,
472 	.write_inode		= NULL,
473 	.evict_inode		= zpl_evict_inode,
474 	.put_super		= zpl_put_super,
475 	.sync_fs		= zpl_sync_fs,
476 	.statfs			= zpl_statfs,
477 	.remount_fs		= zpl_remount_fs,
478 	.show_devname		= zpl_show_devname,
479 	.show_options		= zpl_show_options,
480 	.show_stats		= NULL,
481 };
482 
483 struct file_system_type zpl_fs_type = {
484 	.owner			= THIS_MODULE,
485 	.name			= ZFS_DRIVER,
486 #if defined(HAVE_IDMAP_MNT_API)
487 	.fs_flags		= FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
488 #else
489 	.fs_flags		= FS_USERNS_MOUNT,
490 #endif
491 	.mount			= zpl_mount,
492 	.kill_sb		= zpl_kill_sb,
493 };
494