xref: /freebsd/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c (revision 3a8960711f4319f9b894ea2453c89065ee1b3a10)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24  * Copyright (c) 2023, Datto Inc. All rights reserved.
25  */
26 
27 
28 #include <sys/zfs_znode.h>
29 #include <sys/zfs_vfsops.h>
30 #include <sys/zfs_vnops.h>
31 #include <sys/zfs_ctldir.h>
32 #include <sys/zpl.h>
33 #include <linux/iversion.h>
34 #include <linux/version.h>
35 
36 
37 static struct inode *
zpl_inode_alloc(struct super_block * sb)38 zpl_inode_alloc(struct super_block *sb)
39 {
40 	struct inode *ip;
41 
42 	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
43 	inode_set_iversion(ip, 1);
44 
45 	return (ip);
46 }
47 
48 static void
zpl_inode_destroy(struct inode * ip)49 zpl_inode_destroy(struct inode *ip)
50 {
51 	ASSERT(atomic_read(&ip->i_count) == 0);
52 	zfs_inode_destroy(ip);
53 }
54 
55 /*
56  * Called from __mark_inode_dirty() to reflect that something in the
57  * inode has changed.  We use it to ensure the znode system attributes
58  * are always strictly update to date with respect to the inode.
59  */
60 static void
zpl_dirty_inode(struct inode * ip,int flags)61 zpl_dirty_inode(struct inode *ip, int flags)
62 {
63 	fstrans_cookie_t cookie;
64 
65 	cookie = spl_fstrans_mark();
66 	zfs_dirty_inode(ip, flags);
67 	spl_fstrans_unmark(cookie);
68 }
69 
70 /*
71  * When ->drop_inode() is called its return value indicates if the
72  * inode should be evicted from the inode cache.  If the inode is
73  * unhashed and has no links the default policy is to evict it
74  * immediately.
75  *
76  * The ->evict_inode() callback must minimally truncate the inode pages,
77  * and call clear_inode().  For 2.6.35 and later kernels this will
78  * simply update the inode state, with the sync occurring before the
79  * truncate in evict().  For earlier kernels clear_inode() maps to
80  * end_writeback() which is responsible for completing all outstanding
81  * write back.  In either case, once this is done it is safe to cleanup
82  * any remaining inode specific data via zfs_inactive().
83  * remaining filesystem specific data.
84  */
85 static void
zpl_evict_inode(struct inode * ip)86 zpl_evict_inode(struct inode *ip)
87 {
88 	fstrans_cookie_t cookie;
89 
90 	cookie = spl_fstrans_mark();
91 	truncate_setsize(ip, 0);
92 	clear_inode(ip);
93 	zfs_inactive(ip);
94 	spl_fstrans_unmark(cookie);
95 }
96 
97 static void
zpl_put_super(struct super_block * sb)98 zpl_put_super(struct super_block *sb)
99 {
100 	fstrans_cookie_t cookie;
101 	int error;
102 
103 	cookie = spl_fstrans_mark();
104 	error = -zfs_umount(sb);
105 	spl_fstrans_unmark(cookie);
106 	ASSERT3S(error, <=, 0);
107 }
108 
109 /*
110  * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
111  * syscalls, via sb->s_op->sync_fs().
112  *
113  * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
114  * sync_filesystem() would ignore the return from sync_fs(), instead only
115  * considing the error from syncing the underlying block device (sb->s_dev).
116  * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
117  * us to report a sync directly.
118  *
119  * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
120  * error store `s_wb_err`, to carry errors seen on page writeback since the
121  * last call to syncfs(). If sync_filesystem() does not return an error, any
122  * existing writeback error on the superblock will be used instead (and cleared
123  * either way). We don't use this (page writeback is a different thing for us),
124  * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
125  *
126  * Before 5.8, we have no other good options - no matter what happens, the
127  * userspace program will be told the call has succeeded, and so we must make
128  * it so, Therefore, when we are asked to wait for sync to complete (wait ==
129  * 1), if zfs_sync() has returned an error we have no choice but to block,
130  * regardless of the reason.
131  *
132  * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
133  * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
134  * mainline Linux series at time of writing), and has likely been backported to
135  * vendor kernels before 5.8. We don't really want to use a workaround when we
136  * don't have to, but we can't really detect whether or not sync_filesystem()
137  * will return our errors (without a difficult runtime test anyway). So, we use
138  * a static version check: any kernel reporting its version as 5.17+ will use a
139  * direct error return, otherwise, we'll either use s_wb_err if it was detected
140  * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
141  * block to ensure the correct semantics.
142  *
143  * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
144  */
145 static int
zpl_sync_fs(struct super_block * sb,int wait)146 zpl_sync_fs(struct super_block *sb, int wait)
147 {
148 	fstrans_cookie_t cookie;
149 	cred_t *cr = CRED();
150 	int error;
151 
152 	crhold(cr);
153 	cookie = spl_fstrans_mark();
154 	error = -zfs_sync(sb, wait, cr);
155 
156 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
157 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
158 	if (error && wait)
159 		errseq_set(&sb->s_wb_err, error);
160 #else
161 	if (error && wait) {
162 		zfsvfs_t *zfsvfs = sb->s_fs_info;
163 		ASSERT3P(zfsvfs, !=, NULL);
164 		if (zfs_enter(zfsvfs, FTAG) == 0) {
165 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
166 			zfs_exit(zfsvfs, FTAG);
167 			error = 0;
168 		}
169 	}
170 #endif
171 #endif /* < 5.17.0 */
172 
173 	spl_fstrans_unmark(cookie);
174 	crfree(cr);
175 
176 	ASSERT3S(error, <=, 0);
177 	return (error);
178 }
179 
180 static int
zpl_statfs(struct dentry * dentry,struct kstatfs * statp)181 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
182 {
183 	fstrans_cookie_t cookie;
184 	int error;
185 
186 	cookie = spl_fstrans_mark();
187 	error = -zfs_statvfs(dentry->d_inode, statp);
188 	spl_fstrans_unmark(cookie);
189 	ASSERT3S(error, <=, 0);
190 
191 	/*
192 	 * If required by a 32-bit system call, dynamically scale the
193 	 * block size up to 16MiB and decrease the block counts.  This
194 	 * allows for a maximum size of 64EiB to be reported.  The file
195 	 * counts must be artificially capped at 2^32-1.
196 	 */
197 	if (unlikely(zpl_is_32bit_api())) {
198 		while (statp->f_blocks > UINT32_MAX &&
199 		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
200 			statp->f_frsize <<= 1;
201 			statp->f_bsize <<= 1;
202 
203 			statp->f_blocks >>= 1;
204 			statp->f_bfree >>= 1;
205 			statp->f_bavail >>= 1;
206 		}
207 
208 		uint64_t usedobjs = statp->f_files - statp->f_ffree;
209 		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
210 		statp->f_files = statp->f_ffree + usedobjs;
211 	}
212 
213 	return (error);
214 }
215 
216 static int
zpl_remount_fs(struct super_block * sb,int * flags,char * data)217 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
218 {
219 	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
220 	fstrans_cookie_t cookie;
221 	int error;
222 
223 	cookie = spl_fstrans_mark();
224 	error = -zfs_remount(sb, flags, &zm);
225 	spl_fstrans_unmark(cookie);
226 	ASSERT3S(error, <=, 0);
227 
228 	return (error);
229 }
230 
231 static int
__zpl_show_devname(struct seq_file * seq,zfsvfs_t * zfsvfs)232 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
233 {
234 	int error;
235 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
236 		return (error);
237 
238 	char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
239 	dmu_objset_name(zfsvfs->z_os, fsname);
240 
241 	for (int i = 0; fsname[i] != 0; i++) {
242 		/*
243 		 * Spaces in the dataset name must be converted to their
244 		 * octal escape sequence for getmntent(3) to correctly
245 		 * parse then fsname portion of /proc/self/mounts.
246 		 */
247 		if (fsname[i] == ' ') {
248 			seq_puts(seq, "\\040");
249 		} else {
250 			seq_putc(seq, fsname[i]);
251 		}
252 	}
253 
254 	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
255 
256 	zpl_exit(zfsvfs, FTAG);
257 
258 	return (0);
259 }
260 
261 static int
zpl_show_devname(struct seq_file * seq,struct dentry * root)262 zpl_show_devname(struct seq_file *seq, struct dentry *root)
263 {
264 	return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
265 }
266 
267 static int
__zpl_show_options(struct seq_file * seq,zfsvfs_t * zfsvfs)268 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
269 {
270 	seq_printf(seq, ",%s",
271 	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
272 
273 #ifdef CONFIG_FS_POSIX_ACL
274 	switch (zfsvfs->z_acl_type) {
275 	case ZFS_ACLTYPE_POSIX:
276 		seq_puts(seq, ",posixacl");
277 		break;
278 	default:
279 		seq_puts(seq, ",noacl");
280 		break;
281 	}
282 #endif /* CONFIG_FS_POSIX_ACL */
283 
284 	switch (zfsvfs->z_case) {
285 	case ZFS_CASE_SENSITIVE:
286 		seq_puts(seq, ",casesensitive");
287 		break;
288 	case ZFS_CASE_INSENSITIVE:
289 		seq_puts(seq, ",caseinsensitive");
290 		break;
291 	default:
292 		seq_puts(seq, ",casemixed");
293 		break;
294 	}
295 
296 	return (0);
297 }
298 
299 static int
zpl_show_options(struct seq_file * seq,struct dentry * root)300 zpl_show_options(struct seq_file *seq, struct dentry *root)
301 {
302 	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
303 }
304 
305 static int
zpl_fill_super(struct super_block * sb,void * data,int silent)306 zpl_fill_super(struct super_block *sb, void *data, int silent)
307 {
308 	zfs_mnt_t *zm = (zfs_mnt_t *)data;
309 	fstrans_cookie_t cookie;
310 	int error;
311 
312 	cookie = spl_fstrans_mark();
313 	error = -zfs_domount(sb, zm, silent);
314 	spl_fstrans_unmark(cookie);
315 	ASSERT3S(error, <=, 0);
316 
317 	return (error);
318 }
319 
320 static int
zpl_test_super(struct super_block * s,void * data)321 zpl_test_super(struct super_block *s, void *data)
322 {
323 	zfsvfs_t *zfsvfs = s->s_fs_info;
324 	objset_t *os = data;
325 	/*
326 	 * If the os doesn't match the z_os in the super_block, assume it is
327 	 * not a match. Matching would imply a multimount of a dataset. It is
328 	 * possible that during a multimount, there is a simultaneous operation
329 	 * that changes the z_os, e.g., rollback, where the match will be
330 	 * missed, but in that case the user will get an EBUSY.
331 	 */
332 	return (zfsvfs != NULL && os == zfsvfs->z_os);
333 }
334 
335 static struct super_block *
zpl_mount_impl(struct file_system_type * fs_type,int flags,zfs_mnt_t * zm)336 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
337 {
338 	struct super_block *s;
339 	objset_t *os;
340 	boolean_t issnap = B_FALSE;
341 	int err;
342 
343 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
344 	if (err)
345 		return (ERR_PTR(-err));
346 
347 	/*
348 	 * The dsl pool lock must be released prior to calling sget().
349 	 * It is possible sget() may block on the lock in grab_super()
350 	 * while deactivate_super() holds that same lock and waits for
351 	 * a txg sync.  If the dsl_pool lock is held over sget()
352 	 * this can prevent the pool sync and cause a deadlock.
353 	 */
354 	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
355 	dsl_pool_rele(dmu_objset_pool(os), FTAG);
356 
357 	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
358 
359 	/*
360 	 * Recheck with the lock held to prevent mounting the wrong dataset
361 	 * since z_os can be stale when the teardown lock is held.
362 	 *
363 	 * We can't do this in zpl_test_super in since it's under spinlock and
364 	 * also s_umount lock is not held there so it would race with
365 	 * zfs_umount and zfsvfs can be freed.
366 	 */
367 	if (!IS_ERR(s) && s->s_fs_info != NULL) {
368 		zfsvfs_t *zfsvfs = s->s_fs_info;
369 		if (zpl_enter(zfsvfs, FTAG) == 0) {
370 			if (os != zfsvfs->z_os)
371 				err = -SET_ERROR(EBUSY);
372 			issnap = zfsvfs->z_issnap;
373 			zpl_exit(zfsvfs, FTAG);
374 		} else {
375 			err = -SET_ERROR(EBUSY);
376 		}
377 	}
378 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
379 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
380 
381 	if (IS_ERR(s))
382 		return (ERR_CAST(s));
383 
384 	if (err) {
385 		deactivate_locked_super(s);
386 		return (ERR_PTR(err));
387 	}
388 
389 	if (s->s_root == NULL) {
390 		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
391 		if (err) {
392 			deactivate_locked_super(s);
393 			return (ERR_PTR(err));
394 		}
395 		s->s_flags |= SB_ACTIVE;
396 	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
397 		/*
398 		 * Skip ro check for snap since snap is always ro regardless
399 		 * ro flag is passed by mount or not.
400 		 */
401 		deactivate_locked_super(s);
402 		return (ERR_PTR(-EBUSY));
403 	}
404 
405 	return (s);
406 }
407 
408 static struct dentry *
zpl_mount(struct file_system_type * fs_type,int flags,const char * osname,void * data)409 zpl_mount(struct file_system_type *fs_type, int flags,
410     const char *osname, void *data)
411 {
412 	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
413 
414 	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
415 	if (IS_ERR(sb))
416 		return (ERR_CAST(sb));
417 
418 	return (dget(sb->s_root));
419 }
420 
421 static void
zpl_kill_sb(struct super_block * sb)422 zpl_kill_sb(struct super_block *sb)
423 {
424 	zfs_preumount(sb);
425 	kill_anon_super(sb);
426 }
427 
428 void
zpl_prune_sb(uint64_t nr_to_scan,void * arg)429 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
430 {
431 	struct super_block *sb = (struct super_block *)arg;
432 	int objects = 0;
433 
434 	/*
435 	 * Ensure the superblock is not in the process of being torn down.
436 	 */
437 #ifdef HAVE_SB_DYING
438 	if (down_read_trylock(&sb->s_umount)) {
439 		if (!(sb->s_flags & SB_DYING) && sb->s_root &&
440 		    (sb->s_flags & SB_BORN)) {
441 			(void) zfs_prune(sb, nr_to_scan, &objects);
442 		}
443 		up_read(&sb->s_umount);
444 	}
445 #else
446 	if (down_read_trylock(&sb->s_umount)) {
447 		if (!hlist_unhashed(&sb->s_instances) &&
448 		    sb->s_root && (sb->s_flags & SB_BORN)) {
449 			(void) zfs_prune(sb, nr_to_scan, &objects);
450 		}
451 		up_read(&sb->s_umount);
452 	}
453 #endif
454 }
455 
456 const struct super_operations zpl_super_operations = {
457 	.alloc_inode		= zpl_inode_alloc,
458 	.destroy_inode		= zpl_inode_destroy,
459 	.dirty_inode		= zpl_dirty_inode,
460 	.write_inode		= NULL,
461 	.evict_inode		= zpl_evict_inode,
462 	.put_super		= zpl_put_super,
463 	.sync_fs		= zpl_sync_fs,
464 	.statfs			= zpl_statfs,
465 	.remount_fs		= zpl_remount_fs,
466 	.show_devname		= zpl_show_devname,
467 	.show_options		= zpl_show_options,
468 	.show_stats		= NULL,
469 };
470 
471 struct file_system_type zpl_fs_type = {
472 	.owner			= THIS_MODULE,
473 	.name			= ZFS_DRIVER,
474 #if defined(HAVE_IDMAP_MNT_API)
475 	.fs_flags		= FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
476 #else
477 	.fs_flags		= FS_USERNS_MOUNT,
478 #endif
479 	.mount			= zpl_mount,
480 	.kill_sb		= zpl_kill_sb,
481 };
482