1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
24 * Copyright (c) 2023, Datto Inc. All rights reserved.
25 */
26
27
28 #include <sys/zfs_znode.h>
29 #include <sys/zfs_vfsops.h>
30 #include <sys/zfs_vnops.h>
31 #include <sys/zfs_ctldir.h>
32 #include <sys/zpl.h>
33 #include <linux/iversion.h>
34 #include <linux/version.h>
35
36
37 static struct inode *
zpl_inode_alloc(struct super_block * sb)38 zpl_inode_alloc(struct super_block *sb)
39 {
40 struct inode *ip;
41
42 VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
43 inode_set_iversion(ip, 1);
44
45 return (ip);
46 }
47
48 static void
zpl_inode_destroy(struct inode * ip)49 zpl_inode_destroy(struct inode *ip)
50 {
51 ASSERT(atomic_read(&ip->i_count) == 0);
52 zfs_inode_destroy(ip);
53 }
54
55 /*
56 * Called from __mark_inode_dirty() to reflect that something in the
57 * inode has changed. We use it to ensure the znode system attributes
58 * are always strictly update to date with respect to the inode.
59 */
60 static void
zpl_dirty_inode(struct inode * ip,int flags)61 zpl_dirty_inode(struct inode *ip, int flags)
62 {
63 fstrans_cookie_t cookie;
64
65 cookie = spl_fstrans_mark();
66 zfs_dirty_inode(ip, flags);
67 spl_fstrans_unmark(cookie);
68 }
69
70 /*
71 * When ->drop_inode() is called its return value indicates if the
72 * inode should be evicted from the inode cache. If the inode is
73 * unhashed and has no links the default policy is to evict it
74 * immediately.
75 *
76 * The ->evict_inode() callback must minimally truncate the inode pages,
77 * and call clear_inode(). For 2.6.35 and later kernels this will
78 * simply update the inode state, with the sync occurring before the
79 * truncate in evict(). For earlier kernels clear_inode() maps to
80 * end_writeback() which is responsible for completing all outstanding
81 * write back. In either case, once this is done it is safe to cleanup
82 * any remaining inode specific data via zfs_inactive().
83 * remaining filesystem specific data.
84 */
85 static void
zpl_evict_inode(struct inode * ip)86 zpl_evict_inode(struct inode *ip)
87 {
88 fstrans_cookie_t cookie;
89
90 cookie = spl_fstrans_mark();
91 truncate_setsize(ip, 0);
92 clear_inode(ip);
93 zfs_inactive(ip);
94 spl_fstrans_unmark(cookie);
95 }
96
97 static void
zpl_put_super(struct super_block * sb)98 zpl_put_super(struct super_block *sb)
99 {
100 fstrans_cookie_t cookie;
101 int error;
102
103 cookie = spl_fstrans_mark();
104 error = -zfs_umount(sb);
105 spl_fstrans_unmark(cookie);
106 ASSERT3S(error, <=, 0);
107 }
108
109 /*
110 * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2)
111 * syscalls, via sb->s_op->sync_fs().
112 *
113 * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() ->
114 * sync_filesystem() would ignore the return from sync_fs(), instead only
115 * considing the error from syncing the underlying block device (sb->s_dev).
116 * Since OpenZFS doesn't _have_ an underlying block device, there's no way for
117 * us to report a sync directly.
118 *
119 * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra
120 * error store `s_wb_err`, to carry errors seen on page writeback since the
121 * last call to syncfs(). If sync_filesystem() does not return an error, any
122 * existing writeback error on the superblock will be used instead (and cleared
123 * either way). We don't use this (page writeback is a different thing for us),
124 * so for 5.8-5.17 we can use that instead to get syncfs() to return the error.
125 *
126 * Before 5.8, we have no other good options - no matter what happens, the
127 * userspace program will be told the call has succeeded, and so we must make
128 * it so, Therefore, when we are asked to wait for sync to complete (wait ==
129 * 1), if zfs_sync() has returned an error we have no choice but to block,
130 * regardless of the reason.
131 *
132 * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely
133 * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the
134 * mainline Linux series at time of writing), and has likely been backported to
135 * vendor kernels before 5.8. We don't really want to use a workaround when we
136 * don't have to, but we can't really detect whether or not sync_filesystem()
137 * will return our errors (without a difficult runtime test anyway). So, we use
138 * a static version check: any kernel reporting its version as 5.17+ will use a
139 * direct error return, otherwise, we'll either use s_wb_err if it was detected
140 * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will
141 * block to ensure the correct semantics.
142 *
143 * See https://github.com/openzfs/zfs/issues/17416 for further discussion.
144 */
145 static int
zpl_sync_fs(struct super_block * sb,int wait)146 zpl_sync_fs(struct super_block *sb, int wait)
147 {
148 fstrans_cookie_t cookie;
149 cred_t *cr = CRED();
150 int error;
151
152 crhold(cr);
153 cookie = spl_fstrans_mark();
154 error = -zfs_sync(sb, wait, cr);
155
156 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
157 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR
158 if (error && wait)
159 errseq_set(&sb->s_wb_err, error);
160 #else
161 if (error && wait) {
162 zfsvfs_t *zfsvfs = sb->s_fs_info;
163 ASSERT3P(zfsvfs, !=, NULL);
164 if (zfs_enter(zfsvfs, FTAG) == 0) {
165 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
166 zfs_exit(zfsvfs, FTAG);
167 error = 0;
168 }
169 }
170 #endif
171 #endif /* < 5.17.0 */
172
173 spl_fstrans_unmark(cookie);
174 crfree(cr);
175
176 ASSERT3S(error, <=, 0);
177 return (error);
178 }
179
180 static int
zpl_statfs(struct dentry * dentry,struct kstatfs * statp)181 zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
182 {
183 fstrans_cookie_t cookie;
184 int error;
185
186 cookie = spl_fstrans_mark();
187 error = -zfs_statvfs(dentry->d_inode, statp);
188 spl_fstrans_unmark(cookie);
189 ASSERT3S(error, <=, 0);
190
191 /*
192 * If required by a 32-bit system call, dynamically scale the
193 * block size up to 16MiB and decrease the block counts. This
194 * allows for a maximum size of 64EiB to be reported. The file
195 * counts must be artificially capped at 2^32-1.
196 */
197 if (unlikely(zpl_is_32bit_api())) {
198 while (statp->f_blocks > UINT32_MAX &&
199 statp->f_bsize < SPA_MAXBLOCKSIZE) {
200 statp->f_frsize <<= 1;
201 statp->f_bsize <<= 1;
202
203 statp->f_blocks >>= 1;
204 statp->f_bfree >>= 1;
205 statp->f_bavail >>= 1;
206 }
207
208 uint64_t usedobjs = statp->f_files - statp->f_ffree;
209 statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
210 statp->f_files = statp->f_ffree + usedobjs;
211 }
212
213 return (error);
214 }
215
216 static int
zpl_remount_fs(struct super_block * sb,int * flags,char * data)217 zpl_remount_fs(struct super_block *sb, int *flags, char *data)
218 {
219 zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
220 fstrans_cookie_t cookie;
221 int error;
222
223 cookie = spl_fstrans_mark();
224 error = -zfs_remount(sb, flags, &zm);
225 spl_fstrans_unmark(cookie);
226 ASSERT3S(error, <=, 0);
227
228 return (error);
229 }
230
231 static int
__zpl_show_devname(struct seq_file * seq,zfsvfs_t * zfsvfs)232 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
233 {
234 int error;
235 if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
236 return (error);
237
238 char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
239 dmu_objset_name(zfsvfs->z_os, fsname);
240
241 for (int i = 0; fsname[i] != 0; i++) {
242 /*
243 * Spaces in the dataset name must be converted to their
244 * octal escape sequence for getmntent(3) to correctly
245 * parse then fsname portion of /proc/self/mounts.
246 */
247 if (fsname[i] == ' ') {
248 seq_puts(seq, "\\040");
249 } else {
250 seq_putc(seq, fsname[i]);
251 }
252 }
253
254 kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
255
256 zpl_exit(zfsvfs, FTAG);
257
258 return (0);
259 }
260
261 static int
zpl_show_devname(struct seq_file * seq,struct dentry * root)262 zpl_show_devname(struct seq_file *seq, struct dentry *root)
263 {
264 return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
265 }
266
267 static int
__zpl_show_options(struct seq_file * seq,zfsvfs_t * zfsvfs)268 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
269 {
270 seq_printf(seq, ",%s",
271 zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
272
273 #ifdef CONFIG_FS_POSIX_ACL
274 switch (zfsvfs->z_acl_type) {
275 case ZFS_ACLTYPE_POSIX:
276 seq_puts(seq, ",posixacl");
277 break;
278 default:
279 seq_puts(seq, ",noacl");
280 break;
281 }
282 #endif /* CONFIG_FS_POSIX_ACL */
283
284 switch (zfsvfs->z_case) {
285 case ZFS_CASE_SENSITIVE:
286 seq_puts(seq, ",casesensitive");
287 break;
288 case ZFS_CASE_INSENSITIVE:
289 seq_puts(seq, ",caseinsensitive");
290 break;
291 default:
292 seq_puts(seq, ",casemixed");
293 break;
294 }
295
296 return (0);
297 }
298
299 static int
zpl_show_options(struct seq_file * seq,struct dentry * root)300 zpl_show_options(struct seq_file *seq, struct dentry *root)
301 {
302 return (__zpl_show_options(seq, root->d_sb->s_fs_info));
303 }
304
305 static int
zpl_fill_super(struct super_block * sb,void * data,int silent)306 zpl_fill_super(struct super_block *sb, void *data, int silent)
307 {
308 zfs_mnt_t *zm = (zfs_mnt_t *)data;
309 fstrans_cookie_t cookie;
310 int error;
311
312 cookie = spl_fstrans_mark();
313 error = -zfs_domount(sb, zm, silent);
314 spl_fstrans_unmark(cookie);
315 ASSERT3S(error, <=, 0);
316
317 return (error);
318 }
319
320 static int
zpl_test_super(struct super_block * s,void * data)321 zpl_test_super(struct super_block *s, void *data)
322 {
323 zfsvfs_t *zfsvfs = s->s_fs_info;
324 objset_t *os = data;
325 /*
326 * If the os doesn't match the z_os in the super_block, assume it is
327 * not a match. Matching would imply a multimount of a dataset. It is
328 * possible that during a multimount, there is a simultaneous operation
329 * that changes the z_os, e.g., rollback, where the match will be
330 * missed, but in that case the user will get an EBUSY.
331 */
332 return (zfsvfs != NULL && os == zfsvfs->z_os);
333 }
334
335 static struct super_block *
zpl_mount_impl(struct file_system_type * fs_type,int flags,zfs_mnt_t * zm)336 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
337 {
338 struct super_block *s;
339 objset_t *os;
340 boolean_t issnap = B_FALSE;
341 int err;
342
343 err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
344 if (err)
345 return (ERR_PTR(-err));
346
347 /*
348 * The dsl pool lock must be released prior to calling sget().
349 * It is possible sget() may block on the lock in grab_super()
350 * while deactivate_super() holds that same lock and waits for
351 * a txg sync. If the dsl_pool lock is held over sget()
352 * this can prevent the pool sync and cause a deadlock.
353 */
354 dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
355 dsl_pool_rele(dmu_objset_pool(os), FTAG);
356
357 s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
358
359 /*
360 * Recheck with the lock held to prevent mounting the wrong dataset
361 * since z_os can be stale when the teardown lock is held.
362 *
363 * We can't do this in zpl_test_super in since it's under spinlock and
364 * also s_umount lock is not held there so it would race with
365 * zfs_umount and zfsvfs can be freed.
366 */
367 if (!IS_ERR(s) && s->s_fs_info != NULL) {
368 zfsvfs_t *zfsvfs = s->s_fs_info;
369 if (zpl_enter(zfsvfs, FTAG) == 0) {
370 if (os != zfsvfs->z_os)
371 err = -SET_ERROR(EBUSY);
372 issnap = zfsvfs->z_issnap;
373 zpl_exit(zfsvfs, FTAG);
374 } else {
375 err = -SET_ERROR(EBUSY);
376 }
377 }
378 dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
379 dsl_dataset_rele(dmu_objset_ds(os), FTAG);
380
381 if (IS_ERR(s))
382 return (ERR_CAST(s));
383
384 if (err) {
385 deactivate_locked_super(s);
386 return (ERR_PTR(err));
387 }
388
389 if (s->s_root == NULL) {
390 err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
391 if (err) {
392 deactivate_locked_super(s);
393 return (ERR_PTR(err));
394 }
395 s->s_flags |= SB_ACTIVE;
396 } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
397 /*
398 * Skip ro check for snap since snap is always ro regardless
399 * ro flag is passed by mount or not.
400 */
401 deactivate_locked_super(s);
402 return (ERR_PTR(-EBUSY));
403 }
404
405 return (s);
406 }
407
408 static struct dentry *
zpl_mount(struct file_system_type * fs_type,int flags,const char * osname,void * data)409 zpl_mount(struct file_system_type *fs_type, int flags,
410 const char *osname, void *data)
411 {
412 zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
413
414 struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
415 if (IS_ERR(sb))
416 return (ERR_CAST(sb));
417
418 return (dget(sb->s_root));
419 }
420
421 static void
zpl_kill_sb(struct super_block * sb)422 zpl_kill_sb(struct super_block *sb)
423 {
424 zfs_preumount(sb);
425 kill_anon_super(sb);
426 }
427
428 void
zpl_prune_sb(uint64_t nr_to_scan,void * arg)429 zpl_prune_sb(uint64_t nr_to_scan, void *arg)
430 {
431 struct super_block *sb = (struct super_block *)arg;
432 int objects = 0;
433
434 /*
435 * Ensure the superblock is not in the process of being torn down.
436 */
437 #ifdef HAVE_SB_DYING
438 if (down_read_trylock(&sb->s_umount)) {
439 if (!(sb->s_flags & SB_DYING) && sb->s_root &&
440 (sb->s_flags & SB_BORN)) {
441 (void) zfs_prune(sb, nr_to_scan, &objects);
442 }
443 up_read(&sb->s_umount);
444 }
445 #else
446 if (down_read_trylock(&sb->s_umount)) {
447 if (!hlist_unhashed(&sb->s_instances) &&
448 sb->s_root && (sb->s_flags & SB_BORN)) {
449 (void) zfs_prune(sb, nr_to_scan, &objects);
450 }
451 up_read(&sb->s_umount);
452 }
453 #endif
454 }
455
456 const struct super_operations zpl_super_operations = {
457 .alloc_inode = zpl_inode_alloc,
458 .destroy_inode = zpl_inode_destroy,
459 .dirty_inode = zpl_dirty_inode,
460 .write_inode = NULL,
461 .evict_inode = zpl_evict_inode,
462 .put_super = zpl_put_super,
463 .sync_fs = zpl_sync_fs,
464 .statfs = zpl_statfs,
465 .remount_fs = zpl_remount_fs,
466 .show_devname = zpl_show_devname,
467 .show_options = zpl_show_options,
468 .show_stats = NULL,
469 };
470
471 struct file_system_type zpl_fs_type = {
472 .owner = THIS_MODULE,
473 .name = ZFS_DRIVER,
474 #if defined(HAVE_IDMAP_MNT_API)
475 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
476 #else
477 .fs_flags = FS_USERNS_MOUNT,
478 #endif
479 .mount = zpl_mount,
480 .kill_sb = zpl_kill_sb,
481 };
482