1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. 24 * Copyright (c) 2023, Datto Inc. All rights reserved. 25 */ 26 27 28 #include <sys/zfs_znode.h> 29 #include <sys/zfs_vfsops.h> 30 #include <sys/zfs_vnops.h> 31 #include <sys/zfs_ctldir.h> 32 #include <sys/zpl.h> 33 #include <linux/iversion.h> 34 #include <linux/version.h> 35 36 37 static struct inode * 38 zpl_inode_alloc(struct super_block *sb) 39 { 40 struct inode *ip; 41 42 VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); 43 inode_set_iversion(ip, 1); 44 45 return (ip); 46 } 47 48 static void 49 zpl_inode_destroy(struct inode *ip) 50 { 51 ASSERT(atomic_read(&ip->i_count) == 0); 52 zfs_inode_destroy(ip); 53 } 54 55 /* 56 * Called from __mark_inode_dirty() to reflect that something in the 57 * inode has changed. We use it to ensure the znode system attributes 58 * are always strictly update to date with respect to the inode. 59 */ 60 static void 61 zpl_dirty_inode(struct inode *ip, int flags) 62 { 63 fstrans_cookie_t cookie; 64 65 cookie = spl_fstrans_mark(); 66 zfs_dirty_inode(ip, flags); 67 spl_fstrans_unmark(cookie); 68 } 69 70 /* 71 * When ->drop_inode() is called its return value indicates if the 72 * inode should be evicted from the inode cache. If the inode is 73 * unhashed and has no links the default policy is to evict it 74 * immediately. 75 * 76 * The ->evict_inode() callback must minimally truncate the inode pages, 77 * and call clear_inode(). For 2.6.35 and later kernels this will 78 * simply update the inode state, with the sync occurring before the 79 * truncate in evict(). For earlier kernels clear_inode() maps to 80 * end_writeback() which is responsible for completing all outstanding 81 * write back. In either case, once this is done it is safe to cleanup 82 * any remaining inode specific data via zfs_inactive(). 83 * remaining filesystem specific data. 84 */ 85 static void 86 zpl_evict_inode(struct inode *ip) 87 { 88 fstrans_cookie_t cookie; 89 90 cookie = spl_fstrans_mark(); 91 truncate_setsize(ip, 0); 92 clear_inode(ip); 93 zfs_inactive(ip); 94 spl_fstrans_unmark(cookie); 95 } 96 97 static void 98 zpl_put_super(struct super_block *sb) 99 { 100 fstrans_cookie_t cookie; 101 int error; 102 103 cookie = spl_fstrans_mark(); 104 error = -zfs_umount(sb); 105 spl_fstrans_unmark(cookie); 106 ASSERT3S(error, <=, 0); 107 } 108 109 /* 110 * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2) 111 * syscalls, via sb->s_op->sync_fs(). 112 * 113 * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() -> 114 * sync_filesystem() would ignore the return from sync_fs(), instead only 115 * considing the error from syncing the underlying block device (sb->s_dev). 116 * Since OpenZFS doesn't _have_ an underlying block device, there's no way for 117 * us to report a sync directly. 118 * 119 * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra 120 * error store `s_wb_err`, to carry errors seen on page writeback since the 121 * last call to syncfs(). If sync_filesystem() does not return an error, any 122 * existing writeback error on the superblock will be used instead (and cleared 123 * either way). We don't use this (page writeback is a different thing for us), 124 * so for 5.8-5.17 we can use that instead to get syncfs() to return the error. 125 * 126 * Before 5.8, we have no other good options - no matter what happens, the 127 * userspace program will be told the call has succeeded, and so we must make 128 * it so, Therefore, when we are asked to wait for sync to complete (wait == 129 * 1), if zfs_sync() has returned an error we have no choice but to block, 130 * regardless of the reason. 131 * 132 * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely 133 * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the 134 * mainline Linux series at time of writing), and has likely been backported to 135 * vendor kernels before 5.8. We don't really want to use a workaround when we 136 * don't have to, but we can't really detect whether or not sync_filesystem() 137 * will return our errors (without a difficult runtime test anyway). So, we use 138 * a static version check: any kernel reporting its version as 5.17+ will use a 139 * direct error return, otherwise, we'll either use s_wb_err if it was detected 140 * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will 141 * block to ensure the correct semantics. 142 * 143 * See https://github.com/openzfs/zfs/issues/17416 for further discussion. 144 */ 145 static int 146 zpl_sync_fs(struct super_block *sb, int wait) 147 { 148 fstrans_cookie_t cookie; 149 cred_t *cr = CRED(); 150 int error; 151 152 crhold(cr); 153 cookie = spl_fstrans_mark(); 154 error = -zfs_sync(sb, wait, cr); 155 156 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) 157 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR 158 if (error && wait) 159 errseq_set(&sb->s_wb_err, error); 160 #else 161 if (error && wait) { 162 zfsvfs_t *zfsvfs = sb->s_fs_info; 163 ASSERT3P(zfsvfs, !=, NULL); 164 if (zfs_enter(zfsvfs, FTAG) == 0) { 165 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 166 zfs_exit(zfsvfs, FTAG); 167 error = 0; 168 } 169 } 170 #endif 171 #endif /* < 5.17.0 */ 172 173 spl_fstrans_unmark(cookie); 174 crfree(cr); 175 176 ASSERT3S(error, <=, 0); 177 return (error); 178 } 179 180 static int 181 zpl_statfs(struct dentry *dentry, struct kstatfs *statp) 182 { 183 fstrans_cookie_t cookie; 184 int error; 185 186 cookie = spl_fstrans_mark(); 187 error = -zfs_statvfs(dentry->d_inode, statp); 188 spl_fstrans_unmark(cookie); 189 ASSERT3S(error, <=, 0); 190 191 /* 192 * If required by a 32-bit system call, dynamically scale the 193 * block size up to 16MiB and decrease the block counts. This 194 * allows for a maximum size of 64EiB to be reported. The file 195 * counts must be artificially capped at 2^32-1. 196 */ 197 if (unlikely(zpl_is_32bit_api())) { 198 while (statp->f_blocks > UINT32_MAX && 199 statp->f_bsize < SPA_MAXBLOCKSIZE) { 200 statp->f_frsize <<= 1; 201 statp->f_bsize <<= 1; 202 203 statp->f_blocks >>= 1; 204 statp->f_bfree >>= 1; 205 statp->f_bavail >>= 1; 206 } 207 208 uint64_t usedobjs = statp->f_files - statp->f_ffree; 209 statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); 210 statp->f_files = statp->f_ffree + usedobjs; 211 } 212 213 return (error); 214 } 215 216 static int 217 zpl_remount_fs(struct super_block *sb, int *flags, char *data) 218 { 219 zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; 220 fstrans_cookie_t cookie; 221 int error; 222 223 cookie = spl_fstrans_mark(); 224 error = -zfs_remount(sb, flags, &zm); 225 spl_fstrans_unmark(cookie); 226 ASSERT3S(error, <=, 0); 227 228 return (error); 229 } 230 231 static int 232 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) 233 { 234 int error; 235 if ((error = zpl_enter(zfsvfs, FTAG)) != 0) 236 return (error); 237 238 char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 239 dmu_objset_name(zfsvfs->z_os, fsname); 240 241 for (int i = 0; fsname[i] != 0; i++) { 242 /* 243 * Spaces in the dataset name must be converted to their 244 * octal escape sequence for getmntent(3) to correctly 245 * parse then fsname portion of /proc/self/mounts. 246 */ 247 if (fsname[i] == ' ') { 248 seq_puts(seq, "\\040"); 249 } else { 250 seq_putc(seq, fsname[i]); 251 } 252 } 253 254 kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); 255 256 zpl_exit(zfsvfs, FTAG); 257 258 return (0); 259 } 260 261 static int 262 zpl_show_devname(struct seq_file *seq, struct dentry *root) 263 { 264 return (__zpl_show_devname(seq, root->d_sb->s_fs_info)); 265 } 266 267 static int 268 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) 269 { 270 seq_printf(seq, ",%s", 271 zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); 272 273 #ifdef CONFIG_FS_POSIX_ACL 274 switch (zfsvfs->z_acl_type) { 275 case ZFS_ACLTYPE_POSIX: 276 seq_puts(seq, ",posixacl"); 277 break; 278 default: 279 seq_puts(seq, ",noacl"); 280 break; 281 } 282 #endif /* CONFIG_FS_POSIX_ACL */ 283 284 switch (zfsvfs->z_case) { 285 case ZFS_CASE_SENSITIVE: 286 seq_puts(seq, ",casesensitive"); 287 break; 288 case ZFS_CASE_INSENSITIVE: 289 seq_puts(seq, ",caseinsensitive"); 290 break; 291 default: 292 seq_puts(seq, ",casemixed"); 293 break; 294 } 295 296 return (0); 297 } 298 299 static int 300 zpl_show_options(struct seq_file *seq, struct dentry *root) 301 { 302 return (__zpl_show_options(seq, root->d_sb->s_fs_info)); 303 } 304 305 static int 306 zpl_fill_super(struct super_block *sb, void *data, int silent) 307 { 308 zfs_mnt_t *zm = (zfs_mnt_t *)data; 309 fstrans_cookie_t cookie; 310 int error; 311 312 cookie = spl_fstrans_mark(); 313 error = -zfs_domount(sb, zm, silent); 314 spl_fstrans_unmark(cookie); 315 ASSERT3S(error, <=, 0); 316 317 return (error); 318 } 319 320 static int 321 zpl_test_super(struct super_block *s, void *data) 322 { 323 zfsvfs_t *zfsvfs = s->s_fs_info; 324 objset_t *os = data; 325 /* 326 * If the os doesn't match the z_os in the super_block, assume it is 327 * not a match. Matching would imply a multimount of a dataset. It is 328 * possible that during a multimount, there is a simultaneous operation 329 * that changes the z_os, e.g., rollback, where the match will be 330 * missed, but in that case the user will get an EBUSY. 331 */ 332 return (zfsvfs != NULL && os == zfsvfs->z_os); 333 } 334 335 static struct super_block * 336 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) 337 { 338 struct super_block *s; 339 objset_t *os; 340 boolean_t issnap = B_FALSE; 341 int err; 342 343 err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); 344 if (err) 345 return (ERR_PTR(-err)); 346 347 /* 348 * The dsl pool lock must be released prior to calling sget(). 349 * It is possible sget() may block on the lock in grab_super() 350 * while deactivate_super() holds that same lock and waits for 351 * a txg sync. If the dsl_pool lock is held over sget() 352 * this can prevent the pool sync and cause a deadlock. 353 */ 354 dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); 355 dsl_pool_rele(dmu_objset_pool(os), FTAG); 356 357 s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); 358 359 /* 360 * Recheck with the lock held to prevent mounting the wrong dataset 361 * since z_os can be stale when the teardown lock is held. 362 * 363 * We can't do this in zpl_test_super in since it's under spinlock and 364 * also s_umount lock is not held there so it would race with 365 * zfs_umount and zfsvfs can be freed. 366 */ 367 if (!IS_ERR(s) && s->s_fs_info != NULL) { 368 zfsvfs_t *zfsvfs = s->s_fs_info; 369 if (zpl_enter(zfsvfs, FTAG) == 0) { 370 if (os != zfsvfs->z_os) 371 err = -SET_ERROR(EBUSY); 372 issnap = zfsvfs->z_issnap; 373 zpl_exit(zfsvfs, FTAG); 374 } else { 375 err = -SET_ERROR(EBUSY); 376 } 377 } 378 dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); 379 dsl_dataset_rele(dmu_objset_ds(os), FTAG); 380 381 if (IS_ERR(s)) 382 return (ERR_CAST(s)); 383 384 if (err) { 385 deactivate_locked_super(s); 386 return (ERR_PTR(err)); 387 } 388 389 if (s->s_root == NULL) { 390 err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); 391 if (err) { 392 deactivate_locked_super(s); 393 return (ERR_PTR(err)); 394 } 395 s->s_flags |= SB_ACTIVE; 396 } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) { 397 /* 398 * Skip ro check for snap since snap is always ro regardless 399 * ro flag is passed by mount or not. 400 */ 401 deactivate_locked_super(s); 402 return (ERR_PTR(-EBUSY)); 403 } 404 405 return (s); 406 } 407 408 static struct dentry * 409 zpl_mount(struct file_system_type *fs_type, int flags, 410 const char *osname, void *data) 411 { 412 zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; 413 414 struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); 415 if (IS_ERR(sb)) 416 return (ERR_CAST(sb)); 417 418 return (dget(sb->s_root)); 419 } 420 421 static void 422 zpl_kill_sb(struct super_block *sb) 423 { 424 zfs_preumount(sb); 425 kill_anon_super(sb); 426 } 427 428 void 429 zpl_prune_sb(uint64_t nr_to_scan, void *arg) 430 { 431 struct super_block *sb = (struct super_block *)arg; 432 int objects = 0; 433 434 /* 435 * Ensure the superblock is not in the process of being torn down. 436 */ 437 #ifdef HAVE_SB_DYING 438 if (down_read_trylock(&sb->s_umount)) { 439 if (!(sb->s_flags & SB_DYING) && sb->s_root && 440 (sb->s_flags & SB_BORN)) { 441 (void) zfs_prune(sb, nr_to_scan, &objects); 442 } 443 up_read(&sb->s_umount); 444 } 445 #else 446 if (down_read_trylock(&sb->s_umount)) { 447 if (!hlist_unhashed(&sb->s_instances) && 448 sb->s_root && (sb->s_flags & SB_BORN)) { 449 (void) zfs_prune(sb, nr_to_scan, &objects); 450 } 451 up_read(&sb->s_umount); 452 } 453 #endif 454 } 455 456 const struct super_operations zpl_super_operations = { 457 .alloc_inode = zpl_inode_alloc, 458 .destroy_inode = zpl_inode_destroy, 459 .dirty_inode = zpl_dirty_inode, 460 .write_inode = NULL, 461 .evict_inode = zpl_evict_inode, 462 .put_super = zpl_put_super, 463 .sync_fs = zpl_sync_fs, 464 .statfs = zpl_statfs, 465 .remount_fs = zpl_remount_fs, 466 .show_devname = zpl_show_devname, 467 .show_options = zpl_show_options, 468 .show_stats = NULL, 469 }; 470 471 struct file_system_type zpl_fs_type = { 472 .owner = THIS_MODULE, 473 .name = ZFS_DRIVER, 474 #if defined(HAVE_IDMAP_MNT_API) 475 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 476 #else 477 .fs_flags = FS_USERNS_MOUNT, 478 #endif 479 .mount = zpl_mount, 480 .kill_sb = zpl_kill_sb, 481 }; 482