1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. 24 * Copyright (c) 2023, Datto Inc. All rights reserved. 25 */ 26 27 28 #include <sys/zfs_znode.h> 29 #include <sys/zfs_vfsops.h> 30 #include <sys/zfs_vnops.h> 31 #include <sys/zfs_ctldir.h> 32 #include <sys/zpl.h> 33 #include <linux/iversion.h> 34 #include <linux/version.h> 35 36 37 static struct inode * 38 zpl_inode_alloc(struct super_block *sb) 39 { 40 struct inode *ip; 41 42 VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); 43 inode_set_iversion(ip, 1); 44 45 return (ip); 46 } 47 48 #ifdef HAVE_SOPS_FREE_INODE 49 static void 50 zpl_inode_free(struct inode *ip) 51 { 52 ASSERT(atomic_read(&ip->i_count) == 0); 53 zfs_inode_free(ip); 54 } 55 #endif 56 57 static void 58 zpl_inode_destroy(struct inode *ip) 59 { 60 ASSERT(atomic_read(&ip->i_count) == 0); 61 zfs_inode_destroy(ip); 62 } 63 64 /* 65 * Called from __mark_inode_dirty() to reflect that something in the 66 * inode has changed. We use it to ensure the znode system attributes 67 * are always strictly update to date with respect to the inode. 68 */ 69 static void 70 zpl_dirty_inode(struct inode *ip, int flags) 71 { 72 fstrans_cookie_t cookie; 73 74 cookie = spl_fstrans_mark(); 75 zfs_dirty_inode(ip, flags); 76 spl_fstrans_unmark(cookie); 77 } 78 79 /* 80 * When ->drop_inode() is called its return value indicates if the 81 * inode should be evicted from the inode cache. If the inode is 82 * unhashed and has no links the default policy is to evict it 83 * immediately. 84 * 85 * The ->evict_inode() callback must minimally truncate the inode pages, 86 * and call clear_inode(). For 2.6.35 and later kernels this will 87 * simply update the inode state, with the sync occurring before the 88 * truncate in evict(). For earlier kernels clear_inode() maps to 89 * end_writeback() which is responsible for completing all outstanding 90 * write back. In either case, once this is done it is safe to cleanup 91 * any remaining inode specific data via zfs_inactive(). 92 * remaining filesystem specific data. 93 */ 94 static void 95 zpl_evict_inode(struct inode *ip) 96 { 97 fstrans_cookie_t cookie; 98 99 cookie = spl_fstrans_mark(); 100 truncate_setsize(ip, 0); 101 clear_inode(ip); 102 zfs_inactive(ip); 103 spl_fstrans_unmark(cookie); 104 } 105 106 static void 107 zpl_put_super(struct super_block *sb) 108 { 109 fstrans_cookie_t cookie; 110 int error; 111 112 cookie = spl_fstrans_mark(); 113 error = -zfs_umount(sb); 114 spl_fstrans_unmark(cookie); 115 ASSERT3S(error, <=, 0); 116 } 117 118 /* 119 * zfs_sync() is the underlying implementation for the sync(2) and syncfs(2) 120 * syscalls, via sb->s_op->sync_fs(). 121 * 122 * Before kernel 5.17 (torvalds/linux@5679897eb104), syncfs() -> 123 * sync_filesystem() would ignore the return from sync_fs(), instead only 124 * considing the error from syncing the underlying block device (sb->s_dev). 125 * Since OpenZFS doesn't _have_ an underlying block device, there's no way for 126 * us to report a sync directly. 127 * 128 * However, in 5.8 (torvalds/linux@735e4ae5ba28) the superblock gained an extra 129 * error store `s_wb_err`, to carry errors seen on page writeback since the 130 * last call to syncfs(). If sync_filesystem() does not return an error, any 131 * existing writeback error on the superblock will be used instead (and cleared 132 * either way). We don't use this (page writeback is a different thing for us), 133 * so for 5.8-5.17 we can use that instead to get syncfs() to return the error. 134 * 135 * Before 5.8, we have no other good options - no matter what happens, the 136 * userspace program will be told the call has succeeded, and so we must make 137 * it so, Therefore, when we are asked to wait for sync to complete (wait == 138 * 1), if zfs_sync() has returned an error we have no choice but to block, 139 * regardless of the reason. 140 * 141 * The 5.17 change was backported to the 5.10, 5.15 and 5.16 series, and likely 142 * to some vendor kernels. Meanwhile, s_wb_err is still in use in 6.15 (the 143 * mainline Linux series at time of writing), and has likely been backported to 144 * vendor kernels before 5.8. We don't really want to use a workaround when we 145 * don't have to, but we can't really detect whether or not sync_filesystem() 146 * will return our errors (without a difficult runtime test anyway). So, we use 147 * a static version check: any kernel reporting its version as 5.17+ will use a 148 * direct error return, otherwise, we'll either use s_wb_err if it was detected 149 * at configure (5.8-5.16 + vendor backports). If it's unavailable, we will 150 * block to ensure the correct semantics. 151 * 152 * See https://github.com/openzfs/zfs/issues/17416 for further discussion. 153 */ 154 static int 155 zpl_sync_fs(struct super_block *sb, int wait) 156 { 157 fstrans_cookie_t cookie; 158 cred_t *cr = CRED(); 159 int error; 160 161 crhold(cr); 162 cookie = spl_fstrans_mark(); 163 error = -zfs_sync(sb, wait, cr); 164 165 #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) 166 #ifdef HAVE_SUPER_BLOCK_S_WB_ERR 167 if (error && wait) 168 errseq_set(&sb->s_wb_err, error); 169 #else 170 if (error && wait) { 171 zfsvfs_t *zfsvfs = sb->s_fs_info; 172 ASSERT3P(zfsvfs, !=, NULL); 173 if (zfs_enter(zfsvfs, FTAG) == 0) { 174 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); 175 zfs_exit(zfsvfs, FTAG); 176 error = 0; 177 } 178 } 179 #endif 180 #endif /* < 5.17.0 */ 181 182 spl_fstrans_unmark(cookie); 183 crfree(cr); 184 185 ASSERT3S(error, <=, 0); 186 return (error); 187 } 188 189 static int 190 zpl_statfs(struct dentry *dentry, struct kstatfs *statp) 191 { 192 fstrans_cookie_t cookie; 193 int error; 194 195 cookie = spl_fstrans_mark(); 196 error = -zfs_statvfs(dentry->d_inode, statp); 197 spl_fstrans_unmark(cookie); 198 ASSERT3S(error, <=, 0); 199 200 /* 201 * If required by a 32-bit system call, dynamically scale the 202 * block size up to 16MiB and decrease the block counts. This 203 * allows for a maximum size of 64EiB to be reported. The file 204 * counts must be artificially capped at 2^32-1. 205 */ 206 if (unlikely(zpl_is_32bit_api())) { 207 while (statp->f_blocks > UINT32_MAX && 208 statp->f_bsize < SPA_MAXBLOCKSIZE) { 209 statp->f_frsize <<= 1; 210 statp->f_bsize <<= 1; 211 212 statp->f_blocks >>= 1; 213 statp->f_bfree >>= 1; 214 statp->f_bavail >>= 1; 215 } 216 217 uint64_t usedobjs = statp->f_files - statp->f_ffree; 218 statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); 219 statp->f_files = statp->f_ffree + usedobjs; 220 } 221 222 return (error); 223 } 224 225 static int 226 zpl_remount_fs(struct super_block *sb, int *flags, char *data) 227 { 228 zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; 229 fstrans_cookie_t cookie; 230 int error; 231 232 cookie = spl_fstrans_mark(); 233 error = -zfs_remount(sb, flags, &zm); 234 spl_fstrans_unmark(cookie); 235 ASSERT3S(error, <=, 0); 236 237 return (error); 238 } 239 240 static int 241 __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) 242 { 243 int error; 244 if ((error = zpl_enter(zfsvfs, FTAG)) != 0) 245 return (error); 246 247 char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 248 dmu_objset_name(zfsvfs->z_os, fsname); 249 250 for (int i = 0; fsname[i] != 0; i++) { 251 /* 252 * Spaces in the dataset name must be converted to their 253 * octal escape sequence for getmntent(3) to correctly 254 * parse then fsname portion of /proc/self/mounts. 255 */ 256 if (fsname[i] == ' ') { 257 seq_puts(seq, "\\040"); 258 } else { 259 seq_putc(seq, fsname[i]); 260 } 261 } 262 263 kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); 264 265 zpl_exit(zfsvfs, FTAG); 266 267 return (0); 268 } 269 270 static int 271 zpl_show_devname(struct seq_file *seq, struct dentry *root) 272 { 273 return (__zpl_show_devname(seq, root->d_sb->s_fs_info)); 274 } 275 276 static int 277 __zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) 278 { 279 seq_printf(seq, ",%s", 280 zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); 281 282 #ifdef CONFIG_FS_POSIX_ACL 283 switch (zfsvfs->z_acl_type) { 284 case ZFS_ACLTYPE_POSIX: 285 seq_puts(seq, ",posixacl"); 286 break; 287 default: 288 seq_puts(seq, ",noacl"); 289 break; 290 } 291 #endif /* CONFIG_FS_POSIX_ACL */ 292 293 switch (zfsvfs->z_case) { 294 case ZFS_CASE_SENSITIVE: 295 seq_puts(seq, ",casesensitive"); 296 break; 297 case ZFS_CASE_INSENSITIVE: 298 seq_puts(seq, ",caseinsensitive"); 299 break; 300 default: 301 seq_puts(seq, ",casemixed"); 302 break; 303 } 304 305 return (0); 306 } 307 308 static int 309 zpl_show_options(struct seq_file *seq, struct dentry *root) 310 { 311 return (__zpl_show_options(seq, root->d_sb->s_fs_info)); 312 } 313 314 static int 315 zpl_fill_super(struct super_block *sb, void *data, int silent) 316 { 317 zfs_mnt_t *zm = (zfs_mnt_t *)data; 318 fstrans_cookie_t cookie; 319 int error; 320 321 cookie = spl_fstrans_mark(); 322 error = -zfs_domount(sb, zm, silent); 323 spl_fstrans_unmark(cookie); 324 ASSERT3S(error, <=, 0); 325 326 return (error); 327 } 328 329 static int 330 zpl_test_super(struct super_block *s, void *data) 331 { 332 zfsvfs_t *zfsvfs = s->s_fs_info; 333 objset_t *os = data; 334 /* 335 * If the os doesn't match the z_os in the super_block, assume it is 336 * not a match. Matching would imply a multimount of a dataset. It is 337 * possible that during a multimount, there is a simultaneous operation 338 * that changes the z_os, e.g., rollback, where the match will be 339 * missed, but in that case the user will get an EBUSY. 340 */ 341 return (zfsvfs != NULL && os == zfsvfs->z_os); 342 } 343 344 static struct super_block * 345 zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) 346 { 347 struct super_block *s; 348 objset_t *os; 349 boolean_t issnap = B_FALSE; 350 int err; 351 352 err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); 353 if (err) 354 return (ERR_PTR(-err)); 355 356 /* 357 * The dsl pool lock must be released prior to calling sget(). 358 * It is possible sget() may block on the lock in grab_super() 359 * while deactivate_super() holds that same lock and waits for 360 * a txg sync. If the dsl_pool lock is held over sget() 361 * this can prevent the pool sync and cause a deadlock. 362 */ 363 dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); 364 dsl_pool_rele(dmu_objset_pool(os), FTAG); 365 366 s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); 367 368 /* 369 * Recheck with the lock held to prevent mounting the wrong dataset 370 * since z_os can be stale when the teardown lock is held. 371 * 372 * We can't do this in zpl_test_super in since it's under spinlock and 373 * also s_umount lock is not held there so it would race with 374 * zfs_umount and zfsvfs can be freed. 375 */ 376 if (!IS_ERR(s) && s->s_fs_info != NULL) { 377 zfsvfs_t *zfsvfs = s->s_fs_info; 378 if (zpl_enter(zfsvfs, FTAG) == 0) { 379 if (os != zfsvfs->z_os) 380 err = -SET_ERROR(EBUSY); 381 issnap = zfsvfs->z_issnap; 382 zpl_exit(zfsvfs, FTAG); 383 } else { 384 err = -SET_ERROR(EBUSY); 385 } 386 } 387 dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); 388 dsl_dataset_rele(dmu_objset_ds(os), FTAG); 389 390 if (IS_ERR(s)) 391 return (ERR_CAST(s)); 392 393 if (err) { 394 deactivate_locked_super(s); 395 return (ERR_PTR(err)); 396 } 397 398 if (s->s_root == NULL) { 399 err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); 400 if (err) { 401 deactivate_locked_super(s); 402 return (ERR_PTR(err)); 403 } 404 s->s_flags |= SB_ACTIVE; 405 } else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) { 406 /* 407 * Skip ro check for snap since snap is always ro regardless 408 * ro flag is passed by mount or not. 409 */ 410 deactivate_locked_super(s); 411 return (ERR_PTR(-EBUSY)); 412 } 413 414 return (s); 415 } 416 417 static struct dentry * 418 zpl_mount(struct file_system_type *fs_type, int flags, 419 const char *osname, void *data) 420 { 421 zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; 422 423 struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); 424 if (IS_ERR(sb)) 425 return (ERR_CAST(sb)); 426 427 return (dget(sb->s_root)); 428 } 429 430 static void 431 zpl_kill_sb(struct super_block *sb) 432 { 433 zfs_preumount(sb); 434 kill_anon_super(sb); 435 } 436 437 void 438 zpl_prune_sb(uint64_t nr_to_scan, void *arg) 439 { 440 struct super_block *sb = (struct super_block *)arg; 441 int objects = 0; 442 443 /* 444 * Ensure the superblock is not in the process of being torn down. 445 */ 446 #ifdef HAVE_SB_DYING 447 if (down_read_trylock(&sb->s_umount)) { 448 if (!(sb->s_flags & SB_DYING) && sb->s_root && 449 (sb->s_flags & SB_BORN)) { 450 (void) zfs_prune(sb, nr_to_scan, &objects); 451 } 452 up_read(&sb->s_umount); 453 } 454 #else 455 if (down_read_trylock(&sb->s_umount)) { 456 if (!hlist_unhashed(&sb->s_instances) && 457 sb->s_root && (sb->s_flags & SB_BORN)) { 458 (void) zfs_prune(sb, nr_to_scan, &objects); 459 } 460 up_read(&sb->s_umount); 461 } 462 #endif 463 } 464 465 const struct super_operations zpl_super_operations = { 466 .alloc_inode = zpl_inode_alloc, 467 #ifdef HAVE_SOPS_FREE_INODE 468 .free_inode = zpl_inode_free, 469 #endif 470 .destroy_inode = zpl_inode_destroy, 471 .dirty_inode = zpl_dirty_inode, 472 .write_inode = NULL, 473 .evict_inode = zpl_evict_inode, 474 .put_super = zpl_put_super, 475 .sync_fs = zpl_sync_fs, 476 .statfs = zpl_statfs, 477 .remount_fs = zpl_remount_fs, 478 .show_devname = zpl_show_devname, 479 .show_options = zpl_show_options, 480 .show_stats = NULL, 481 }; 482 483 struct file_system_type zpl_fs_type = { 484 .owner = THIS_MODULE, 485 .name = ZFS_DRIVER, 486 #if defined(HAVE_IDMAP_MNT_API) 487 .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, 488 #else 489 .fs_flags = FS_USERNS_MOUNT, 490 #endif 491 .mount = zpl_mount, 492 .kill_sb = zpl_kill_sb, 493 }; 494