1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (C) 2011 Lawrence Livermore National Security, LLC. 26 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). 27 * LLNL-CODE-403049. 28 * Rewritten for Linux by: 29 * Rohan Puri <rohan.puri15@gmail.com> 30 * Brian Behlendorf <behlendorf1@llnl.gov> 31 * Copyright (c) 2013 by Delphix. All rights reserved. 32 * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. 33 * Copyright (c) 2018 George Melikov. All Rights Reserved. 34 * Copyright (c) 2019 Datto, Inc. All rights reserved. 35 * Copyright (c) 2020 The MathWorks, Inc. All rights reserved. 36 */ 37 38 /* 39 * ZFS control directory (a.k.a. ".zfs") 40 * 41 * This directory provides a common location for all ZFS meta-objects. 42 * Currently, this is only the 'snapshot' and 'shares' directory, but this may 43 * expand in the future. The elements are built dynamically, as the hierarchy 44 * does not actually exist on disk. 45 * 46 * For 'snapshot', we don't want to have all snapshots always mounted, because 47 * this would take up a huge amount of space in /etc/mnttab. We have three 48 * types of objects: 49 * 50 * ctldir ------> snapshotdir -------> snapshot 51 * | 52 * | 53 * V 54 * mounted fs 55 * 56 * The 'snapshot' node contains just enough information to lookup '..' and act 57 * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we 58 * perform an automount of the underlying filesystem and return the 59 * corresponding inode. 60 * 61 * All mounts are handled automatically by an user mode helper which invokes 62 * the mount procedure. Unmounts are handled by allowing the mount 63 * point to expire so the kernel may automatically unmount it. 64 * 65 * The '.zfs', '.zfs/snapshot', and all directories created under 66 * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same 67 * zfsvfs_t as the head filesystem (what '.zfs' lives under). 68 * 69 * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths 70 * (ie: snapshots) are complete ZFS filesystems and have their own unique 71 * zfsvfs_t. However, the fsid reported by these mounts will be the same 72 * as that used by the parent zfsvfs_t to make NFS happy. 73 */ 74 75 #include <sys/types.h> 76 #include <sys/param.h> 77 #include <sys/time.h> 78 #include <sys/sysmacros.h> 79 #include <sys/pathname.h> 80 #include <sys/vfs.h> 81 #include <sys/zfs_ctldir.h> 82 #include <sys/zfs_ioctl.h> 83 #include <sys/zfs_vfsops.h> 84 #include <sys/zfs_vnops.h> 85 #include <sys/stat.h> 86 #include <sys/dmu.h> 87 #include <sys/dmu_objset.h> 88 #include <sys/dsl_destroy.h> 89 #include <sys/dsl_deleg.h> 90 #include <sys/zpl.h> 91 #include <sys/mntent.h> 92 #include "zfs_namecheck.h" 93 94 /* 95 * Two AVL trees are maintained which contain all currently automounted 96 * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t 97 * entry which MUST: 98 * 99 * - be attached to both trees, and 100 * - be unique, no duplicate entries are allowed. 101 * 102 * The zfs_snapshots_by_name tree is indexed by the full dataset name 103 * while the zfs_snapshots_by_objsetid tree is indexed by the unique 104 * objsetid. This allows for fast lookups either by name or objsetid. 105 */ 106 static avl_tree_t zfs_snapshots_by_name; 107 static avl_tree_t zfs_snapshots_by_objsetid; 108 static krwlock_t zfs_snapshot_lock; 109 110 /* 111 * Control Directory Tunables (.zfs) 112 */ 113 int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; 114 static int zfs_admin_snapshot = 0; 115 static int zfs_snapshot_no_setuid = 0; 116 117 typedef struct { 118 char *se_name; /* full snapshot name */ 119 char *se_path; /* full mount path */ 120 spa_t *se_spa; /* pool spa (NULL if pending) */ 121 uint64_t se_objsetid; /* snapshot objset id */ 122 struct dentry *se_root_dentry; /* snapshot root dentry */ 123 taskqid_t se_taskqid; /* scheduled unmount taskqid */ 124 avl_node_t se_node_name; /* zfs_snapshots_by_name link */ 125 avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ 126 zfs_refcount_t se_refcount; /* reference count */ 127 kmutex_t se_mtx; /* protects se_mounting and se_cv */ 128 kcondvar_t se_cv; /* signal mount completion */ 129 boolean_t se_mounting; /* mount operation in progress */ 130 int se_mount_error; /* error from failed mount */ 131 } zfs_snapentry_t; 132 133 static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay); 134 135 /* 136 * Allocate a new zfs_snapentry_t being careful to make a copy of the 137 * the snapshot name and provided mount point. No reference is taken. 138 */ 139 static zfs_snapentry_t * 140 zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa, 141 uint64_t objsetid, struct dentry *root_dentry) 142 { 143 zfs_snapentry_t *se; 144 145 se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); 146 147 se->se_name = kmem_strdup(full_name); 148 se->se_path = kmem_strdup(full_path); 149 se->se_spa = spa; 150 se->se_objsetid = objsetid; 151 se->se_root_dentry = root_dentry; 152 se->se_taskqid = TASKQID_INVALID; 153 mutex_init(&se->se_mtx, NULL, MUTEX_DEFAULT, NULL); 154 cv_init(&se->se_cv, NULL, CV_DEFAULT, NULL); 155 se->se_mounting = B_FALSE; 156 se->se_mount_error = 0; 157 158 zfs_refcount_create(&se->se_refcount); 159 160 return (se); 161 } 162 163 /* 164 * Free a zfs_snapentry_t the caller must ensure there are no active 165 * references. 166 */ 167 static void 168 zfsctl_snapshot_free(zfs_snapentry_t *se) 169 { 170 zfs_refcount_destroy(&se->se_refcount); 171 kmem_strfree(se->se_name); 172 kmem_strfree(se->se_path); 173 mutex_destroy(&se->se_mtx); 174 cv_destroy(&se->se_cv); 175 176 kmem_free(se, sizeof (zfs_snapentry_t)); 177 } 178 179 /* 180 * Hold a reference on the zfs_snapentry_t. 181 */ 182 static void 183 zfsctl_snapshot_hold(zfs_snapentry_t *se) 184 { 185 zfs_refcount_add(&se->se_refcount, NULL); 186 } 187 188 /* 189 * Release a reference on the zfs_snapentry_t. When the number of 190 * references drops to zero the structure will be freed. 191 */ 192 static void 193 zfsctl_snapshot_rele(zfs_snapentry_t *se) 194 { 195 if (zfs_refcount_remove(&se->se_refcount, NULL) == 0) 196 zfsctl_snapshot_free(se); 197 } 198 199 /* 200 * Add a zfs_snapentry_t to the zfs_snapshots_by_name tree. If the entry 201 * is not pending (se_spa != NULL), also add to zfs_snapshots_by_objsetid. 202 * While the zfs_snapentry_t is part of the trees a reference is held. 203 */ 204 static void 205 zfsctl_snapshot_add(zfs_snapentry_t *se) 206 { 207 ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); 208 zfsctl_snapshot_hold(se); 209 avl_add(&zfs_snapshots_by_name, se); 210 if (se->se_spa != NULL) 211 avl_add(&zfs_snapshots_by_objsetid, se); 212 } 213 214 /* 215 * Remove a zfs_snapentry_t from the zfs_snapshots_by_name tree and 216 * zfs_snapshots_by_objsetid tree (if not pending). Upon removal a 217 * reference is dropped, this can result in the structure being freed 218 * if that was the last remaining reference. 219 */ 220 static void 221 zfsctl_snapshot_remove(zfs_snapentry_t *se) 222 { 223 ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); 224 avl_remove(&zfs_snapshots_by_name, se); 225 if (se->se_spa != NULL) 226 avl_remove(&zfs_snapshots_by_objsetid, se); 227 zfsctl_snapshot_rele(se); 228 } 229 230 /* 231 * Fill a pending zfs_snapentry_t after mount succeeds. Fills in the 232 * remaining fields and adds the entry to the zfs_snapshots_by_objsetid tree. 233 */ 234 static void 235 zfsctl_snapshot_fill(zfs_snapentry_t *se, spa_t *spa, uint64_t objsetid, 236 struct dentry *root_dentry) 237 { 238 ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); 239 ASSERT3P(se->se_spa, ==, NULL); 240 se->se_spa = spa; 241 se->se_objsetid = objsetid; 242 se->se_root_dentry = root_dentry; 243 avl_add(&zfs_snapshots_by_objsetid, se); 244 } 245 246 /* 247 * Snapshot name comparison function for the zfs_snapshots_by_name. 248 */ 249 static int 250 snapentry_compare_by_name(const void *a, const void *b) 251 { 252 const zfs_snapentry_t *se_a = a; 253 const zfs_snapentry_t *se_b = b; 254 return (TREE_ISIGN(strcmp(se_a->se_name, se_b->se_name))); 255 } 256 257 /* 258 * Snapshot name comparison function for the zfs_snapshots_by_objsetid. 259 */ 260 static int 261 snapentry_compare_by_objsetid(const void *a, const void *b) 262 { 263 const zfs_snapentry_t *se_a = a; 264 const zfs_snapentry_t *se_b = b; 265 266 int cmp = TREE_PCMP(se_a->se_spa, se_b->se_spa); 267 if (cmp != 0) 268 return (cmp); 269 return (TREE_CMP(se_a->se_objsetid, se_b->se_objsetid)); 270 } 271 272 /* 273 * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname 274 * is found a pointer to the zfs_snapentry_t is returned and a reference 275 * taken on the structure. The caller is responsible for dropping the 276 * reference with zfsctl_snapshot_rele(). If the snapname is not found 277 * NULL will be returned. 278 */ 279 static zfs_snapentry_t * 280 zfsctl_snapshot_find_by_name(const char *snapname) 281 { 282 zfs_snapentry_t *se, search; 283 284 ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); 285 286 search.se_name = (char *)snapname; 287 se = avl_find(&zfs_snapshots_by_name, &search, NULL); 288 if (se) 289 zfsctl_snapshot_hold(se); 290 291 return (se); 292 } 293 294 /* 295 * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id 296 * rather than the snapname. In all other respects it behaves the same 297 * as zfsctl_snapshot_find_by_name(). 298 */ 299 static zfs_snapentry_t * 300 zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) 301 { 302 zfs_snapentry_t *se, search; 303 304 ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); 305 306 search.se_spa = spa; 307 search.se_objsetid = objsetid; 308 se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); 309 if (se) 310 zfsctl_snapshot_hold(se); 311 312 return (se); 313 } 314 315 /* 316 * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is 317 * removed, renamed, and added back to the new correct location in the tree. 318 */ 319 static int 320 zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname) 321 { 322 zfs_snapentry_t *se; 323 324 ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); 325 326 se = zfsctl_snapshot_find_by_name(old_snapname); 327 if (se == NULL) 328 return (SET_ERROR(ENOENT)); 329 if (se->se_spa == NULL) { 330 /* Snapshot mount is in progress */ 331 zfsctl_snapshot_rele(se); 332 return (SET_ERROR(EBUSY)); 333 } 334 335 zfsctl_snapshot_remove(se); 336 kmem_strfree(se->se_name); 337 se->se_name = kmem_strdup(new_snapname); 338 zfsctl_snapshot_add(se); 339 zfsctl_snapshot_rele(se); 340 341 return (0); 342 } 343 344 /* 345 * Delayed task responsible for unmounting an expired automounted snapshot. 346 */ 347 static void 348 snapentry_expire(void *data) 349 { 350 zfs_snapentry_t *se = (zfs_snapentry_t *)data; 351 spa_t *spa = se->se_spa; 352 uint64_t objsetid = se->se_objsetid; 353 354 if (zfs_expire_snapshot <= 0) { 355 zfsctl_snapshot_rele(se); 356 return; 357 } 358 359 (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); 360 361 /* 362 * Clear taskqid and reschedule if the snapshot wasn't removed. 363 * This can occur when the snapshot is busy. 364 */ 365 rw_enter(&zfs_snapshot_lock, RW_WRITER); 366 se->se_taskqid = TASKQID_INVALID; 367 zfsctl_snapshot_rele(se); 368 if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { 369 zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); 370 zfsctl_snapshot_rele(se); 371 } 372 rw_exit(&zfs_snapshot_lock); 373 } 374 375 /* 376 * Cancel an automatic unmount of a snapname. This callback is responsible 377 * for dropping the reference on the zfs_snapentry_t which was taken when 378 * during dispatch. 379 */ 380 static void 381 zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) 382 { 383 int err = 0; 384 385 ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); 386 387 err = taskq_cancel_id(system_delay_taskq, se->se_taskqid, B_FALSE); 388 /* 389 * Clear taskqid only if we successfully cancelled before execution. 390 * For ENOENT, task already cleared it. For EBUSY, task will clear 391 * it when done. 392 */ 393 if (err == 0) { 394 se->se_taskqid = TASKQID_INVALID; 395 zfsctl_snapshot_rele(se); 396 } 397 } 398 399 /* 400 * Dispatch the unmount task for delayed handling with a hold protecting it. 401 */ 402 static void 403 zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) 404 { 405 ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); 406 407 if (delay <= 0) 408 return; 409 410 /* 411 * If this condition happens, we managed to: 412 * - dispatch once 413 * - want to dispatch _again_ before it returned 414 * 415 * So let's just return - if that task fails at unmounting, 416 * we'll eventually dispatch again, and if it succeeds, 417 * no problem. 418 */ 419 if (se->se_taskqid != TASKQID_INVALID) { 420 return; 421 } 422 423 zfsctl_snapshot_hold(se); 424 se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, 425 snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); 426 } 427 428 /* 429 * Schedule an automatic unmount of objset id to occur in delay seconds from 430 * now. Any previous delayed unmount will be cancelled in favor of the 431 * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name() 432 * and held until the outstanding task is handled or cancelled. 433 */ 434 int 435 zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) 436 { 437 zfs_snapentry_t *se; 438 int error = ENOENT; 439 440 rw_enter(&zfs_snapshot_lock, RW_WRITER); 441 if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { 442 zfsctl_snapshot_unmount_cancel(se); 443 zfsctl_snapshot_unmount_delay_impl(se, delay); 444 zfsctl_snapshot_rele(se); 445 error = 0; 446 } 447 rw_exit(&zfs_snapshot_lock); 448 449 return (error); 450 } 451 452 /* 453 * Check if the given inode is a part of the virtual .zfs directory. 454 */ 455 boolean_t 456 zfsctl_is_node(struct inode *ip) 457 { 458 return (ITOZ(ip)->z_is_ctldir); 459 } 460 461 /* 462 * Check if the given inode is a .zfs/snapshots/snapname directory. 463 */ 464 boolean_t 465 zfsctl_is_snapdir(struct inode *ip) 466 { 467 return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS)); 468 } 469 470 /* 471 * Allocate a new inode with the passed id and ops. 472 */ 473 static struct inode * 474 zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, 475 const struct file_operations *fops, const struct inode_operations *ops, 476 uint64_t creation) 477 { 478 struct inode *ip; 479 znode_t *zp; 480 inode_timespec_t now = {.tv_sec = creation}; 481 482 ip = new_inode(zfsvfs->z_sb); 483 if (ip == NULL) 484 return (NULL); 485 486 if (!creation) 487 now = current_time(ip); 488 zp = ITOZ(ip); 489 ASSERT0P(zp->z_dirlocks); 490 ASSERT0P(zp->z_acl_cached); 491 ASSERT0P(zp->z_xattr_cached); 492 zp->z_id = id; 493 zp->z_unlinked = B_FALSE; 494 zp->z_atime_dirty = B_FALSE; 495 zp->z_zn_prefetch = B_FALSE; 496 zp->z_is_sa = B_FALSE; 497 zp->z_is_ctldir = B_TRUE; 498 zp->z_xattr_dir_absent = B_FALSE; 499 zp->z_sa_hdl = NULL; 500 zp->z_blksz = 0; 501 zp->z_seq = 0; 502 zp->z_mapcnt = 0; 503 zp->z_size = 0; 504 zp->z_pflags = 0; 505 zp->z_mode = 0; 506 zp->z_sync_cnt = 0; 507 ip->i_generation = 0; 508 ip->i_ino = id; 509 ip->i_mode = (S_IFDIR | S_IRWXUGO); 510 ip->i_uid = SUID_TO_KUID(0); 511 ip->i_gid = SGID_TO_KGID(0); 512 ip->i_blkbits = SPA_MINBLOCKSHIFT; 513 zpl_inode_set_atime_to_ts(ip, now); 514 zpl_inode_set_mtime_to_ts(ip, now); 515 zpl_inode_set_ctime_to_ts(ip, now); 516 ip->i_fop = fops; 517 ip->i_op = ops; 518 #if defined(IOP_XATTR) 519 ip->i_opflags &= ~IOP_XATTR; 520 #endif 521 522 if (insert_inode_locked(ip)) { 523 unlock_new_inode(ip); 524 iput(ip); 525 return (NULL); 526 } 527 528 mutex_enter(&zfsvfs->z_znodes_lock); 529 list_insert_tail(&zfsvfs->z_all_znodes, zp); 530 membar_producer(); 531 mutex_exit(&zfsvfs->z_znodes_lock); 532 533 unlock_new_inode(ip); 534 535 return (ip); 536 } 537 538 /* 539 * Lookup the inode with given id, it will be allocated if needed. 540 */ 541 static struct inode * 542 zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, 543 const struct file_operations *fops, const struct inode_operations *ops) 544 { 545 struct inode *ip = NULL; 546 uint64_t creation = 0; 547 dsl_dataset_t *snap_ds; 548 dsl_pool_t *pool; 549 550 while (ip == NULL) { 551 ip = ilookup(zfsvfs->z_sb, (unsigned long)id); 552 if (ip) 553 break; 554 555 if (id <= ZFSCTL_INO_SNAPDIRS && !creation) { 556 pool = dmu_objset_pool(zfsvfs->z_os); 557 dsl_pool_config_enter(pool, FTAG); 558 if (!dsl_dataset_hold_obj(pool, 559 ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) { 560 creation = dsl_get_creation(snap_ds); 561 dsl_dataset_rele(snap_ds, FTAG); 562 } 563 dsl_pool_config_exit(pool, FTAG); 564 } 565 566 /* May fail due to concurrent zfsctl_inode_alloc() */ 567 ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation); 568 } 569 570 return (ip); 571 } 572 573 /* 574 * Create the '.zfs' directory. This directory is cached as part of the VFS 575 * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount() 576 * therefore checks against a vfs_count of 2 instead of 1. This reference 577 * is removed when the ctldir is destroyed in the unmount. All other entities 578 * under the '.zfs' directory are created dynamically as needed. 579 * 580 * Because the dynamically created '.zfs' directory entries assume the use 581 * of 64-bit inode numbers this support must be disabled on 32-bit systems. 582 */ 583 int 584 zfsctl_create(zfsvfs_t *zfsvfs) 585 { 586 ASSERT0P(zfsvfs->z_ctldir); 587 588 zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, 589 &zpl_fops_root, &zpl_ops_root, 0); 590 if (zfsvfs->z_ctldir == NULL) 591 return (SET_ERROR(ENOENT)); 592 593 return (0); 594 } 595 596 /* 597 * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name. 598 * Only called when the filesystem is unmounted. 599 */ 600 void 601 zfsctl_destroy(zfsvfs_t *zfsvfs) 602 { 603 if (zfsvfs->z_issnap) { 604 zfs_snapentry_t *se; 605 spa_t *spa = zfsvfs->z_os->os_spa; 606 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 607 608 rw_enter(&zfs_snapshot_lock, RW_WRITER); 609 se = zfsctl_snapshot_find_by_objsetid(spa, objsetid); 610 if (se != NULL) { 611 zfsctl_snapshot_remove(se); 612 /* 613 * Don't wait if snapentry_expire task is calling 614 * umount, which may have resulted in this destroy 615 * call. Waiting would deadlock: snapentry_expire 616 * waits for umount while umount waits for task. 617 */ 618 zfsctl_snapshot_unmount_cancel(se); 619 zfsctl_snapshot_rele(se); 620 } 621 rw_exit(&zfs_snapshot_lock); 622 } else if (zfsvfs->z_ctldir) { 623 iput(zfsvfs->z_ctldir); 624 zfsvfs->z_ctldir = NULL; 625 } 626 } 627 628 /* 629 * Given a root znode, retrieve the associated .zfs directory. 630 * Add a hold to the vnode and return it. 631 */ 632 struct inode * 633 zfsctl_root(znode_t *zp) 634 { 635 ASSERT(zfs_has_ctldir(zp)); 636 /* Must have an existing ref, so igrab() cannot return NULL */ 637 VERIFY3P(igrab(ZTOZSB(zp)->z_ctldir), !=, NULL); 638 return (ZTOZSB(zp)->z_ctldir); 639 } 640 641 /* 642 * Generate a long fid to indicate a snapdir. We encode whether snapdir is 643 * already mounted in gen field. We do this because nfsd lookup will not 644 * trigger automount. Next time the nfsd does fh_to_dentry, we will notice 645 * this and do automount and return ESTALE to force nfsd revalidate and follow 646 * mount. 647 */ 648 static int 649 zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp) 650 { 651 zfid_short_t *zfid = (zfid_short_t *)fidp; 652 zfid_long_t *zlfid = (zfid_long_t *)fidp; 653 uint32_t gen = 0; 654 uint64_t object; 655 uint64_t objsetid; 656 int i; 657 struct dentry *dentry; 658 659 if (fidp->fid_len < LONG_FID_LEN) { 660 fidp->fid_len = LONG_FID_LEN; 661 return (SET_ERROR(ENOSPC)); 662 } 663 664 object = ip->i_ino; 665 objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino; 666 zfid->zf_len = LONG_FID_LEN; 667 668 dentry = d_obtain_alias(igrab(ip)); 669 if (!IS_ERR(dentry)) { 670 gen = !!d_mountpoint(dentry); 671 dput(dentry); 672 } 673 674 for (i = 0; i < sizeof (zfid->zf_object); i++) 675 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 676 677 for (i = 0; i < sizeof (zfid->zf_gen); i++) 678 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 679 680 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 681 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 682 683 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 684 zlfid->zf_setgen[i] = 0; 685 686 return (0); 687 } 688 689 /* 690 * Generate an appropriate fid for an entry in the .zfs directory. 691 */ 692 int 693 zfsctl_fid(struct inode *ip, fid_t *fidp) 694 { 695 znode_t *zp = ITOZ(ip); 696 zfsvfs_t *zfsvfs = ITOZSB(ip); 697 uint64_t object = zp->z_id; 698 zfid_short_t *zfid; 699 int i; 700 int error; 701 702 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 703 return (error); 704 705 if (zfsctl_is_snapdir(ip)) { 706 zfs_exit(zfsvfs, FTAG); 707 return (zfsctl_snapdir_fid(ip, fidp)); 708 } 709 710 if (fidp->fid_len < SHORT_FID_LEN) { 711 fidp->fid_len = SHORT_FID_LEN; 712 zfs_exit(zfsvfs, FTAG); 713 return (SET_ERROR(ENOSPC)); 714 } 715 716 zfid = (zfid_short_t *)fidp; 717 718 zfid->zf_len = SHORT_FID_LEN; 719 720 for (i = 0; i < sizeof (zfid->zf_object); i++) 721 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 722 723 /* .zfs znodes always have a generation number of 0 */ 724 for (i = 0; i < sizeof (zfid->zf_gen); i++) 725 zfid->zf_gen[i] = 0; 726 727 zfs_exit(zfsvfs, FTAG); 728 return (0); 729 } 730 731 /* 732 * Construct a full dataset name in full_name: "pool/dataset@snap_name" 733 */ 734 static int 735 zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, 736 char *full_name) 737 { 738 objset_t *os = zfsvfs->z_os; 739 740 if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) 741 return (SET_ERROR(EILSEQ)); 742 743 dmu_objset_name(os, full_name); 744 if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) 745 return (SET_ERROR(ENAMETOOLONG)); 746 747 (void) strcat(full_name, "@"); 748 (void) strcat(full_name, snap_name); 749 750 return (0); 751 } 752 753 /* 754 * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" 755 */ 756 static int 757 zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, 758 int path_len, char *full_path) 759 { 760 objset_t *os = zfsvfs->z_os; 761 fstrans_cookie_t cookie; 762 char *snapname; 763 boolean_t case_conflict; 764 uint64_t id, pos = 0; 765 int error = 0; 766 767 cookie = spl_fstrans_mark(); 768 snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 769 770 while (error == 0) { 771 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 772 error = dmu_snapshot_list_next(zfsvfs->z_os, 773 ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos, 774 &case_conflict); 775 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 776 if (error) 777 goto out; 778 779 if (id == objsetid) 780 break; 781 } 782 783 mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); 784 if (zfsvfs->z_vfs->vfs_mntpoint != NULL) { 785 snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", 786 zfsvfs->z_vfs->vfs_mntpoint, snapname); 787 } else 788 error = SET_ERROR(ENOENT); 789 mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); 790 791 out: 792 kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); 793 spl_fstrans_unmark(cookie); 794 795 return (error); 796 } 797 798 /* 799 * Special case the handling of "..". 800 */ 801 int 802 zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp, 803 int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) 804 { 805 zfsvfs_t *zfsvfs = ITOZSB(dip); 806 int error = 0; 807 808 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 809 return (error); 810 811 if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED) { 812 *ipp = NULL; 813 } else if (strcmp(name, "..") == 0) { 814 *ipp = dip->i_sb->s_root->d_inode; 815 } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { 816 *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, 817 &zpl_fops_snapdir, &zpl_ops_snapdir); 818 } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) { 819 *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES, 820 &zpl_fops_shares, &zpl_ops_shares); 821 } else { 822 *ipp = NULL; 823 } 824 825 if (*ipp == NULL) 826 error = SET_ERROR(ENOENT); 827 828 zfs_exit(zfsvfs, FTAG); 829 830 return (error); 831 } 832 833 /* 834 * Lookup entry point for the 'snapshot' directory. Try to open the 835 * snapshot if it exist, creating the pseudo filesystem inode as necessary. 836 */ 837 int 838 zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp, 839 int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) 840 { 841 zfsvfs_t *zfsvfs = ITOZSB(dip); 842 uint64_t id; 843 int error; 844 845 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 846 return (error); 847 848 error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); 849 if (error) { 850 zfs_exit(zfsvfs, FTAG); 851 return (error); 852 } 853 854 *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id, 855 &simple_dir_operations, &simple_dir_inode_operations); 856 if (*ipp == NULL) 857 error = SET_ERROR(ENOENT); 858 859 zfs_exit(zfsvfs, FTAG); 860 861 return (error); 862 } 863 864 /* 865 * Renaming a directory under '.zfs/snapshot' will automatically trigger 866 * a rename of the snapshot to the new given name. The rename is confined 867 * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. 868 */ 869 int 870 zfsctl_snapdir_rename(struct inode *sdip, const char *snm, 871 struct inode *tdip, const char *tnm, cred_t *cr, int flags) 872 { 873 zfsvfs_t *zfsvfs = ITOZSB(sdip); 874 char *to, *from, *real, *fsname; 875 int error; 876 877 if (!zfs_admin_snapshot) 878 return (SET_ERROR(EACCES)); 879 880 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 881 return (error); 882 883 to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 884 from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 885 real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 886 fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 887 888 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 889 error = dmu_snapshot_realname(zfsvfs->z_os, snm, real, 890 ZFS_MAX_DATASET_NAME_LEN, NULL); 891 if (error == 0) { 892 snm = real; 893 } else if (error != ENOTSUP) { 894 goto out; 895 } 896 } 897 898 dmu_objset_name(zfsvfs->z_os, fsname); 899 900 error = zfsctl_snapshot_name(ITOZSB(sdip), snm, 901 ZFS_MAX_DATASET_NAME_LEN, from); 902 if (error == 0) 903 error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, 904 ZFS_MAX_DATASET_NAME_LEN, to); 905 if (error == 0) 906 error = zfs_secpolicy_rename_perms(from, to, cr); 907 if (error != 0) 908 goto out; 909 910 /* 911 * Cannot move snapshots out of the snapdir. 912 */ 913 if (sdip != tdip) { 914 error = SET_ERROR(EINVAL); 915 goto out; 916 } 917 918 /* 919 * No-op when names are identical. 920 */ 921 if (strcmp(snm, tnm) == 0) { 922 error = 0; 923 goto out; 924 } 925 926 rw_enter(&zfs_snapshot_lock, RW_WRITER); 927 928 error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); 929 if (error == 0) 930 (void) zfsctl_snapshot_rename(snm, tnm); 931 932 rw_exit(&zfs_snapshot_lock); 933 out: 934 kmem_free(from, ZFS_MAX_DATASET_NAME_LEN); 935 kmem_free(to, ZFS_MAX_DATASET_NAME_LEN); 936 kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); 937 kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); 938 939 zfs_exit(zfsvfs, FTAG); 940 941 return (error); 942 } 943 944 /* 945 * Removing a directory under '.zfs/snapshot' will automatically trigger 946 * the removal of the snapshot with the given name. 947 */ 948 int 949 zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr, 950 int flags) 951 { 952 zfsvfs_t *zfsvfs = ITOZSB(dip); 953 char *snapname, *real; 954 int error; 955 956 if (!zfs_admin_snapshot) 957 return (SET_ERROR(EACCES)); 958 959 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 960 return (error); 961 962 snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 963 real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 964 965 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { 966 error = dmu_snapshot_realname(zfsvfs->z_os, name, real, 967 ZFS_MAX_DATASET_NAME_LEN, NULL); 968 if (error == 0) { 969 name = real; 970 } else if (error != ENOTSUP) { 971 goto out; 972 } 973 } 974 975 error = zfsctl_snapshot_name(ITOZSB(dip), name, 976 ZFS_MAX_DATASET_NAME_LEN, snapname); 977 if (error == 0) 978 error = zfs_secpolicy_destroy_perms(snapname, cr); 979 if (error != 0) 980 goto out; 981 982 error = zfsctl_snapshot_unmount(snapname, MNT_FORCE); 983 if ((error == 0) || (error == ENOENT)) 984 error = dsl_destroy_snapshot(snapname, B_FALSE); 985 out: 986 kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); 987 kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); 988 989 zfs_exit(zfsvfs, FTAG); 990 991 return (error); 992 } 993 994 /* 995 * Creating a directory under '.zfs/snapshot' will automatically trigger 996 * the creation of a new snapshot with the given name. 997 */ 998 int 999 zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, vattr_t *vap, 1000 struct inode **ipp, cred_t *cr, int flags) 1001 { 1002 zfsvfs_t *zfsvfs = ITOZSB(dip); 1003 char *dsname; 1004 int error; 1005 1006 if (!zfs_admin_snapshot) 1007 return (SET_ERROR(EACCES)); 1008 1009 dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 1010 1011 if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { 1012 error = SET_ERROR(EILSEQ); 1013 goto out; 1014 } 1015 1016 dmu_objset_name(zfsvfs->z_os, dsname); 1017 1018 error = zfs_secpolicy_snapshot_perms(dsname, cr); 1019 if (error != 0) 1020 goto out; 1021 1022 if (error == 0) { 1023 error = dmu_objset_snapshot_one(dsname, dirname); 1024 if (error != 0) 1025 goto out; 1026 1027 error = zfsctl_snapdir_lookup(dip, dirname, ipp, 1028 0, cr, NULL, NULL); 1029 } 1030 out: 1031 kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); 1032 1033 return (error); 1034 } 1035 1036 /* 1037 * Flush everything out of the kernel's export table and such. 1038 * This is needed as once the snapshot is used over NFS, its 1039 * entries in svc_export and svc_expkey caches hold reference 1040 * to the snapshot mount point. There is no known way of flushing 1041 * only the entries related to the snapshot. 1042 */ 1043 static void 1044 exportfs_flush(void) 1045 { 1046 char *argv[] = { "/usr/sbin/exportfs", "-f", NULL }; 1047 char *envp[] = { NULL }; 1048 1049 (void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); 1050 } 1051 1052 /* 1053 * Returns the path in char format for given struct path. Uses 1054 * d_path exported by kernel to convert struct path to char 1055 * format. Returns the correct path for mountpoints and chroot 1056 * environments. 1057 * 1058 * If chroot environment has directories that are mounted with 1059 * --bind or --rbind flag, d_path returns the complete path inside 1060 * chroot environment but does not return the absolute path, i.e. 1061 * the path to chroot environment is missing. 1062 */ 1063 static int 1064 get_root_path(struct path *path, char *buff, int len) 1065 { 1066 char *path_buffer, *path_ptr; 1067 int error = 0; 1068 1069 path_get(path); 1070 path_buffer = kmem_zalloc(len, KM_SLEEP); 1071 path_ptr = d_path(path, path_buffer, len); 1072 if (IS_ERR(path_ptr)) 1073 error = SET_ERROR(-PTR_ERR(path_ptr)); 1074 else 1075 strcpy(buff, path_ptr); 1076 1077 kmem_free(path_buffer, len); 1078 path_put(path); 1079 return (error); 1080 } 1081 1082 /* 1083 * Returns if the current process root is chrooted or not. Linux 1084 * kernel exposes the task_struct for current process and init. 1085 * Since init process root points to actual root filesystem when 1086 * Linux runtime is reached, we can compare the current process 1087 * root with init process root to determine if root of the current 1088 * process is different from init, which can reliably determine if 1089 * current process is in chroot context or not. 1090 */ 1091 static int 1092 is_current_chrooted(void) 1093 { 1094 struct task_struct *curr = current, *global = &init_task; 1095 struct path cr_root, gl_root; 1096 1097 task_lock(curr); 1098 get_fs_root(curr->fs, &cr_root); 1099 task_unlock(curr); 1100 1101 task_lock(global); 1102 get_fs_root(global->fs, &gl_root); 1103 task_unlock(global); 1104 1105 int chrooted = !path_equal(&cr_root, &gl_root); 1106 path_put(&gl_root); 1107 path_put(&cr_root); 1108 1109 return (chrooted); 1110 } 1111 1112 /* 1113 * Attempt to unmount a snapshot by making a call to user space. 1114 * There is no assurance that this can or will succeed, is just a 1115 * best effort. In the case where it does fail, perhaps because 1116 * it's in use, the unmount will fail harmlessly. 1117 */ 1118 int 1119 zfsctl_snapshot_unmount(const char *snapname, int flags) 1120 { 1121 char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL, 1122 NULL }; 1123 char *envp[] = { NULL }; 1124 zfs_snapentry_t *se; 1125 int error; 1126 1127 rw_enter(&zfs_snapshot_lock, RW_READER); 1128 if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) { 1129 rw_exit(&zfs_snapshot_lock); 1130 return (SET_ERROR(ENOENT)); 1131 } 1132 rw_exit(&zfs_snapshot_lock); 1133 1134 /* 1135 * Wait for any pending auto-mount to complete before unmounting. 1136 */ 1137 mutex_enter(&se->se_mtx); 1138 while (se->se_mounting) 1139 cv_wait(&se->se_cv, &se->se_mtx); 1140 mutex_exit(&se->se_mtx); 1141 1142 if (flags & MNT_FORCE) 1143 argv[4] = "-fn"; 1144 argv[5] = se->se_path; 1145 dprintf("unmount; path=%s\n", se->se_path); 1146 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); 1147 1148 /* 1149 * The kernel's NFS export cache can hold references to the 1150 * snapshot mountpoint and cause umount to fail. ZFS cannot 1151 * invalidate individual entries because the relevant kernel 1152 * APIs are exported GPL-only, so we issue a global flush 1153 * instead. To avoid impacting unrelated snapshots, the flush 1154 * runs only on umount failure. Not perfect, but better than 1155 * flushing unconditionally. 1156 */ 1157 if (error) { 1158 exportfs_flush(); 1159 error = call_usermodehelper(argv[0], argv, envp, 1160 UMH_WAIT_PROC); 1161 } 1162 1163 zfsctl_snapshot_rele(se); 1164 1165 /* 1166 * The umount system utility will return 256 on error. We must 1167 * assume this error is because the file system is busy so it is 1168 * converted to the more sensible EBUSY. 1169 */ 1170 if (error) 1171 error = SET_ERROR(EBUSY); 1172 1173 return (error); 1174 } 1175 1176 int 1177 zfsctl_snapshot_mount(struct path *path, int flags) 1178 { 1179 struct dentry *dentry = path->dentry; 1180 struct inode *ip = dentry->d_inode; 1181 zfsvfs_t *zfsvfs; 1182 zfsvfs_t *snap_zfsvfs; 1183 zfs_snapentry_t *se; 1184 char *full_name, *full_path, *options; 1185 char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n", 1186 "-o", NULL, NULL, NULL, NULL }; 1187 char *envp[] = { NULL }; 1188 int error; 1189 struct path spath; 1190 1191 if (ip == NULL) 1192 return (SET_ERROR(EISDIR)); 1193 1194 zfsvfs = ITOZSB(ip); 1195 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1196 return (error); 1197 1198 full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 1199 full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 1200 options = kmem_zalloc(7, KM_SLEEP); 1201 1202 error = zfsctl_snapshot_name(zfsvfs, dname(dentry), 1203 ZFS_MAX_DATASET_NAME_LEN, full_name); 1204 if (error) { 1205 zfs_exit(zfsvfs, FTAG); 1206 goto error; 1207 } 1208 1209 if (is_current_chrooted() == 0) { 1210 /* 1211 * Current process is not in chroot context 1212 */ 1213 1214 char *m = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 1215 struct path mnt_path; 1216 mnt_path.mnt = path->mnt; 1217 mnt_path.dentry = path->mnt->mnt_root; 1218 1219 /* 1220 * Get path to current mountpoint 1221 */ 1222 error = get_root_path(&mnt_path, m, MAXPATHLEN); 1223 if (error != 0) { 1224 kmem_free(m, MAXPATHLEN); 1225 zfs_exit(zfsvfs, FTAG); 1226 goto error; 1227 } 1228 mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); 1229 if (zfsvfs->z_vfs->vfs_mntpoint != NULL) { 1230 /* 1231 * If current mnountpoint and vfs_mntpoint are not same, 1232 * store current mountpoint in vfs_mntpoint. 1233 */ 1234 if (strcmp(zfsvfs->z_vfs->vfs_mntpoint, m) != 0) { 1235 kmem_strfree(zfsvfs->z_vfs->vfs_mntpoint); 1236 zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m); 1237 } 1238 } else 1239 zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m); 1240 mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); 1241 kmem_free(m, MAXPATHLEN); 1242 } 1243 1244 /* 1245 * Construct a mount point path from sb of the ctldir inode and dirent 1246 * name, instead of from d_path(), so that chroot'd process doesn't fail 1247 * on mount.zfs(8). 1248 */ 1249 mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); 1250 snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", 1251 zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "", 1252 dname(dentry)); 1253 mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); 1254 1255 snprintf(options, 7, "%s", 1256 zfs_snapshot_no_setuid ? "nosuid" : "suid"); 1257 1258 /* 1259 * Release z_teardown_lock before potentially blocking operations 1260 * (cv_wait for concurrent mounts, call_usermodehelper for the mount 1261 * helper). Holding z_teardown_lock(R) across call_usermodehelper 1262 * deadlocks with namespace_sem: the mount helper needs 1263 * namespace_sem(W) via move_mount, while /proc/self/mountinfo 1264 * readers hold namespace_sem(R) and need z_teardown_lock(R) via 1265 * zpl_show_devname. A concurrent zfs_suspend_fs queuing 1266 * z_teardown_lock(W) blocks new readers, completing the cycle. 1267 * See https://github.com/openzfs/zfs/issues/18409 1268 * 1269 * Releasing the lock allows zfs_suspend_fs to proceed during 1270 * the mount, so dmu_objset_hold in zpl_get_tree can transiently 1271 * fail with ENOENT during the clone swap. The mount helper 1272 * fails, this function returns EISDIR, and the VFS silently 1273 * falls back to the ctldir stub (empty directory). The caller 1274 * gets the stub inode instead of the real snapshot root until 1275 * the next access retries the automount. 1276 * 1277 * Safe because everything below operates on local string copies 1278 * (full_name, full_path) or uses its own synchronization 1279 * (zfs_snapshot_lock, se_mtx). The parent zfsvfs pointer 1280 * remains valid because we hold a path reference to the 1281 * automount trigger dentry. 1282 */ 1283 zfs_exit(zfsvfs, FTAG); 1284 1285 /* 1286 * Check if snapshot is already being mounted. If found, wait for 1287 * pending mount to complete before returning success. 1288 */ 1289 rw_enter(&zfs_snapshot_lock, RW_WRITER); 1290 if ((se = zfsctl_snapshot_find_by_name(full_name)) != NULL) { 1291 rw_exit(&zfs_snapshot_lock); 1292 mutex_enter(&se->se_mtx); 1293 while (se->se_mounting) 1294 cv_wait(&se->se_cv, &se->se_mtx); 1295 1296 /* 1297 * Return the same error as the first mount attempt (0 if 1298 * succeeded, error code if failed). 1299 */ 1300 error = se->se_mount_error; 1301 mutex_exit(&se->se_mtx); 1302 zfsctl_snapshot_rele(se); 1303 goto error; 1304 } 1305 1306 /* 1307 * Create pending entry and mark mount in progress. 1308 */ 1309 se = zfsctl_snapshot_alloc(full_name, full_path, NULL, 0, NULL); 1310 se->se_mounting = B_TRUE; 1311 zfsctl_snapshot_add(se); 1312 zfsctl_snapshot_hold(se); 1313 rw_exit(&zfs_snapshot_lock); 1314 1315 /* 1316 * Attempt to mount the snapshot from user space. Normally this 1317 * would be done using the vfs_kern_mount() function, however that 1318 * function is marked GPL-only and cannot be used. On error we 1319 * careful to log the real error to the console and return EISDIR 1320 * to safely abort the automount. This should be very rare. 1321 * 1322 * If the user mode helper happens to return EBUSY, a concurrent 1323 * mount is already in progress in which case the error is ignored. 1324 * Take note that if the program was executed successfully the return 1325 * value from call_usermodehelper() will be (exitcode << 8 + signal). 1326 */ 1327 dprintf("mount; name=%s path=%s\n", full_name, full_path); 1328 argv[7] = options; 1329 argv[8] = full_name; 1330 argv[9] = full_path; 1331 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); 1332 if (error) { 1333 /* 1334 * Mount failed - cleanup pending entry and signal waiters. 1335 */ 1336 if (!(error & MOUNT_BUSY << 8)) { 1337 zfs_dbgmsg("Unable to automount %s error=%d", 1338 full_path, error); 1339 error = SET_ERROR(EISDIR); 1340 } else { 1341 /* 1342 * EBUSY, this could mean a concurrent mount, or the 1343 * snapshot has already been mounted at completely 1344 * different place. We return 0 so VFS will retry. For 1345 * the latter case the VFS will retry several times 1346 * and return ELOOP, which is probably not a very good 1347 * behavior. 1348 */ 1349 error = 0; 1350 } 1351 1352 rw_enter(&zfs_snapshot_lock, RW_WRITER); 1353 zfsctl_snapshot_remove(se); 1354 rw_exit(&zfs_snapshot_lock); 1355 mutex_enter(&se->se_mtx); 1356 se->se_mount_error = error; 1357 se->se_mounting = B_FALSE; 1358 cv_broadcast(&se->se_cv); 1359 mutex_exit(&se->se_mtx); 1360 zfsctl_snapshot_rele(se); 1361 goto error; 1362 } 1363 1364 /* 1365 * Follow down in to the mounted snapshot and set MNT_SHRINKABLE 1366 * to identify this as an automounted filesystem. 1367 */ 1368 spath = *path; 1369 path_get(&spath); 1370 if (follow_down_one(&spath)) { 1371 snap_zfsvfs = ITOZSB(spath.dentry->d_inode); 1372 snap_zfsvfs->z_parent = zfsvfs; 1373 dentry = spath.dentry; 1374 spath.mnt->mnt_flags |= MNT_SHRINKABLE; 1375 1376 rw_enter(&zfs_snapshot_lock, RW_WRITER); 1377 zfsctl_snapshot_fill(se, snap_zfsvfs->z_os->os_spa, 1378 dmu_objset_id(snap_zfsvfs->z_os), dentry); 1379 zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); 1380 rw_exit(&zfs_snapshot_lock); 1381 } else { 1382 rw_enter(&zfs_snapshot_lock, RW_WRITER); 1383 zfsctl_snapshot_remove(se); 1384 rw_exit(&zfs_snapshot_lock); 1385 } 1386 path_put(&spath); 1387 1388 /* 1389 * Signal mount completion and cleanup. 1390 */ 1391 mutex_enter(&se->se_mtx); 1392 se->se_mounting = B_FALSE; 1393 cv_broadcast(&se->se_cv); 1394 mutex_exit(&se->se_mtx); 1395 zfsctl_snapshot_rele(se); 1396 error: 1397 kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); 1398 kmem_free(full_path, MAXPATHLEN); 1399 kmem_free(options, 7); 1400 1401 return (error); 1402 } 1403 1404 /* 1405 * Get the snapdir inode from fid 1406 */ 1407 int 1408 zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen, 1409 struct inode **ipp) 1410 { 1411 zfsvfs_t *zfsvfs = sb->s_fs_info; 1412 int error; 1413 struct path path; 1414 char *mnt; 1415 struct dentry *dentry; 1416 zfs_snapentry_t *se; 1417 1418 mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP); 1419 1420 /* 1421 * Try the in-memory AVL tree first for previously mounted 1422 * snapshots, falling back to the on-disk scan if not found. 1423 */ 1424 rw_enter(&zfs_snapshot_lock, RW_READER); 1425 se = zfsctl_snapshot_find_by_objsetid(zfsvfs->z_os->os_spa, objsetid); 1426 rw_exit(&zfs_snapshot_lock); 1427 if (se != NULL) { 1428 strlcpy(mnt, se->se_path, MAXPATHLEN); 1429 zfsctl_snapshot_rele(se); 1430 } else { 1431 error = zfsctl_snapshot_path_objset(zfsvfs, objsetid, 1432 MAXPATHLEN, mnt); 1433 if (error) 1434 goto out; 1435 } 1436 1437 /* Trigger automount */ 1438 error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); 1439 if (error) 1440 goto out; 1441 1442 path_put(&path); 1443 /* 1444 * Get the snapdir inode. Note, we don't want to use the above 1445 * path because it contains the root of the snapshot rather 1446 * than the snapdir. 1447 */ 1448 *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid); 1449 if (*ipp == NULL) { 1450 error = SET_ERROR(ENOENT); 1451 goto out; 1452 } 1453 1454 /* check gen, see zfsctl_snapdir_fid */ 1455 dentry = d_obtain_alias(igrab(*ipp)); 1456 if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) { 1457 iput(*ipp); 1458 *ipp = NULL; 1459 error = SET_ERROR(ENOENT); 1460 } 1461 if (!IS_ERR(dentry)) 1462 dput(dentry); 1463 out: 1464 kmem_free(mnt, MAXPATHLEN); 1465 return (error); 1466 } 1467 1468 int 1469 zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, 1470 int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) 1471 { 1472 zfsvfs_t *zfsvfs = ITOZSB(dip); 1473 znode_t *zp; 1474 znode_t *dzp; 1475 int error; 1476 1477 if ((error = zfs_enter(zfsvfs, FTAG)) != 0) 1478 return (error); 1479 1480 if (zfsvfs->z_shares_dir == 0) { 1481 zfs_exit(zfsvfs, FTAG); 1482 return (SET_ERROR(ENOTSUP)); 1483 } 1484 1485 if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { 1486 error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL); 1487 zrele(dzp); 1488 } 1489 1490 zfs_exit(zfsvfs, FTAG); 1491 1492 return (error); 1493 } 1494 1495 /* 1496 * Initialize the various pieces we'll need to create and manipulate .zfs 1497 * directories. Currently this is unused but available. 1498 */ 1499 void 1500 zfsctl_init(void) 1501 { 1502 avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name, 1503 sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, 1504 se_node_name)); 1505 avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid, 1506 sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, 1507 se_node_objsetid)); 1508 rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); 1509 } 1510 1511 /* 1512 * Cleanup the various pieces we needed for .zfs directories. In particular 1513 * ensure the expiry timer is canceled safely. 1514 */ 1515 void 1516 zfsctl_fini(void) 1517 { 1518 avl_destroy(&zfs_snapshots_by_name); 1519 avl_destroy(&zfs_snapshots_by_objsetid); 1520 rw_destroy(&zfs_snapshot_lock); 1521 } 1522 1523 module_param(zfs_admin_snapshot, int, 0644); 1524 MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); 1525 1526 module_param(zfs_expire_snapshot, int, 0644); 1527 MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); 1528 1529 module_param(zfs_snapshot_no_setuid, int, 0644); 1530 MODULE_PARM_DESC(zfs_snapshot_no_setuid, 1531 "Disable setuid/setgid for automounts in .zfs/snapshot"); 1532