1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1999-2004 Poul-Henning Kamp 5 * Copyright (c) 1999 Michael Smith 6 * Copyright (c) 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/conf.h> 41 #include <sys/smp.h> 42 #include <sys/devctl.h> 43 #include <sys/eventhandler.h> 44 #include <sys/fcntl.h> 45 #include <sys/jail.h> 46 #include <sys/kernel.h> 47 #include <sys/ktr.h> 48 #include <sys/libkern.h> 49 #include <sys/limits.h> 50 #include <sys/malloc.h> 51 #include <sys/mount.h> 52 #include <sys/mutex.h> 53 #include <sys/namei.h> 54 #include <sys/priv.h> 55 #include <sys/proc.h> 56 #include <sys/filedesc.h> 57 #include <sys/reboot.h> 58 #include <sys/sbuf.h> 59 #include <sys/syscallsubr.h> 60 #include <sys/sysproto.h> 61 #include <sys/sx.h> 62 #include <sys/sysctl.h> 63 #include <sys/systm.h> 64 #include <sys/taskqueue.h> 65 #include <sys/vnode.h> 66 #include <vm/uma.h> 67 68 #include <geom/geom.h> 69 70 #include <machine/stdarg.h> 71 72 #include <security/audit/audit.h> 73 #include <security/mac/mac_framework.h> 74 75 #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) 76 77 static int vfs_domount(struct thread *td, const char *fstype, char *fspath, 78 uint64_t fsflags, bool jail_export, 79 struct vfsoptlist **optlist); 80 static void free_mntarg(struct mntarg *ma); 81 82 static int usermount = 0; 83 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, 84 "Unprivileged users may mount and unmount file systems"); 85 86 static bool default_autoro = false; 87 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, 88 "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); 89 90 static bool recursive_forced_unmount = false; 91 SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW, 92 &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts" 93 " when a file system is forcibly unmounted"); 94 95 static SYSCTL_NODE(_vfs, OID_AUTO, deferred_unmount, 96 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "deferred unmount controls"); 97 98 static unsigned int deferred_unmount_retry_limit = 10; 99 SYSCTL_UINT(_vfs_deferred_unmount, OID_AUTO, retry_limit, CTLFLAG_RW, 100 &deferred_unmount_retry_limit, 0, 101 "Maximum number of retries for deferred unmount failure"); 102 103 static int deferred_unmount_retry_delay_hz; 104 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, retry_delay_hz, CTLFLAG_RW, 105 &deferred_unmount_retry_delay_hz, 0, 106 "Delay in units of [1/kern.hz]s when retrying a failed deferred unmount"); 107 108 static int deferred_unmount_total_retries = 0; 109 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, total_retries, CTLFLAG_RD, 110 &deferred_unmount_total_retries, 0, 111 "Total number of retried deferred unmounts"); 112 113 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); 114 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); 115 static uma_zone_t mount_zone; 116 117 /* List of mounted filesystems. */ 118 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 119 120 /* For any iteration/modification of mountlist */ 121 struct mtx_padalign __exclusive_cache_line mountlist_mtx; 122 123 EVENTHANDLER_LIST_DEFINE(vfs_mounted); 124 EVENTHANDLER_LIST_DEFINE(vfs_unmounted); 125 126 static void vfs_deferred_unmount(void *arg, int pending); 127 static struct timeout_task deferred_unmount_task; 128 static struct mtx deferred_unmount_lock; 129 MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount", 130 MTX_DEF); 131 static STAILQ_HEAD(, mount) deferred_unmount_list = 132 STAILQ_HEAD_INITIALIZER(deferred_unmount_list); 133 TASKQUEUE_DEFINE_THREAD(deferred_unmount); 134 135 static void mount_devctl_event(const char *type, struct mount *mp, bool donew); 136 137 /* 138 * Global opts, taken by all filesystems 139 */ 140 static const char *global_opts[] = { 141 "errmsg", 142 "fstype", 143 "fspath", 144 "ro", 145 "rw", 146 "nosuid", 147 "noexec", 148 NULL 149 }; 150 151 static int 152 mount_init(void *mem, int size, int flags) 153 { 154 struct mount *mp; 155 156 mp = (struct mount *)mem; 157 mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); 158 mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); 159 lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); 160 mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO); 161 mp->mnt_ref = 0; 162 mp->mnt_vfs_ops = 1; 163 mp->mnt_rootvnode = NULL; 164 return (0); 165 } 166 167 static void 168 mount_fini(void *mem, int size) 169 { 170 struct mount *mp; 171 172 mp = (struct mount *)mem; 173 uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu); 174 lockdestroy(&mp->mnt_explock); 175 mtx_destroy(&mp->mnt_listmtx); 176 mtx_destroy(&mp->mnt_mtx); 177 } 178 179 static void 180 vfs_mount_init(void *dummy __unused) 181 { 182 TIMEOUT_TASK_INIT(taskqueue_deferred_unmount, &deferred_unmount_task, 183 0, vfs_deferred_unmount, NULL); 184 deferred_unmount_retry_delay_hz = hz; 185 mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, 186 NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); 187 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF); 188 } 189 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); 190 191 /* 192 * --------------------------------------------------------------------- 193 * Functions for building and sanitizing the mount options 194 */ 195 196 /* Remove one mount option. */ 197 static void 198 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) 199 { 200 201 TAILQ_REMOVE(opts, opt, link); 202 free(opt->name, M_MOUNT); 203 if (opt->value != NULL) 204 free(opt->value, M_MOUNT); 205 free(opt, M_MOUNT); 206 } 207 208 /* Release all resources related to the mount options. */ 209 void 210 vfs_freeopts(struct vfsoptlist *opts) 211 { 212 struct vfsopt *opt; 213 214 while (!TAILQ_EMPTY(opts)) { 215 opt = TAILQ_FIRST(opts); 216 vfs_freeopt(opts, opt); 217 } 218 free(opts, M_MOUNT); 219 } 220 221 void 222 vfs_deleteopt(struct vfsoptlist *opts, const char *name) 223 { 224 struct vfsopt *opt, *temp; 225 226 if (opts == NULL) 227 return; 228 TAILQ_FOREACH_SAFE(opt, opts, link, temp) { 229 if (strcmp(opt->name, name) == 0) 230 vfs_freeopt(opts, opt); 231 } 232 } 233 234 static int 235 vfs_isopt_ro(const char *opt) 236 { 237 238 if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || 239 strcmp(opt, "norw") == 0) 240 return (1); 241 return (0); 242 } 243 244 static int 245 vfs_isopt_rw(const char *opt) 246 { 247 248 if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) 249 return (1); 250 return (0); 251 } 252 253 /* 254 * Check if options are equal (with or without the "no" prefix). 255 */ 256 static int 257 vfs_equalopts(const char *opt1, const char *opt2) 258 { 259 char *p; 260 261 /* "opt" vs. "opt" or "noopt" vs. "noopt" */ 262 if (strcmp(opt1, opt2) == 0) 263 return (1); 264 /* "noopt" vs. "opt" */ 265 if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) 266 return (1); 267 /* "opt" vs. "noopt" */ 268 if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) 269 return (1); 270 while ((p = strchr(opt1, '.')) != NULL && 271 !strncmp(opt1, opt2, ++p - opt1)) { 272 opt2 += p - opt1; 273 opt1 = p; 274 /* "foo.noopt" vs. "foo.opt" */ 275 if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) 276 return (1); 277 /* "foo.opt" vs. "foo.noopt" */ 278 if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) 279 return (1); 280 } 281 /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ 282 if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && 283 (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) 284 return (1); 285 return (0); 286 } 287 288 /* 289 * If a mount option is specified several times, 290 * (with or without the "no" prefix) only keep 291 * the last occurrence of it. 292 */ 293 static void 294 vfs_sanitizeopts(struct vfsoptlist *opts) 295 { 296 struct vfsopt *opt, *opt2, *tmp; 297 298 TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { 299 opt2 = TAILQ_PREV(opt, vfsoptlist, link); 300 while (opt2 != NULL) { 301 if (vfs_equalopts(opt->name, opt2->name)) { 302 tmp = TAILQ_PREV(opt2, vfsoptlist, link); 303 vfs_freeopt(opts, opt2); 304 opt2 = tmp; 305 } else { 306 opt2 = TAILQ_PREV(opt2, vfsoptlist, link); 307 } 308 } 309 } 310 } 311 312 /* 313 * Build a linked list of mount options from a struct uio. 314 */ 315 int 316 vfs_buildopts(struct uio *auio, struct vfsoptlist **options) 317 { 318 struct vfsoptlist *opts; 319 struct vfsopt *opt; 320 size_t memused, namelen, optlen; 321 unsigned int i, iovcnt; 322 int error; 323 324 opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); 325 TAILQ_INIT(opts); 326 memused = 0; 327 iovcnt = auio->uio_iovcnt; 328 for (i = 0; i < iovcnt; i += 2) { 329 namelen = auio->uio_iov[i].iov_len; 330 optlen = auio->uio_iov[i + 1].iov_len; 331 memused += sizeof(struct vfsopt) + optlen + namelen; 332 /* 333 * Avoid consuming too much memory, and attempts to overflow 334 * memused. 335 */ 336 if (memused > VFS_MOUNTARG_SIZE_MAX || 337 optlen > VFS_MOUNTARG_SIZE_MAX || 338 namelen > VFS_MOUNTARG_SIZE_MAX) { 339 error = EINVAL; 340 goto bad; 341 } 342 343 opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); 344 opt->name = malloc(namelen, M_MOUNT, M_WAITOK); 345 opt->value = NULL; 346 opt->len = 0; 347 opt->pos = i / 2; 348 opt->seen = 0; 349 350 /* 351 * Do this early, so jumps to "bad" will free the current 352 * option. 353 */ 354 TAILQ_INSERT_TAIL(opts, opt, link); 355 356 if (auio->uio_segflg == UIO_SYSSPACE) { 357 bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); 358 } else { 359 error = copyin(auio->uio_iov[i].iov_base, opt->name, 360 namelen); 361 if (error) 362 goto bad; 363 } 364 /* Ensure names are null-terminated strings. */ 365 if (namelen == 0 || opt->name[namelen - 1] != '\0') { 366 error = EINVAL; 367 goto bad; 368 } 369 if (optlen != 0) { 370 opt->len = optlen; 371 opt->value = malloc(optlen, M_MOUNT, M_WAITOK); 372 if (auio->uio_segflg == UIO_SYSSPACE) { 373 bcopy(auio->uio_iov[i + 1].iov_base, opt->value, 374 optlen); 375 } else { 376 error = copyin(auio->uio_iov[i + 1].iov_base, 377 opt->value, optlen); 378 if (error) 379 goto bad; 380 } 381 } 382 } 383 vfs_sanitizeopts(opts); 384 *options = opts; 385 return (0); 386 bad: 387 vfs_freeopts(opts); 388 return (error); 389 } 390 391 /* 392 * Merge the old mount options with the new ones passed 393 * in the MNT_UPDATE case. 394 * 395 * XXX: This function will keep a "nofoo" option in the new 396 * options. E.g, if the option's canonical name is "foo", 397 * "nofoo" ends up in the mount point's active options. 398 */ 399 static void 400 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) 401 { 402 struct vfsopt *opt, *new; 403 404 TAILQ_FOREACH(opt, oldopts, link) { 405 new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); 406 new->name = strdup(opt->name, M_MOUNT); 407 if (opt->len != 0) { 408 new->value = malloc(opt->len, M_MOUNT, M_WAITOK); 409 bcopy(opt->value, new->value, opt->len); 410 } else 411 new->value = NULL; 412 new->len = opt->len; 413 new->seen = opt->seen; 414 TAILQ_INSERT_HEAD(toopts, new, link); 415 } 416 vfs_sanitizeopts(toopts); 417 } 418 419 /* 420 * Mount a filesystem. 421 */ 422 #ifndef _SYS_SYSPROTO_H_ 423 struct nmount_args { 424 struct iovec *iovp; 425 unsigned int iovcnt; 426 int flags; 427 }; 428 #endif 429 int 430 sys_nmount(struct thread *td, struct nmount_args *uap) 431 { 432 struct uio *auio; 433 int error; 434 u_int iovcnt; 435 uint64_t flags; 436 437 /* 438 * Mount flags are now 64-bits. On 32-bit archtectures only 439 * 32-bits are passed in, but from here on everything handles 440 * 64-bit flags correctly. 441 */ 442 flags = uap->flags; 443 444 AUDIT_ARG_FFLAGS(flags); 445 CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, 446 uap->iovp, uap->iovcnt, flags); 447 448 /* 449 * Filter out MNT_ROOTFS. We do not want clients of nmount() in 450 * userspace to set this flag, but we must filter it out if we want 451 * MNT_UPDATE on the root file system to work. 452 * MNT_ROOTFS should only be set by the kernel when mounting its 453 * root file system. 454 */ 455 flags &= ~MNT_ROOTFS; 456 457 iovcnt = uap->iovcnt; 458 /* 459 * Check that we have an even number of iovec's 460 * and that we have at least two options. 461 */ 462 if ((iovcnt & 1) || (iovcnt < 4)) { 463 CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, 464 uap->iovcnt); 465 return (EINVAL); 466 } 467 468 error = copyinuio(uap->iovp, iovcnt, &auio); 469 if (error) { 470 CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", 471 __func__, error); 472 return (error); 473 } 474 error = vfs_donmount(td, flags, auio); 475 476 free(auio, M_IOV); 477 return (error); 478 } 479 480 /* 481 * --------------------------------------------------------------------- 482 * Various utility functions 483 */ 484 485 /* 486 * Get a reference on a mount point from a vnode. 487 * 488 * The vnode is allowed to be passed unlocked and race against dooming. Note in 489 * such case there are no guarantees the referenced mount point will still be 490 * associated with it after the function returns. 491 */ 492 struct mount * 493 vfs_ref_from_vp(struct vnode *vp) 494 { 495 struct mount *mp; 496 struct mount_pcpu *mpcpu; 497 498 mp = atomic_load_ptr(&vp->v_mount); 499 if (__predict_false(mp == NULL)) { 500 return (mp); 501 } 502 if (vfs_op_thread_enter(mp, mpcpu)) { 503 if (__predict_true(mp == vp->v_mount)) { 504 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 505 vfs_op_thread_exit(mp, mpcpu); 506 } else { 507 vfs_op_thread_exit(mp, mpcpu); 508 mp = NULL; 509 } 510 } else { 511 MNT_ILOCK(mp); 512 if (mp == vp->v_mount) { 513 MNT_REF(mp); 514 MNT_IUNLOCK(mp); 515 } else { 516 MNT_IUNLOCK(mp); 517 mp = NULL; 518 } 519 } 520 return (mp); 521 } 522 523 void 524 vfs_ref(struct mount *mp) 525 { 526 struct mount_pcpu *mpcpu; 527 528 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 529 if (vfs_op_thread_enter(mp, mpcpu)) { 530 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 531 vfs_op_thread_exit(mp, mpcpu); 532 return; 533 } 534 535 MNT_ILOCK(mp); 536 MNT_REF(mp); 537 MNT_IUNLOCK(mp); 538 } 539 540 /* 541 * Register ump as an upper mount of the mount associated with 542 * vnode vp. This registration will be tracked through 543 * mount_upper_node upper, which should be allocated by the 544 * caller and stored in per-mount data associated with mp. 545 * 546 * If successful, this function will return the mount associated 547 * with vp, and will ensure that it cannot be unmounted until 548 * ump has been unregistered as one of its upper mounts. 549 * 550 * Upon failure this function will return NULL. 551 */ 552 struct mount * 553 vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump, 554 struct mount_upper_node *upper) 555 { 556 struct mount *mp; 557 558 mp = atomic_load_ptr(&vp->v_mount); 559 if (mp == NULL) 560 return (NULL); 561 MNT_ILOCK(mp); 562 if (mp != vp->v_mount || 563 ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) { 564 MNT_IUNLOCK(mp); 565 return (NULL); 566 } 567 KASSERT(ump != mp, ("upper and lower mounts are identical")); 568 upper->mp = ump; 569 MNT_REF(mp); 570 TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link); 571 MNT_IUNLOCK(mp); 572 return (mp); 573 } 574 575 /* 576 * Register upper mount ump to receive vnode unlink/reclaim 577 * notifications from lower mount mp. This registration will 578 * be tracked through mount_upper_node upper, which should be 579 * allocated by the caller and stored in per-mount data 580 * associated with mp. 581 * 582 * ump must already be registered as an upper mount of mp 583 * through a call to vfs_register_upper_from_vp(). 584 */ 585 void 586 vfs_register_for_notification(struct mount *mp, struct mount *ump, 587 struct mount_upper_node *upper) 588 { 589 upper->mp = ump; 590 MNT_ILOCK(mp); 591 TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link); 592 MNT_IUNLOCK(mp); 593 } 594 595 static void 596 vfs_drain_upper_locked(struct mount *mp) 597 { 598 mtx_assert(MNT_MTX(mp), MA_OWNED); 599 while (mp->mnt_upper_pending != 0) { 600 mp->mnt_kern_flag |= MNTK_UPPER_WAITER; 601 msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0); 602 } 603 } 604 605 /* 606 * Undo a previous call to vfs_register_for_notification(). 607 * The mount represented by upper must be currently registered 608 * as an upper mount for mp. 609 */ 610 void 611 vfs_unregister_for_notification(struct mount *mp, 612 struct mount_upper_node *upper) 613 { 614 MNT_ILOCK(mp); 615 vfs_drain_upper_locked(mp); 616 TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link); 617 MNT_IUNLOCK(mp); 618 } 619 620 /* 621 * Undo a previous call to vfs_register_upper_from_vp(). 622 * This must be done before mp can be unmounted. 623 */ 624 void 625 vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper) 626 { 627 MNT_ILOCK(mp); 628 KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0, 629 ("registered upper with pending unmount")); 630 vfs_drain_upper_locked(mp); 631 TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link); 632 if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 && 633 TAILQ_EMPTY(&mp->mnt_uppers)) { 634 mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER; 635 wakeup(&mp->mnt_taskqueue_link); 636 } 637 MNT_REL(mp); 638 MNT_IUNLOCK(mp); 639 } 640 641 void 642 vfs_rel(struct mount *mp) 643 { 644 struct mount_pcpu *mpcpu; 645 646 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 647 if (vfs_op_thread_enter(mp, mpcpu)) { 648 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 649 vfs_op_thread_exit(mp, mpcpu); 650 return; 651 } 652 653 MNT_ILOCK(mp); 654 MNT_REL(mp); 655 MNT_IUNLOCK(mp); 656 } 657 658 /* 659 * Allocate and initialize the mount point struct. 660 */ 661 struct mount * 662 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, 663 struct ucred *cred) 664 { 665 struct mount *mp; 666 667 mp = uma_zalloc(mount_zone, M_WAITOK); 668 bzero(&mp->mnt_startzero, 669 __rangeof(struct mount, mnt_startzero, mnt_endzero)); 670 mp->mnt_kern_flag = 0; 671 mp->mnt_flag = 0; 672 mp->mnt_rootvnode = NULL; 673 mp->mnt_vnodecovered = NULL; 674 mp->mnt_op = NULL; 675 mp->mnt_vfc = NULL; 676 TAILQ_INIT(&mp->mnt_nvnodelist); 677 mp->mnt_nvnodelistsize = 0; 678 TAILQ_INIT(&mp->mnt_lazyvnodelist); 679 mp->mnt_lazyvnodelistsize = 0; 680 MPPASS(mp->mnt_ref == 0 && mp->mnt_lockref == 0 && 681 mp->mnt_writeopcount == 0, mp); 682 MPASSERT(mp->mnt_vfs_ops == 1, mp, 683 ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops)); 684 (void) vfs_busy(mp, MBF_NOWAIT); 685 atomic_add_acq_int(&vfsp->vfc_refcount, 1); 686 mp->mnt_op = vfsp->vfc_vfsops; 687 mp->mnt_vfc = vfsp; 688 mp->mnt_stat.f_type = vfsp->vfc_typenum; 689 mp->mnt_gen++; 690 strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 691 mp->mnt_vnodecovered = vp; 692 mp->mnt_cred = crdup(cred); 693 mp->mnt_stat.f_owner = cred->cr_uid; 694 strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); 695 mp->mnt_iosize_max = DFLTPHYS; 696 #ifdef MAC 697 mac_mount_init(mp); 698 mac_mount_create(cred, mp); 699 #endif 700 arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); 701 mp->mnt_upper_pending = 0; 702 TAILQ_INIT(&mp->mnt_uppers); 703 TAILQ_INIT(&mp->mnt_notify); 704 mp->mnt_taskqueue_flags = 0; 705 mp->mnt_unmount_retries = 0; 706 return (mp); 707 } 708 709 /* 710 * Destroy the mount struct previously allocated by vfs_mount_alloc(). 711 */ 712 void 713 vfs_mount_destroy(struct mount *mp) 714 { 715 716 MPPASS(mp->mnt_vfs_ops != 0, mp); 717 718 vfs_assert_mount_counters(mp); 719 720 MNT_ILOCK(mp); 721 mp->mnt_kern_flag |= MNTK_REFEXPIRE; 722 if (mp->mnt_kern_flag & MNTK_MWAIT) { 723 mp->mnt_kern_flag &= ~MNTK_MWAIT; 724 wakeup(mp); 725 } 726 while (mp->mnt_ref) 727 msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); 728 KASSERT(mp->mnt_ref == 0, 729 ("%s: invalid refcount in the drain path @ %s:%d", __func__, 730 __FILE__, __LINE__)); 731 MPPASS(mp->mnt_writeopcount == 0, mp); 732 MPPASS(mp->mnt_secondary_writes == 0, mp); 733 atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); 734 if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { 735 struct vnode *vp; 736 737 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) 738 vn_printf(vp, "dangling vnode "); 739 panic("unmount: dangling vnode"); 740 } 741 KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending")); 742 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); 743 KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify")); 744 MPPASS(mp->mnt_nvnodelistsize == 0, mp); 745 MPPASS(mp->mnt_lazyvnodelistsize == 0, mp); 746 MPPASS(mp->mnt_lockref == 0, mp); 747 MNT_IUNLOCK(mp); 748 749 MPASSERT(mp->mnt_vfs_ops == 1, mp, 750 ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops)); 751 752 MPASSERT(mp->mnt_rootvnode == NULL, mp, 753 ("mount point still has a root vnode %p", mp->mnt_rootvnode)); 754 755 if (mp->mnt_vnodecovered != NULL) 756 vrele(mp->mnt_vnodecovered); 757 #ifdef MAC 758 mac_mount_destroy(mp); 759 #endif 760 if (mp->mnt_opt != NULL) 761 vfs_freeopts(mp->mnt_opt); 762 if (mp->mnt_exjail != NULL) { 763 atomic_subtract_int(&mp->mnt_exjail->cr_prison->pr_exportcnt, 764 1); 765 crfree(mp->mnt_exjail); 766 } 767 if (mp->mnt_export != NULL) { 768 vfs_free_addrlist(mp->mnt_export); 769 free(mp->mnt_export, M_MOUNT); 770 } 771 crfree(mp->mnt_cred); 772 uma_zfree(mount_zone, mp); 773 } 774 775 static bool 776 vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error) 777 { 778 /* This is an upgrade of an exisiting mount. */ 779 if ((fsflags & MNT_UPDATE) != 0) 780 return (false); 781 /* This is already an R/O mount. */ 782 if ((fsflags & MNT_RDONLY) != 0) 783 return (false); 784 785 switch (error) { 786 case ENODEV: /* generic, geom, ... */ 787 case EACCES: /* cam/scsi, ... */ 788 case EROFS: /* md, mmcsd, ... */ 789 /* 790 * These errors can be returned by the storage layer to signal 791 * that the media is read-only. No harm in the R/O mount 792 * attempt if the error was returned for some other reason. 793 */ 794 return (true); 795 default: 796 return (false); 797 } 798 } 799 800 int 801 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) 802 { 803 struct vfsoptlist *optlist; 804 struct vfsopt *opt, *tmp_opt; 805 char *fstype, *fspath, *errmsg; 806 int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; 807 bool autoro, has_nonexport, jail_export; 808 809 errmsg = fspath = NULL; 810 errmsg_len = fspathlen = 0; 811 errmsg_pos = -1; 812 autoro = default_autoro; 813 814 error = vfs_buildopts(fsoptions, &optlist); 815 if (error) 816 return (error); 817 818 if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) 819 errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); 820 821 /* 822 * We need these two options before the others, 823 * and they are mandatory for any filesystem. 824 * Ensure they are NUL terminated as well. 825 */ 826 fstypelen = 0; 827 error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); 828 if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') { 829 error = EINVAL; 830 if (errmsg != NULL) 831 strncpy(errmsg, "Invalid fstype", errmsg_len); 832 goto bail; 833 } 834 fspathlen = 0; 835 error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); 836 if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') { 837 error = EINVAL; 838 if (errmsg != NULL) 839 strncpy(errmsg, "Invalid fspath", errmsg_len); 840 goto bail; 841 } 842 843 /* 844 * Check to see that "export" is only used with the "update", "fstype", 845 * "fspath", "from" and "errmsg" options when in a vnet jail. 846 * These are the ones used to set/update exports by mountd(8). 847 * If only the above options are set in a jail that can run mountd(8), 848 * then the jail_export argument of vfs_domount() will be true. 849 * When jail_export is true, the vfs_suser() check does not cause 850 * failure, but limits the update to exports only. 851 * This allows mountd(8) running within the vnet jail 852 * to export file systems visible within the jail, but 853 * mounted outside of the jail. 854 */ 855 /* 856 * We need to see if we have the "update" option 857 * before we call vfs_domount(), since vfs_domount() has special 858 * logic based on MNT_UPDATE. This is very important 859 * when we want to update the root filesystem. 860 */ 861 has_nonexport = false; 862 jail_export = false; 863 TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { 864 int do_freeopt = 0; 865 866 if (jailed(td->td_ucred) && 867 strcmp(opt->name, "export") != 0 && 868 strcmp(opt->name, "update") != 0 && 869 strcmp(opt->name, "fstype") != 0 && 870 strcmp(opt->name, "fspath") != 0 && 871 strcmp(opt->name, "from") != 0 && 872 strcmp(opt->name, "errmsg") != 0) 873 has_nonexport = true; 874 if (strcmp(opt->name, "update") == 0) { 875 fsflags |= MNT_UPDATE; 876 do_freeopt = 1; 877 } 878 else if (strcmp(opt->name, "async") == 0) 879 fsflags |= MNT_ASYNC; 880 else if (strcmp(opt->name, "force") == 0) { 881 fsflags |= MNT_FORCE; 882 do_freeopt = 1; 883 } 884 else if (strcmp(opt->name, "reload") == 0) { 885 fsflags |= MNT_RELOAD; 886 do_freeopt = 1; 887 } 888 else if (strcmp(opt->name, "multilabel") == 0) 889 fsflags |= MNT_MULTILABEL; 890 else if (strcmp(opt->name, "noasync") == 0) 891 fsflags &= ~MNT_ASYNC; 892 else if (strcmp(opt->name, "noatime") == 0) 893 fsflags |= MNT_NOATIME; 894 else if (strcmp(opt->name, "atime") == 0) { 895 free(opt->name, M_MOUNT); 896 opt->name = strdup("nonoatime", M_MOUNT); 897 } 898 else if (strcmp(opt->name, "noclusterr") == 0) 899 fsflags |= MNT_NOCLUSTERR; 900 else if (strcmp(opt->name, "clusterr") == 0) { 901 free(opt->name, M_MOUNT); 902 opt->name = strdup("nonoclusterr", M_MOUNT); 903 } 904 else if (strcmp(opt->name, "noclusterw") == 0) 905 fsflags |= MNT_NOCLUSTERW; 906 else if (strcmp(opt->name, "clusterw") == 0) { 907 free(opt->name, M_MOUNT); 908 opt->name = strdup("nonoclusterw", M_MOUNT); 909 } 910 else if (strcmp(opt->name, "noexec") == 0) 911 fsflags |= MNT_NOEXEC; 912 else if (strcmp(opt->name, "exec") == 0) { 913 free(opt->name, M_MOUNT); 914 opt->name = strdup("nonoexec", M_MOUNT); 915 } 916 else if (strcmp(opt->name, "nosuid") == 0) 917 fsflags |= MNT_NOSUID; 918 else if (strcmp(opt->name, "suid") == 0) { 919 free(opt->name, M_MOUNT); 920 opt->name = strdup("nonosuid", M_MOUNT); 921 } 922 else if (strcmp(opt->name, "nosymfollow") == 0) 923 fsflags |= MNT_NOSYMFOLLOW; 924 else if (strcmp(opt->name, "symfollow") == 0) { 925 free(opt->name, M_MOUNT); 926 opt->name = strdup("nonosymfollow", M_MOUNT); 927 } 928 else if (strcmp(opt->name, "noro") == 0) { 929 fsflags &= ~MNT_RDONLY; 930 autoro = false; 931 } 932 else if (strcmp(opt->name, "rw") == 0) { 933 fsflags &= ~MNT_RDONLY; 934 autoro = false; 935 } 936 else if (strcmp(opt->name, "ro") == 0) { 937 fsflags |= MNT_RDONLY; 938 autoro = false; 939 } 940 else if (strcmp(opt->name, "rdonly") == 0) { 941 free(opt->name, M_MOUNT); 942 opt->name = strdup("ro", M_MOUNT); 943 fsflags |= MNT_RDONLY; 944 autoro = false; 945 } 946 else if (strcmp(opt->name, "autoro") == 0) { 947 do_freeopt = 1; 948 autoro = true; 949 } 950 else if (strcmp(opt->name, "suiddir") == 0) 951 fsflags |= MNT_SUIDDIR; 952 else if (strcmp(opt->name, "sync") == 0) 953 fsflags |= MNT_SYNCHRONOUS; 954 else if (strcmp(opt->name, "union") == 0) 955 fsflags |= MNT_UNION; 956 else if (strcmp(opt->name, "export") == 0) { 957 fsflags |= MNT_EXPORTED; 958 jail_export = true; 959 } else if (strcmp(opt->name, "automounted") == 0) { 960 fsflags |= MNT_AUTOMOUNTED; 961 do_freeopt = 1; 962 } else if (strcmp(opt->name, "nocover") == 0) { 963 fsflags |= MNT_NOCOVER; 964 do_freeopt = 1; 965 } else if (strcmp(opt->name, "cover") == 0) { 966 fsflags &= ~MNT_NOCOVER; 967 do_freeopt = 1; 968 } else if (strcmp(opt->name, "emptydir") == 0) { 969 fsflags |= MNT_EMPTYDIR; 970 do_freeopt = 1; 971 } else if (strcmp(opt->name, "noemptydir") == 0) { 972 fsflags &= ~MNT_EMPTYDIR; 973 do_freeopt = 1; 974 } 975 if (do_freeopt) 976 vfs_freeopt(optlist, opt); 977 } 978 979 /* 980 * Be ultra-paranoid about making sure the type and fspath 981 * variables will fit in our mp buffers, including the 982 * terminating NUL. 983 */ 984 if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { 985 error = ENAMETOOLONG; 986 goto bail; 987 } 988 989 /* 990 * If has_nonexport is true or the caller is not running within a 991 * vnet prison that can run mountd(8), set jail_export false. 992 */ 993 if (has_nonexport || !jailed(td->td_ucred) || 994 !prison_check_nfsd(td->td_ucred)) 995 jail_export = false; 996 997 error = vfs_domount(td, fstype, fspath, fsflags, jail_export, &optlist); 998 if (error == ENODEV) { 999 error = EINVAL; 1000 if (errmsg != NULL) 1001 strncpy(errmsg, "Invalid fstype", errmsg_len); 1002 goto bail; 1003 } 1004 1005 /* 1006 * See if we can mount in the read-only mode if the error code suggests 1007 * that it could be possible and the mount options allow for that. 1008 * Never try it if "[no]{ro|rw}" has been explicitly requested and not 1009 * overridden by "autoro". 1010 */ 1011 if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) { 1012 printf("%s: R/W mount failed, possibly R/O media," 1013 " trying R/O mount\n", __func__); 1014 fsflags |= MNT_RDONLY; 1015 error = vfs_domount(td, fstype, fspath, fsflags, jail_export, 1016 &optlist); 1017 } 1018 bail: 1019 /* copyout the errmsg */ 1020 if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) 1021 && errmsg_len > 0 && errmsg != NULL) { 1022 if (fsoptions->uio_segflg == UIO_SYSSPACE) { 1023 bcopy(errmsg, 1024 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, 1025 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); 1026 } else { 1027 (void)copyout(errmsg, 1028 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, 1029 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); 1030 } 1031 } 1032 1033 if (optlist != NULL) 1034 vfs_freeopts(optlist); 1035 return (error); 1036 } 1037 1038 /* 1039 * Old mount API. 1040 */ 1041 #ifndef _SYS_SYSPROTO_H_ 1042 struct mount_args { 1043 char *type; 1044 char *path; 1045 int flags; 1046 caddr_t data; 1047 }; 1048 #endif 1049 /* ARGSUSED */ 1050 int 1051 sys_mount(struct thread *td, struct mount_args *uap) 1052 { 1053 char *fstype; 1054 struct vfsconf *vfsp = NULL; 1055 struct mntarg *ma = NULL; 1056 uint64_t flags; 1057 int error; 1058 1059 /* 1060 * Mount flags are now 64-bits. On 32-bit architectures only 1061 * 32-bits are passed in, but from here on everything handles 1062 * 64-bit flags correctly. 1063 */ 1064 flags = uap->flags; 1065 1066 AUDIT_ARG_FFLAGS(flags); 1067 1068 /* 1069 * Filter out MNT_ROOTFS. We do not want clients of mount() in 1070 * userspace to set this flag, but we must filter it out if we want 1071 * MNT_UPDATE on the root file system to work. 1072 * MNT_ROOTFS should only be set by the kernel when mounting its 1073 * root file system. 1074 */ 1075 flags &= ~MNT_ROOTFS; 1076 1077 fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); 1078 error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); 1079 if (error) { 1080 free(fstype, M_TEMP); 1081 return (error); 1082 } 1083 1084 AUDIT_ARG_TEXT(fstype); 1085 vfsp = vfs_byname_kld(fstype, td, &error); 1086 free(fstype, M_TEMP); 1087 if (vfsp == NULL) 1088 return (EINVAL); 1089 if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 && 1090 vfsp->vfc_vfsops_sd->vfs_cmount == NULL) || 1091 ((vfsp->vfc_flags & VFCF_SBDRY) == 0 && 1092 vfsp->vfc_vfsops->vfs_cmount == NULL)) 1093 return (EOPNOTSUPP); 1094 1095 ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); 1096 ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); 1097 ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); 1098 ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); 1099 ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); 1100 1101 if ((vfsp->vfc_flags & VFCF_SBDRY) != 0) 1102 return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags)); 1103 return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags)); 1104 } 1105 1106 /* 1107 * vfs_domount_first(): first file system mount (not update) 1108 */ 1109 static int 1110 vfs_domount_first( 1111 struct thread *td, /* Calling thread. */ 1112 struct vfsconf *vfsp, /* File system type. */ 1113 char *fspath, /* Mount path. */ 1114 struct vnode *vp, /* Vnode to be covered. */ 1115 uint64_t fsflags, /* Flags common to all filesystems. */ 1116 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1117 ) 1118 { 1119 struct vattr va; 1120 struct mount *mp; 1121 struct vnode *newdp, *rootvp; 1122 int error, error1; 1123 bool unmounted; 1124 1125 ASSERT_VOP_ELOCKED(vp, __func__); 1126 KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); 1127 1128 /* 1129 * If the jail of the calling thread lacks permission for this type of 1130 * file system, or is trying to cover its own root, deny immediately. 1131 */ 1132 if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred, 1133 vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) { 1134 vput(vp); 1135 return (EPERM); 1136 } 1137 1138 /* 1139 * If the user is not root, ensure that they own the directory 1140 * onto which we are attempting to mount. 1141 */ 1142 error = VOP_GETATTR(vp, &va, td->td_ucred); 1143 if (error == 0 && va.va_uid != td->td_ucred->cr_uid) 1144 error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN); 1145 if (error == 0) 1146 error = vinvalbuf(vp, V_SAVE, 0, 0); 1147 if (vfsp->vfc_flags & VFCF_FILEMOUNT) { 1148 if (error == 0 && vp->v_type != VDIR && vp->v_type != VREG) 1149 error = EINVAL; 1150 /* 1151 * For file mounts, ensure that there is only one hardlink to the file. 1152 */ 1153 if (error == 0 && vp->v_type == VREG && va.va_nlink != 1) 1154 error = EINVAL; 1155 } else { 1156 if (error == 0 && vp->v_type != VDIR) 1157 error = ENOTDIR; 1158 } 1159 if (error == 0 && (fsflags & MNT_EMPTYDIR) != 0) 1160 error = vn_dir_check_empty(vp); 1161 if (error == 0) { 1162 VI_LOCK(vp); 1163 if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) 1164 vp->v_iflag |= VI_MOUNT; 1165 else 1166 error = EBUSY; 1167 VI_UNLOCK(vp); 1168 } 1169 if (error != 0) { 1170 vput(vp); 1171 return (error); 1172 } 1173 vn_seqc_write_begin(vp); 1174 VOP_UNLOCK(vp); 1175 1176 /* Allocate and initialize the filesystem. */ 1177 mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); 1178 /* XXXMAC: pass to vfs_mount_alloc? */ 1179 mp->mnt_optnew = *optlist; 1180 /* Set the mount level flags. */ 1181 mp->mnt_flag = (fsflags & 1182 (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY | MNT_FORCE)); 1183 1184 /* 1185 * Mount the filesystem. 1186 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they 1187 * get. No freeing of cn_pnbuf. 1188 */ 1189 error1 = 0; 1190 unmounted = true; 1191 if ((error = VFS_MOUNT(mp)) != 0 || 1192 (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 || 1193 (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) { 1194 rootvp = NULL; 1195 if (error1 != 0) { 1196 MPASS(error == 0); 1197 rootvp = vfs_cache_root_clear(mp); 1198 if (rootvp != NULL) { 1199 vhold(rootvp); 1200 vrele(rootvp); 1201 } 1202 (void)vn_start_write(NULL, &mp, V_WAIT); 1203 MNT_ILOCK(mp); 1204 mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_UNMOUNTF; 1205 MNT_IUNLOCK(mp); 1206 VFS_PURGE(mp); 1207 error = VFS_UNMOUNT(mp, 0); 1208 vn_finished_write(mp); 1209 if (error != 0) { 1210 printf( 1211 "failed post-mount (%d): rollback unmount returned %d\n", 1212 error1, error); 1213 unmounted = false; 1214 } 1215 error = error1; 1216 } 1217 vfs_unbusy(mp); 1218 mp->mnt_vnodecovered = NULL; 1219 if (unmounted) { 1220 /* XXXKIB wait for mnt_lockref drain? */ 1221 vfs_mount_destroy(mp); 1222 } 1223 VI_LOCK(vp); 1224 vp->v_iflag &= ~VI_MOUNT; 1225 VI_UNLOCK(vp); 1226 if (rootvp != NULL) { 1227 vn_seqc_write_end(rootvp); 1228 vdrop(rootvp); 1229 } 1230 vn_seqc_write_end(vp); 1231 vrele(vp); 1232 return (error); 1233 } 1234 vn_seqc_write_begin(newdp); 1235 VOP_UNLOCK(newdp); 1236 1237 if (mp->mnt_opt != NULL) 1238 vfs_freeopts(mp->mnt_opt); 1239 mp->mnt_opt = mp->mnt_optnew; 1240 *optlist = NULL; 1241 1242 /* 1243 * Prevent external consumers of mount options from reading mnt_optnew. 1244 */ 1245 mp->mnt_optnew = NULL; 1246 1247 MNT_ILOCK(mp); 1248 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 1249 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 1250 mp->mnt_kern_flag |= MNTK_ASYNC; 1251 else 1252 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1253 MNT_IUNLOCK(mp); 1254 1255 /* 1256 * VIRF_MOUNTPOINT and v_mountedhere need to be set under the 1257 * vp lock to satisfy vfs_lookup() requirements. 1258 */ 1259 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); 1260 VI_LOCK(vp); 1261 vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); 1262 vp->v_mountedhere = mp; 1263 VI_UNLOCK(vp); 1264 VOP_UNLOCK(vp); 1265 cache_purge(vp); 1266 1267 /* 1268 * We need to lock both vnodes. 1269 * 1270 * Use vn_lock_pair to avoid establishing an ordering between vnodes 1271 * from different filesystems. 1272 */ 1273 vn_lock_pair(vp, false, LK_EXCLUSIVE, newdp, false, LK_EXCLUSIVE); 1274 1275 VI_LOCK(vp); 1276 vp->v_iflag &= ~VI_MOUNT; 1277 VI_UNLOCK(vp); 1278 /* Place the new filesystem at the end of the mount list. */ 1279 mtx_lock(&mountlist_mtx); 1280 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); 1281 mtx_unlock(&mountlist_mtx); 1282 vfs_event_signal(NULL, VQ_MOUNT, 0); 1283 VOP_UNLOCK(vp); 1284 EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); 1285 VOP_UNLOCK(newdp); 1286 mount_devctl_event("MOUNT", mp, false); 1287 mountcheckdirs(vp, newdp); 1288 vn_seqc_write_end(vp); 1289 vn_seqc_write_end(newdp); 1290 vrele(newdp); 1291 if ((mp->mnt_flag & MNT_RDONLY) == 0) 1292 vfs_allocate_syncvnode(mp); 1293 vfs_op_exit(mp); 1294 vfs_unbusy(mp); 1295 return (0); 1296 } 1297 1298 /* 1299 * vfs_domount_update(): update of mounted file system 1300 */ 1301 static int 1302 vfs_domount_update( 1303 struct thread *td, /* Calling thread. */ 1304 struct vnode *vp, /* Mount point vnode. */ 1305 uint64_t fsflags, /* Flags common to all filesystems. */ 1306 bool jail_export, /* Got export option in vnet prison. */ 1307 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1308 ) 1309 { 1310 struct export_args export; 1311 struct o2export_args o2export; 1312 struct vnode *rootvp; 1313 void *bufp; 1314 struct mount *mp; 1315 int error, export_error, i, len, fsid_up_len; 1316 uint64_t flag; 1317 gid_t *grps; 1318 fsid_t *fsid_up; 1319 bool vfs_suser_failed; 1320 1321 ASSERT_VOP_ELOCKED(vp, __func__); 1322 KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); 1323 mp = vp->v_mount; 1324 1325 if ((vp->v_vflag & VV_ROOT) == 0) { 1326 if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) 1327 == 0) 1328 error = EXDEV; 1329 else 1330 error = EINVAL; 1331 vput(vp); 1332 return (error); 1333 } 1334 1335 /* 1336 * We only allow the filesystem to be reloaded if it 1337 * is currently mounted read-only. 1338 */ 1339 flag = mp->mnt_flag; 1340 if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { 1341 vput(vp); 1342 return (EOPNOTSUPP); /* Needs translation */ 1343 } 1344 /* 1345 * Only privileged root, or (if MNT_USER is set) the user that 1346 * did the original mount is permitted to update it. 1347 */ 1348 /* 1349 * For the case of mountd(8) doing exports in a jail, the vfs_suser() 1350 * call does not cause failure. vfs_domount() has already checked 1351 * that "root" is doing this and vfs_suser() will fail when 1352 * the file system has been mounted outside the jail. 1353 * jail_export set true indicates that "export" is not mixed 1354 * with other options that change mount behaviour. 1355 */ 1356 vfs_suser_failed = false; 1357 error = vfs_suser(mp, td); 1358 if (jail_export && error != 0) { 1359 error = 0; 1360 vfs_suser_failed = true; 1361 } 1362 if (error != 0) { 1363 vput(vp); 1364 return (error); 1365 } 1366 if (vfs_busy(mp, MBF_NOWAIT)) { 1367 vput(vp); 1368 return (EBUSY); 1369 } 1370 VI_LOCK(vp); 1371 if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { 1372 VI_UNLOCK(vp); 1373 vfs_unbusy(mp); 1374 vput(vp); 1375 return (EBUSY); 1376 } 1377 vp->v_iflag |= VI_MOUNT; 1378 VI_UNLOCK(vp); 1379 VOP_UNLOCK(vp); 1380 1381 rootvp = NULL; 1382 vfs_op_enter(mp); 1383 vn_seqc_write_begin(vp); 1384 1385 if (vfs_getopt(*optlist, "fsid", (void **)&fsid_up, 1386 &fsid_up_len) == 0) { 1387 if (fsid_up_len != sizeof(*fsid_up)) { 1388 error = EINVAL; 1389 goto end; 1390 } 1391 if (fsidcmp(fsid_up, &mp->mnt_stat.f_fsid) != 0) { 1392 error = ENOENT; 1393 goto end; 1394 } 1395 vfs_deleteopt(*optlist, "fsid"); 1396 } 1397 1398 MNT_ILOCK(mp); 1399 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 1400 MNT_IUNLOCK(mp); 1401 error = EBUSY; 1402 goto end; 1403 } 1404 if (vfs_suser_failed) { 1405 KASSERT((fsflags & (MNT_EXPORTED | MNT_UPDATE)) == 1406 (MNT_EXPORTED | MNT_UPDATE), 1407 ("%s: jailed export did not set expected fsflags", 1408 __func__)); 1409 /* 1410 * For this case, only MNT_UPDATE and 1411 * MNT_EXPORTED have been set in fsflags 1412 * by the options. Only set MNT_UPDATE, 1413 * since that is the one that would be set 1414 * when set in fsflags, below. 1415 */ 1416 mp->mnt_flag |= MNT_UPDATE; 1417 } else { 1418 mp->mnt_flag &= ~MNT_UPDATEMASK; 1419 mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | 1420 MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); 1421 if ((mp->mnt_flag & MNT_ASYNC) == 0) 1422 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1423 } 1424 rootvp = vfs_cache_root_clear(mp); 1425 MNT_IUNLOCK(mp); 1426 mp->mnt_optnew = *optlist; 1427 vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); 1428 1429 /* 1430 * Mount the filesystem. 1431 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they 1432 * get. No freeing of cn_pnbuf. 1433 */ 1434 /* 1435 * For the case of mountd(8) doing exports from within a vnet jail, 1436 * "from" is typically not set correctly such that VFS_MOUNT() will 1437 * return ENOENT. It is not obvious that VFS_MOUNT() ever needs to be 1438 * called when mountd is doing exports, but this check only applies to 1439 * the specific case where it is running inside a vnet jail, to 1440 * avoid any POLA violation. 1441 */ 1442 error = 0; 1443 if (!jail_export) 1444 error = VFS_MOUNT(mp); 1445 1446 export_error = 0; 1447 /* Process the export option. */ 1448 if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, 1449 &len) == 0) { 1450 /* Assume that there is only 1 ABI for each length. */ 1451 switch (len) { 1452 case (sizeof(struct oexport_args)): 1453 bzero(&o2export, sizeof(o2export)); 1454 /* FALLTHROUGH */ 1455 case (sizeof(o2export)): 1456 bcopy(bufp, &o2export, len); 1457 export.ex_flags = (uint64_t)o2export.ex_flags; 1458 export.ex_root = o2export.ex_root; 1459 export.ex_uid = o2export.ex_anon.cr_uid; 1460 export.ex_groups = NULL; 1461 export.ex_ngroups = o2export.ex_anon.cr_ngroups; 1462 if (export.ex_ngroups > 0) { 1463 if (export.ex_ngroups <= XU_NGROUPS) { 1464 export.ex_groups = malloc( 1465 export.ex_ngroups * sizeof(gid_t), 1466 M_TEMP, M_WAITOK); 1467 for (i = 0; i < export.ex_ngroups; i++) 1468 export.ex_groups[i] = 1469 o2export.ex_anon.cr_groups[i]; 1470 } else 1471 export_error = EINVAL; 1472 } else if (export.ex_ngroups < 0) 1473 export_error = EINVAL; 1474 export.ex_addr = o2export.ex_addr; 1475 export.ex_addrlen = o2export.ex_addrlen; 1476 export.ex_mask = o2export.ex_mask; 1477 export.ex_masklen = o2export.ex_masklen; 1478 export.ex_indexfile = o2export.ex_indexfile; 1479 export.ex_numsecflavors = o2export.ex_numsecflavors; 1480 if (export.ex_numsecflavors < MAXSECFLAVORS) { 1481 for (i = 0; i < export.ex_numsecflavors; i++) 1482 export.ex_secflavors[i] = 1483 o2export.ex_secflavors[i]; 1484 } else 1485 export_error = EINVAL; 1486 if (export_error == 0) 1487 export_error = vfs_export(mp, &export, true); 1488 free(export.ex_groups, M_TEMP); 1489 break; 1490 case (sizeof(export)): 1491 bcopy(bufp, &export, len); 1492 grps = NULL; 1493 if (export.ex_ngroups > 0) { 1494 if (export.ex_ngroups <= NGROUPS_MAX) { 1495 grps = malloc(export.ex_ngroups * 1496 sizeof(gid_t), M_TEMP, M_WAITOK); 1497 export_error = copyin(export.ex_groups, 1498 grps, export.ex_ngroups * 1499 sizeof(gid_t)); 1500 if (export_error == 0) 1501 export.ex_groups = grps; 1502 } else 1503 export_error = EINVAL; 1504 } else if (export.ex_ngroups == 0) 1505 export.ex_groups = NULL; 1506 else 1507 export_error = EINVAL; 1508 if (export_error == 0) 1509 export_error = vfs_export(mp, &export, true); 1510 free(grps, M_TEMP); 1511 break; 1512 default: 1513 export_error = EINVAL; 1514 break; 1515 } 1516 } 1517 1518 MNT_ILOCK(mp); 1519 if (error == 0) { 1520 mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | 1521 MNT_SNAPSHOT); 1522 } else { 1523 /* 1524 * If we fail, restore old mount flags. MNT_QUOTA is special, 1525 * because it is not part of MNT_UPDATEMASK, but it could have 1526 * changed in the meantime if quotactl(2) was called. 1527 * All in all we want current value of MNT_QUOTA, not the old 1528 * one. 1529 */ 1530 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); 1531 } 1532 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 1533 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 1534 mp->mnt_kern_flag |= MNTK_ASYNC; 1535 else 1536 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1537 MNT_IUNLOCK(mp); 1538 1539 if (error != 0) 1540 goto end; 1541 1542 mount_devctl_event("REMOUNT", mp, true); 1543 if (mp->mnt_opt != NULL) 1544 vfs_freeopts(mp->mnt_opt); 1545 mp->mnt_opt = mp->mnt_optnew; 1546 *optlist = NULL; 1547 (void)VFS_STATFS(mp, &mp->mnt_stat); 1548 /* 1549 * Prevent external consumers of mount options from reading 1550 * mnt_optnew. 1551 */ 1552 mp->mnt_optnew = NULL; 1553 1554 if ((mp->mnt_flag & MNT_RDONLY) == 0) 1555 vfs_allocate_syncvnode(mp); 1556 else 1557 vfs_deallocate_syncvnode(mp); 1558 end: 1559 vfs_op_exit(mp); 1560 if (rootvp != NULL) { 1561 vn_seqc_write_end(rootvp); 1562 vrele(rootvp); 1563 } 1564 vn_seqc_write_end(vp); 1565 vfs_unbusy(mp); 1566 VI_LOCK(vp); 1567 vp->v_iflag &= ~VI_MOUNT; 1568 VI_UNLOCK(vp); 1569 vrele(vp); 1570 return (error != 0 ? error : export_error); 1571 } 1572 1573 /* 1574 * vfs_domount(): actually attempt a filesystem mount. 1575 */ 1576 static int 1577 vfs_domount( 1578 struct thread *td, /* Calling thread. */ 1579 const char *fstype, /* Filesystem type. */ 1580 char *fspath, /* Mount path. */ 1581 uint64_t fsflags, /* Flags common to all filesystems. */ 1582 bool jail_export, /* Got export option in vnet prison. */ 1583 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1584 ) 1585 { 1586 struct vfsconf *vfsp; 1587 struct nameidata nd; 1588 struct vnode *vp; 1589 char *pathbuf; 1590 int error; 1591 1592 /* 1593 * Be ultra-paranoid about making sure the type and fspath 1594 * variables will fit in our mp buffers, including the 1595 * terminating NUL. 1596 */ 1597 if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) 1598 return (ENAMETOOLONG); 1599 1600 if (jail_export) { 1601 error = priv_check(td, PRIV_NFS_DAEMON); 1602 if (error) 1603 return (error); 1604 } else if (jailed(td->td_ucred) || usermount == 0) { 1605 if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) 1606 return (error); 1607 } 1608 1609 /* 1610 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. 1611 */ 1612 if (fsflags & MNT_EXPORTED) { 1613 error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); 1614 if (error) 1615 return (error); 1616 } 1617 if (fsflags & MNT_SUIDDIR) { 1618 error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); 1619 if (error) 1620 return (error); 1621 } 1622 /* 1623 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. 1624 */ 1625 if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { 1626 if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) 1627 fsflags |= MNT_NOSUID | MNT_USER; 1628 } 1629 1630 /* Load KLDs before we lock the covered vnode to avoid reversals. */ 1631 vfsp = NULL; 1632 if ((fsflags & MNT_UPDATE) == 0) { 1633 /* Don't try to load KLDs if we're mounting the root. */ 1634 if (fsflags & MNT_ROOTFS) { 1635 if ((vfsp = vfs_byname(fstype)) == NULL) 1636 return (ENODEV); 1637 } else { 1638 if ((vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) 1639 return (error); 1640 } 1641 } 1642 1643 /* 1644 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. 1645 */ 1646 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | WANTPARENT, 1647 UIO_SYSSPACE, fspath); 1648 error = namei(&nd); 1649 if (error != 0) 1650 return (error); 1651 vp = nd.ni_vp; 1652 /* 1653 * Don't allow stacking file mounts to work around problems with the way 1654 * that namei sets nd.ni_dvp to vp_crossmp for these. 1655 */ 1656 if (vp->v_type == VREG) 1657 fsflags |= MNT_NOCOVER; 1658 if ((fsflags & MNT_UPDATE) == 0) { 1659 if ((vp->v_vflag & VV_ROOT) != 0 && 1660 (fsflags & MNT_NOCOVER) != 0) { 1661 vput(vp); 1662 error = EBUSY; 1663 goto out; 1664 } 1665 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); 1666 strcpy(pathbuf, fspath); 1667 /* 1668 * Note: we allow any vnode type here. If the path sanity check 1669 * succeeds, the type will be validated in vfs_domount_first 1670 * above. 1671 */ 1672 if (vp->v_type == VDIR) 1673 error = vn_path_to_global_path(td, vp, pathbuf, 1674 MNAMELEN); 1675 else 1676 error = vn_path_to_global_path_hardlink(td, vp, 1677 nd.ni_dvp, pathbuf, MNAMELEN, 1678 nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen); 1679 if (error == 0) { 1680 error = vfs_domount_first(td, vfsp, pathbuf, vp, 1681 fsflags, optlist); 1682 } 1683 free(pathbuf, M_TEMP); 1684 } else 1685 error = vfs_domount_update(td, vp, fsflags, jail_export, 1686 optlist); 1687 1688 out: 1689 NDFREE_PNBUF(&nd); 1690 vrele(nd.ni_dvp); 1691 1692 return (error); 1693 } 1694 1695 /* 1696 * Unmount a filesystem. 1697 * 1698 * Note: unmount takes a path to the vnode mounted on as argument, not 1699 * special file (as before). 1700 */ 1701 #ifndef _SYS_SYSPROTO_H_ 1702 struct unmount_args { 1703 char *path; 1704 int flags; 1705 }; 1706 #endif 1707 /* ARGSUSED */ 1708 int 1709 sys_unmount(struct thread *td, struct unmount_args *uap) 1710 { 1711 1712 return (kern_unmount(td, uap->path, uap->flags)); 1713 } 1714 1715 int 1716 kern_unmount(struct thread *td, const char *path, int flags) 1717 { 1718 struct nameidata nd; 1719 struct mount *mp; 1720 char *fsidbuf, *pathbuf; 1721 fsid_t fsid; 1722 int error; 1723 1724 AUDIT_ARG_VALUE(flags); 1725 if (jailed(td->td_ucred) || usermount == 0) { 1726 error = priv_check(td, PRIV_VFS_UNMOUNT); 1727 if (error) 1728 return (error); 1729 } 1730 1731 if (flags & MNT_BYFSID) { 1732 fsidbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); 1733 error = copyinstr(path, fsidbuf, MNAMELEN, NULL); 1734 if (error) { 1735 free(fsidbuf, M_TEMP); 1736 return (error); 1737 } 1738 1739 AUDIT_ARG_TEXT(fsidbuf); 1740 /* Decode the filesystem ID. */ 1741 if (sscanf(fsidbuf, "FSID:%d:%d", &fsid.val[0], &fsid.val[1]) != 2) { 1742 free(fsidbuf, M_TEMP); 1743 return (EINVAL); 1744 } 1745 1746 mp = vfs_getvfs(&fsid); 1747 free(fsidbuf, M_TEMP); 1748 if (mp == NULL) { 1749 return (ENOENT); 1750 } 1751 } else { 1752 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); 1753 error = copyinstr(path, pathbuf, MNAMELEN, NULL); 1754 if (error) { 1755 free(pathbuf, M_TEMP); 1756 return (error); 1757 } 1758 1759 /* 1760 * Try to find global path for path argument. 1761 */ 1762 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 1763 UIO_SYSSPACE, pathbuf); 1764 if (namei(&nd) == 0) { 1765 NDFREE_PNBUF(&nd); 1766 error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, 1767 MNAMELEN); 1768 if (error == 0) 1769 vput(nd.ni_vp); 1770 } 1771 mtx_lock(&mountlist_mtx); 1772 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { 1773 if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { 1774 vfs_ref(mp); 1775 break; 1776 } 1777 } 1778 mtx_unlock(&mountlist_mtx); 1779 free(pathbuf, M_TEMP); 1780 if (mp == NULL) { 1781 /* 1782 * Previously we returned ENOENT for a nonexistent path and 1783 * EINVAL for a non-mountpoint. We cannot tell these apart 1784 * now, so in the !MNT_BYFSID case return the more likely 1785 * EINVAL for compatibility. 1786 */ 1787 return (EINVAL); 1788 } 1789 } 1790 1791 /* 1792 * Don't allow unmounting the root filesystem. 1793 */ 1794 if (mp->mnt_flag & MNT_ROOTFS) { 1795 vfs_rel(mp); 1796 return (EINVAL); 1797 } 1798 error = dounmount(mp, flags, td); 1799 return (error); 1800 } 1801 1802 /* 1803 * Return error if any of the vnodes, ignoring the root vnode 1804 * and the syncer vnode, have non-zero usecount. 1805 * 1806 * This function is purely advisory - it can return false positives 1807 * and negatives. 1808 */ 1809 static int 1810 vfs_check_usecounts(struct mount *mp) 1811 { 1812 struct vnode *vp, *mvp; 1813 1814 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 1815 if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && 1816 vp->v_usecount != 0) { 1817 VI_UNLOCK(vp); 1818 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 1819 return (EBUSY); 1820 } 1821 VI_UNLOCK(vp); 1822 } 1823 1824 return (0); 1825 } 1826 1827 static void 1828 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) 1829 { 1830 1831 mtx_assert(MNT_MTX(mp), MA_OWNED); 1832 mp->mnt_kern_flag &= ~mntkflags; 1833 if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { 1834 mp->mnt_kern_flag &= ~MNTK_MWAIT; 1835 wakeup(mp); 1836 } 1837 vfs_op_exit_locked(mp); 1838 MNT_IUNLOCK(mp); 1839 if (coveredvp != NULL) { 1840 VOP_UNLOCK(coveredvp); 1841 vdrop(coveredvp); 1842 } 1843 vn_finished_write(mp); 1844 vfs_rel(mp); 1845 } 1846 1847 /* 1848 * There are various reference counters associated with the mount point. 1849 * Normally it is permitted to modify them without taking the mnt ilock, 1850 * but this behavior can be temporarily disabled if stable value is needed 1851 * or callers are expected to block (e.g. to not allow new users during 1852 * forced unmount). 1853 */ 1854 void 1855 vfs_op_enter(struct mount *mp) 1856 { 1857 struct mount_pcpu *mpcpu; 1858 int cpu; 1859 1860 MNT_ILOCK(mp); 1861 mp->mnt_vfs_ops++; 1862 if (mp->mnt_vfs_ops > 1) { 1863 MNT_IUNLOCK(mp); 1864 return; 1865 } 1866 vfs_op_barrier_wait(mp); 1867 CPU_FOREACH(cpu) { 1868 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1869 1870 mp->mnt_ref += mpcpu->mntp_ref; 1871 mpcpu->mntp_ref = 0; 1872 1873 mp->mnt_lockref += mpcpu->mntp_lockref; 1874 mpcpu->mntp_lockref = 0; 1875 1876 mp->mnt_writeopcount += mpcpu->mntp_writeopcount; 1877 mpcpu->mntp_writeopcount = 0; 1878 } 1879 MPASSERT(mp->mnt_ref > 0 && mp->mnt_lockref >= 0 && 1880 mp->mnt_writeopcount >= 0, mp, 1881 ("invalid count(s): ref %d lockref %d writeopcount %d", 1882 mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount)); 1883 MNT_IUNLOCK(mp); 1884 vfs_assert_mount_counters(mp); 1885 } 1886 1887 void 1888 vfs_op_exit_locked(struct mount *mp) 1889 { 1890 1891 mtx_assert(MNT_MTX(mp), MA_OWNED); 1892 1893 MPASSERT(mp->mnt_vfs_ops > 0, mp, 1894 ("invalid vfs_ops count %d", mp->mnt_vfs_ops)); 1895 MPASSERT(mp->mnt_vfs_ops > 1 || 1896 (mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_SUSPEND)) == 0, mp, 1897 ("vfs_ops too low %d in unmount or suspend", mp->mnt_vfs_ops)); 1898 mp->mnt_vfs_ops--; 1899 } 1900 1901 void 1902 vfs_op_exit(struct mount *mp) 1903 { 1904 1905 MNT_ILOCK(mp); 1906 vfs_op_exit_locked(mp); 1907 MNT_IUNLOCK(mp); 1908 } 1909 1910 struct vfs_op_barrier_ipi { 1911 struct mount *mp; 1912 struct smp_rendezvous_cpus_retry_arg srcra; 1913 }; 1914 1915 static void 1916 vfs_op_action_func(void *arg) 1917 { 1918 struct vfs_op_barrier_ipi *vfsopipi; 1919 struct mount *mp; 1920 1921 vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); 1922 mp = vfsopipi->mp; 1923 1924 if (!vfs_op_thread_entered(mp)) 1925 smp_rendezvous_cpus_done(arg); 1926 } 1927 1928 static void 1929 vfs_op_wait_func(void *arg, int cpu) 1930 { 1931 struct vfs_op_barrier_ipi *vfsopipi; 1932 struct mount *mp; 1933 struct mount_pcpu *mpcpu; 1934 1935 vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); 1936 mp = vfsopipi->mp; 1937 1938 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1939 while (atomic_load_int(&mpcpu->mntp_thread_in_ops)) 1940 cpu_spinwait(); 1941 } 1942 1943 void 1944 vfs_op_barrier_wait(struct mount *mp) 1945 { 1946 struct vfs_op_barrier_ipi vfsopipi; 1947 1948 vfsopipi.mp = mp; 1949 1950 smp_rendezvous_cpus_retry(all_cpus, 1951 smp_no_rendezvous_barrier, 1952 vfs_op_action_func, 1953 smp_no_rendezvous_barrier, 1954 vfs_op_wait_func, 1955 &vfsopipi.srcra); 1956 } 1957 1958 #ifdef DIAGNOSTIC 1959 void 1960 vfs_assert_mount_counters(struct mount *mp) 1961 { 1962 struct mount_pcpu *mpcpu; 1963 int cpu; 1964 1965 if (mp->mnt_vfs_ops == 0) 1966 return; 1967 1968 CPU_FOREACH(cpu) { 1969 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1970 if (mpcpu->mntp_ref != 0 || 1971 mpcpu->mntp_lockref != 0 || 1972 mpcpu->mntp_writeopcount != 0) 1973 vfs_dump_mount_counters(mp); 1974 } 1975 } 1976 1977 void 1978 vfs_dump_mount_counters(struct mount *mp) 1979 { 1980 struct mount_pcpu *mpcpu; 1981 int ref, lockref, writeopcount; 1982 int cpu; 1983 1984 printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); 1985 1986 printf(" ref : "); 1987 ref = mp->mnt_ref; 1988 CPU_FOREACH(cpu) { 1989 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1990 printf("%d ", mpcpu->mntp_ref); 1991 ref += mpcpu->mntp_ref; 1992 } 1993 printf("\n"); 1994 printf(" lockref : "); 1995 lockref = mp->mnt_lockref; 1996 CPU_FOREACH(cpu) { 1997 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1998 printf("%d ", mpcpu->mntp_lockref); 1999 lockref += mpcpu->mntp_lockref; 2000 } 2001 printf("\n"); 2002 printf("writeopcount: "); 2003 writeopcount = mp->mnt_writeopcount; 2004 CPU_FOREACH(cpu) { 2005 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 2006 printf("%d ", mpcpu->mntp_writeopcount); 2007 writeopcount += mpcpu->mntp_writeopcount; 2008 } 2009 printf("\n"); 2010 2011 printf("counter struct total\n"); 2012 printf("ref %-5d %-5d\n", mp->mnt_ref, ref); 2013 printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); 2014 printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); 2015 2016 panic("invalid counts on struct mount"); 2017 } 2018 #endif 2019 2020 int 2021 vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) 2022 { 2023 struct mount_pcpu *mpcpu; 2024 int cpu, sum; 2025 2026 switch (which) { 2027 case MNT_COUNT_REF: 2028 sum = mp->mnt_ref; 2029 break; 2030 case MNT_COUNT_LOCKREF: 2031 sum = mp->mnt_lockref; 2032 break; 2033 case MNT_COUNT_WRITEOPCOUNT: 2034 sum = mp->mnt_writeopcount; 2035 break; 2036 } 2037 2038 CPU_FOREACH(cpu) { 2039 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 2040 switch (which) { 2041 case MNT_COUNT_REF: 2042 sum += mpcpu->mntp_ref; 2043 break; 2044 case MNT_COUNT_LOCKREF: 2045 sum += mpcpu->mntp_lockref; 2046 break; 2047 case MNT_COUNT_WRITEOPCOUNT: 2048 sum += mpcpu->mntp_writeopcount; 2049 break; 2050 } 2051 } 2052 return (sum); 2053 } 2054 2055 static bool 2056 deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue, 2057 int timeout_ticks) 2058 { 2059 bool enqueued; 2060 2061 enqueued = false; 2062 mtx_lock(&deferred_unmount_lock); 2063 if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) { 2064 mp->mnt_taskqueue_flags = flags | MNT_DEFERRED; 2065 STAILQ_INSERT_TAIL(&deferred_unmount_list, mp, 2066 mnt_taskqueue_link); 2067 enqueued = true; 2068 } 2069 mtx_unlock(&deferred_unmount_lock); 2070 2071 if (enqueued) { 2072 taskqueue_enqueue_timeout(taskqueue_deferred_unmount, 2073 &deferred_unmount_task, timeout_ticks); 2074 } 2075 2076 return (enqueued); 2077 } 2078 2079 /* 2080 * Taskqueue handler for processing async/recursive unmounts 2081 */ 2082 static void 2083 vfs_deferred_unmount(void *argi __unused, int pending __unused) 2084 { 2085 STAILQ_HEAD(, mount) local_unmounts; 2086 uint64_t flags; 2087 struct mount *mp, *tmp; 2088 int error; 2089 unsigned int retries; 2090 bool unmounted; 2091 2092 STAILQ_INIT(&local_unmounts); 2093 mtx_lock(&deferred_unmount_lock); 2094 STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); 2095 mtx_unlock(&deferred_unmount_lock); 2096 2097 STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) { 2098 flags = mp->mnt_taskqueue_flags; 2099 KASSERT((flags & MNT_DEFERRED) != 0, 2100 ("taskqueue unmount without MNT_DEFERRED")); 2101 error = dounmount(mp, flags, curthread); 2102 if (error != 0) { 2103 MNT_ILOCK(mp); 2104 unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0); 2105 MNT_IUNLOCK(mp); 2106 2107 /* 2108 * The deferred unmount thread is the only thread that 2109 * modifies the retry counts, so locking/atomics aren't 2110 * needed here. 2111 */ 2112 retries = (mp->mnt_unmount_retries)++; 2113 deferred_unmount_total_retries++; 2114 if (!unmounted && retries < deferred_unmount_retry_limit) { 2115 deferred_unmount_enqueue(mp, flags, true, 2116 -deferred_unmount_retry_delay_hz); 2117 } else { 2118 if (retries >= deferred_unmount_retry_limit) { 2119 printf("giving up on deferred unmount " 2120 "of %s after %d retries, error %d\n", 2121 mp->mnt_stat.f_mntonname, retries, error); 2122 } 2123 vfs_rel(mp); 2124 } 2125 } 2126 } 2127 } 2128 2129 /* 2130 * Do the actual filesystem unmount. 2131 */ 2132 int 2133 dounmount(struct mount *mp, uint64_t flags, struct thread *td) 2134 { 2135 struct mount_upper_node *upper; 2136 struct vnode *coveredvp, *rootvp; 2137 int error; 2138 uint64_t async_flag; 2139 int mnt_gen_r; 2140 unsigned int retries; 2141 2142 KASSERT((flags & MNT_DEFERRED) == 0 || 2143 (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE), 2144 ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE")); 2145 2146 /* 2147 * If the caller has explicitly requested the unmount to be handled by 2148 * the taskqueue and we're not already in taskqueue context, queue 2149 * up the unmount request and exit. This is done prior to any 2150 * credential checks; MNT_DEFERRED should be used only for kernel- 2151 * initiated unmounts and will therefore be processed with the 2152 * (kernel) credentials of the taskqueue thread. Still, callers 2153 * should be sure this is the behavior they want. 2154 */ 2155 if ((flags & MNT_DEFERRED) != 0 && 2156 taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) { 2157 if (!deferred_unmount_enqueue(mp, flags, false, 0)) 2158 vfs_rel(mp); 2159 return (EINPROGRESS); 2160 } 2161 2162 /* 2163 * Only privileged root, or (if MNT_USER is set) the user that did the 2164 * original mount is permitted to unmount this filesystem. 2165 * This check should be made prior to queueing up any recursive 2166 * unmounts of upper filesystems. Those unmounts will be executed 2167 * with kernel thread credentials and are expected to succeed, so 2168 * we must at least ensure the originating context has sufficient 2169 * privilege to unmount the base filesystem before proceeding with 2170 * the uppers. 2171 */ 2172 error = vfs_suser(mp, td); 2173 if (error != 0) { 2174 KASSERT((flags & MNT_DEFERRED) == 0, 2175 ("taskqueue unmount with insufficient privilege")); 2176 vfs_rel(mp); 2177 return (error); 2178 } 2179 2180 if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0)) 2181 flags |= MNT_RECURSE; 2182 2183 if ((flags & MNT_RECURSE) != 0) { 2184 KASSERT((flags & MNT_FORCE) != 0, 2185 ("MNT_RECURSE requires MNT_FORCE")); 2186 2187 MNT_ILOCK(mp); 2188 /* 2189 * Set MNTK_RECURSE to prevent new upper mounts from being 2190 * added, and note that an operation on the uppers list is in 2191 * progress. This will ensure that unregistration from the 2192 * uppers list, and therefore any pending unmount of the upper 2193 * FS, can't complete until after we finish walking the list. 2194 */ 2195 mp->mnt_kern_flag |= MNTK_RECURSE; 2196 mp->mnt_upper_pending++; 2197 TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) { 2198 retries = upper->mp->mnt_unmount_retries; 2199 if (retries > deferred_unmount_retry_limit) { 2200 error = EBUSY; 2201 continue; 2202 } 2203 MNT_IUNLOCK(mp); 2204 2205 vfs_ref(upper->mp); 2206 if (!deferred_unmount_enqueue(upper->mp, flags, 2207 false, 0)) 2208 vfs_rel(upper->mp); 2209 MNT_ILOCK(mp); 2210 } 2211 mp->mnt_upper_pending--; 2212 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 2213 mp->mnt_upper_pending == 0) { 2214 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 2215 wakeup(&mp->mnt_uppers); 2216 } 2217 2218 /* 2219 * If we're not on the taskqueue, wait until the uppers list 2220 * is drained before proceeding with unmount. Otherwise, if 2221 * we are on the taskqueue and there are still pending uppers, 2222 * just re-enqueue on the end of the taskqueue. 2223 */ 2224 if ((flags & MNT_DEFERRED) == 0) { 2225 while (error == 0 && !TAILQ_EMPTY(&mp->mnt_uppers)) { 2226 mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER; 2227 error = msleep(&mp->mnt_taskqueue_link, 2228 MNT_MTX(mp), PCATCH, "umntqw", 0); 2229 } 2230 if (error != 0) { 2231 MNT_REL(mp); 2232 MNT_IUNLOCK(mp); 2233 return (error); 2234 } 2235 } else if (!TAILQ_EMPTY(&mp->mnt_uppers)) { 2236 MNT_IUNLOCK(mp); 2237 if (error == 0) 2238 deferred_unmount_enqueue(mp, flags, true, 0); 2239 return (error); 2240 } 2241 MNT_IUNLOCK(mp); 2242 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty")); 2243 } 2244 2245 /* Allow the taskqueue to safely re-enqueue on failure */ 2246 if ((flags & MNT_DEFERRED) != 0) 2247 vfs_ref(mp); 2248 2249 if ((coveredvp = mp->mnt_vnodecovered) != NULL) { 2250 mnt_gen_r = mp->mnt_gen; 2251 VI_LOCK(coveredvp); 2252 vholdl(coveredvp); 2253 vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); 2254 /* 2255 * Check for mp being unmounted while waiting for the 2256 * covered vnode lock. 2257 */ 2258 if (coveredvp->v_mountedhere != mp || 2259 coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { 2260 VOP_UNLOCK(coveredvp); 2261 vdrop(coveredvp); 2262 vfs_rel(mp); 2263 return (EBUSY); 2264 } 2265 } 2266 2267 vfs_op_enter(mp); 2268 2269 vn_start_write(NULL, &mp, V_WAIT); 2270 MNT_ILOCK(mp); 2271 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || 2272 (mp->mnt_flag & MNT_UPDATE) != 0 || 2273 !TAILQ_EMPTY(&mp->mnt_uppers)) { 2274 dounmount_cleanup(mp, coveredvp, 0); 2275 return (EBUSY); 2276 } 2277 mp->mnt_kern_flag |= MNTK_UNMOUNT; 2278 rootvp = vfs_cache_root_clear(mp); 2279 if (coveredvp != NULL) 2280 vn_seqc_write_begin(coveredvp); 2281 if (flags & MNT_NONBUSY) { 2282 MNT_IUNLOCK(mp); 2283 error = vfs_check_usecounts(mp); 2284 MNT_ILOCK(mp); 2285 if (error != 0) { 2286 vn_seqc_write_end(coveredvp); 2287 dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); 2288 if (rootvp != NULL) { 2289 vn_seqc_write_end(rootvp); 2290 vrele(rootvp); 2291 } 2292 return (error); 2293 } 2294 } 2295 /* Allow filesystems to detect that a forced unmount is in progress. */ 2296 if (flags & MNT_FORCE) { 2297 mp->mnt_kern_flag |= MNTK_UNMOUNTF; 2298 MNT_IUNLOCK(mp); 2299 /* 2300 * Must be done after setting MNTK_UNMOUNTF and before 2301 * waiting for mnt_lockref to become 0. 2302 */ 2303 VFS_PURGE(mp); 2304 MNT_ILOCK(mp); 2305 } 2306 error = 0; 2307 if (mp->mnt_lockref) { 2308 mp->mnt_kern_flag |= MNTK_DRAINING; 2309 error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, 2310 "mount drain", 0); 2311 } 2312 MNT_IUNLOCK(mp); 2313 KASSERT(mp->mnt_lockref == 0, 2314 ("%s: invalid lock refcount in the drain path @ %s:%d", 2315 __func__, __FILE__, __LINE__)); 2316 KASSERT(error == 0, 2317 ("%s: invalid return value for msleep in the drain path @ %s:%d", 2318 __func__, __FILE__, __LINE__)); 2319 2320 /* 2321 * We want to keep the vnode around so that we can vn_seqc_write_end 2322 * after we are done with unmount. Downgrade our reference to a mere 2323 * hold count so that we don't interefere with anything. 2324 */ 2325 if (rootvp != NULL) { 2326 vhold(rootvp); 2327 vrele(rootvp); 2328 } 2329 2330 if (mp->mnt_flag & MNT_EXPUBLIC) 2331 vfs_setpublicfs(NULL, NULL, NULL); 2332 2333 vfs_periodic(mp, MNT_WAIT); 2334 MNT_ILOCK(mp); 2335 async_flag = mp->mnt_flag & MNT_ASYNC; 2336 mp->mnt_flag &= ~MNT_ASYNC; 2337 mp->mnt_kern_flag &= ~MNTK_ASYNC; 2338 MNT_IUNLOCK(mp); 2339 vfs_deallocate_syncvnode(mp); 2340 error = VFS_UNMOUNT(mp, flags); 2341 vn_finished_write(mp); 2342 vfs_rel(mp); 2343 /* 2344 * If we failed to flush the dirty blocks for this mount point, 2345 * undo all the cdir/rdir and rootvnode changes we made above. 2346 * Unless we failed to do so because the device is reporting that 2347 * it doesn't exist anymore. 2348 */ 2349 if (error && error != ENXIO) { 2350 MNT_ILOCK(mp); 2351 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 2352 MNT_IUNLOCK(mp); 2353 vfs_allocate_syncvnode(mp); 2354 MNT_ILOCK(mp); 2355 } 2356 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); 2357 mp->mnt_flag |= async_flag; 2358 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 2359 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 2360 mp->mnt_kern_flag |= MNTK_ASYNC; 2361 if (mp->mnt_kern_flag & MNTK_MWAIT) { 2362 mp->mnt_kern_flag &= ~MNTK_MWAIT; 2363 wakeup(mp); 2364 } 2365 vfs_op_exit_locked(mp); 2366 MNT_IUNLOCK(mp); 2367 if (coveredvp) { 2368 vn_seqc_write_end(coveredvp); 2369 VOP_UNLOCK(coveredvp); 2370 vdrop(coveredvp); 2371 } 2372 if (rootvp != NULL) { 2373 vn_seqc_write_end(rootvp); 2374 vdrop(rootvp); 2375 } 2376 return (error); 2377 } 2378 2379 mtx_lock(&mountlist_mtx); 2380 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2381 mtx_unlock(&mountlist_mtx); 2382 EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); 2383 if (coveredvp != NULL) { 2384 VI_LOCK(coveredvp); 2385 vn_irflag_unset_locked(coveredvp, VIRF_MOUNTPOINT); 2386 coveredvp->v_mountedhere = NULL; 2387 vn_seqc_write_end_locked(coveredvp); 2388 VI_UNLOCK(coveredvp); 2389 VOP_UNLOCK(coveredvp); 2390 vdrop(coveredvp); 2391 } 2392 mount_devctl_event("UNMOUNT", mp, false); 2393 if (rootvp != NULL) { 2394 vn_seqc_write_end(rootvp); 2395 vdrop(rootvp); 2396 } 2397 vfs_event_signal(NULL, VQ_UNMOUNT, 0); 2398 if (rootvnode != NULL && mp == rootvnode->v_mount) { 2399 vrele(rootvnode); 2400 rootvnode = NULL; 2401 } 2402 if (mp == rootdevmp) 2403 rootdevmp = NULL; 2404 if ((flags & MNT_DEFERRED) != 0) 2405 vfs_rel(mp); 2406 vfs_mount_destroy(mp); 2407 return (0); 2408 } 2409 2410 /* 2411 * Report errors during filesystem mounting. 2412 */ 2413 void 2414 vfs_mount_error(struct mount *mp, const char *fmt, ...) 2415 { 2416 struct vfsoptlist *moptlist = mp->mnt_optnew; 2417 va_list ap; 2418 int error, len; 2419 char *errmsg; 2420 2421 error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); 2422 if (error || errmsg == NULL || len <= 0) 2423 return; 2424 2425 va_start(ap, fmt); 2426 vsnprintf(errmsg, (size_t)len, fmt, ap); 2427 va_end(ap); 2428 } 2429 2430 void 2431 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) 2432 { 2433 va_list ap; 2434 int error, len; 2435 char *errmsg; 2436 2437 error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); 2438 if (error || errmsg == NULL || len <= 0) 2439 return; 2440 2441 va_start(ap, fmt); 2442 vsnprintf(errmsg, (size_t)len, fmt, ap); 2443 va_end(ap); 2444 } 2445 2446 /* 2447 * --------------------------------------------------------------------- 2448 * Functions for querying mount options/arguments from filesystems. 2449 */ 2450 2451 /* 2452 * Check that no unknown options are given 2453 */ 2454 int 2455 vfs_filteropt(struct vfsoptlist *opts, const char **legal) 2456 { 2457 struct vfsopt *opt; 2458 char errmsg[255]; 2459 const char **t, *p, *q; 2460 int ret = 0; 2461 2462 TAILQ_FOREACH(opt, opts, link) { 2463 p = opt->name; 2464 q = NULL; 2465 if (p[0] == 'n' && p[1] == 'o') 2466 q = p + 2; 2467 for(t = global_opts; *t != NULL; t++) { 2468 if (strcmp(*t, p) == 0) 2469 break; 2470 if (q != NULL) { 2471 if (strcmp(*t, q) == 0) 2472 break; 2473 } 2474 } 2475 if (*t != NULL) 2476 continue; 2477 for(t = legal; *t != NULL; t++) { 2478 if (strcmp(*t, p) == 0) 2479 break; 2480 if (q != NULL) { 2481 if (strcmp(*t, q) == 0) 2482 break; 2483 } 2484 } 2485 if (*t != NULL) 2486 continue; 2487 snprintf(errmsg, sizeof(errmsg), 2488 "mount option <%s> is unknown", p); 2489 ret = EINVAL; 2490 } 2491 if (ret != 0) { 2492 TAILQ_FOREACH(opt, opts, link) { 2493 if (strcmp(opt->name, "errmsg") == 0) { 2494 strncpy((char *)opt->value, errmsg, opt->len); 2495 break; 2496 } 2497 } 2498 if (opt == NULL) 2499 printf("%s\n", errmsg); 2500 } 2501 return (ret); 2502 } 2503 2504 /* 2505 * Get a mount option by its name. 2506 * 2507 * Return 0 if the option was found, ENOENT otherwise. 2508 * If len is non-NULL it will be filled with the length 2509 * of the option. If buf is non-NULL, it will be filled 2510 * with the address of the option. 2511 */ 2512 int 2513 vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len) 2514 { 2515 struct vfsopt *opt; 2516 2517 KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); 2518 2519 TAILQ_FOREACH(opt, opts, link) { 2520 if (strcmp(name, opt->name) == 0) { 2521 opt->seen = 1; 2522 if (len != NULL) 2523 *len = opt->len; 2524 if (buf != NULL) 2525 *buf = opt->value; 2526 return (0); 2527 } 2528 } 2529 return (ENOENT); 2530 } 2531 2532 int 2533 vfs_getopt_pos(struct vfsoptlist *opts, const char *name) 2534 { 2535 struct vfsopt *opt; 2536 2537 if (opts == NULL) 2538 return (-1); 2539 2540 TAILQ_FOREACH(opt, opts, link) { 2541 if (strcmp(name, opt->name) == 0) { 2542 opt->seen = 1; 2543 return (opt->pos); 2544 } 2545 } 2546 return (-1); 2547 } 2548 2549 int 2550 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) 2551 { 2552 char *opt_value, *vtp; 2553 quad_t iv; 2554 int error, opt_len; 2555 2556 error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); 2557 if (error != 0) 2558 return (error); 2559 if (opt_len == 0 || opt_value == NULL) 2560 return (EINVAL); 2561 if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') 2562 return (EINVAL); 2563 iv = strtoq(opt_value, &vtp, 0); 2564 if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) 2565 return (EINVAL); 2566 if (iv < 0) 2567 return (EINVAL); 2568 switch (vtp[0]) { 2569 case 't': case 'T': 2570 iv *= 1024; 2571 /* FALLTHROUGH */ 2572 case 'g': case 'G': 2573 iv *= 1024; 2574 /* FALLTHROUGH */ 2575 case 'm': case 'M': 2576 iv *= 1024; 2577 /* FALLTHROUGH */ 2578 case 'k': case 'K': 2579 iv *= 1024; 2580 case '\0': 2581 break; 2582 default: 2583 return (EINVAL); 2584 } 2585 *value = iv; 2586 2587 return (0); 2588 } 2589 2590 char * 2591 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) 2592 { 2593 struct vfsopt *opt; 2594 2595 *error = 0; 2596 TAILQ_FOREACH(opt, opts, link) { 2597 if (strcmp(name, opt->name) != 0) 2598 continue; 2599 opt->seen = 1; 2600 if (opt->len == 0 || 2601 ((char *)opt->value)[opt->len - 1] != '\0') { 2602 *error = EINVAL; 2603 return (NULL); 2604 } 2605 return (opt->value); 2606 } 2607 *error = ENOENT; 2608 return (NULL); 2609 } 2610 2611 int 2612 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, 2613 uint64_t val) 2614 { 2615 struct vfsopt *opt; 2616 2617 TAILQ_FOREACH(opt, opts, link) { 2618 if (strcmp(name, opt->name) == 0) { 2619 opt->seen = 1; 2620 if (w != NULL) 2621 *w |= val; 2622 return (1); 2623 } 2624 } 2625 if (w != NULL) 2626 *w &= ~val; 2627 return (0); 2628 } 2629 2630 int 2631 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) 2632 { 2633 va_list ap; 2634 struct vfsopt *opt; 2635 int ret; 2636 2637 KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); 2638 2639 TAILQ_FOREACH(opt, opts, link) { 2640 if (strcmp(name, opt->name) != 0) 2641 continue; 2642 opt->seen = 1; 2643 if (opt->len == 0 || opt->value == NULL) 2644 return (0); 2645 if (((char *)opt->value)[opt->len - 1] != '\0') 2646 return (0); 2647 va_start(ap, fmt); 2648 ret = vsscanf(opt->value, fmt, ap); 2649 va_end(ap); 2650 return (ret); 2651 } 2652 return (0); 2653 } 2654 2655 int 2656 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) 2657 { 2658 struct vfsopt *opt; 2659 2660 TAILQ_FOREACH(opt, opts, link) { 2661 if (strcmp(name, opt->name) != 0) 2662 continue; 2663 opt->seen = 1; 2664 if (opt->value == NULL) 2665 opt->len = len; 2666 else { 2667 if (opt->len != len) 2668 return (EINVAL); 2669 bcopy(value, opt->value, len); 2670 } 2671 return (0); 2672 } 2673 return (ENOENT); 2674 } 2675 2676 int 2677 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) 2678 { 2679 struct vfsopt *opt; 2680 2681 TAILQ_FOREACH(opt, opts, link) { 2682 if (strcmp(name, opt->name) != 0) 2683 continue; 2684 opt->seen = 1; 2685 if (opt->value == NULL) 2686 opt->len = len; 2687 else { 2688 if (opt->len < len) 2689 return (EINVAL); 2690 opt->len = len; 2691 bcopy(value, opt->value, len); 2692 } 2693 return (0); 2694 } 2695 return (ENOENT); 2696 } 2697 2698 int 2699 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) 2700 { 2701 struct vfsopt *opt; 2702 2703 TAILQ_FOREACH(opt, opts, link) { 2704 if (strcmp(name, opt->name) != 0) 2705 continue; 2706 opt->seen = 1; 2707 if (opt->value == NULL) 2708 opt->len = strlen(value) + 1; 2709 else if (strlcpy(opt->value, value, opt->len) >= opt->len) 2710 return (EINVAL); 2711 return (0); 2712 } 2713 return (ENOENT); 2714 } 2715 2716 /* 2717 * Find and copy a mount option. 2718 * 2719 * The size of the buffer has to be specified 2720 * in len, if it is not the same length as the 2721 * mount option, EINVAL is returned. 2722 * Returns ENOENT if the option is not found. 2723 */ 2724 int 2725 vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len) 2726 { 2727 struct vfsopt *opt; 2728 2729 KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); 2730 2731 TAILQ_FOREACH(opt, opts, link) { 2732 if (strcmp(name, opt->name) == 0) { 2733 opt->seen = 1; 2734 if (len != opt->len) 2735 return (EINVAL); 2736 bcopy(opt->value, dest, opt->len); 2737 return (0); 2738 } 2739 } 2740 return (ENOENT); 2741 } 2742 2743 int 2744 __vfs_statfs(struct mount *mp, struct statfs *sbp) 2745 { 2746 /* 2747 * Filesystems only fill in part of the structure for updates, we 2748 * have to read the entirety first to get all content. 2749 */ 2750 if (sbp != &mp->mnt_stat) 2751 memcpy(sbp, &mp->mnt_stat, sizeof(*sbp)); 2752 2753 /* 2754 * Set these in case the underlying filesystem fails to do so. 2755 */ 2756 sbp->f_version = STATFS_VERSION; 2757 sbp->f_namemax = NAME_MAX; 2758 sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 2759 sbp->f_nvnodelistsize = mp->mnt_nvnodelistsize; 2760 2761 return (mp->mnt_op->vfs_statfs(mp, sbp)); 2762 } 2763 2764 void 2765 vfs_mountedfrom(struct mount *mp, const char *from) 2766 { 2767 2768 bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); 2769 strlcpy(mp->mnt_stat.f_mntfromname, from, 2770 sizeof mp->mnt_stat.f_mntfromname); 2771 } 2772 2773 /* 2774 * --------------------------------------------------------------------- 2775 * This is the api for building mount args and mounting filesystems from 2776 * inside the kernel. 2777 * 2778 * The API works by accumulation of individual args. First error is 2779 * latched. 2780 * 2781 * XXX: should be documented in new manpage kernel_mount(9) 2782 */ 2783 2784 /* A memory allocation which must be freed when we are done */ 2785 struct mntaarg { 2786 SLIST_ENTRY(mntaarg) next; 2787 }; 2788 2789 /* The header for the mount arguments */ 2790 struct mntarg { 2791 struct iovec *v; 2792 int len; 2793 int error; 2794 SLIST_HEAD(, mntaarg) list; 2795 }; 2796 2797 /* 2798 * Add a boolean argument. 2799 * 2800 * flag is the boolean value. 2801 * name must start with "no". 2802 */ 2803 struct mntarg * 2804 mount_argb(struct mntarg *ma, int flag, const char *name) 2805 { 2806 2807 KASSERT(name[0] == 'n' && name[1] == 'o', 2808 ("mount_argb(...,%s): name must start with 'no'", name)); 2809 2810 return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); 2811 } 2812 2813 /* 2814 * Add an argument printf style 2815 */ 2816 struct mntarg * 2817 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) 2818 { 2819 va_list ap; 2820 struct mntaarg *maa; 2821 struct sbuf *sb; 2822 int len; 2823 2824 if (ma == NULL) { 2825 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2826 SLIST_INIT(&ma->list); 2827 } 2828 if (ma->error) 2829 return (ma); 2830 2831 ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), 2832 M_MOUNT, M_WAITOK); 2833 ma->v[ma->len].iov_base = (void *)(uintptr_t)name; 2834 ma->v[ma->len].iov_len = strlen(name) + 1; 2835 ma->len++; 2836 2837 sb = sbuf_new_auto(); 2838 va_start(ap, fmt); 2839 sbuf_vprintf(sb, fmt, ap); 2840 va_end(ap); 2841 sbuf_finish(sb); 2842 len = sbuf_len(sb) + 1; 2843 maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); 2844 SLIST_INSERT_HEAD(&ma->list, maa, next); 2845 bcopy(sbuf_data(sb), maa + 1, len); 2846 sbuf_delete(sb); 2847 2848 ma->v[ma->len].iov_base = maa + 1; 2849 ma->v[ma->len].iov_len = len; 2850 ma->len++; 2851 2852 return (ma); 2853 } 2854 2855 /* 2856 * Add an argument which is a userland string. 2857 */ 2858 struct mntarg * 2859 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) 2860 { 2861 struct mntaarg *maa; 2862 char *tbuf; 2863 2864 if (val == NULL) 2865 return (ma); 2866 if (ma == NULL) { 2867 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2868 SLIST_INIT(&ma->list); 2869 } 2870 if (ma->error) 2871 return (ma); 2872 maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); 2873 SLIST_INSERT_HEAD(&ma->list, maa, next); 2874 tbuf = (void *)(maa + 1); 2875 ma->error = copyinstr(val, tbuf, len, NULL); 2876 return (mount_arg(ma, name, tbuf, -1)); 2877 } 2878 2879 /* 2880 * Plain argument. 2881 * 2882 * If length is -1, treat value as a C string. 2883 */ 2884 struct mntarg * 2885 mount_arg(struct mntarg *ma, const char *name, const void *val, int len) 2886 { 2887 2888 if (ma == NULL) { 2889 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2890 SLIST_INIT(&ma->list); 2891 } 2892 if (ma->error) 2893 return (ma); 2894 2895 ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), 2896 M_MOUNT, M_WAITOK); 2897 ma->v[ma->len].iov_base = (void *)(uintptr_t)name; 2898 ma->v[ma->len].iov_len = strlen(name) + 1; 2899 ma->len++; 2900 2901 ma->v[ma->len].iov_base = (void *)(uintptr_t)val; 2902 if (len < 0) 2903 ma->v[ma->len].iov_len = strlen(val) + 1; 2904 else 2905 ma->v[ma->len].iov_len = len; 2906 ma->len++; 2907 return (ma); 2908 } 2909 2910 /* 2911 * Free a mntarg structure 2912 */ 2913 static void 2914 free_mntarg(struct mntarg *ma) 2915 { 2916 struct mntaarg *maa; 2917 2918 while (!SLIST_EMPTY(&ma->list)) { 2919 maa = SLIST_FIRST(&ma->list); 2920 SLIST_REMOVE_HEAD(&ma->list, next); 2921 free(maa, M_MOUNT); 2922 } 2923 free(ma->v, M_MOUNT); 2924 free(ma, M_MOUNT); 2925 } 2926 2927 /* 2928 * Mount a filesystem 2929 */ 2930 int 2931 kernel_mount(struct mntarg *ma, uint64_t flags) 2932 { 2933 struct uio auio; 2934 int error; 2935 2936 KASSERT(ma != NULL, ("kernel_mount NULL ma")); 2937 KASSERT(ma->error != 0 || ma->v != NULL, ("kernel_mount NULL ma->v")); 2938 KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); 2939 2940 error = ma->error; 2941 if (error == 0) { 2942 auio.uio_iov = ma->v; 2943 auio.uio_iovcnt = ma->len; 2944 auio.uio_segflg = UIO_SYSSPACE; 2945 error = vfs_donmount(curthread, flags, &auio); 2946 } 2947 free_mntarg(ma); 2948 return (error); 2949 } 2950 2951 /* Map from mount options to printable formats. */ 2952 static struct mntoptnames optnames[] = { 2953 MNTOPT_NAMES 2954 }; 2955 2956 #define DEVCTL_LEN 1024 2957 static void 2958 mount_devctl_event(const char *type, struct mount *mp, bool donew) 2959 { 2960 const uint8_t *cp; 2961 struct mntoptnames *fp; 2962 struct sbuf sb; 2963 struct statfs *sfp = &mp->mnt_stat; 2964 char *buf; 2965 2966 buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT); 2967 if (buf == NULL) 2968 return; 2969 sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN); 2970 sbuf_cpy(&sb, "mount-point=\""); 2971 devctl_safe_quote_sb(&sb, sfp->f_mntonname); 2972 sbuf_cat(&sb, "\" mount-dev=\""); 2973 devctl_safe_quote_sb(&sb, sfp->f_mntfromname); 2974 sbuf_cat(&sb, "\" mount-type=\""); 2975 devctl_safe_quote_sb(&sb, sfp->f_fstypename); 2976 sbuf_cat(&sb, "\" fsid=0x"); 2977 cp = (const uint8_t *)&sfp->f_fsid.val[0]; 2978 for (int i = 0; i < sizeof(sfp->f_fsid); i++) 2979 sbuf_printf(&sb, "%02x", cp[i]); 2980 sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner); 2981 for (fp = optnames; fp->o_opt != 0; fp++) { 2982 if ((mp->mnt_flag & fp->o_opt) != 0) { 2983 sbuf_cat(&sb, fp->o_name); 2984 sbuf_putc(&sb, ';'); 2985 } 2986 } 2987 sbuf_putc(&sb, '"'); 2988 sbuf_finish(&sb); 2989 2990 /* 2991 * Options are not published because the form of the options depends on 2992 * the file system and may include binary data. In addition, they don't 2993 * necessarily provide enough useful information to be actionable when 2994 * devd processes them. 2995 */ 2996 2997 if (sbuf_error(&sb) == 0) 2998 devctl_notify("VFS", "FS", type, sbuf_data(&sb)); 2999 sbuf_delete(&sb); 3000 free(buf, M_MOUNT); 3001 } 3002 3003 /* 3004 * Force remount specified mount point to read-only. The argument 3005 * must be busied to avoid parallel unmount attempts. 3006 * 3007 * Intended use is to prevent further writes if some metadata 3008 * inconsistency is detected. Note that the function still flushes 3009 * all cached metadata and data for the mount point, which might be 3010 * not always suitable. 3011 */ 3012 int 3013 vfs_remount_ro(struct mount *mp) 3014 { 3015 struct vfsoptlist *opts; 3016 struct vfsopt *opt; 3017 struct vnode *vp_covered, *rootvp; 3018 int error; 3019 3020 vfs_op_enter(mp); 3021 KASSERT(mp->mnt_lockref > 0, 3022 ("vfs_remount_ro: mp %p is not busied", mp)); 3023 KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0, 3024 ("vfs_remount_ro: mp %p is being unmounted (and busy?)", mp)); 3025 3026 rootvp = NULL; 3027 vp_covered = mp->mnt_vnodecovered; 3028 error = vget(vp_covered, LK_EXCLUSIVE | LK_NOWAIT); 3029 if (error != 0) { 3030 vfs_op_exit(mp); 3031 return (error); 3032 } 3033 VI_LOCK(vp_covered); 3034 if ((vp_covered->v_iflag & VI_MOUNT) != 0) { 3035 VI_UNLOCK(vp_covered); 3036 vput(vp_covered); 3037 vfs_op_exit(mp); 3038 return (EBUSY); 3039 } 3040 vp_covered->v_iflag |= VI_MOUNT; 3041 VI_UNLOCK(vp_covered); 3042 vn_seqc_write_begin(vp_covered); 3043 3044 MNT_ILOCK(mp); 3045 if ((mp->mnt_flag & MNT_RDONLY) != 0) { 3046 MNT_IUNLOCK(mp); 3047 error = EBUSY; 3048 goto out; 3049 } 3050 mp->mnt_flag |= MNT_UPDATE | MNT_FORCE | MNT_RDONLY; 3051 rootvp = vfs_cache_root_clear(mp); 3052 MNT_IUNLOCK(mp); 3053 3054 opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK | M_ZERO); 3055 TAILQ_INIT(opts); 3056 opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK | M_ZERO); 3057 opt->name = strdup("ro", M_MOUNT); 3058 opt->value = NULL; 3059 TAILQ_INSERT_TAIL(opts, opt, link); 3060 vfs_mergeopts(opts, mp->mnt_opt); 3061 mp->mnt_optnew = opts; 3062 3063 error = VFS_MOUNT(mp); 3064 3065 if (error == 0) { 3066 MNT_ILOCK(mp); 3067 mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE); 3068 MNT_IUNLOCK(mp); 3069 vfs_deallocate_syncvnode(mp); 3070 if (mp->mnt_opt != NULL) 3071 vfs_freeopts(mp->mnt_opt); 3072 mp->mnt_opt = mp->mnt_optnew; 3073 } else { 3074 MNT_ILOCK(mp); 3075 mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE | MNT_RDONLY); 3076 MNT_IUNLOCK(mp); 3077 vfs_freeopts(mp->mnt_optnew); 3078 } 3079 mp->mnt_optnew = NULL; 3080 3081 out: 3082 vfs_op_exit(mp); 3083 VI_LOCK(vp_covered); 3084 vp_covered->v_iflag &= ~VI_MOUNT; 3085 VI_UNLOCK(vp_covered); 3086 vput(vp_covered); 3087 vn_seqc_write_end(vp_covered); 3088 if (rootvp != NULL) { 3089 vn_seqc_write_end(rootvp); 3090 vrele(rootvp); 3091 } 3092 return (error); 3093 } 3094 3095 /* 3096 * Suspend write operations on all local writeable filesystems. Does 3097 * full sync of them in the process. 3098 * 3099 * Iterate over the mount points in reverse order, suspending most 3100 * recently mounted filesystems first. It handles a case where a 3101 * filesystem mounted from a md(4) vnode-backed device should be 3102 * suspended before the filesystem that owns the vnode. 3103 */ 3104 void 3105 suspend_all_fs(void) 3106 { 3107 struct mount *mp; 3108 int error; 3109 3110 mtx_lock(&mountlist_mtx); 3111 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { 3112 error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT); 3113 if (error != 0) 3114 continue; 3115 if ((mp->mnt_flag & (MNT_RDONLY | MNT_LOCAL)) != MNT_LOCAL || 3116 (mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 3117 mtx_lock(&mountlist_mtx); 3118 vfs_unbusy(mp); 3119 continue; 3120 } 3121 error = vfs_write_suspend(mp, 0); 3122 if (error == 0) { 3123 MNT_ILOCK(mp); 3124 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0); 3125 mp->mnt_kern_flag |= MNTK_SUSPEND_ALL; 3126 MNT_IUNLOCK(mp); 3127 mtx_lock(&mountlist_mtx); 3128 } else { 3129 printf("suspend of %s failed, error %d\n", 3130 mp->mnt_stat.f_mntonname, error); 3131 mtx_lock(&mountlist_mtx); 3132 vfs_unbusy(mp); 3133 } 3134 } 3135 mtx_unlock(&mountlist_mtx); 3136 } 3137 3138 /* 3139 * Clone the mnt_exjail field to a new mount point. 3140 */ 3141 void 3142 vfs_exjail_clone(struct mount *inmp, struct mount *outmp) 3143 { 3144 struct ucred *cr; 3145 struct prison *pr; 3146 3147 MNT_ILOCK(inmp); 3148 cr = inmp->mnt_exjail; 3149 if (cr != NULL) { 3150 crhold(cr); 3151 MNT_IUNLOCK(inmp); 3152 pr = cr->cr_prison; 3153 sx_slock(&allprison_lock); 3154 if (!prison_isalive(pr)) { 3155 sx_sunlock(&allprison_lock); 3156 crfree(cr); 3157 return; 3158 } 3159 MNT_ILOCK(outmp); 3160 if (outmp->mnt_exjail == NULL) { 3161 outmp->mnt_exjail = cr; 3162 atomic_add_int(&pr->pr_exportcnt, 1); 3163 cr = NULL; 3164 } 3165 MNT_IUNLOCK(outmp); 3166 sx_sunlock(&allprison_lock); 3167 if (cr != NULL) 3168 crfree(cr); 3169 } else 3170 MNT_IUNLOCK(inmp); 3171 } 3172 3173 void 3174 resume_all_fs(void) 3175 { 3176 struct mount *mp; 3177 3178 mtx_lock(&mountlist_mtx); 3179 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3180 if ((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0) 3181 continue; 3182 mtx_unlock(&mountlist_mtx); 3183 MNT_ILOCK(mp); 3184 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) != 0); 3185 mp->mnt_kern_flag &= ~MNTK_SUSPEND_ALL; 3186 MNT_IUNLOCK(mp); 3187 vfs_write_resume(mp, 0); 3188 mtx_lock(&mountlist_mtx); 3189 vfs_unbusy(mp); 3190 } 3191 mtx_unlock(&mountlist_mtx); 3192 } 3193