1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1999-2004 Poul-Henning Kamp 5 * Copyright (c) 1999 Michael Smith 6 * Copyright (c) 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/conf.h> 41 #include <sys/smp.h> 42 #include <sys/devctl.h> 43 #include <sys/eventhandler.h> 44 #include <sys/fcntl.h> 45 #include <sys/jail.h> 46 #include <sys/kernel.h> 47 #include <sys/ktr.h> 48 #include <sys/libkern.h> 49 #include <sys/limits.h> 50 #include <sys/malloc.h> 51 #include <sys/mount.h> 52 #include <sys/mutex.h> 53 #include <sys/namei.h> 54 #include <sys/priv.h> 55 #include <sys/proc.h> 56 #include <sys/filedesc.h> 57 #include <sys/reboot.h> 58 #include <sys/sbuf.h> 59 #include <sys/syscallsubr.h> 60 #include <sys/sysproto.h> 61 #include <sys/sx.h> 62 #include <sys/sysctl.h> 63 #include <sys/systm.h> 64 #include <sys/taskqueue.h> 65 #include <sys/vnode.h> 66 #include <vm/uma.h> 67 68 #include <geom/geom.h> 69 70 #include <machine/stdarg.h> 71 72 #include <security/audit/audit.h> 73 #include <security/mac/mac_framework.h> 74 75 #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) 76 77 static int vfs_domount(struct thread *td, const char *fstype, char *fspath, 78 uint64_t fsflags, bool jail_export, 79 struct vfsoptlist **optlist); 80 static void free_mntarg(struct mntarg *ma); 81 82 static int usermount = 0; 83 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, 84 "Unprivileged users may mount and unmount file systems"); 85 86 static bool default_autoro = false; 87 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, 88 "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); 89 90 static bool recursive_forced_unmount = false; 91 SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW, 92 &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts" 93 " when a file system is forcibly unmounted"); 94 95 static SYSCTL_NODE(_vfs, OID_AUTO, deferred_unmount, 96 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "deferred unmount controls"); 97 98 static unsigned int deferred_unmount_retry_limit = 10; 99 SYSCTL_UINT(_vfs_deferred_unmount, OID_AUTO, retry_limit, CTLFLAG_RW, 100 &deferred_unmount_retry_limit, 0, 101 "Maximum number of retries for deferred unmount failure"); 102 103 static int deferred_unmount_retry_delay_hz; 104 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, retry_delay_hz, CTLFLAG_RW, 105 &deferred_unmount_retry_delay_hz, 0, 106 "Delay in units of [1/kern.hz]s when retrying a failed deferred unmount"); 107 108 static int deferred_unmount_total_retries = 0; 109 SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, total_retries, CTLFLAG_RD, 110 &deferred_unmount_total_retries, 0, 111 "Total number of retried deferred unmounts"); 112 113 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); 114 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); 115 static uma_zone_t mount_zone; 116 117 /* List of mounted filesystems. */ 118 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 119 120 /* For any iteration/modification of mountlist */ 121 struct mtx_padalign __exclusive_cache_line mountlist_mtx; 122 123 EVENTHANDLER_LIST_DEFINE(vfs_mounted); 124 EVENTHANDLER_LIST_DEFINE(vfs_unmounted); 125 126 static void vfs_deferred_unmount(void *arg, int pending); 127 static struct timeout_task deferred_unmount_task; 128 static struct mtx deferred_unmount_lock; 129 MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount", 130 MTX_DEF); 131 static STAILQ_HEAD(, mount) deferred_unmount_list = 132 STAILQ_HEAD_INITIALIZER(deferred_unmount_list); 133 TASKQUEUE_DEFINE_THREAD(deferred_unmount); 134 135 static void mount_devctl_event(const char *type, struct mount *mp, bool donew); 136 137 /* 138 * Global opts, taken by all filesystems 139 */ 140 static const char *global_opts[] = { 141 "errmsg", 142 "fstype", 143 "fspath", 144 "ro", 145 "rw", 146 "nosuid", 147 "noexec", 148 NULL 149 }; 150 151 static int 152 mount_init(void *mem, int size, int flags) 153 { 154 struct mount *mp; 155 156 mp = (struct mount *)mem; 157 mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); 158 mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); 159 lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); 160 mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO); 161 mp->mnt_ref = 0; 162 mp->mnt_vfs_ops = 1; 163 mp->mnt_rootvnode = NULL; 164 return (0); 165 } 166 167 static void 168 mount_fini(void *mem, int size) 169 { 170 struct mount *mp; 171 172 mp = (struct mount *)mem; 173 uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu); 174 lockdestroy(&mp->mnt_explock); 175 mtx_destroy(&mp->mnt_listmtx); 176 mtx_destroy(&mp->mnt_mtx); 177 } 178 179 static void 180 vfs_mount_init(void *dummy __unused) 181 { 182 TIMEOUT_TASK_INIT(taskqueue_deferred_unmount, &deferred_unmount_task, 183 0, vfs_deferred_unmount, NULL); 184 deferred_unmount_retry_delay_hz = hz; 185 mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, 186 NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); 187 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF); 188 } 189 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); 190 191 /* 192 * --------------------------------------------------------------------- 193 * Functions for building and sanitizing the mount options 194 */ 195 196 /* Remove one mount option. */ 197 static void 198 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) 199 { 200 201 TAILQ_REMOVE(opts, opt, link); 202 free(opt->name, M_MOUNT); 203 if (opt->value != NULL) 204 free(opt->value, M_MOUNT); 205 free(opt, M_MOUNT); 206 } 207 208 /* Release all resources related to the mount options. */ 209 void 210 vfs_freeopts(struct vfsoptlist *opts) 211 { 212 struct vfsopt *opt; 213 214 while (!TAILQ_EMPTY(opts)) { 215 opt = TAILQ_FIRST(opts); 216 vfs_freeopt(opts, opt); 217 } 218 free(opts, M_MOUNT); 219 } 220 221 void 222 vfs_deleteopt(struct vfsoptlist *opts, const char *name) 223 { 224 struct vfsopt *opt, *temp; 225 226 if (opts == NULL) 227 return; 228 TAILQ_FOREACH_SAFE(opt, opts, link, temp) { 229 if (strcmp(opt->name, name) == 0) 230 vfs_freeopt(opts, opt); 231 } 232 } 233 234 static int 235 vfs_isopt_ro(const char *opt) 236 { 237 238 if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || 239 strcmp(opt, "norw") == 0) 240 return (1); 241 return (0); 242 } 243 244 static int 245 vfs_isopt_rw(const char *opt) 246 { 247 248 if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) 249 return (1); 250 return (0); 251 } 252 253 /* 254 * Check if options are equal (with or without the "no" prefix). 255 */ 256 static int 257 vfs_equalopts(const char *opt1, const char *opt2) 258 { 259 char *p; 260 261 /* "opt" vs. "opt" or "noopt" vs. "noopt" */ 262 if (strcmp(opt1, opt2) == 0) 263 return (1); 264 /* "noopt" vs. "opt" */ 265 if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) 266 return (1); 267 /* "opt" vs. "noopt" */ 268 if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) 269 return (1); 270 while ((p = strchr(opt1, '.')) != NULL && 271 !strncmp(opt1, opt2, ++p - opt1)) { 272 opt2 += p - opt1; 273 opt1 = p; 274 /* "foo.noopt" vs. "foo.opt" */ 275 if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) 276 return (1); 277 /* "foo.opt" vs. "foo.noopt" */ 278 if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) 279 return (1); 280 } 281 /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ 282 if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && 283 (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) 284 return (1); 285 return (0); 286 } 287 288 /* 289 * If a mount option is specified several times, 290 * (with or without the "no" prefix) only keep 291 * the last occurrence of it. 292 */ 293 static void 294 vfs_sanitizeopts(struct vfsoptlist *opts) 295 { 296 struct vfsopt *opt, *opt2, *tmp; 297 298 TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { 299 opt2 = TAILQ_PREV(opt, vfsoptlist, link); 300 while (opt2 != NULL) { 301 if (vfs_equalopts(opt->name, opt2->name)) { 302 tmp = TAILQ_PREV(opt2, vfsoptlist, link); 303 vfs_freeopt(opts, opt2); 304 opt2 = tmp; 305 } else { 306 opt2 = TAILQ_PREV(opt2, vfsoptlist, link); 307 } 308 } 309 } 310 } 311 312 /* 313 * Build a linked list of mount options from a struct uio. 314 */ 315 int 316 vfs_buildopts(struct uio *auio, struct vfsoptlist **options) 317 { 318 struct vfsoptlist *opts; 319 struct vfsopt *opt; 320 size_t memused, namelen, optlen; 321 unsigned int i, iovcnt; 322 int error; 323 324 opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); 325 TAILQ_INIT(opts); 326 memused = 0; 327 iovcnt = auio->uio_iovcnt; 328 for (i = 0; i < iovcnt; i += 2) { 329 namelen = auio->uio_iov[i].iov_len; 330 optlen = auio->uio_iov[i + 1].iov_len; 331 memused += sizeof(struct vfsopt) + optlen + namelen; 332 /* 333 * Avoid consuming too much memory, and attempts to overflow 334 * memused. 335 */ 336 if (memused > VFS_MOUNTARG_SIZE_MAX || 337 optlen > VFS_MOUNTARG_SIZE_MAX || 338 namelen > VFS_MOUNTARG_SIZE_MAX) { 339 error = EINVAL; 340 goto bad; 341 } 342 343 opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); 344 opt->name = malloc(namelen, M_MOUNT, M_WAITOK); 345 opt->value = NULL; 346 opt->len = 0; 347 opt->pos = i / 2; 348 opt->seen = 0; 349 350 /* 351 * Do this early, so jumps to "bad" will free the current 352 * option. 353 */ 354 TAILQ_INSERT_TAIL(opts, opt, link); 355 356 if (auio->uio_segflg == UIO_SYSSPACE) { 357 bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); 358 } else { 359 error = copyin(auio->uio_iov[i].iov_base, opt->name, 360 namelen); 361 if (error) 362 goto bad; 363 } 364 /* Ensure names are null-terminated strings. */ 365 if (namelen == 0 || opt->name[namelen - 1] != '\0') { 366 error = EINVAL; 367 goto bad; 368 } 369 if (optlen != 0) { 370 opt->len = optlen; 371 opt->value = malloc(optlen, M_MOUNT, M_WAITOK); 372 if (auio->uio_segflg == UIO_SYSSPACE) { 373 bcopy(auio->uio_iov[i + 1].iov_base, opt->value, 374 optlen); 375 } else { 376 error = copyin(auio->uio_iov[i + 1].iov_base, 377 opt->value, optlen); 378 if (error) 379 goto bad; 380 } 381 } 382 } 383 vfs_sanitizeopts(opts); 384 *options = opts; 385 return (0); 386 bad: 387 vfs_freeopts(opts); 388 return (error); 389 } 390 391 /* 392 * Merge the old mount options with the new ones passed 393 * in the MNT_UPDATE case. 394 * 395 * XXX: This function will keep a "nofoo" option in the new 396 * options. E.g, if the option's canonical name is "foo", 397 * "nofoo" ends up in the mount point's active options. 398 */ 399 static void 400 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) 401 { 402 struct vfsopt *opt, *new; 403 404 TAILQ_FOREACH(opt, oldopts, link) { 405 new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); 406 new->name = strdup(opt->name, M_MOUNT); 407 if (opt->len != 0) { 408 new->value = malloc(opt->len, M_MOUNT, M_WAITOK); 409 bcopy(opt->value, new->value, opt->len); 410 } else 411 new->value = NULL; 412 new->len = opt->len; 413 new->seen = opt->seen; 414 TAILQ_INSERT_HEAD(toopts, new, link); 415 } 416 vfs_sanitizeopts(toopts); 417 } 418 419 /* 420 * Mount a filesystem. 421 */ 422 #ifndef _SYS_SYSPROTO_H_ 423 struct nmount_args { 424 struct iovec *iovp; 425 unsigned int iovcnt; 426 int flags; 427 }; 428 #endif 429 int 430 sys_nmount(struct thread *td, struct nmount_args *uap) 431 { 432 struct uio *auio; 433 int error; 434 u_int iovcnt; 435 uint64_t flags; 436 437 /* 438 * Mount flags are now 64-bits. On 32-bit archtectures only 439 * 32-bits are passed in, but from here on everything handles 440 * 64-bit flags correctly. 441 */ 442 flags = uap->flags; 443 444 AUDIT_ARG_FFLAGS(flags); 445 CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, 446 uap->iovp, uap->iovcnt, flags); 447 448 /* 449 * Filter out MNT_ROOTFS. We do not want clients of nmount() in 450 * userspace to set this flag, but we must filter it out if we want 451 * MNT_UPDATE on the root file system to work. 452 * MNT_ROOTFS should only be set by the kernel when mounting its 453 * root file system. 454 */ 455 flags &= ~MNT_ROOTFS; 456 457 iovcnt = uap->iovcnt; 458 /* 459 * Check that we have an even number of iovec's 460 * and that we have at least two options. 461 */ 462 if ((iovcnt & 1) || (iovcnt < 4)) { 463 CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, 464 uap->iovcnt); 465 return (EINVAL); 466 } 467 468 error = copyinuio(uap->iovp, iovcnt, &auio); 469 if (error) { 470 CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", 471 __func__, error); 472 return (error); 473 } 474 error = vfs_donmount(td, flags, auio); 475 476 freeuio(auio); 477 return (error); 478 } 479 480 /* 481 * --------------------------------------------------------------------- 482 * Various utility functions 483 */ 484 485 /* 486 * Get a reference on a mount point from a vnode. 487 * 488 * The vnode is allowed to be passed unlocked and race against dooming. Note in 489 * such case there are no guarantees the referenced mount point will still be 490 * associated with it after the function returns. 491 */ 492 struct mount * 493 vfs_ref_from_vp(struct vnode *vp) 494 { 495 struct mount *mp; 496 struct mount_pcpu *mpcpu; 497 498 mp = atomic_load_ptr(&vp->v_mount); 499 if (__predict_false(mp == NULL)) { 500 return (mp); 501 } 502 if (vfs_op_thread_enter(mp, mpcpu)) { 503 if (__predict_true(mp == vp->v_mount)) { 504 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 505 vfs_op_thread_exit(mp, mpcpu); 506 } else { 507 vfs_op_thread_exit(mp, mpcpu); 508 mp = NULL; 509 } 510 } else { 511 MNT_ILOCK(mp); 512 if (mp == vp->v_mount) { 513 MNT_REF(mp); 514 MNT_IUNLOCK(mp); 515 } else { 516 MNT_IUNLOCK(mp); 517 mp = NULL; 518 } 519 } 520 return (mp); 521 } 522 523 void 524 vfs_ref(struct mount *mp) 525 { 526 struct mount_pcpu *mpcpu; 527 528 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 529 if (vfs_op_thread_enter(mp, mpcpu)) { 530 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 531 vfs_op_thread_exit(mp, mpcpu); 532 return; 533 } 534 535 MNT_ILOCK(mp); 536 MNT_REF(mp); 537 MNT_IUNLOCK(mp); 538 } 539 540 /* 541 * Register ump as an upper mount of the mount associated with 542 * vnode vp. This registration will be tracked through 543 * mount_upper_node upper, which should be allocated by the 544 * caller and stored in per-mount data associated with mp. 545 * 546 * If successful, this function will return the mount associated 547 * with vp, and will ensure that it cannot be unmounted until 548 * ump has been unregistered as one of its upper mounts. 549 * 550 * Upon failure this function will return NULL. 551 */ 552 struct mount * 553 vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump, 554 struct mount_upper_node *upper) 555 { 556 struct mount *mp; 557 558 mp = atomic_load_ptr(&vp->v_mount); 559 if (mp == NULL) 560 return (NULL); 561 MNT_ILOCK(mp); 562 if (mp != vp->v_mount || 563 ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) { 564 MNT_IUNLOCK(mp); 565 return (NULL); 566 } 567 KASSERT(ump != mp, ("upper and lower mounts are identical")); 568 upper->mp = ump; 569 MNT_REF(mp); 570 TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link); 571 MNT_IUNLOCK(mp); 572 return (mp); 573 } 574 575 /* 576 * Register upper mount ump to receive vnode unlink/reclaim 577 * notifications from lower mount mp. This registration will 578 * be tracked through mount_upper_node upper, which should be 579 * allocated by the caller and stored in per-mount data 580 * associated with mp. 581 * 582 * ump must already be registered as an upper mount of mp 583 * through a call to vfs_register_upper_from_vp(). 584 */ 585 void 586 vfs_register_for_notification(struct mount *mp, struct mount *ump, 587 struct mount_upper_node *upper) 588 { 589 upper->mp = ump; 590 MNT_ILOCK(mp); 591 TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link); 592 MNT_IUNLOCK(mp); 593 } 594 595 static void 596 vfs_drain_upper_locked(struct mount *mp) 597 { 598 mtx_assert(MNT_MTX(mp), MA_OWNED); 599 while (mp->mnt_upper_pending != 0) { 600 mp->mnt_kern_flag |= MNTK_UPPER_WAITER; 601 msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0); 602 } 603 } 604 605 /* 606 * Undo a previous call to vfs_register_for_notification(). 607 * The mount represented by upper must be currently registered 608 * as an upper mount for mp. 609 */ 610 void 611 vfs_unregister_for_notification(struct mount *mp, 612 struct mount_upper_node *upper) 613 { 614 MNT_ILOCK(mp); 615 vfs_drain_upper_locked(mp); 616 TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link); 617 MNT_IUNLOCK(mp); 618 } 619 620 /* 621 * Undo a previous call to vfs_register_upper_from_vp(). 622 * This must be done before mp can be unmounted. 623 */ 624 void 625 vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper) 626 { 627 MNT_ILOCK(mp); 628 KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0, 629 ("registered upper with pending unmount")); 630 vfs_drain_upper_locked(mp); 631 TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link); 632 if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 && 633 TAILQ_EMPTY(&mp->mnt_uppers)) { 634 mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER; 635 wakeup(&mp->mnt_taskqueue_link); 636 } 637 MNT_REL(mp); 638 MNT_IUNLOCK(mp); 639 } 640 641 void 642 vfs_rel(struct mount *mp) 643 { 644 struct mount_pcpu *mpcpu; 645 646 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 647 if (vfs_op_thread_enter(mp, mpcpu)) { 648 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 649 vfs_op_thread_exit(mp, mpcpu); 650 return; 651 } 652 653 MNT_ILOCK(mp); 654 MNT_REL(mp); 655 MNT_IUNLOCK(mp); 656 } 657 658 /* 659 * Allocate and initialize the mount point struct. 660 */ 661 struct mount * 662 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, 663 struct ucred *cred) 664 { 665 struct mount *mp; 666 667 mp = uma_zalloc(mount_zone, M_WAITOK); 668 bzero(&mp->mnt_startzero, 669 __rangeof(struct mount, mnt_startzero, mnt_endzero)); 670 mp->mnt_kern_flag = 0; 671 mp->mnt_flag = 0; 672 mp->mnt_rootvnode = NULL; 673 mp->mnt_vnodecovered = NULL; 674 mp->mnt_op = NULL; 675 mp->mnt_vfc = NULL; 676 TAILQ_INIT(&mp->mnt_nvnodelist); 677 mp->mnt_nvnodelistsize = 0; 678 TAILQ_INIT(&mp->mnt_lazyvnodelist); 679 mp->mnt_lazyvnodelistsize = 0; 680 MPPASS(mp->mnt_ref == 0 && mp->mnt_lockref == 0 && 681 mp->mnt_writeopcount == 0, mp); 682 MPASSERT(mp->mnt_vfs_ops == 1, mp, 683 ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops)); 684 (void) vfs_busy(mp, MBF_NOWAIT); 685 atomic_add_acq_int(&vfsp->vfc_refcount, 1); 686 mp->mnt_op = vfsp->vfc_vfsops; 687 mp->mnt_vfc = vfsp; 688 mp->mnt_stat.f_type = vfsp->vfc_typenum; 689 mp->mnt_gen++; 690 strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 691 mp->mnt_vnodecovered = vp; 692 mp->mnt_cred = crdup(cred); 693 mp->mnt_stat.f_owner = cred->cr_uid; 694 strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); 695 mp->mnt_iosize_max = DFLTPHYS; 696 #ifdef MAC 697 mac_mount_init(mp); 698 mac_mount_create(cred, mp); 699 #endif 700 arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); 701 mp->mnt_upper_pending = 0; 702 TAILQ_INIT(&mp->mnt_uppers); 703 TAILQ_INIT(&mp->mnt_notify); 704 mp->mnt_taskqueue_flags = 0; 705 mp->mnt_unmount_retries = 0; 706 return (mp); 707 } 708 709 /* 710 * Destroy the mount struct previously allocated by vfs_mount_alloc(). 711 */ 712 void 713 vfs_mount_destroy(struct mount *mp) 714 { 715 716 MPPASS(mp->mnt_vfs_ops != 0, mp); 717 718 vfs_assert_mount_counters(mp); 719 720 MNT_ILOCK(mp); 721 mp->mnt_kern_flag |= MNTK_REFEXPIRE; 722 if (mp->mnt_kern_flag & MNTK_MWAIT) { 723 mp->mnt_kern_flag &= ~MNTK_MWAIT; 724 wakeup(mp); 725 } 726 while (mp->mnt_ref) 727 msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); 728 KASSERT(mp->mnt_ref == 0, 729 ("%s: invalid refcount in the drain path @ %s:%d", __func__, 730 __FILE__, __LINE__)); 731 MPPASS(mp->mnt_writeopcount == 0, mp); 732 MPPASS(mp->mnt_secondary_writes == 0, mp); 733 atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); 734 if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { 735 struct vnode *vp; 736 737 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) 738 vn_printf(vp, "dangling vnode "); 739 panic("unmount: dangling vnode"); 740 } 741 KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending")); 742 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); 743 KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify")); 744 MPPASS(mp->mnt_nvnodelistsize == 0, mp); 745 MPPASS(mp->mnt_lazyvnodelistsize == 0, mp); 746 MPPASS(mp->mnt_lockref == 0, mp); 747 MNT_IUNLOCK(mp); 748 749 MPASSERT(mp->mnt_vfs_ops == 1, mp, 750 ("vfs_ops should be 1 but %d found", mp->mnt_vfs_ops)); 751 752 MPASSERT(mp->mnt_rootvnode == NULL, mp, 753 ("mount point still has a root vnode %p", mp->mnt_rootvnode)); 754 755 if (mp->mnt_vnodecovered != NULL) 756 vrele(mp->mnt_vnodecovered); 757 #ifdef MAC 758 mac_mount_destroy(mp); 759 #endif 760 if (mp->mnt_opt != NULL) 761 vfs_freeopts(mp->mnt_opt); 762 if (mp->mnt_exjail != NULL) { 763 atomic_subtract_int(&mp->mnt_exjail->cr_prison->pr_exportcnt, 764 1); 765 crfree(mp->mnt_exjail); 766 } 767 if (mp->mnt_export != NULL) { 768 vfs_free_addrlist(mp->mnt_export); 769 free(mp->mnt_export, M_MOUNT); 770 } 771 crfree(mp->mnt_cred); 772 uma_zfree(mount_zone, mp); 773 } 774 775 static bool 776 vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error) 777 { 778 /* This is an upgrade of an exisiting mount. */ 779 if ((fsflags & MNT_UPDATE) != 0) 780 return (false); 781 /* This is already an R/O mount. */ 782 if ((fsflags & MNT_RDONLY) != 0) 783 return (false); 784 785 switch (error) { 786 case ENODEV: /* generic, geom, ... */ 787 case EACCES: /* cam/scsi, ... */ 788 case EROFS: /* md, mmcsd, ... */ 789 /* 790 * These errors can be returned by the storage layer to signal 791 * that the media is read-only. No harm in the R/O mount 792 * attempt if the error was returned for some other reason. 793 */ 794 return (true); 795 default: 796 return (false); 797 } 798 } 799 800 int 801 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) 802 { 803 struct vfsoptlist *optlist; 804 struct vfsopt *opt, *tmp_opt; 805 char *fstype, *fspath, *errmsg; 806 int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; 807 bool autoro, has_nonexport, jail_export; 808 809 errmsg = fspath = NULL; 810 errmsg_len = fspathlen = 0; 811 errmsg_pos = -1; 812 autoro = default_autoro; 813 814 error = vfs_buildopts(fsoptions, &optlist); 815 if (error) 816 return (error); 817 818 if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) 819 errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); 820 821 /* 822 * We need these two options before the others, 823 * and they are mandatory for any filesystem. 824 * Ensure they are NUL terminated as well. 825 */ 826 fstypelen = 0; 827 error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); 828 if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') { 829 error = EINVAL; 830 if (errmsg != NULL) 831 strncpy(errmsg, "Invalid fstype", errmsg_len); 832 goto bail; 833 } 834 fspathlen = 0; 835 error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); 836 if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') { 837 error = EINVAL; 838 if (errmsg != NULL) 839 strncpy(errmsg, "Invalid fspath", errmsg_len); 840 goto bail; 841 } 842 843 /* 844 * Check to see that "export" is only used with the "update", "fstype", 845 * "fspath", "from" and "errmsg" options when in a vnet jail. 846 * These are the ones used to set/update exports by mountd(8). 847 * If only the above options are set in a jail that can run mountd(8), 848 * then the jail_export argument of vfs_domount() will be true. 849 * When jail_export is true, the vfs_suser() check does not cause 850 * failure, but limits the update to exports only. 851 * This allows mountd(8) running within the vnet jail 852 * to export file systems visible within the jail, but 853 * mounted outside of the jail. 854 */ 855 /* 856 * We need to see if we have the "update" option 857 * before we call vfs_domount(), since vfs_domount() has special 858 * logic based on MNT_UPDATE. This is very important 859 * when we want to update the root filesystem. 860 */ 861 has_nonexport = false; 862 jail_export = false; 863 TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { 864 int do_freeopt = 0; 865 866 if (jailed(td->td_ucred) && 867 strcmp(opt->name, "export") != 0 && 868 strcmp(opt->name, "update") != 0 && 869 strcmp(opt->name, "fstype") != 0 && 870 strcmp(opt->name, "fspath") != 0 && 871 strcmp(opt->name, "from") != 0 && 872 strcmp(opt->name, "errmsg") != 0) 873 has_nonexport = true; 874 if (strcmp(opt->name, "update") == 0) { 875 fsflags |= MNT_UPDATE; 876 do_freeopt = 1; 877 } 878 else if (strcmp(opt->name, "async") == 0) 879 fsflags |= MNT_ASYNC; 880 else if (strcmp(opt->name, "force") == 0) { 881 fsflags |= MNT_FORCE; 882 do_freeopt = 1; 883 } 884 else if (strcmp(opt->name, "reload") == 0) { 885 fsflags |= MNT_RELOAD; 886 do_freeopt = 1; 887 } 888 else if (strcmp(opt->name, "multilabel") == 0) 889 fsflags |= MNT_MULTILABEL; 890 else if (strcmp(opt->name, "noasync") == 0) 891 fsflags &= ~MNT_ASYNC; 892 else if (strcmp(opt->name, "noatime") == 0) 893 fsflags |= MNT_NOATIME; 894 else if (strcmp(opt->name, "atime") == 0) { 895 free(opt->name, M_MOUNT); 896 opt->name = strdup("nonoatime", M_MOUNT); 897 } 898 else if (strcmp(opt->name, "noclusterr") == 0) 899 fsflags |= MNT_NOCLUSTERR; 900 else if (strcmp(opt->name, "clusterr") == 0) { 901 free(opt->name, M_MOUNT); 902 opt->name = strdup("nonoclusterr", M_MOUNT); 903 } 904 else if (strcmp(opt->name, "noclusterw") == 0) 905 fsflags |= MNT_NOCLUSTERW; 906 else if (strcmp(opt->name, "clusterw") == 0) { 907 free(opt->name, M_MOUNT); 908 opt->name = strdup("nonoclusterw", M_MOUNT); 909 } 910 else if (strcmp(opt->name, "noexec") == 0) 911 fsflags |= MNT_NOEXEC; 912 else if (strcmp(opt->name, "exec") == 0) { 913 free(opt->name, M_MOUNT); 914 opt->name = strdup("nonoexec", M_MOUNT); 915 } 916 else if (strcmp(opt->name, "nosuid") == 0) 917 fsflags |= MNT_NOSUID; 918 else if (strcmp(opt->name, "suid") == 0) { 919 free(opt->name, M_MOUNT); 920 opt->name = strdup("nonosuid", M_MOUNT); 921 } 922 else if (strcmp(opt->name, "nosymfollow") == 0) 923 fsflags |= MNT_NOSYMFOLLOW; 924 else if (strcmp(opt->name, "symfollow") == 0) { 925 free(opt->name, M_MOUNT); 926 opt->name = strdup("nonosymfollow", M_MOUNT); 927 } 928 else if (strcmp(opt->name, "noro") == 0) { 929 fsflags &= ~MNT_RDONLY; 930 autoro = false; 931 } 932 else if (strcmp(opt->name, "rw") == 0) { 933 fsflags &= ~MNT_RDONLY; 934 autoro = false; 935 } 936 else if (strcmp(opt->name, "ro") == 0) { 937 fsflags |= MNT_RDONLY; 938 autoro = false; 939 } 940 else if (strcmp(opt->name, "rdonly") == 0) { 941 free(opt->name, M_MOUNT); 942 opt->name = strdup("ro", M_MOUNT); 943 fsflags |= MNT_RDONLY; 944 autoro = false; 945 } 946 else if (strcmp(opt->name, "autoro") == 0) { 947 do_freeopt = 1; 948 autoro = true; 949 } 950 else if (strcmp(opt->name, "suiddir") == 0) 951 fsflags |= MNT_SUIDDIR; 952 else if (strcmp(opt->name, "sync") == 0) 953 fsflags |= MNT_SYNCHRONOUS; 954 else if (strcmp(opt->name, "union") == 0) 955 fsflags |= MNT_UNION; 956 else if (strcmp(opt->name, "export") == 0) { 957 fsflags |= MNT_EXPORTED; 958 jail_export = true; 959 } else if (strcmp(opt->name, "automounted") == 0) { 960 fsflags |= MNT_AUTOMOUNTED; 961 do_freeopt = 1; 962 } else if (strcmp(opt->name, "nocover") == 0) { 963 fsflags |= MNT_NOCOVER; 964 do_freeopt = 1; 965 } else if (strcmp(opt->name, "cover") == 0) { 966 fsflags &= ~MNT_NOCOVER; 967 do_freeopt = 1; 968 } else if (strcmp(opt->name, "emptydir") == 0) { 969 fsflags |= MNT_EMPTYDIR; 970 do_freeopt = 1; 971 } else if (strcmp(opt->name, "noemptydir") == 0) { 972 fsflags &= ~MNT_EMPTYDIR; 973 do_freeopt = 1; 974 } 975 if (do_freeopt) 976 vfs_freeopt(optlist, opt); 977 } 978 979 /* 980 * Be ultra-paranoid about making sure the type and fspath 981 * variables will fit in our mp buffers, including the 982 * terminating NUL. 983 */ 984 if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { 985 error = ENAMETOOLONG; 986 goto bail; 987 } 988 989 /* 990 * If has_nonexport is true or the caller is not running within a 991 * vnet prison that can run mountd(8), set jail_export false. 992 */ 993 if (has_nonexport || !jailed(td->td_ucred) || 994 !prison_check_nfsd(td->td_ucred)) 995 jail_export = false; 996 997 error = vfs_domount(td, fstype, fspath, fsflags, jail_export, &optlist); 998 if (error == ENODEV) { 999 error = EINVAL; 1000 if (errmsg != NULL) 1001 strncpy(errmsg, "Invalid fstype", errmsg_len); 1002 goto bail; 1003 } 1004 1005 /* 1006 * See if we can mount in the read-only mode if the error code suggests 1007 * that it could be possible and the mount options allow for that. 1008 * Never try it if "[no]{ro|rw}" has been explicitly requested and not 1009 * overridden by "autoro". 1010 */ 1011 if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) { 1012 printf("%s: R/W mount failed, possibly R/O media," 1013 " trying R/O mount\n", __func__); 1014 fsflags |= MNT_RDONLY; 1015 error = vfs_domount(td, fstype, fspath, fsflags, jail_export, 1016 &optlist); 1017 } 1018 bail: 1019 /* copyout the errmsg */ 1020 if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) 1021 && errmsg_len > 0 && errmsg != NULL) { 1022 if (fsoptions->uio_segflg == UIO_SYSSPACE) { 1023 bcopy(errmsg, 1024 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, 1025 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); 1026 } else { 1027 (void)copyout(errmsg, 1028 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, 1029 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); 1030 } 1031 } 1032 1033 if (optlist != NULL) 1034 vfs_freeopts(optlist); 1035 return (error); 1036 } 1037 1038 /* 1039 * Old mount API. 1040 */ 1041 #ifndef _SYS_SYSPROTO_H_ 1042 struct mount_args { 1043 char *type; 1044 char *path; 1045 int flags; 1046 caddr_t data; 1047 }; 1048 #endif 1049 /* ARGSUSED */ 1050 int 1051 sys_mount(struct thread *td, struct mount_args *uap) 1052 { 1053 char *fstype; 1054 struct vfsconf *vfsp = NULL; 1055 struct mntarg *ma = NULL; 1056 uint64_t flags; 1057 int error; 1058 1059 /* 1060 * Mount flags are now 64-bits. On 32-bit architectures only 1061 * 32-bits are passed in, but from here on everything handles 1062 * 64-bit flags correctly. 1063 */ 1064 flags = uap->flags; 1065 1066 AUDIT_ARG_FFLAGS(flags); 1067 1068 /* 1069 * Filter out MNT_ROOTFS. We do not want clients of mount() in 1070 * userspace to set this flag, but we must filter it out if we want 1071 * MNT_UPDATE on the root file system to work. 1072 * MNT_ROOTFS should only be set by the kernel when mounting its 1073 * root file system. 1074 */ 1075 flags &= ~MNT_ROOTFS; 1076 1077 fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); 1078 error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); 1079 if (error) { 1080 free(fstype, M_TEMP); 1081 return (error); 1082 } 1083 1084 AUDIT_ARG_TEXT(fstype); 1085 vfsp = vfs_byname_kld(fstype, td, &error); 1086 free(fstype, M_TEMP); 1087 if (vfsp == NULL) 1088 return (EINVAL); 1089 if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 && 1090 vfsp->vfc_vfsops_sd->vfs_cmount == NULL) || 1091 ((vfsp->vfc_flags & VFCF_SBDRY) == 0 && 1092 vfsp->vfc_vfsops->vfs_cmount == NULL)) 1093 return (EOPNOTSUPP); 1094 1095 ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); 1096 ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); 1097 ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); 1098 ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); 1099 ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); 1100 1101 if ((vfsp->vfc_flags & VFCF_SBDRY) != 0) 1102 return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags)); 1103 return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags)); 1104 } 1105 1106 /* 1107 * vfs_domount_first(): first file system mount (not update) 1108 */ 1109 static int 1110 vfs_domount_first( 1111 struct thread *td, /* Calling thread. */ 1112 struct vfsconf *vfsp, /* File system type. */ 1113 char *fspath, /* Mount path. */ 1114 struct vnode *vp, /* Vnode to be covered. */ 1115 uint64_t fsflags, /* Flags common to all filesystems. */ 1116 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1117 ) 1118 { 1119 struct vattr va; 1120 struct mount *mp; 1121 struct vnode *newdp, *rootvp; 1122 int error, error1; 1123 bool unmounted; 1124 1125 ASSERT_VOP_ELOCKED(vp, __func__); 1126 KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); 1127 1128 /* 1129 * If the jail of the calling thread lacks permission for this type of 1130 * file system, or is trying to cover its own root, deny immediately. 1131 */ 1132 if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred, 1133 vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) { 1134 vput(vp); 1135 return (EPERM); 1136 } 1137 1138 /* 1139 * If the user is not root, ensure that they own the directory 1140 * onto which we are attempting to mount. 1141 */ 1142 error = VOP_GETATTR(vp, &va, td->td_ucred); 1143 if (error == 0 && va.va_uid != td->td_ucred->cr_uid) 1144 error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN); 1145 if (error == 0) 1146 error = vinvalbuf(vp, V_SAVE, 0, 0); 1147 if (vfsp->vfc_flags & VFCF_FILEMOUNT) { 1148 if (error == 0 && vp->v_type != VDIR && vp->v_type != VREG) 1149 error = EINVAL; 1150 /* 1151 * For file mounts, ensure that there is only one hardlink to the file. 1152 */ 1153 if (error == 0 && vp->v_type == VREG && va.va_nlink != 1) 1154 error = EINVAL; 1155 } else { 1156 if (error == 0 && vp->v_type != VDIR) 1157 error = ENOTDIR; 1158 } 1159 if (error == 0 && (fsflags & MNT_EMPTYDIR) != 0) 1160 error = vn_dir_check_empty(vp); 1161 if (error == 0) { 1162 VI_LOCK(vp); 1163 if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) 1164 vp->v_iflag |= VI_MOUNT; 1165 else 1166 error = EBUSY; 1167 VI_UNLOCK(vp); 1168 } 1169 if (error != 0) { 1170 vput(vp); 1171 return (error); 1172 } 1173 vn_seqc_write_begin(vp); 1174 VOP_UNLOCK(vp); 1175 1176 /* Allocate and initialize the filesystem. */ 1177 mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); 1178 /* XXXMAC: pass to vfs_mount_alloc? */ 1179 mp->mnt_optnew = *optlist; 1180 /* Set the mount level flags. */ 1181 mp->mnt_flag = (fsflags & 1182 (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY | MNT_FORCE)); 1183 1184 /* 1185 * Mount the filesystem. 1186 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they 1187 * get. No freeing of cn_pnbuf. 1188 */ 1189 error1 = 0; 1190 unmounted = true; 1191 if ((error = VFS_MOUNT(mp)) != 0 || 1192 (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 || 1193 (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) { 1194 rootvp = NULL; 1195 if (error1 != 0) { 1196 MPASS(error == 0); 1197 rootvp = vfs_cache_root_clear(mp); 1198 if (rootvp != NULL) { 1199 vhold(rootvp); 1200 vrele(rootvp); 1201 } 1202 (void)vn_start_write(NULL, &mp, V_WAIT); 1203 MNT_ILOCK(mp); 1204 mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_UNMOUNTF; 1205 MNT_IUNLOCK(mp); 1206 VFS_PURGE(mp); 1207 error = VFS_UNMOUNT(mp, 0); 1208 vn_finished_write(mp); 1209 if (error != 0) { 1210 printf( 1211 "failed post-mount (%d): rollback unmount returned %d\n", 1212 error1, error); 1213 unmounted = false; 1214 } 1215 error = error1; 1216 } 1217 vfs_unbusy(mp); 1218 mp->mnt_vnodecovered = NULL; 1219 if (unmounted) { 1220 /* XXXKIB wait for mnt_lockref drain? */ 1221 vfs_mount_destroy(mp); 1222 } 1223 VI_LOCK(vp); 1224 vp->v_iflag &= ~VI_MOUNT; 1225 VI_UNLOCK(vp); 1226 if (rootvp != NULL) { 1227 vn_seqc_write_end(rootvp); 1228 vdrop(rootvp); 1229 } 1230 vn_seqc_write_end(vp); 1231 vrele(vp); 1232 return (error); 1233 } 1234 vn_seqc_write_begin(newdp); 1235 VOP_UNLOCK(newdp); 1236 1237 if (mp->mnt_opt != NULL) 1238 vfs_freeopts(mp->mnt_opt); 1239 mp->mnt_opt = mp->mnt_optnew; 1240 *optlist = NULL; 1241 1242 /* 1243 * Prevent external consumers of mount options from reading mnt_optnew. 1244 */ 1245 mp->mnt_optnew = NULL; 1246 1247 MNT_ILOCK(mp); 1248 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 1249 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 1250 mp->mnt_kern_flag |= MNTK_ASYNC; 1251 else 1252 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1253 MNT_IUNLOCK(mp); 1254 1255 /* 1256 * VIRF_MOUNTPOINT and v_mountedhere need to be set under the 1257 * vp lock to satisfy vfs_lookup() requirements. 1258 */ 1259 VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); 1260 VI_LOCK(vp); 1261 vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); 1262 vp->v_mountedhere = mp; 1263 VI_UNLOCK(vp); 1264 VOP_UNLOCK(vp); 1265 cache_purge(vp); 1266 1267 /* 1268 * We need to lock both vnodes. 1269 * 1270 * Use vn_lock_pair to avoid establishing an ordering between vnodes 1271 * from different filesystems. 1272 */ 1273 vn_lock_pair(vp, false, LK_EXCLUSIVE, newdp, false, LK_EXCLUSIVE); 1274 1275 VI_LOCK(vp); 1276 vp->v_iflag &= ~VI_MOUNT; 1277 VI_UNLOCK(vp); 1278 /* Place the new filesystem at the end of the mount list. */ 1279 mtx_lock(&mountlist_mtx); 1280 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); 1281 mtx_unlock(&mountlist_mtx); 1282 vfs_event_signal(NULL, VQ_MOUNT, 0); 1283 VOP_UNLOCK(vp); 1284 EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); 1285 VOP_UNLOCK(newdp); 1286 mount_devctl_event("MOUNT", mp, false); 1287 mountcheckdirs(vp, newdp); 1288 vn_seqc_write_end(vp); 1289 vn_seqc_write_end(newdp); 1290 vrele(newdp); 1291 if ((mp->mnt_flag & MNT_RDONLY) == 0) 1292 vfs_allocate_syncvnode(mp); 1293 vfs_op_exit(mp); 1294 vfs_unbusy(mp); 1295 return (0); 1296 } 1297 1298 /* 1299 * vfs_domount_update(): update of mounted file system 1300 */ 1301 static int 1302 vfs_domount_update( 1303 struct thread *td, /* Calling thread. */ 1304 struct vnode *vp, /* Mount point vnode. */ 1305 uint64_t fsflags, /* Flags common to all filesystems. */ 1306 bool jail_export, /* Got export option in vnet prison. */ 1307 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1308 ) 1309 { 1310 struct export_args export; 1311 struct o2export_args o2export; 1312 struct vnode *rootvp; 1313 void *bufp; 1314 struct mount *mp; 1315 int error, export_error, i, len, fsid_up_len; 1316 uint64_t flag, mnt_union; 1317 gid_t *grps; 1318 fsid_t *fsid_up; 1319 bool vfs_suser_failed; 1320 1321 ASSERT_VOP_ELOCKED(vp, __func__); 1322 KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); 1323 mp = vp->v_mount; 1324 1325 if ((vp->v_vflag & VV_ROOT) == 0) { 1326 if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) 1327 == 0) 1328 error = EXDEV; 1329 else 1330 error = EINVAL; 1331 vput(vp); 1332 return (error); 1333 } 1334 1335 /* 1336 * We only allow the filesystem to be reloaded if it 1337 * is currently mounted read-only. 1338 */ 1339 flag = mp->mnt_flag; 1340 if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { 1341 vput(vp); 1342 return (EOPNOTSUPP); /* Needs translation */ 1343 } 1344 /* 1345 * Only privileged root, or (if MNT_USER is set) the user that 1346 * did the original mount is permitted to update it. 1347 */ 1348 /* 1349 * For the case of mountd(8) doing exports in a jail, the vfs_suser() 1350 * call does not cause failure. vfs_domount() has already checked 1351 * that "root" is doing this and vfs_suser() will fail when 1352 * the file system has been mounted outside the jail. 1353 * jail_export set true indicates that "export" is not mixed 1354 * with other options that change mount behaviour. 1355 */ 1356 vfs_suser_failed = false; 1357 error = vfs_suser(mp, td); 1358 if (jail_export && error != 0) { 1359 error = 0; 1360 vfs_suser_failed = true; 1361 } 1362 if (error != 0) { 1363 vput(vp); 1364 return (error); 1365 } 1366 if (vfs_busy(mp, MBF_NOWAIT)) { 1367 vput(vp); 1368 return (EBUSY); 1369 } 1370 VI_LOCK(vp); 1371 if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { 1372 VI_UNLOCK(vp); 1373 vfs_unbusy(mp); 1374 vput(vp); 1375 return (EBUSY); 1376 } 1377 vp->v_iflag |= VI_MOUNT; 1378 VI_UNLOCK(vp); 1379 VOP_UNLOCK(vp); 1380 1381 rootvp = NULL; 1382 vfs_op_enter(mp); 1383 vn_seqc_write_begin(vp); 1384 1385 if (vfs_getopt(*optlist, "fsid", (void **)&fsid_up, 1386 &fsid_up_len) == 0) { 1387 if (fsid_up_len != sizeof(*fsid_up)) { 1388 error = EINVAL; 1389 goto end; 1390 } 1391 if (fsidcmp(fsid_up, &mp->mnt_stat.f_fsid) != 0) { 1392 error = ENOENT; 1393 goto end; 1394 } 1395 vfs_deleteopt(*optlist, "fsid"); 1396 } 1397 1398 mnt_union = 0; 1399 MNT_ILOCK(mp); 1400 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 1401 MNT_IUNLOCK(mp); 1402 error = EBUSY; 1403 goto end; 1404 } 1405 if (vfs_suser_failed) { 1406 KASSERT((fsflags & (MNT_EXPORTED | MNT_UPDATE)) == 1407 (MNT_EXPORTED | MNT_UPDATE), 1408 ("%s: jailed export did not set expected fsflags", 1409 __func__)); 1410 /* 1411 * For this case, only MNT_UPDATE and 1412 * MNT_EXPORTED have been set in fsflags 1413 * by the options. Only set MNT_UPDATE, 1414 * since that is the one that would be set 1415 * when set in fsflags, below. 1416 */ 1417 mp->mnt_flag |= MNT_UPDATE; 1418 } else { 1419 mp->mnt_flag &= ~MNT_UPDATEMASK; 1420 if ((mp->mnt_flag & MNT_UNION) == 0 && 1421 (fsflags & MNT_UNION) != 0) { 1422 fsflags &= ~MNT_UNION; 1423 mnt_union = MNT_UNION; 1424 } 1425 mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | 1426 MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); 1427 if ((mp->mnt_flag & MNT_ASYNC) == 0) 1428 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1429 } 1430 rootvp = vfs_cache_root_clear(mp); 1431 MNT_IUNLOCK(mp); 1432 mp->mnt_optnew = *optlist; 1433 vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); 1434 1435 /* 1436 * Mount the filesystem. 1437 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they 1438 * get. No freeing of cn_pnbuf. 1439 */ 1440 /* 1441 * For the case of mountd(8) doing exports from within a vnet jail, 1442 * "from" is typically not set correctly such that VFS_MOUNT() will 1443 * return ENOENT. It is not obvious that VFS_MOUNT() ever needs to be 1444 * called when mountd is doing exports, but this check only applies to 1445 * the specific case where it is running inside a vnet jail, to 1446 * avoid any POLA violation. 1447 */ 1448 error = 0; 1449 if (!jail_export) 1450 error = VFS_MOUNT(mp); 1451 1452 export_error = 0; 1453 /* Process the export option. */ 1454 if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, 1455 &len) == 0) { 1456 /* Assume that there is only 1 ABI for each length. */ 1457 switch (len) { 1458 case (sizeof(struct oexport_args)): 1459 bzero(&o2export, sizeof(o2export)); 1460 /* FALLTHROUGH */ 1461 case (sizeof(o2export)): 1462 bcopy(bufp, &o2export, len); 1463 export.ex_flags = (uint64_t)o2export.ex_flags; 1464 export.ex_root = o2export.ex_root; 1465 export.ex_uid = o2export.ex_anon.cr_uid; 1466 export.ex_groups = NULL; 1467 export.ex_ngroups = o2export.ex_anon.cr_ngroups; 1468 if (export.ex_ngroups > 0) { 1469 if (export.ex_ngroups <= XU_NGROUPS) { 1470 export.ex_groups = malloc( 1471 export.ex_ngroups * sizeof(gid_t), 1472 M_TEMP, M_WAITOK); 1473 for (i = 0; i < export.ex_ngroups; i++) 1474 export.ex_groups[i] = 1475 o2export.ex_anon.cr_groups[i]; 1476 } else 1477 export_error = EINVAL; 1478 } else if (export.ex_ngroups < 0) 1479 export_error = EINVAL; 1480 export.ex_addr = o2export.ex_addr; 1481 export.ex_addrlen = o2export.ex_addrlen; 1482 export.ex_mask = o2export.ex_mask; 1483 export.ex_masklen = o2export.ex_masklen; 1484 export.ex_indexfile = o2export.ex_indexfile; 1485 export.ex_numsecflavors = o2export.ex_numsecflavors; 1486 if (export.ex_numsecflavors < MAXSECFLAVORS) { 1487 for (i = 0; i < export.ex_numsecflavors; i++) 1488 export.ex_secflavors[i] = 1489 o2export.ex_secflavors[i]; 1490 } else 1491 export_error = EINVAL; 1492 if (export_error == 0) 1493 export_error = vfs_export(mp, &export, true); 1494 free(export.ex_groups, M_TEMP); 1495 break; 1496 case (sizeof(export)): 1497 bcopy(bufp, &export, len); 1498 grps = NULL; 1499 if (export.ex_ngroups > 0) { 1500 if (export.ex_ngroups <= ngroups_max + 1) { 1501 grps = malloc(export.ex_ngroups * 1502 sizeof(gid_t), M_TEMP, M_WAITOK); 1503 export_error = copyin(export.ex_groups, 1504 grps, export.ex_ngroups * 1505 sizeof(gid_t)); 1506 if (export_error == 0) 1507 export.ex_groups = grps; 1508 } else 1509 export_error = EINVAL; 1510 } else if (export.ex_ngroups == 0) 1511 export.ex_groups = NULL; 1512 else 1513 export_error = EINVAL; 1514 if (export_error == 0) 1515 export_error = vfs_export(mp, &export, true); 1516 free(grps, M_TEMP); 1517 break; 1518 default: 1519 export_error = EINVAL; 1520 break; 1521 } 1522 } 1523 1524 MNT_ILOCK(mp); 1525 if (error == 0) { 1526 mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | 1527 MNT_SNAPSHOT); 1528 mp->mnt_flag |= mnt_union; 1529 } else { 1530 /* 1531 * If we fail, restore old mount flags. MNT_QUOTA is special, 1532 * because it is not part of MNT_UPDATEMASK, but it could have 1533 * changed in the meantime if quotactl(2) was called. 1534 * All in all we want current value of MNT_QUOTA, not the old 1535 * one. 1536 */ 1537 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); 1538 } 1539 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 1540 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 1541 mp->mnt_kern_flag |= MNTK_ASYNC; 1542 else 1543 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1544 MNT_IUNLOCK(mp); 1545 1546 if (error != 0) 1547 goto end; 1548 1549 mount_devctl_event("REMOUNT", mp, true); 1550 if (mp->mnt_opt != NULL) 1551 vfs_freeopts(mp->mnt_opt); 1552 mp->mnt_opt = mp->mnt_optnew; 1553 *optlist = NULL; 1554 (void)VFS_STATFS(mp, &mp->mnt_stat); 1555 /* 1556 * Prevent external consumers of mount options from reading 1557 * mnt_optnew. 1558 */ 1559 mp->mnt_optnew = NULL; 1560 1561 if ((mp->mnt_flag & MNT_RDONLY) == 0) 1562 vfs_allocate_syncvnode(mp); 1563 else 1564 vfs_deallocate_syncvnode(mp); 1565 end: 1566 vfs_op_exit(mp); 1567 if (rootvp != NULL) { 1568 vn_seqc_write_end(rootvp); 1569 vrele(rootvp); 1570 } 1571 vn_seqc_write_end(vp); 1572 vfs_unbusy(mp); 1573 VI_LOCK(vp); 1574 vp->v_iflag &= ~VI_MOUNT; 1575 VI_UNLOCK(vp); 1576 vrele(vp); 1577 return (error != 0 ? error : export_error); 1578 } 1579 1580 /* 1581 * vfs_domount(): actually attempt a filesystem mount. 1582 */ 1583 static int 1584 vfs_domount( 1585 struct thread *td, /* Calling thread. */ 1586 const char *fstype, /* Filesystem type. */ 1587 char *fspath, /* Mount path. */ 1588 uint64_t fsflags, /* Flags common to all filesystems. */ 1589 bool jail_export, /* Got export option in vnet prison. */ 1590 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1591 ) 1592 { 1593 struct vfsconf *vfsp; 1594 struct nameidata nd; 1595 struct vnode *vp; 1596 char *pathbuf; 1597 int error; 1598 1599 /* 1600 * Be ultra-paranoid about making sure the type and fspath 1601 * variables will fit in our mp buffers, including the 1602 * terminating NUL. 1603 */ 1604 if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) 1605 return (ENAMETOOLONG); 1606 1607 if (jail_export) { 1608 error = priv_check(td, PRIV_NFS_DAEMON); 1609 if (error) 1610 return (error); 1611 } else if (jailed(td->td_ucred) || usermount == 0) { 1612 if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) 1613 return (error); 1614 } 1615 1616 /* 1617 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. 1618 */ 1619 if (fsflags & MNT_EXPORTED) { 1620 error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); 1621 if (error) 1622 return (error); 1623 } 1624 if (fsflags & MNT_SUIDDIR) { 1625 error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); 1626 if (error) 1627 return (error); 1628 } 1629 /* 1630 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. 1631 */ 1632 if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { 1633 if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) 1634 fsflags |= MNT_NOSUID | MNT_USER; 1635 } 1636 1637 /* Load KLDs before we lock the covered vnode to avoid reversals. */ 1638 vfsp = NULL; 1639 if ((fsflags & MNT_UPDATE) == 0) { 1640 /* Don't try to load KLDs if we're mounting the root. */ 1641 if (fsflags & MNT_ROOTFS) { 1642 if ((vfsp = vfs_byname(fstype)) == NULL) 1643 return (ENODEV); 1644 } else { 1645 if ((vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) 1646 return (error); 1647 } 1648 } 1649 1650 /* 1651 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. 1652 */ 1653 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | WANTPARENT, 1654 UIO_SYSSPACE, fspath); 1655 error = namei(&nd); 1656 if (error != 0) 1657 return (error); 1658 vp = nd.ni_vp; 1659 /* 1660 * Don't allow stacking file mounts to work around problems with the way 1661 * that namei sets nd.ni_dvp to vp_crossmp for these. 1662 */ 1663 if (vp->v_type == VREG) 1664 fsflags |= MNT_NOCOVER; 1665 if ((fsflags & MNT_UPDATE) == 0) { 1666 if ((vp->v_vflag & VV_ROOT) != 0 && 1667 (fsflags & MNT_NOCOVER) != 0) { 1668 vput(vp); 1669 error = EBUSY; 1670 goto out; 1671 } 1672 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); 1673 strcpy(pathbuf, fspath); 1674 /* 1675 * Note: we allow any vnode type here. If the path sanity check 1676 * succeeds, the type will be validated in vfs_domount_first 1677 * above. 1678 */ 1679 if (vp->v_type == VDIR) 1680 error = vn_path_to_global_path(td, vp, pathbuf, 1681 MNAMELEN); 1682 else 1683 error = vn_path_to_global_path_hardlink(td, vp, 1684 nd.ni_dvp, pathbuf, MNAMELEN, 1685 nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen); 1686 if (error == 0) { 1687 error = vfs_domount_first(td, vfsp, pathbuf, vp, 1688 fsflags, optlist); 1689 } 1690 free(pathbuf, M_TEMP); 1691 } else 1692 error = vfs_domount_update(td, vp, fsflags, jail_export, 1693 optlist); 1694 1695 out: 1696 NDFREE_PNBUF(&nd); 1697 vrele(nd.ni_dvp); 1698 1699 return (error); 1700 } 1701 1702 /* 1703 * Unmount a filesystem. 1704 * 1705 * Note: unmount takes a path to the vnode mounted on as argument, not 1706 * special file (as before). 1707 */ 1708 #ifndef _SYS_SYSPROTO_H_ 1709 struct unmount_args { 1710 char *path; 1711 int flags; 1712 }; 1713 #endif 1714 /* ARGSUSED */ 1715 int 1716 sys_unmount(struct thread *td, struct unmount_args *uap) 1717 { 1718 1719 return (kern_unmount(td, uap->path, uap->flags)); 1720 } 1721 1722 int 1723 kern_unmount(struct thread *td, const char *path, int flags) 1724 { 1725 struct nameidata nd; 1726 struct mount *mp; 1727 char *fsidbuf, *pathbuf; 1728 fsid_t fsid; 1729 int error; 1730 1731 AUDIT_ARG_VALUE(flags); 1732 if (jailed(td->td_ucred) || usermount == 0) { 1733 error = priv_check(td, PRIV_VFS_UNMOUNT); 1734 if (error) 1735 return (error); 1736 } 1737 1738 if (flags & MNT_BYFSID) { 1739 fsidbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); 1740 error = copyinstr(path, fsidbuf, MNAMELEN, NULL); 1741 if (error) { 1742 free(fsidbuf, M_TEMP); 1743 return (error); 1744 } 1745 1746 AUDIT_ARG_TEXT(fsidbuf); 1747 /* Decode the filesystem ID. */ 1748 if (sscanf(fsidbuf, "FSID:%d:%d", &fsid.val[0], &fsid.val[1]) != 2) { 1749 free(fsidbuf, M_TEMP); 1750 return (EINVAL); 1751 } 1752 1753 mp = vfs_getvfs(&fsid); 1754 free(fsidbuf, M_TEMP); 1755 if (mp == NULL) { 1756 return (ENOENT); 1757 } 1758 } else { 1759 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); 1760 error = copyinstr(path, pathbuf, MNAMELEN, NULL); 1761 if (error) { 1762 free(pathbuf, M_TEMP); 1763 return (error); 1764 } 1765 1766 /* 1767 * Try to find global path for path argument. 1768 */ 1769 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 1770 UIO_SYSSPACE, pathbuf); 1771 if (namei(&nd) == 0) { 1772 NDFREE_PNBUF(&nd); 1773 error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, 1774 MNAMELEN); 1775 if (error == 0) 1776 vput(nd.ni_vp); 1777 } 1778 mtx_lock(&mountlist_mtx); 1779 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { 1780 if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { 1781 vfs_ref(mp); 1782 break; 1783 } 1784 } 1785 mtx_unlock(&mountlist_mtx); 1786 free(pathbuf, M_TEMP); 1787 if (mp == NULL) { 1788 /* 1789 * Previously we returned ENOENT for a nonexistent path and 1790 * EINVAL for a non-mountpoint. We cannot tell these apart 1791 * now, so in the !MNT_BYFSID case return the more likely 1792 * EINVAL for compatibility. 1793 */ 1794 return (EINVAL); 1795 } 1796 } 1797 1798 /* 1799 * Don't allow unmounting the root filesystem. 1800 */ 1801 if (mp->mnt_flag & MNT_ROOTFS) { 1802 vfs_rel(mp); 1803 return (EINVAL); 1804 } 1805 error = dounmount(mp, flags, td); 1806 return (error); 1807 } 1808 1809 /* 1810 * Return error if any of the vnodes, ignoring the root vnode 1811 * and the syncer vnode, have non-zero usecount. 1812 * 1813 * This function is purely advisory - it can return false positives 1814 * and negatives. 1815 */ 1816 static int 1817 vfs_check_usecounts(struct mount *mp) 1818 { 1819 struct vnode *vp, *mvp; 1820 1821 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 1822 if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && 1823 vp->v_usecount != 0) { 1824 VI_UNLOCK(vp); 1825 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 1826 return (EBUSY); 1827 } 1828 VI_UNLOCK(vp); 1829 } 1830 1831 return (0); 1832 } 1833 1834 static void 1835 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) 1836 { 1837 1838 mtx_assert(MNT_MTX(mp), MA_OWNED); 1839 mp->mnt_kern_flag &= ~mntkflags; 1840 if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { 1841 mp->mnt_kern_flag &= ~MNTK_MWAIT; 1842 wakeup(mp); 1843 } 1844 vfs_op_exit_locked(mp); 1845 MNT_IUNLOCK(mp); 1846 if (coveredvp != NULL) { 1847 VOP_UNLOCK(coveredvp); 1848 vdrop(coveredvp); 1849 } 1850 vn_finished_write(mp); 1851 vfs_rel(mp); 1852 } 1853 1854 /* 1855 * There are various reference counters associated with the mount point. 1856 * Normally it is permitted to modify them without taking the mnt ilock, 1857 * but this behavior can be temporarily disabled if stable value is needed 1858 * or callers are expected to block (e.g. to not allow new users during 1859 * forced unmount). 1860 */ 1861 void 1862 vfs_op_enter(struct mount *mp) 1863 { 1864 struct mount_pcpu *mpcpu; 1865 int cpu; 1866 1867 MNT_ILOCK(mp); 1868 mp->mnt_vfs_ops++; 1869 if (mp->mnt_vfs_ops > 1) { 1870 MNT_IUNLOCK(mp); 1871 return; 1872 } 1873 vfs_op_barrier_wait(mp); 1874 CPU_FOREACH(cpu) { 1875 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1876 1877 mp->mnt_ref += mpcpu->mntp_ref; 1878 mpcpu->mntp_ref = 0; 1879 1880 mp->mnt_lockref += mpcpu->mntp_lockref; 1881 mpcpu->mntp_lockref = 0; 1882 1883 mp->mnt_writeopcount += mpcpu->mntp_writeopcount; 1884 mpcpu->mntp_writeopcount = 0; 1885 } 1886 MPASSERT(mp->mnt_ref > 0 && mp->mnt_lockref >= 0 && 1887 mp->mnt_writeopcount >= 0, mp, 1888 ("invalid count(s): ref %d lockref %d writeopcount %d", 1889 mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount)); 1890 MNT_IUNLOCK(mp); 1891 vfs_assert_mount_counters(mp); 1892 } 1893 1894 void 1895 vfs_op_exit_locked(struct mount *mp) 1896 { 1897 1898 mtx_assert(MNT_MTX(mp), MA_OWNED); 1899 1900 MPASSERT(mp->mnt_vfs_ops > 0, mp, 1901 ("invalid vfs_ops count %d", mp->mnt_vfs_ops)); 1902 MPASSERT(mp->mnt_vfs_ops > 1 || 1903 (mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_SUSPEND)) == 0, mp, 1904 ("vfs_ops too low %d in unmount or suspend", mp->mnt_vfs_ops)); 1905 mp->mnt_vfs_ops--; 1906 } 1907 1908 void 1909 vfs_op_exit(struct mount *mp) 1910 { 1911 1912 MNT_ILOCK(mp); 1913 vfs_op_exit_locked(mp); 1914 MNT_IUNLOCK(mp); 1915 } 1916 1917 struct vfs_op_barrier_ipi { 1918 struct mount *mp; 1919 struct smp_rendezvous_cpus_retry_arg srcra; 1920 }; 1921 1922 static void 1923 vfs_op_action_func(void *arg) 1924 { 1925 struct vfs_op_barrier_ipi *vfsopipi; 1926 struct mount *mp; 1927 1928 vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); 1929 mp = vfsopipi->mp; 1930 1931 if (!vfs_op_thread_entered(mp)) 1932 smp_rendezvous_cpus_done(arg); 1933 } 1934 1935 static void 1936 vfs_op_wait_func(void *arg, int cpu) 1937 { 1938 struct vfs_op_barrier_ipi *vfsopipi; 1939 struct mount *mp; 1940 struct mount_pcpu *mpcpu; 1941 1942 vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); 1943 mp = vfsopipi->mp; 1944 1945 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1946 while (atomic_load_int(&mpcpu->mntp_thread_in_ops)) 1947 cpu_spinwait(); 1948 } 1949 1950 void 1951 vfs_op_barrier_wait(struct mount *mp) 1952 { 1953 struct vfs_op_barrier_ipi vfsopipi; 1954 1955 vfsopipi.mp = mp; 1956 1957 smp_rendezvous_cpus_retry(all_cpus, 1958 smp_no_rendezvous_barrier, 1959 vfs_op_action_func, 1960 smp_no_rendezvous_barrier, 1961 vfs_op_wait_func, 1962 &vfsopipi.srcra); 1963 } 1964 1965 #ifdef DIAGNOSTIC 1966 void 1967 vfs_assert_mount_counters(struct mount *mp) 1968 { 1969 struct mount_pcpu *mpcpu; 1970 int cpu; 1971 1972 if (mp->mnt_vfs_ops == 0) 1973 return; 1974 1975 CPU_FOREACH(cpu) { 1976 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1977 if (mpcpu->mntp_ref != 0 || 1978 mpcpu->mntp_lockref != 0 || 1979 mpcpu->mntp_writeopcount != 0) 1980 vfs_dump_mount_counters(mp); 1981 } 1982 } 1983 1984 void 1985 vfs_dump_mount_counters(struct mount *mp) 1986 { 1987 struct mount_pcpu *mpcpu; 1988 int ref, lockref, writeopcount; 1989 int cpu; 1990 1991 printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); 1992 1993 printf(" ref : "); 1994 ref = mp->mnt_ref; 1995 CPU_FOREACH(cpu) { 1996 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1997 printf("%d ", mpcpu->mntp_ref); 1998 ref += mpcpu->mntp_ref; 1999 } 2000 printf("\n"); 2001 printf(" lockref : "); 2002 lockref = mp->mnt_lockref; 2003 CPU_FOREACH(cpu) { 2004 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 2005 printf("%d ", mpcpu->mntp_lockref); 2006 lockref += mpcpu->mntp_lockref; 2007 } 2008 printf("\n"); 2009 printf("writeopcount: "); 2010 writeopcount = mp->mnt_writeopcount; 2011 CPU_FOREACH(cpu) { 2012 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 2013 printf("%d ", mpcpu->mntp_writeopcount); 2014 writeopcount += mpcpu->mntp_writeopcount; 2015 } 2016 printf("\n"); 2017 2018 printf("counter struct total\n"); 2019 printf("ref %-5d %-5d\n", mp->mnt_ref, ref); 2020 printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); 2021 printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); 2022 2023 panic("invalid counts on struct mount"); 2024 } 2025 #endif 2026 2027 int 2028 vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) 2029 { 2030 struct mount_pcpu *mpcpu; 2031 int cpu, sum; 2032 2033 switch (which) { 2034 case MNT_COUNT_REF: 2035 sum = mp->mnt_ref; 2036 break; 2037 case MNT_COUNT_LOCKREF: 2038 sum = mp->mnt_lockref; 2039 break; 2040 case MNT_COUNT_WRITEOPCOUNT: 2041 sum = mp->mnt_writeopcount; 2042 break; 2043 } 2044 2045 CPU_FOREACH(cpu) { 2046 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 2047 switch (which) { 2048 case MNT_COUNT_REF: 2049 sum += mpcpu->mntp_ref; 2050 break; 2051 case MNT_COUNT_LOCKREF: 2052 sum += mpcpu->mntp_lockref; 2053 break; 2054 case MNT_COUNT_WRITEOPCOUNT: 2055 sum += mpcpu->mntp_writeopcount; 2056 break; 2057 } 2058 } 2059 return (sum); 2060 } 2061 2062 static bool 2063 deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue, 2064 int timeout_ticks) 2065 { 2066 bool enqueued; 2067 2068 enqueued = false; 2069 mtx_lock(&deferred_unmount_lock); 2070 if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) { 2071 mp->mnt_taskqueue_flags = flags | MNT_DEFERRED; 2072 STAILQ_INSERT_TAIL(&deferred_unmount_list, mp, 2073 mnt_taskqueue_link); 2074 enqueued = true; 2075 } 2076 mtx_unlock(&deferred_unmount_lock); 2077 2078 if (enqueued) { 2079 taskqueue_enqueue_timeout(taskqueue_deferred_unmount, 2080 &deferred_unmount_task, timeout_ticks); 2081 } 2082 2083 return (enqueued); 2084 } 2085 2086 /* 2087 * Taskqueue handler for processing async/recursive unmounts 2088 */ 2089 static void 2090 vfs_deferred_unmount(void *argi __unused, int pending __unused) 2091 { 2092 STAILQ_HEAD(, mount) local_unmounts; 2093 uint64_t flags; 2094 struct mount *mp, *tmp; 2095 int error; 2096 unsigned int retries; 2097 bool unmounted; 2098 2099 STAILQ_INIT(&local_unmounts); 2100 mtx_lock(&deferred_unmount_lock); 2101 STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); 2102 mtx_unlock(&deferred_unmount_lock); 2103 2104 STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) { 2105 flags = mp->mnt_taskqueue_flags; 2106 KASSERT((flags & MNT_DEFERRED) != 0, 2107 ("taskqueue unmount without MNT_DEFERRED")); 2108 error = dounmount(mp, flags, curthread); 2109 if (error != 0) { 2110 MNT_ILOCK(mp); 2111 unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0); 2112 MNT_IUNLOCK(mp); 2113 2114 /* 2115 * The deferred unmount thread is the only thread that 2116 * modifies the retry counts, so locking/atomics aren't 2117 * needed here. 2118 */ 2119 retries = (mp->mnt_unmount_retries)++; 2120 deferred_unmount_total_retries++; 2121 if (!unmounted && retries < deferred_unmount_retry_limit) { 2122 deferred_unmount_enqueue(mp, flags, true, 2123 -deferred_unmount_retry_delay_hz); 2124 } else { 2125 if (retries >= deferred_unmount_retry_limit) { 2126 printf("giving up on deferred unmount " 2127 "of %s after %d retries, error %d\n", 2128 mp->mnt_stat.f_mntonname, retries, error); 2129 } 2130 vfs_rel(mp); 2131 } 2132 } 2133 } 2134 } 2135 2136 /* 2137 * Do the actual filesystem unmount. 2138 */ 2139 int 2140 dounmount(struct mount *mp, uint64_t flags, struct thread *td) 2141 { 2142 struct mount_upper_node *upper; 2143 struct vnode *coveredvp, *rootvp; 2144 int error; 2145 uint64_t async_flag; 2146 int mnt_gen_r; 2147 unsigned int retries; 2148 2149 KASSERT((flags & MNT_DEFERRED) == 0 || 2150 (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE), 2151 ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE")); 2152 2153 /* 2154 * If the caller has explicitly requested the unmount to be handled by 2155 * the taskqueue and we're not already in taskqueue context, queue 2156 * up the unmount request and exit. This is done prior to any 2157 * credential checks; MNT_DEFERRED should be used only for kernel- 2158 * initiated unmounts and will therefore be processed with the 2159 * (kernel) credentials of the taskqueue thread. Still, callers 2160 * should be sure this is the behavior they want. 2161 */ 2162 if ((flags & MNT_DEFERRED) != 0 && 2163 taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) { 2164 if (!deferred_unmount_enqueue(mp, flags, false, 0)) 2165 vfs_rel(mp); 2166 return (EINPROGRESS); 2167 } 2168 2169 /* 2170 * Only privileged root, or (if MNT_USER is set) the user that did the 2171 * original mount is permitted to unmount this filesystem. 2172 * This check should be made prior to queueing up any recursive 2173 * unmounts of upper filesystems. Those unmounts will be executed 2174 * with kernel thread credentials and are expected to succeed, so 2175 * we must at least ensure the originating context has sufficient 2176 * privilege to unmount the base filesystem before proceeding with 2177 * the uppers. 2178 */ 2179 error = vfs_suser(mp, td); 2180 if (error != 0) { 2181 KASSERT((flags & MNT_DEFERRED) == 0, 2182 ("taskqueue unmount with insufficient privilege")); 2183 vfs_rel(mp); 2184 return (error); 2185 } 2186 2187 if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0)) 2188 flags |= MNT_RECURSE; 2189 2190 if ((flags & MNT_RECURSE) != 0) { 2191 KASSERT((flags & MNT_FORCE) != 0, 2192 ("MNT_RECURSE requires MNT_FORCE")); 2193 2194 MNT_ILOCK(mp); 2195 /* 2196 * Set MNTK_RECURSE to prevent new upper mounts from being 2197 * added, and note that an operation on the uppers list is in 2198 * progress. This will ensure that unregistration from the 2199 * uppers list, and therefore any pending unmount of the upper 2200 * FS, can't complete until after we finish walking the list. 2201 */ 2202 mp->mnt_kern_flag |= MNTK_RECURSE; 2203 mp->mnt_upper_pending++; 2204 TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) { 2205 retries = upper->mp->mnt_unmount_retries; 2206 if (retries > deferred_unmount_retry_limit) { 2207 error = EBUSY; 2208 continue; 2209 } 2210 MNT_IUNLOCK(mp); 2211 2212 vfs_ref(upper->mp); 2213 if (!deferred_unmount_enqueue(upper->mp, flags, 2214 false, 0)) 2215 vfs_rel(upper->mp); 2216 MNT_ILOCK(mp); 2217 } 2218 mp->mnt_upper_pending--; 2219 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 2220 mp->mnt_upper_pending == 0) { 2221 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 2222 wakeup(&mp->mnt_uppers); 2223 } 2224 2225 /* 2226 * If we're not on the taskqueue, wait until the uppers list 2227 * is drained before proceeding with unmount. Otherwise, if 2228 * we are on the taskqueue and there are still pending uppers, 2229 * just re-enqueue on the end of the taskqueue. 2230 */ 2231 if ((flags & MNT_DEFERRED) == 0) { 2232 while (error == 0 && !TAILQ_EMPTY(&mp->mnt_uppers)) { 2233 mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER; 2234 error = msleep(&mp->mnt_taskqueue_link, 2235 MNT_MTX(mp), PCATCH, "umntqw", 0); 2236 } 2237 if (error != 0) { 2238 MNT_REL(mp); 2239 MNT_IUNLOCK(mp); 2240 return (error); 2241 } 2242 } else if (!TAILQ_EMPTY(&mp->mnt_uppers)) { 2243 MNT_IUNLOCK(mp); 2244 if (error == 0) 2245 deferred_unmount_enqueue(mp, flags, true, 0); 2246 return (error); 2247 } 2248 MNT_IUNLOCK(mp); 2249 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty")); 2250 } 2251 2252 /* Allow the taskqueue to safely re-enqueue on failure */ 2253 if ((flags & MNT_DEFERRED) != 0) 2254 vfs_ref(mp); 2255 2256 if ((coveredvp = mp->mnt_vnodecovered) != NULL) { 2257 mnt_gen_r = mp->mnt_gen; 2258 VI_LOCK(coveredvp); 2259 vholdl(coveredvp); 2260 vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); 2261 /* 2262 * Check for mp being unmounted while waiting for the 2263 * covered vnode lock. 2264 */ 2265 if (coveredvp->v_mountedhere != mp || 2266 coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { 2267 VOP_UNLOCK(coveredvp); 2268 vdrop(coveredvp); 2269 vfs_rel(mp); 2270 return (EBUSY); 2271 } 2272 } 2273 2274 vfs_op_enter(mp); 2275 2276 vn_start_write(NULL, &mp, V_WAIT); 2277 MNT_ILOCK(mp); 2278 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || 2279 (mp->mnt_flag & MNT_UPDATE) != 0 || 2280 !TAILQ_EMPTY(&mp->mnt_uppers)) { 2281 dounmount_cleanup(mp, coveredvp, 0); 2282 return (EBUSY); 2283 } 2284 mp->mnt_kern_flag |= MNTK_UNMOUNT; 2285 rootvp = vfs_cache_root_clear(mp); 2286 if (coveredvp != NULL) 2287 vn_seqc_write_begin(coveredvp); 2288 if (flags & MNT_NONBUSY) { 2289 MNT_IUNLOCK(mp); 2290 error = vfs_check_usecounts(mp); 2291 MNT_ILOCK(mp); 2292 if (error != 0) { 2293 vn_seqc_write_end(coveredvp); 2294 dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); 2295 if (rootvp != NULL) { 2296 vn_seqc_write_end(rootvp); 2297 vrele(rootvp); 2298 } 2299 return (error); 2300 } 2301 } 2302 /* Allow filesystems to detect that a forced unmount is in progress. */ 2303 if (flags & MNT_FORCE) { 2304 mp->mnt_kern_flag |= MNTK_UNMOUNTF; 2305 MNT_IUNLOCK(mp); 2306 /* 2307 * Must be done after setting MNTK_UNMOUNTF and before 2308 * waiting for mnt_lockref to become 0. 2309 */ 2310 VFS_PURGE(mp); 2311 MNT_ILOCK(mp); 2312 } 2313 error = 0; 2314 if (mp->mnt_lockref) { 2315 mp->mnt_kern_flag |= MNTK_DRAINING; 2316 error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, 2317 "mount drain", 0); 2318 } 2319 MNT_IUNLOCK(mp); 2320 KASSERT(mp->mnt_lockref == 0, 2321 ("%s: invalid lock refcount in the drain path @ %s:%d", 2322 __func__, __FILE__, __LINE__)); 2323 KASSERT(error == 0, 2324 ("%s: invalid return value for msleep in the drain path @ %s:%d", 2325 __func__, __FILE__, __LINE__)); 2326 2327 /* 2328 * We want to keep the vnode around so that we can vn_seqc_write_end 2329 * after we are done with unmount. Downgrade our reference to a mere 2330 * hold count so that we don't interefere with anything. 2331 */ 2332 if (rootvp != NULL) { 2333 vhold(rootvp); 2334 vrele(rootvp); 2335 } 2336 2337 if (mp->mnt_flag & MNT_EXPUBLIC) 2338 vfs_setpublicfs(NULL, NULL, NULL); 2339 2340 vfs_periodic(mp, MNT_WAIT); 2341 MNT_ILOCK(mp); 2342 async_flag = mp->mnt_flag & MNT_ASYNC; 2343 mp->mnt_flag &= ~MNT_ASYNC; 2344 mp->mnt_kern_flag &= ~MNTK_ASYNC; 2345 MNT_IUNLOCK(mp); 2346 vfs_deallocate_syncvnode(mp); 2347 error = VFS_UNMOUNT(mp, flags); 2348 vn_finished_write(mp); 2349 vfs_rel(mp); 2350 /* 2351 * If we failed to flush the dirty blocks for this mount point, 2352 * undo all the cdir/rdir and rootvnode changes we made above. 2353 * Unless we failed to do so because the device is reporting that 2354 * it doesn't exist anymore. 2355 */ 2356 if (error && error != ENXIO) { 2357 MNT_ILOCK(mp); 2358 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 2359 MNT_IUNLOCK(mp); 2360 vfs_allocate_syncvnode(mp); 2361 MNT_ILOCK(mp); 2362 } 2363 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); 2364 mp->mnt_flag |= async_flag; 2365 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 2366 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 2367 mp->mnt_kern_flag |= MNTK_ASYNC; 2368 if (mp->mnt_kern_flag & MNTK_MWAIT) { 2369 mp->mnt_kern_flag &= ~MNTK_MWAIT; 2370 wakeup(mp); 2371 } 2372 vfs_op_exit_locked(mp); 2373 MNT_IUNLOCK(mp); 2374 if (coveredvp) { 2375 vn_seqc_write_end(coveredvp); 2376 VOP_UNLOCK(coveredvp); 2377 vdrop(coveredvp); 2378 } 2379 if (rootvp != NULL) { 2380 vn_seqc_write_end(rootvp); 2381 vdrop(rootvp); 2382 } 2383 return (error); 2384 } 2385 2386 mtx_lock(&mountlist_mtx); 2387 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2388 mtx_unlock(&mountlist_mtx); 2389 EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); 2390 if (coveredvp != NULL) { 2391 VI_LOCK(coveredvp); 2392 vn_irflag_unset_locked(coveredvp, VIRF_MOUNTPOINT); 2393 coveredvp->v_mountedhere = NULL; 2394 vn_seqc_write_end_locked(coveredvp); 2395 VI_UNLOCK(coveredvp); 2396 VOP_UNLOCK(coveredvp); 2397 vdrop(coveredvp); 2398 } 2399 mount_devctl_event("UNMOUNT", mp, false); 2400 if (rootvp != NULL) { 2401 vn_seqc_write_end(rootvp); 2402 vdrop(rootvp); 2403 } 2404 vfs_event_signal(NULL, VQ_UNMOUNT, 0); 2405 if (rootvnode != NULL && mp == rootvnode->v_mount) { 2406 vrele(rootvnode); 2407 rootvnode = NULL; 2408 } 2409 if (mp == rootdevmp) 2410 rootdevmp = NULL; 2411 if ((flags & MNT_DEFERRED) != 0) 2412 vfs_rel(mp); 2413 vfs_mount_destroy(mp); 2414 return (0); 2415 } 2416 2417 /* 2418 * Report errors during filesystem mounting. 2419 */ 2420 void 2421 vfs_mount_error(struct mount *mp, const char *fmt, ...) 2422 { 2423 struct vfsoptlist *moptlist = mp->mnt_optnew; 2424 va_list ap; 2425 int error, len; 2426 char *errmsg; 2427 2428 error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); 2429 if (error || errmsg == NULL || len <= 0) 2430 return; 2431 2432 va_start(ap, fmt); 2433 vsnprintf(errmsg, (size_t)len, fmt, ap); 2434 va_end(ap); 2435 } 2436 2437 void 2438 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) 2439 { 2440 va_list ap; 2441 int error, len; 2442 char *errmsg; 2443 2444 error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); 2445 if (error || errmsg == NULL || len <= 0) 2446 return; 2447 2448 va_start(ap, fmt); 2449 vsnprintf(errmsg, (size_t)len, fmt, ap); 2450 va_end(ap); 2451 } 2452 2453 /* 2454 * --------------------------------------------------------------------- 2455 * Functions for querying mount options/arguments from filesystems. 2456 */ 2457 2458 /* 2459 * Check that no unknown options are given 2460 */ 2461 int 2462 vfs_filteropt(struct vfsoptlist *opts, const char **legal) 2463 { 2464 struct vfsopt *opt; 2465 char errmsg[255]; 2466 const char **t, *p, *q; 2467 int ret = 0; 2468 2469 TAILQ_FOREACH(opt, opts, link) { 2470 p = opt->name; 2471 q = NULL; 2472 if (p[0] == 'n' && p[1] == 'o') 2473 q = p + 2; 2474 for(t = global_opts; *t != NULL; t++) { 2475 if (strcmp(*t, p) == 0) 2476 break; 2477 if (q != NULL) { 2478 if (strcmp(*t, q) == 0) 2479 break; 2480 } 2481 } 2482 if (*t != NULL) 2483 continue; 2484 for(t = legal; *t != NULL; t++) { 2485 if (strcmp(*t, p) == 0) 2486 break; 2487 if (q != NULL) { 2488 if (strcmp(*t, q) == 0) 2489 break; 2490 } 2491 } 2492 if (*t != NULL) 2493 continue; 2494 snprintf(errmsg, sizeof(errmsg), 2495 "mount option <%s> is unknown", p); 2496 ret = EINVAL; 2497 } 2498 if (ret != 0) { 2499 TAILQ_FOREACH(opt, opts, link) { 2500 if (strcmp(opt->name, "errmsg") == 0) { 2501 strncpy((char *)opt->value, errmsg, opt->len); 2502 break; 2503 } 2504 } 2505 if (opt == NULL) 2506 printf("%s\n", errmsg); 2507 } 2508 return (ret); 2509 } 2510 2511 /* 2512 * Get a mount option by its name. 2513 * 2514 * Return 0 if the option was found, ENOENT otherwise. 2515 * If len is non-NULL it will be filled with the length 2516 * of the option. If buf is non-NULL, it will be filled 2517 * with the address of the option. 2518 */ 2519 int 2520 vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len) 2521 { 2522 struct vfsopt *opt; 2523 2524 KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); 2525 2526 TAILQ_FOREACH(opt, opts, link) { 2527 if (strcmp(name, opt->name) == 0) { 2528 opt->seen = 1; 2529 if (len != NULL) 2530 *len = opt->len; 2531 if (buf != NULL) 2532 *buf = opt->value; 2533 return (0); 2534 } 2535 } 2536 return (ENOENT); 2537 } 2538 2539 int 2540 vfs_getopt_pos(struct vfsoptlist *opts, const char *name) 2541 { 2542 struct vfsopt *opt; 2543 2544 if (opts == NULL) 2545 return (-1); 2546 2547 TAILQ_FOREACH(opt, opts, link) { 2548 if (strcmp(name, opt->name) == 0) { 2549 opt->seen = 1; 2550 return (opt->pos); 2551 } 2552 } 2553 return (-1); 2554 } 2555 2556 int 2557 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) 2558 { 2559 char *opt_value, *vtp; 2560 quad_t iv; 2561 int error, opt_len; 2562 2563 error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); 2564 if (error != 0) 2565 return (error); 2566 if (opt_len == 0 || opt_value == NULL) 2567 return (EINVAL); 2568 if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') 2569 return (EINVAL); 2570 iv = strtoq(opt_value, &vtp, 0); 2571 if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) 2572 return (EINVAL); 2573 if (iv < 0) 2574 return (EINVAL); 2575 switch (vtp[0]) { 2576 case 't': case 'T': 2577 iv *= 1024; 2578 /* FALLTHROUGH */ 2579 case 'g': case 'G': 2580 iv *= 1024; 2581 /* FALLTHROUGH */ 2582 case 'm': case 'M': 2583 iv *= 1024; 2584 /* FALLTHROUGH */ 2585 case 'k': case 'K': 2586 iv *= 1024; 2587 case '\0': 2588 break; 2589 default: 2590 return (EINVAL); 2591 } 2592 *value = iv; 2593 2594 return (0); 2595 } 2596 2597 char * 2598 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) 2599 { 2600 struct vfsopt *opt; 2601 2602 *error = 0; 2603 TAILQ_FOREACH(opt, opts, link) { 2604 if (strcmp(name, opt->name) != 0) 2605 continue; 2606 opt->seen = 1; 2607 if (opt->len == 0 || 2608 ((char *)opt->value)[opt->len - 1] != '\0') { 2609 *error = EINVAL; 2610 return (NULL); 2611 } 2612 return (opt->value); 2613 } 2614 *error = ENOENT; 2615 return (NULL); 2616 } 2617 2618 int 2619 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, 2620 uint64_t val) 2621 { 2622 struct vfsopt *opt; 2623 2624 TAILQ_FOREACH(opt, opts, link) { 2625 if (strcmp(name, opt->name) == 0) { 2626 opt->seen = 1; 2627 if (w != NULL) 2628 *w |= val; 2629 return (1); 2630 } 2631 } 2632 if (w != NULL) 2633 *w &= ~val; 2634 return (0); 2635 } 2636 2637 int 2638 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) 2639 { 2640 va_list ap; 2641 struct vfsopt *opt; 2642 int ret; 2643 2644 KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); 2645 2646 TAILQ_FOREACH(opt, opts, link) { 2647 if (strcmp(name, opt->name) != 0) 2648 continue; 2649 opt->seen = 1; 2650 if (opt->len == 0 || opt->value == NULL) 2651 return (0); 2652 if (((char *)opt->value)[opt->len - 1] != '\0') 2653 return (0); 2654 va_start(ap, fmt); 2655 ret = vsscanf(opt->value, fmt, ap); 2656 va_end(ap); 2657 return (ret); 2658 } 2659 return (0); 2660 } 2661 2662 int 2663 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) 2664 { 2665 struct vfsopt *opt; 2666 2667 TAILQ_FOREACH(opt, opts, link) { 2668 if (strcmp(name, opt->name) != 0) 2669 continue; 2670 opt->seen = 1; 2671 if (opt->value == NULL) 2672 opt->len = len; 2673 else { 2674 if (opt->len != len) 2675 return (EINVAL); 2676 bcopy(value, opt->value, len); 2677 } 2678 return (0); 2679 } 2680 return (ENOENT); 2681 } 2682 2683 int 2684 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) 2685 { 2686 struct vfsopt *opt; 2687 2688 TAILQ_FOREACH(opt, opts, link) { 2689 if (strcmp(name, opt->name) != 0) 2690 continue; 2691 opt->seen = 1; 2692 if (opt->value == NULL) 2693 opt->len = len; 2694 else { 2695 if (opt->len < len) 2696 return (EINVAL); 2697 opt->len = len; 2698 bcopy(value, opt->value, len); 2699 } 2700 return (0); 2701 } 2702 return (ENOENT); 2703 } 2704 2705 int 2706 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) 2707 { 2708 struct vfsopt *opt; 2709 2710 TAILQ_FOREACH(opt, opts, link) { 2711 if (strcmp(name, opt->name) != 0) 2712 continue; 2713 opt->seen = 1; 2714 if (opt->value == NULL) 2715 opt->len = strlen(value) + 1; 2716 else if (strlcpy(opt->value, value, opt->len) >= opt->len) 2717 return (EINVAL); 2718 return (0); 2719 } 2720 return (ENOENT); 2721 } 2722 2723 /* 2724 * Find and copy a mount option. 2725 * 2726 * The size of the buffer has to be specified 2727 * in len, if it is not the same length as the 2728 * mount option, EINVAL is returned. 2729 * Returns ENOENT if the option is not found. 2730 */ 2731 int 2732 vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len) 2733 { 2734 struct vfsopt *opt; 2735 2736 KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); 2737 2738 TAILQ_FOREACH(opt, opts, link) { 2739 if (strcmp(name, opt->name) == 0) { 2740 opt->seen = 1; 2741 if (len != opt->len) 2742 return (EINVAL); 2743 bcopy(opt->value, dest, opt->len); 2744 return (0); 2745 } 2746 } 2747 return (ENOENT); 2748 } 2749 2750 int 2751 __vfs_statfs(struct mount *mp, struct statfs *sbp) 2752 { 2753 /* 2754 * Filesystems only fill in part of the structure for updates, we 2755 * have to read the entirety first to get all content. 2756 */ 2757 if (sbp != &mp->mnt_stat) 2758 memcpy(sbp, &mp->mnt_stat, sizeof(*sbp)); 2759 2760 /* 2761 * Set these in case the underlying filesystem fails to do so. 2762 */ 2763 sbp->f_version = STATFS_VERSION; 2764 sbp->f_namemax = NAME_MAX; 2765 sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 2766 sbp->f_nvnodelistsize = mp->mnt_nvnodelistsize; 2767 2768 return (mp->mnt_op->vfs_statfs(mp, sbp)); 2769 } 2770 2771 void 2772 vfs_mountedfrom(struct mount *mp, const char *from) 2773 { 2774 2775 bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); 2776 strlcpy(mp->mnt_stat.f_mntfromname, from, 2777 sizeof mp->mnt_stat.f_mntfromname); 2778 } 2779 2780 /* 2781 * --------------------------------------------------------------------- 2782 * This is the api for building mount args and mounting filesystems from 2783 * inside the kernel. 2784 * 2785 * The API works by accumulation of individual args. First error is 2786 * latched. 2787 * 2788 * XXX: should be documented in new manpage kernel_mount(9) 2789 */ 2790 2791 /* A memory allocation which must be freed when we are done */ 2792 struct mntaarg { 2793 SLIST_ENTRY(mntaarg) next; 2794 }; 2795 2796 /* The header for the mount arguments */ 2797 struct mntarg { 2798 struct iovec *v; 2799 int len; 2800 int error; 2801 SLIST_HEAD(, mntaarg) list; 2802 }; 2803 2804 /* 2805 * Add a boolean argument. 2806 * 2807 * flag is the boolean value. 2808 * name must start with "no". 2809 */ 2810 struct mntarg * 2811 mount_argb(struct mntarg *ma, int flag, const char *name) 2812 { 2813 2814 KASSERT(name[0] == 'n' && name[1] == 'o', 2815 ("mount_argb(...,%s): name must start with 'no'", name)); 2816 2817 return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); 2818 } 2819 2820 /* 2821 * Add an argument printf style 2822 */ 2823 struct mntarg * 2824 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) 2825 { 2826 va_list ap; 2827 struct mntaarg *maa; 2828 struct sbuf *sb; 2829 int len; 2830 2831 if (ma == NULL) { 2832 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2833 SLIST_INIT(&ma->list); 2834 } 2835 if (ma->error) 2836 return (ma); 2837 2838 ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), 2839 M_MOUNT, M_WAITOK); 2840 ma->v[ma->len].iov_base = (void *)(uintptr_t)name; 2841 ma->v[ma->len].iov_len = strlen(name) + 1; 2842 ma->len++; 2843 2844 sb = sbuf_new_auto(); 2845 va_start(ap, fmt); 2846 sbuf_vprintf(sb, fmt, ap); 2847 va_end(ap); 2848 sbuf_finish(sb); 2849 len = sbuf_len(sb) + 1; 2850 maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); 2851 SLIST_INSERT_HEAD(&ma->list, maa, next); 2852 bcopy(sbuf_data(sb), maa + 1, len); 2853 sbuf_delete(sb); 2854 2855 ma->v[ma->len].iov_base = maa + 1; 2856 ma->v[ma->len].iov_len = len; 2857 ma->len++; 2858 2859 return (ma); 2860 } 2861 2862 /* 2863 * Add an argument which is a userland string. 2864 */ 2865 struct mntarg * 2866 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) 2867 { 2868 struct mntaarg *maa; 2869 char *tbuf; 2870 2871 if (val == NULL) 2872 return (ma); 2873 if (ma == NULL) { 2874 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2875 SLIST_INIT(&ma->list); 2876 } 2877 if (ma->error) 2878 return (ma); 2879 maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); 2880 SLIST_INSERT_HEAD(&ma->list, maa, next); 2881 tbuf = (void *)(maa + 1); 2882 ma->error = copyinstr(val, tbuf, len, NULL); 2883 return (mount_arg(ma, name, tbuf, -1)); 2884 } 2885 2886 /* 2887 * Plain argument. 2888 * 2889 * If length is -1, treat value as a C string. 2890 */ 2891 struct mntarg * 2892 mount_arg(struct mntarg *ma, const char *name, const void *val, int len) 2893 { 2894 2895 if (ma == NULL) { 2896 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2897 SLIST_INIT(&ma->list); 2898 } 2899 if (ma->error) 2900 return (ma); 2901 2902 ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), 2903 M_MOUNT, M_WAITOK); 2904 ma->v[ma->len].iov_base = (void *)(uintptr_t)name; 2905 ma->v[ma->len].iov_len = strlen(name) + 1; 2906 ma->len++; 2907 2908 ma->v[ma->len].iov_base = (void *)(uintptr_t)val; 2909 if (len < 0) 2910 ma->v[ma->len].iov_len = strlen(val) + 1; 2911 else 2912 ma->v[ma->len].iov_len = len; 2913 ma->len++; 2914 return (ma); 2915 } 2916 2917 /* 2918 * Free a mntarg structure 2919 */ 2920 static void 2921 free_mntarg(struct mntarg *ma) 2922 { 2923 struct mntaarg *maa; 2924 2925 while (!SLIST_EMPTY(&ma->list)) { 2926 maa = SLIST_FIRST(&ma->list); 2927 SLIST_REMOVE_HEAD(&ma->list, next); 2928 free(maa, M_MOUNT); 2929 } 2930 free(ma->v, M_MOUNT); 2931 free(ma, M_MOUNT); 2932 } 2933 2934 /* 2935 * Mount a filesystem 2936 */ 2937 int 2938 kernel_mount(struct mntarg *ma, uint64_t flags) 2939 { 2940 struct uio auio; 2941 int error; 2942 2943 KASSERT(ma != NULL, ("kernel_mount NULL ma")); 2944 KASSERT(ma->error != 0 || ma->v != NULL, ("kernel_mount NULL ma->v")); 2945 KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); 2946 2947 error = ma->error; 2948 if (error == 0) { 2949 auio.uio_iov = ma->v; 2950 auio.uio_iovcnt = ma->len; 2951 auio.uio_segflg = UIO_SYSSPACE; 2952 error = vfs_donmount(curthread, flags, &auio); 2953 } 2954 free_mntarg(ma); 2955 return (error); 2956 } 2957 2958 /* Map from mount options to printable formats. */ 2959 static struct mntoptnames optnames[] = { 2960 MNTOPT_NAMES 2961 }; 2962 2963 #define DEVCTL_LEN 1024 2964 static void 2965 mount_devctl_event(const char *type, struct mount *mp, bool donew) 2966 { 2967 const uint8_t *cp; 2968 struct mntoptnames *fp; 2969 struct sbuf sb; 2970 struct statfs *sfp = &mp->mnt_stat; 2971 char *buf; 2972 2973 buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT); 2974 if (buf == NULL) 2975 return; 2976 sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN); 2977 sbuf_cpy(&sb, "mount-point=\""); 2978 devctl_safe_quote_sb(&sb, sfp->f_mntonname); 2979 sbuf_cat(&sb, "\" mount-dev=\""); 2980 devctl_safe_quote_sb(&sb, sfp->f_mntfromname); 2981 sbuf_cat(&sb, "\" mount-type=\""); 2982 devctl_safe_quote_sb(&sb, sfp->f_fstypename); 2983 sbuf_cat(&sb, "\" fsid=0x"); 2984 cp = (const uint8_t *)&sfp->f_fsid.val[0]; 2985 for (int i = 0; i < sizeof(sfp->f_fsid); i++) 2986 sbuf_printf(&sb, "%02x", cp[i]); 2987 sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner); 2988 for (fp = optnames; fp->o_opt != 0; fp++) { 2989 if ((mp->mnt_flag & fp->o_opt) != 0) { 2990 sbuf_cat(&sb, fp->o_name); 2991 sbuf_putc(&sb, ';'); 2992 } 2993 } 2994 sbuf_putc(&sb, '"'); 2995 sbuf_finish(&sb); 2996 2997 /* 2998 * Options are not published because the form of the options depends on 2999 * the file system and may include binary data. In addition, they don't 3000 * necessarily provide enough useful information to be actionable when 3001 * devd processes them. 3002 */ 3003 3004 if (sbuf_error(&sb) == 0) 3005 devctl_notify("VFS", "FS", type, sbuf_data(&sb)); 3006 sbuf_delete(&sb); 3007 free(buf, M_MOUNT); 3008 } 3009 3010 /* 3011 * Force remount specified mount point to read-only. The argument 3012 * must be busied to avoid parallel unmount attempts. 3013 * 3014 * Intended use is to prevent further writes if some metadata 3015 * inconsistency is detected. Note that the function still flushes 3016 * all cached metadata and data for the mount point, which might be 3017 * not always suitable. 3018 */ 3019 int 3020 vfs_remount_ro(struct mount *mp) 3021 { 3022 struct vfsoptlist *opts; 3023 struct vfsopt *opt; 3024 struct vnode *vp_covered, *rootvp; 3025 int error; 3026 3027 vfs_op_enter(mp); 3028 KASSERT(mp->mnt_lockref > 0, 3029 ("vfs_remount_ro: mp %p is not busied", mp)); 3030 KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0, 3031 ("vfs_remount_ro: mp %p is being unmounted (and busy?)", mp)); 3032 3033 rootvp = NULL; 3034 vp_covered = mp->mnt_vnodecovered; 3035 error = vget(vp_covered, LK_EXCLUSIVE | LK_NOWAIT); 3036 if (error != 0) { 3037 vfs_op_exit(mp); 3038 return (error); 3039 } 3040 VI_LOCK(vp_covered); 3041 if ((vp_covered->v_iflag & VI_MOUNT) != 0) { 3042 VI_UNLOCK(vp_covered); 3043 vput(vp_covered); 3044 vfs_op_exit(mp); 3045 return (EBUSY); 3046 } 3047 vp_covered->v_iflag |= VI_MOUNT; 3048 VI_UNLOCK(vp_covered); 3049 vn_seqc_write_begin(vp_covered); 3050 3051 MNT_ILOCK(mp); 3052 if ((mp->mnt_flag & MNT_RDONLY) != 0) { 3053 MNT_IUNLOCK(mp); 3054 error = EBUSY; 3055 goto out; 3056 } 3057 mp->mnt_flag |= MNT_UPDATE | MNT_FORCE | MNT_RDONLY; 3058 rootvp = vfs_cache_root_clear(mp); 3059 MNT_IUNLOCK(mp); 3060 3061 opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK | M_ZERO); 3062 TAILQ_INIT(opts); 3063 opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK | M_ZERO); 3064 opt->name = strdup("ro", M_MOUNT); 3065 opt->value = NULL; 3066 TAILQ_INSERT_TAIL(opts, opt, link); 3067 vfs_mergeopts(opts, mp->mnt_opt); 3068 mp->mnt_optnew = opts; 3069 3070 error = VFS_MOUNT(mp); 3071 3072 if (error == 0) { 3073 MNT_ILOCK(mp); 3074 mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE); 3075 MNT_IUNLOCK(mp); 3076 vfs_deallocate_syncvnode(mp); 3077 if (mp->mnt_opt != NULL) 3078 vfs_freeopts(mp->mnt_opt); 3079 mp->mnt_opt = mp->mnt_optnew; 3080 } else { 3081 MNT_ILOCK(mp); 3082 mp->mnt_flag &= ~(MNT_UPDATE | MNT_FORCE | MNT_RDONLY); 3083 MNT_IUNLOCK(mp); 3084 vfs_freeopts(mp->mnt_optnew); 3085 } 3086 mp->mnt_optnew = NULL; 3087 3088 out: 3089 vfs_op_exit(mp); 3090 VI_LOCK(vp_covered); 3091 vp_covered->v_iflag &= ~VI_MOUNT; 3092 VI_UNLOCK(vp_covered); 3093 vput(vp_covered); 3094 vn_seqc_write_end(vp_covered); 3095 if (rootvp != NULL) { 3096 vn_seqc_write_end(rootvp); 3097 vrele(rootvp); 3098 } 3099 return (error); 3100 } 3101 3102 /* 3103 * Suspend write operations on all local writeable filesystems. Does 3104 * full sync of them in the process. 3105 * 3106 * Iterate over the mount points in reverse order, suspending most 3107 * recently mounted filesystems first. It handles a case where a 3108 * filesystem mounted from a md(4) vnode-backed device should be 3109 * suspended before the filesystem that owns the vnode. 3110 */ 3111 void 3112 suspend_all_fs(void) 3113 { 3114 struct mount *mp; 3115 int error; 3116 3117 mtx_lock(&mountlist_mtx); 3118 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { 3119 error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT); 3120 if (error != 0) 3121 continue; 3122 if ((mp->mnt_flag & (MNT_RDONLY | MNT_LOCAL)) != MNT_LOCAL || 3123 (mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 3124 mtx_lock(&mountlist_mtx); 3125 vfs_unbusy(mp); 3126 continue; 3127 } 3128 error = vfs_write_suspend(mp, 0); 3129 if (error == 0) { 3130 MNT_ILOCK(mp); 3131 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0); 3132 mp->mnt_kern_flag |= MNTK_SUSPEND_ALL; 3133 MNT_IUNLOCK(mp); 3134 mtx_lock(&mountlist_mtx); 3135 } else { 3136 printf("suspend of %s failed, error %d\n", 3137 mp->mnt_stat.f_mntonname, error); 3138 mtx_lock(&mountlist_mtx); 3139 vfs_unbusy(mp); 3140 } 3141 } 3142 mtx_unlock(&mountlist_mtx); 3143 } 3144 3145 /* 3146 * Clone the mnt_exjail field to a new mount point. 3147 */ 3148 void 3149 vfs_exjail_clone(struct mount *inmp, struct mount *outmp) 3150 { 3151 struct ucred *cr; 3152 struct prison *pr; 3153 3154 MNT_ILOCK(inmp); 3155 cr = inmp->mnt_exjail; 3156 if (cr != NULL) { 3157 crhold(cr); 3158 MNT_IUNLOCK(inmp); 3159 pr = cr->cr_prison; 3160 sx_slock(&allprison_lock); 3161 if (!prison_isalive(pr)) { 3162 sx_sunlock(&allprison_lock); 3163 crfree(cr); 3164 return; 3165 } 3166 MNT_ILOCK(outmp); 3167 if (outmp->mnt_exjail == NULL) { 3168 outmp->mnt_exjail = cr; 3169 atomic_add_int(&pr->pr_exportcnt, 1); 3170 cr = NULL; 3171 } 3172 MNT_IUNLOCK(outmp); 3173 sx_sunlock(&allprison_lock); 3174 if (cr != NULL) 3175 crfree(cr); 3176 } else 3177 MNT_IUNLOCK(inmp); 3178 } 3179 3180 void 3181 resume_all_fs(void) 3182 { 3183 struct mount *mp; 3184 3185 mtx_lock(&mountlist_mtx); 3186 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3187 if ((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0) 3188 continue; 3189 mtx_unlock(&mountlist_mtx); 3190 MNT_ILOCK(mp); 3191 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) != 0); 3192 mp->mnt_kern_flag &= ~MNTK_SUSPEND_ALL; 3193 MNT_IUNLOCK(mp); 3194 vfs_write_resume(mp, 0); 3195 mtx_lock(&mountlist_mtx); 3196 vfs_unbusy(mp); 3197 } 3198 mtx_unlock(&mountlist_mtx); 3199 } 3200