1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1999-2004 Poul-Henning Kamp 5 * Copyright (c) 1999 Michael Smith 6 * Copyright (c) 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 __FBSDID("$FreeBSD$"); 41 42 #include <sys/param.h> 43 #include <sys/conf.h> 44 #include <sys/smp.h> 45 #include <sys/devctl.h> 46 #include <sys/eventhandler.h> 47 #include <sys/fcntl.h> 48 #include <sys/jail.h> 49 #include <sys/kernel.h> 50 #include <sys/ktr.h> 51 #include <sys/libkern.h> 52 #include <sys/limits.h> 53 #include <sys/malloc.h> 54 #include <sys/mount.h> 55 #include <sys/mutex.h> 56 #include <sys/namei.h> 57 #include <sys/priv.h> 58 #include <sys/proc.h> 59 #include <sys/filedesc.h> 60 #include <sys/reboot.h> 61 #include <sys/sbuf.h> 62 #include <sys/syscallsubr.h> 63 #include <sys/sysproto.h> 64 #include <sys/sx.h> 65 #include <sys/sysctl.h> 66 #include <sys/sysent.h> 67 #include <sys/systm.h> 68 #include <sys/taskqueue.h> 69 #include <sys/vnode.h> 70 #include <vm/uma.h> 71 72 #include <geom/geom.h> 73 74 #include <machine/stdarg.h> 75 76 #include <security/audit/audit.h> 77 #include <security/mac/mac_framework.h> 78 79 #define VFS_MOUNTARG_SIZE_MAX (1024 * 64) 80 81 static int vfs_domount(struct thread *td, const char *fstype, char *fspath, 82 uint64_t fsflags, struct vfsoptlist **optlist); 83 static void free_mntarg(struct mntarg *ma); 84 85 static int usermount = 0; 86 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, 87 "Unprivileged users may mount and unmount file systems"); 88 89 static bool default_autoro = false; 90 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, 91 "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); 92 93 static bool recursive_forced_unmount = false; 94 SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW, 95 &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts" 96 " when a file system is forcibly unmounted"); 97 98 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); 99 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); 100 static uma_zone_t mount_zone; 101 102 /* List of mounted filesystems. */ 103 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 104 105 /* For any iteration/modification of mountlist */ 106 struct mtx_padalign __exclusive_cache_line mountlist_mtx; 107 MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); 108 109 EVENTHANDLER_LIST_DEFINE(vfs_mounted); 110 EVENTHANDLER_LIST_DEFINE(vfs_unmounted); 111 112 static void vfs_deferred_unmount(void *arg, int pending); 113 static struct task deferred_unmount_task = 114 TASK_INITIALIZER(0, vfs_deferred_unmount, NULL);; 115 static struct mtx deferred_unmount_lock; 116 MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount", 117 MTX_DEF); 118 static STAILQ_HEAD(, mount) deferred_unmount_list = 119 STAILQ_HEAD_INITIALIZER(deferred_unmount_list); 120 TASKQUEUE_DEFINE_THREAD(deferred_unmount); 121 122 static void mount_devctl_event(const char *type, struct mount *mp, bool donew); 123 124 /* 125 * Global opts, taken by all filesystems 126 */ 127 static const char *global_opts[] = { 128 "errmsg", 129 "fstype", 130 "fspath", 131 "ro", 132 "rw", 133 "nosuid", 134 "noexec", 135 NULL 136 }; 137 138 static int 139 mount_init(void *mem, int size, int flags) 140 { 141 struct mount *mp; 142 143 mp = (struct mount *)mem; 144 mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); 145 mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); 146 lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); 147 mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO); 148 mp->mnt_ref = 0; 149 mp->mnt_vfs_ops = 1; 150 mp->mnt_rootvnode = NULL; 151 return (0); 152 } 153 154 static void 155 mount_fini(void *mem, int size) 156 { 157 struct mount *mp; 158 159 mp = (struct mount *)mem; 160 uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu); 161 lockdestroy(&mp->mnt_explock); 162 mtx_destroy(&mp->mnt_listmtx); 163 mtx_destroy(&mp->mnt_mtx); 164 } 165 166 static void 167 vfs_mount_init(void *dummy __unused) 168 { 169 170 mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, 171 NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); 172 } 173 SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); 174 175 /* 176 * --------------------------------------------------------------------- 177 * Functions for building and sanitizing the mount options 178 */ 179 180 /* Remove one mount option. */ 181 static void 182 vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt) 183 { 184 185 TAILQ_REMOVE(opts, opt, link); 186 free(opt->name, M_MOUNT); 187 if (opt->value != NULL) 188 free(opt->value, M_MOUNT); 189 free(opt, M_MOUNT); 190 } 191 192 /* Release all resources related to the mount options. */ 193 void 194 vfs_freeopts(struct vfsoptlist *opts) 195 { 196 struct vfsopt *opt; 197 198 while (!TAILQ_EMPTY(opts)) { 199 opt = TAILQ_FIRST(opts); 200 vfs_freeopt(opts, opt); 201 } 202 free(opts, M_MOUNT); 203 } 204 205 void 206 vfs_deleteopt(struct vfsoptlist *opts, const char *name) 207 { 208 struct vfsopt *opt, *temp; 209 210 if (opts == NULL) 211 return; 212 TAILQ_FOREACH_SAFE(opt, opts, link, temp) { 213 if (strcmp(opt->name, name) == 0) 214 vfs_freeopt(opts, opt); 215 } 216 } 217 218 static int 219 vfs_isopt_ro(const char *opt) 220 { 221 222 if (strcmp(opt, "ro") == 0 || strcmp(opt, "rdonly") == 0 || 223 strcmp(opt, "norw") == 0) 224 return (1); 225 return (0); 226 } 227 228 static int 229 vfs_isopt_rw(const char *opt) 230 { 231 232 if (strcmp(opt, "rw") == 0 || strcmp(opt, "noro") == 0) 233 return (1); 234 return (0); 235 } 236 237 /* 238 * Check if options are equal (with or without the "no" prefix). 239 */ 240 static int 241 vfs_equalopts(const char *opt1, const char *opt2) 242 { 243 char *p; 244 245 /* "opt" vs. "opt" or "noopt" vs. "noopt" */ 246 if (strcmp(opt1, opt2) == 0) 247 return (1); 248 /* "noopt" vs. "opt" */ 249 if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) 250 return (1); 251 /* "opt" vs. "noopt" */ 252 if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) 253 return (1); 254 while ((p = strchr(opt1, '.')) != NULL && 255 !strncmp(opt1, opt2, ++p - opt1)) { 256 opt2 += p - opt1; 257 opt1 = p; 258 /* "foo.noopt" vs. "foo.opt" */ 259 if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0) 260 return (1); 261 /* "foo.opt" vs. "foo.noopt" */ 262 if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0) 263 return (1); 264 } 265 /* "ro" / "rdonly" / "norw" / "rw" / "noro" */ 266 if ((vfs_isopt_ro(opt1) || vfs_isopt_rw(opt1)) && 267 (vfs_isopt_ro(opt2) || vfs_isopt_rw(opt2))) 268 return (1); 269 return (0); 270 } 271 272 /* 273 * If a mount option is specified several times, 274 * (with or without the "no" prefix) only keep 275 * the last occurrence of it. 276 */ 277 static void 278 vfs_sanitizeopts(struct vfsoptlist *opts) 279 { 280 struct vfsopt *opt, *opt2, *tmp; 281 282 TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) { 283 opt2 = TAILQ_PREV(opt, vfsoptlist, link); 284 while (opt2 != NULL) { 285 if (vfs_equalopts(opt->name, opt2->name)) { 286 tmp = TAILQ_PREV(opt2, vfsoptlist, link); 287 vfs_freeopt(opts, opt2); 288 opt2 = tmp; 289 } else { 290 opt2 = TAILQ_PREV(opt2, vfsoptlist, link); 291 } 292 } 293 } 294 } 295 296 /* 297 * Build a linked list of mount options from a struct uio. 298 */ 299 int 300 vfs_buildopts(struct uio *auio, struct vfsoptlist **options) 301 { 302 struct vfsoptlist *opts; 303 struct vfsopt *opt; 304 size_t memused, namelen, optlen; 305 unsigned int i, iovcnt; 306 int error; 307 308 opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); 309 TAILQ_INIT(opts); 310 memused = 0; 311 iovcnt = auio->uio_iovcnt; 312 for (i = 0; i < iovcnt; i += 2) { 313 namelen = auio->uio_iov[i].iov_len; 314 optlen = auio->uio_iov[i + 1].iov_len; 315 memused += sizeof(struct vfsopt) + optlen + namelen; 316 /* 317 * Avoid consuming too much memory, and attempts to overflow 318 * memused. 319 */ 320 if (memused > VFS_MOUNTARG_SIZE_MAX || 321 optlen > VFS_MOUNTARG_SIZE_MAX || 322 namelen > VFS_MOUNTARG_SIZE_MAX) { 323 error = EINVAL; 324 goto bad; 325 } 326 327 opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); 328 opt->name = malloc(namelen, M_MOUNT, M_WAITOK); 329 opt->value = NULL; 330 opt->len = 0; 331 opt->pos = i / 2; 332 opt->seen = 0; 333 334 /* 335 * Do this early, so jumps to "bad" will free the current 336 * option. 337 */ 338 TAILQ_INSERT_TAIL(opts, opt, link); 339 340 if (auio->uio_segflg == UIO_SYSSPACE) { 341 bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); 342 } else { 343 error = copyin(auio->uio_iov[i].iov_base, opt->name, 344 namelen); 345 if (error) 346 goto bad; 347 } 348 /* Ensure names are null-terminated strings. */ 349 if (namelen == 0 || opt->name[namelen - 1] != '\0') { 350 error = EINVAL; 351 goto bad; 352 } 353 if (optlen != 0) { 354 opt->len = optlen; 355 opt->value = malloc(optlen, M_MOUNT, M_WAITOK); 356 if (auio->uio_segflg == UIO_SYSSPACE) { 357 bcopy(auio->uio_iov[i + 1].iov_base, opt->value, 358 optlen); 359 } else { 360 error = copyin(auio->uio_iov[i + 1].iov_base, 361 opt->value, optlen); 362 if (error) 363 goto bad; 364 } 365 } 366 } 367 vfs_sanitizeopts(opts); 368 *options = opts; 369 return (0); 370 bad: 371 vfs_freeopts(opts); 372 return (error); 373 } 374 375 /* 376 * Merge the old mount options with the new ones passed 377 * in the MNT_UPDATE case. 378 * 379 * XXX: This function will keep a "nofoo" option in the new 380 * options. E.g, if the option's canonical name is "foo", 381 * "nofoo" ends up in the mount point's active options. 382 */ 383 static void 384 vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *oldopts) 385 { 386 struct vfsopt *opt, *new; 387 388 TAILQ_FOREACH(opt, oldopts, link) { 389 new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); 390 new->name = strdup(opt->name, M_MOUNT); 391 if (opt->len != 0) { 392 new->value = malloc(opt->len, M_MOUNT, M_WAITOK); 393 bcopy(opt->value, new->value, opt->len); 394 } else 395 new->value = NULL; 396 new->len = opt->len; 397 new->seen = opt->seen; 398 TAILQ_INSERT_HEAD(toopts, new, link); 399 } 400 vfs_sanitizeopts(toopts); 401 } 402 403 /* 404 * Mount a filesystem. 405 */ 406 #ifndef _SYS_SYSPROTO_H_ 407 struct nmount_args { 408 struct iovec *iovp; 409 unsigned int iovcnt; 410 int flags; 411 }; 412 #endif 413 int 414 sys_nmount(struct thread *td, struct nmount_args *uap) 415 { 416 struct uio *auio; 417 int error; 418 u_int iovcnt; 419 uint64_t flags; 420 421 /* 422 * Mount flags are now 64-bits. On 32-bit archtectures only 423 * 32-bits are passed in, but from here on everything handles 424 * 64-bit flags correctly. 425 */ 426 flags = uap->flags; 427 428 AUDIT_ARG_FFLAGS(flags); 429 CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__, 430 uap->iovp, uap->iovcnt, flags); 431 432 /* 433 * Filter out MNT_ROOTFS. We do not want clients of nmount() in 434 * userspace to set this flag, but we must filter it out if we want 435 * MNT_UPDATE on the root file system to work. 436 * MNT_ROOTFS should only be set by the kernel when mounting its 437 * root file system. 438 */ 439 flags &= ~MNT_ROOTFS; 440 441 iovcnt = uap->iovcnt; 442 /* 443 * Check that we have an even number of iovec's 444 * and that we have at least two options. 445 */ 446 if ((iovcnt & 1) || (iovcnt < 4)) { 447 CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__, 448 uap->iovcnt); 449 return (EINVAL); 450 } 451 452 error = copyinuio(uap->iovp, iovcnt, &auio); 453 if (error) { 454 CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno", 455 __func__, error); 456 return (error); 457 } 458 error = vfs_donmount(td, flags, auio); 459 460 free(auio, M_IOV); 461 return (error); 462 } 463 464 /* 465 * --------------------------------------------------------------------- 466 * Various utility functions 467 */ 468 469 /* 470 * Get a reference on a mount point from a vnode. 471 * 472 * The vnode is allowed to be passed unlocked and race against dooming. Note in 473 * such case there are no guarantees the referenced mount point will still be 474 * associated with it after the function returns. 475 */ 476 struct mount * 477 vfs_ref_from_vp(struct vnode *vp) 478 { 479 struct mount *mp; 480 struct mount_pcpu *mpcpu; 481 482 mp = atomic_load_ptr(&vp->v_mount); 483 if (__predict_false(mp == NULL)) { 484 return (mp); 485 } 486 if (vfs_op_thread_enter(mp, mpcpu)) { 487 if (__predict_true(mp == vp->v_mount)) { 488 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 489 vfs_op_thread_exit(mp, mpcpu); 490 } else { 491 vfs_op_thread_exit(mp, mpcpu); 492 mp = NULL; 493 } 494 } else { 495 MNT_ILOCK(mp); 496 if (mp == vp->v_mount) { 497 MNT_REF(mp); 498 MNT_IUNLOCK(mp); 499 } else { 500 MNT_IUNLOCK(mp); 501 mp = NULL; 502 } 503 } 504 return (mp); 505 } 506 507 void 508 vfs_ref(struct mount *mp) 509 { 510 struct mount_pcpu *mpcpu; 511 512 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 513 if (vfs_op_thread_enter(mp, mpcpu)) { 514 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 515 vfs_op_thread_exit(mp, mpcpu); 516 return; 517 } 518 519 MNT_ILOCK(mp); 520 MNT_REF(mp); 521 MNT_IUNLOCK(mp); 522 } 523 524 /* 525 * Register ump as an upper mount of the mount associated with 526 * vnode vp. This registration will be tracked through 527 * mount_upper_node upper, which should be allocated by the 528 * caller and stored in per-mount data associated with mp. 529 * 530 * If successful, this function will return the mount associated 531 * with vp, and will ensure that it cannot be unmounted until 532 * ump has been unregistered as one of its upper mounts. 533 * 534 * Upon failure this function will return NULL. 535 */ 536 struct mount * 537 vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump, 538 struct mount_upper_node *upper) 539 { 540 struct mount *mp; 541 542 mp = atomic_load_ptr(&vp->v_mount); 543 if (mp == NULL) 544 return (NULL); 545 MNT_ILOCK(mp); 546 if (mp != vp->v_mount || 547 ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) { 548 MNT_IUNLOCK(mp); 549 return (NULL); 550 } 551 KASSERT(ump != mp, ("upper and lower mounts are identical")); 552 upper->mp = ump; 553 MNT_REF(mp); 554 TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link); 555 MNT_IUNLOCK(mp); 556 return (mp); 557 } 558 559 /* 560 * Register upper mount ump to receive vnode unlink/reclaim 561 * notifications from lower mount mp. This registration will 562 * be tracked through mount_upper_node upper, which should be 563 * allocated by the caller and stored in per-mount data 564 * associated with mp. 565 * 566 * ump must already be registered as an upper mount of mp 567 * through a call to vfs_register_upper_from_vp(). 568 */ 569 void 570 vfs_register_for_notification(struct mount *mp, struct mount *ump, 571 struct mount_upper_node *upper) 572 { 573 upper->mp = ump; 574 MNT_ILOCK(mp); 575 TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link); 576 MNT_IUNLOCK(mp); 577 } 578 579 static void 580 vfs_drain_upper_locked(struct mount *mp) 581 { 582 mtx_assert(MNT_MTX(mp), MA_OWNED); 583 while (mp->mnt_upper_pending != 0) { 584 mp->mnt_kern_flag |= MNTK_UPPER_WAITER; 585 msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0); 586 } 587 } 588 589 /* 590 * Undo a previous call to vfs_register_for_notification(). 591 * The mount represented by upper must be currently registered 592 * as an upper mount for mp. 593 */ 594 void 595 vfs_unregister_for_notification(struct mount *mp, 596 struct mount_upper_node *upper) 597 { 598 MNT_ILOCK(mp); 599 vfs_drain_upper_locked(mp); 600 TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link); 601 MNT_IUNLOCK(mp); 602 } 603 604 /* 605 * Undo a previous call to vfs_register_upper_from_vp(). 606 * This must be done before mp can be unmounted. 607 */ 608 void 609 vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper) 610 { 611 MNT_ILOCK(mp); 612 KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0, 613 ("registered upper with pending unmount")); 614 vfs_drain_upper_locked(mp); 615 TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link); 616 if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 && 617 TAILQ_EMPTY(&mp->mnt_uppers)) { 618 mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER; 619 wakeup(&mp->mnt_taskqueue_link); 620 } 621 MNT_REL(mp); 622 MNT_IUNLOCK(mp); 623 } 624 625 void 626 vfs_rel(struct mount *mp) 627 { 628 struct mount_pcpu *mpcpu; 629 630 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 631 if (vfs_op_thread_enter(mp, mpcpu)) { 632 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 633 vfs_op_thread_exit(mp, mpcpu); 634 return; 635 } 636 637 MNT_ILOCK(mp); 638 MNT_REL(mp); 639 MNT_IUNLOCK(mp); 640 } 641 642 /* 643 * Allocate and initialize the mount point struct. 644 */ 645 struct mount * 646 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, 647 struct ucred *cred) 648 { 649 struct mount *mp; 650 651 mp = uma_zalloc(mount_zone, M_WAITOK); 652 bzero(&mp->mnt_startzero, 653 __rangeof(struct mount, mnt_startzero, mnt_endzero)); 654 mp->mnt_kern_flag = 0; 655 mp->mnt_flag = 0; 656 mp->mnt_rootvnode = NULL; 657 mp->mnt_vnodecovered = NULL; 658 mp->mnt_op = NULL; 659 mp->mnt_vfc = NULL; 660 TAILQ_INIT(&mp->mnt_nvnodelist); 661 mp->mnt_nvnodelistsize = 0; 662 TAILQ_INIT(&mp->mnt_lazyvnodelist); 663 mp->mnt_lazyvnodelistsize = 0; 664 if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || 665 mp->mnt_writeopcount != 0) 666 panic("%s: non-zero counters on new mp %p\n", __func__, mp); 667 if (mp->mnt_vfs_ops != 1) 668 panic("%s: vfs_ops should be 1 but %d found\n", __func__, 669 mp->mnt_vfs_ops); 670 (void) vfs_busy(mp, MBF_NOWAIT); 671 atomic_add_acq_int(&vfsp->vfc_refcount, 1); 672 mp->mnt_op = vfsp->vfc_vfsops; 673 mp->mnt_vfc = vfsp; 674 mp->mnt_stat.f_type = vfsp->vfc_typenum; 675 mp->mnt_gen++; 676 strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 677 mp->mnt_vnodecovered = vp; 678 mp->mnt_cred = crdup(cred); 679 mp->mnt_stat.f_owner = cred->cr_uid; 680 strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN); 681 mp->mnt_iosize_max = DFLTPHYS; 682 #ifdef MAC 683 mac_mount_init(mp); 684 mac_mount_create(cred, mp); 685 #endif 686 arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); 687 mp->mnt_upper_pending = 0; 688 TAILQ_INIT(&mp->mnt_uppers); 689 TAILQ_INIT(&mp->mnt_notify); 690 mp->mnt_taskqueue_flags = 0; 691 return (mp); 692 } 693 694 /* 695 * Destroy the mount struct previously allocated by vfs_mount_alloc(). 696 */ 697 void 698 vfs_mount_destroy(struct mount *mp) 699 { 700 701 if (mp->mnt_vfs_ops == 0) 702 panic("%s: entered with zero vfs_ops\n", __func__); 703 704 vfs_assert_mount_counters(mp); 705 706 MNT_ILOCK(mp); 707 mp->mnt_kern_flag |= MNTK_REFEXPIRE; 708 if (mp->mnt_kern_flag & MNTK_MWAIT) { 709 mp->mnt_kern_flag &= ~MNTK_MWAIT; 710 wakeup(mp); 711 } 712 while (mp->mnt_ref) 713 msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0); 714 KASSERT(mp->mnt_ref == 0, 715 ("%s: invalid refcount in the drain path @ %s:%d", __func__, 716 __FILE__, __LINE__)); 717 if (mp->mnt_writeopcount != 0) 718 panic("vfs_mount_destroy: nonzero writeopcount"); 719 if (mp->mnt_secondary_writes != 0) 720 panic("vfs_mount_destroy: nonzero secondary_writes"); 721 atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1); 722 if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) { 723 struct vnode *vp; 724 725 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) 726 vn_printf(vp, "dangling vnode "); 727 panic("unmount: dangling vnode"); 728 } 729 KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending")); 730 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); 731 KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify")); 732 if (mp->mnt_nvnodelistsize != 0) 733 panic("vfs_mount_destroy: nonzero nvnodelistsize"); 734 if (mp->mnt_lazyvnodelistsize != 0) 735 panic("vfs_mount_destroy: nonzero lazyvnodelistsize"); 736 if (mp->mnt_lockref != 0) 737 panic("vfs_mount_destroy: nonzero lock refcount"); 738 MNT_IUNLOCK(mp); 739 740 if (mp->mnt_vfs_ops != 1) 741 panic("%s: vfs_ops should be 1 but %d found\n", __func__, 742 mp->mnt_vfs_ops); 743 744 if (mp->mnt_rootvnode != NULL) 745 panic("%s: mount point still has a root vnode %p\n", __func__, 746 mp->mnt_rootvnode); 747 748 if (mp->mnt_vnodecovered != NULL) 749 vrele(mp->mnt_vnodecovered); 750 #ifdef MAC 751 mac_mount_destroy(mp); 752 #endif 753 if (mp->mnt_opt != NULL) 754 vfs_freeopts(mp->mnt_opt); 755 crfree(mp->mnt_cred); 756 uma_zfree(mount_zone, mp); 757 } 758 759 static bool 760 vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error) 761 { 762 /* This is an upgrade of an exisiting mount. */ 763 if ((fsflags & MNT_UPDATE) != 0) 764 return (false); 765 /* This is already an R/O mount. */ 766 if ((fsflags & MNT_RDONLY) != 0) 767 return (false); 768 769 switch (error) { 770 case ENODEV: /* generic, geom, ... */ 771 case EACCES: /* cam/scsi, ... */ 772 case EROFS: /* md, mmcsd, ... */ 773 /* 774 * These errors can be returned by the storage layer to signal 775 * that the media is read-only. No harm in the R/O mount 776 * attempt if the error was returned for some other reason. 777 */ 778 return (true); 779 default: 780 return (false); 781 } 782 } 783 784 int 785 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions) 786 { 787 struct vfsoptlist *optlist; 788 struct vfsopt *opt, *tmp_opt; 789 char *fstype, *fspath, *errmsg; 790 int error, fstypelen, fspathlen, errmsg_len, errmsg_pos; 791 bool autoro; 792 793 errmsg = fspath = NULL; 794 errmsg_len = fspathlen = 0; 795 errmsg_pos = -1; 796 autoro = default_autoro; 797 798 error = vfs_buildopts(fsoptions, &optlist); 799 if (error) 800 return (error); 801 802 if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0) 803 errmsg_pos = vfs_getopt_pos(optlist, "errmsg"); 804 805 /* 806 * We need these two options before the others, 807 * and they are mandatory for any filesystem. 808 * Ensure they are NUL terminated as well. 809 */ 810 fstypelen = 0; 811 error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen); 812 if (error || fstypelen <= 0 || fstype[fstypelen - 1] != '\0') { 813 error = EINVAL; 814 if (errmsg != NULL) 815 strncpy(errmsg, "Invalid fstype", errmsg_len); 816 goto bail; 817 } 818 fspathlen = 0; 819 error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen); 820 if (error || fspathlen <= 0 || fspath[fspathlen - 1] != '\0') { 821 error = EINVAL; 822 if (errmsg != NULL) 823 strncpy(errmsg, "Invalid fspath", errmsg_len); 824 goto bail; 825 } 826 827 /* 828 * We need to see if we have the "update" option 829 * before we call vfs_domount(), since vfs_domount() has special 830 * logic based on MNT_UPDATE. This is very important 831 * when we want to update the root filesystem. 832 */ 833 TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) { 834 int do_freeopt = 0; 835 836 if (strcmp(opt->name, "update") == 0) { 837 fsflags |= MNT_UPDATE; 838 do_freeopt = 1; 839 } 840 else if (strcmp(opt->name, "async") == 0) 841 fsflags |= MNT_ASYNC; 842 else if (strcmp(opt->name, "force") == 0) { 843 fsflags |= MNT_FORCE; 844 do_freeopt = 1; 845 } 846 else if (strcmp(opt->name, "reload") == 0) { 847 fsflags |= MNT_RELOAD; 848 do_freeopt = 1; 849 } 850 else if (strcmp(opt->name, "multilabel") == 0) 851 fsflags |= MNT_MULTILABEL; 852 else if (strcmp(opt->name, "noasync") == 0) 853 fsflags &= ~MNT_ASYNC; 854 else if (strcmp(opt->name, "noatime") == 0) 855 fsflags |= MNT_NOATIME; 856 else if (strcmp(opt->name, "atime") == 0) { 857 free(opt->name, M_MOUNT); 858 opt->name = strdup("nonoatime", M_MOUNT); 859 } 860 else if (strcmp(opt->name, "noclusterr") == 0) 861 fsflags |= MNT_NOCLUSTERR; 862 else if (strcmp(opt->name, "clusterr") == 0) { 863 free(opt->name, M_MOUNT); 864 opt->name = strdup("nonoclusterr", M_MOUNT); 865 } 866 else if (strcmp(opt->name, "noclusterw") == 0) 867 fsflags |= MNT_NOCLUSTERW; 868 else if (strcmp(opt->name, "clusterw") == 0) { 869 free(opt->name, M_MOUNT); 870 opt->name = strdup("nonoclusterw", M_MOUNT); 871 } 872 else if (strcmp(opt->name, "noexec") == 0) 873 fsflags |= MNT_NOEXEC; 874 else if (strcmp(opt->name, "exec") == 0) { 875 free(opt->name, M_MOUNT); 876 opt->name = strdup("nonoexec", M_MOUNT); 877 } 878 else if (strcmp(opt->name, "nosuid") == 0) 879 fsflags |= MNT_NOSUID; 880 else if (strcmp(opt->name, "suid") == 0) { 881 free(opt->name, M_MOUNT); 882 opt->name = strdup("nonosuid", M_MOUNT); 883 } 884 else if (strcmp(opt->name, "nosymfollow") == 0) 885 fsflags |= MNT_NOSYMFOLLOW; 886 else if (strcmp(opt->name, "symfollow") == 0) { 887 free(opt->name, M_MOUNT); 888 opt->name = strdup("nonosymfollow", M_MOUNT); 889 } 890 else if (strcmp(opt->name, "noro") == 0) { 891 fsflags &= ~MNT_RDONLY; 892 autoro = false; 893 } 894 else if (strcmp(opt->name, "rw") == 0) { 895 fsflags &= ~MNT_RDONLY; 896 autoro = false; 897 } 898 else if (strcmp(opt->name, "ro") == 0) { 899 fsflags |= MNT_RDONLY; 900 autoro = false; 901 } 902 else if (strcmp(opt->name, "rdonly") == 0) { 903 free(opt->name, M_MOUNT); 904 opt->name = strdup("ro", M_MOUNT); 905 fsflags |= MNT_RDONLY; 906 autoro = false; 907 } 908 else if (strcmp(opt->name, "autoro") == 0) { 909 do_freeopt = 1; 910 autoro = true; 911 } 912 else if (strcmp(opt->name, "suiddir") == 0) 913 fsflags |= MNT_SUIDDIR; 914 else if (strcmp(opt->name, "sync") == 0) 915 fsflags |= MNT_SYNCHRONOUS; 916 else if (strcmp(opt->name, "union") == 0) 917 fsflags |= MNT_UNION; 918 else if (strcmp(opt->name, "automounted") == 0) { 919 fsflags |= MNT_AUTOMOUNTED; 920 do_freeopt = 1; 921 } else if (strcmp(opt->name, "nocover") == 0) { 922 fsflags |= MNT_NOCOVER; 923 do_freeopt = 1; 924 } else if (strcmp(opt->name, "cover") == 0) { 925 fsflags &= ~MNT_NOCOVER; 926 do_freeopt = 1; 927 } else if (strcmp(opt->name, "emptydir") == 0) { 928 fsflags |= MNT_EMPTYDIR; 929 do_freeopt = 1; 930 } else if (strcmp(opt->name, "noemptydir") == 0) { 931 fsflags &= ~MNT_EMPTYDIR; 932 do_freeopt = 1; 933 } 934 if (do_freeopt) 935 vfs_freeopt(optlist, opt); 936 } 937 938 /* 939 * Be ultra-paranoid about making sure the type and fspath 940 * variables will fit in our mp buffers, including the 941 * terminating NUL. 942 */ 943 if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) { 944 error = ENAMETOOLONG; 945 goto bail; 946 } 947 948 error = vfs_domount(td, fstype, fspath, fsflags, &optlist); 949 950 /* 951 * See if we can mount in the read-only mode if the error code suggests 952 * that it could be possible and the mount options allow for that. 953 * Never try it if "[no]{ro|rw}" has been explicitly requested and not 954 * overridden by "autoro". 955 */ 956 if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) { 957 printf("%s: R/W mount failed, possibly R/O media," 958 " trying R/O mount\n", __func__); 959 fsflags |= MNT_RDONLY; 960 error = vfs_domount(td, fstype, fspath, fsflags, &optlist); 961 } 962 bail: 963 /* copyout the errmsg */ 964 if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt) 965 && errmsg_len > 0 && errmsg != NULL) { 966 if (fsoptions->uio_segflg == UIO_SYSSPACE) { 967 bcopy(errmsg, 968 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, 969 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); 970 } else { 971 copyout(errmsg, 972 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base, 973 fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len); 974 } 975 } 976 977 if (optlist != NULL) 978 vfs_freeopts(optlist); 979 return (error); 980 } 981 982 /* 983 * Old mount API. 984 */ 985 #ifndef _SYS_SYSPROTO_H_ 986 struct mount_args { 987 char *type; 988 char *path; 989 int flags; 990 caddr_t data; 991 }; 992 #endif 993 /* ARGSUSED */ 994 int 995 sys_mount(struct thread *td, struct mount_args *uap) 996 { 997 char *fstype; 998 struct vfsconf *vfsp = NULL; 999 struct mntarg *ma = NULL; 1000 uint64_t flags; 1001 int error; 1002 1003 /* 1004 * Mount flags are now 64-bits. On 32-bit architectures only 1005 * 32-bits are passed in, but from here on everything handles 1006 * 64-bit flags correctly. 1007 */ 1008 flags = uap->flags; 1009 1010 AUDIT_ARG_FFLAGS(flags); 1011 1012 /* 1013 * Filter out MNT_ROOTFS. We do not want clients of mount() in 1014 * userspace to set this flag, but we must filter it out if we want 1015 * MNT_UPDATE on the root file system to work. 1016 * MNT_ROOTFS should only be set by the kernel when mounting its 1017 * root file system. 1018 */ 1019 flags &= ~MNT_ROOTFS; 1020 1021 fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK); 1022 error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL); 1023 if (error) { 1024 free(fstype, M_TEMP); 1025 return (error); 1026 } 1027 1028 AUDIT_ARG_TEXT(fstype); 1029 vfsp = vfs_byname_kld(fstype, td, &error); 1030 free(fstype, M_TEMP); 1031 if (vfsp == NULL) 1032 return (ENOENT); 1033 if (((vfsp->vfc_flags & VFCF_SBDRY) != 0 && 1034 vfsp->vfc_vfsops_sd->vfs_cmount == NULL) || 1035 ((vfsp->vfc_flags & VFCF_SBDRY) == 0 && 1036 vfsp->vfc_vfsops->vfs_cmount == NULL)) 1037 return (EOPNOTSUPP); 1038 1039 ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN); 1040 ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN); 1041 ma = mount_argb(ma, flags & MNT_RDONLY, "noro"); 1042 ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid"); 1043 ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec"); 1044 1045 if ((vfsp->vfc_flags & VFCF_SBDRY) != 0) 1046 return (vfsp->vfc_vfsops_sd->vfs_cmount(ma, uap->data, flags)); 1047 return (vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags)); 1048 } 1049 1050 /* 1051 * vfs_domount_first(): first file system mount (not update) 1052 */ 1053 static int 1054 vfs_domount_first( 1055 struct thread *td, /* Calling thread. */ 1056 struct vfsconf *vfsp, /* File system type. */ 1057 char *fspath, /* Mount path. */ 1058 struct vnode *vp, /* Vnode to be covered. */ 1059 uint64_t fsflags, /* Flags common to all filesystems. */ 1060 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1061 ) 1062 { 1063 struct vattr va; 1064 struct mount *mp; 1065 struct vnode *newdp, *rootvp; 1066 int error, error1; 1067 bool unmounted; 1068 1069 ASSERT_VOP_ELOCKED(vp, __func__); 1070 KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here")); 1071 1072 if ((fsflags & MNT_EMPTYDIR) != 0) { 1073 error = vfs_emptydir(vp); 1074 if (error != 0) { 1075 vput(vp); 1076 return (error); 1077 } 1078 } 1079 1080 /* 1081 * If the jail of the calling thread lacks permission for this type of 1082 * file system, or is trying to cover its own root, deny immediately. 1083 */ 1084 if (jailed(td->td_ucred) && (!prison_allow(td->td_ucred, 1085 vfsp->vfc_prison_flag) || vp == td->td_ucred->cr_prison->pr_root)) { 1086 vput(vp); 1087 return (EPERM); 1088 } 1089 1090 /* 1091 * If the user is not root, ensure that they own the directory 1092 * onto which we are attempting to mount. 1093 */ 1094 error = VOP_GETATTR(vp, &va, td->td_ucred); 1095 if (error == 0 && va.va_uid != td->td_ucred->cr_uid) 1096 error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN); 1097 if (error == 0) 1098 error = vinvalbuf(vp, V_SAVE, 0, 0); 1099 if (error == 0 && vp->v_type != VDIR) 1100 error = ENOTDIR; 1101 if (error == 0) { 1102 VI_LOCK(vp); 1103 if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) 1104 vp->v_iflag |= VI_MOUNT; 1105 else 1106 error = EBUSY; 1107 VI_UNLOCK(vp); 1108 } 1109 if (error != 0) { 1110 vput(vp); 1111 return (error); 1112 } 1113 vn_seqc_write_begin(vp); 1114 VOP_UNLOCK(vp); 1115 1116 /* Allocate and initialize the filesystem. */ 1117 mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); 1118 /* XXXMAC: pass to vfs_mount_alloc? */ 1119 mp->mnt_optnew = *optlist; 1120 /* Set the mount level flags. */ 1121 mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY)); 1122 1123 /* 1124 * Mount the filesystem. 1125 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they 1126 * get. No freeing of cn_pnbuf. 1127 */ 1128 error1 = 0; 1129 unmounted = true; 1130 if ((error = VFS_MOUNT(mp)) != 0 || 1131 (error1 = VFS_STATFS(mp, &mp->mnt_stat)) != 0 || 1132 (error1 = VFS_ROOT(mp, LK_EXCLUSIVE, &newdp)) != 0) { 1133 rootvp = NULL; 1134 if (error1 != 0) { 1135 MPASS(error == 0); 1136 rootvp = vfs_cache_root_clear(mp); 1137 if (rootvp != NULL) { 1138 vhold(rootvp); 1139 vrele(rootvp); 1140 } 1141 (void)vn_start_write(NULL, &mp, V_WAIT); 1142 MNT_ILOCK(mp); 1143 mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_UNMOUNTF; 1144 MNT_IUNLOCK(mp); 1145 VFS_PURGE(mp); 1146 error = VFS_UNMOUNT(mp, 0); 1147 vn_finished_write(mp); 1148 if (error != 0) { 1149 printf( 1150 "failed post-mount (%d): rollback unmount returned %d\n", 1151 error1, error); 1152 unmounted = false; 1153 } 1154 error = error1; 1155 } 1156 vfs_unbusy(mp); 1157 mp->mnt_vnodecovered = NULL; 1158 if (unmounted) { 1159 /* XXXKIB wait for mnt_lockref drain? */ 1160 vfs_mount_destroy(mp); 1161 } 1162 VI_LOCK(vp); 1163 vp->v_iflag &= ~VI_MOUNT; 1164 VI_UNLOCK(vp); 1165 if (rootvp != NULL) { 1166 vn_seqc_write_end(rootvp); 1167 vdrop(rootvp); 1168 } 1169 vn_seqc_write_end(vp); 1170 vrele(vp); 1171 return (error); 1172 } 1173 vn_seqc_write_begin(newdp); 1174 VOP_UNLOCK(newdp); 1175 1176 if (mp->mnt_opt != NULL) 1177 vfs_freeopts(mp->mnt_opt); 1178 mp->mnt_opt = mp->mnt_optnew; 1179 *optlist = NULL; 1180 1181 /* 1182 * Prevent external consumers of mount options from reading mnt_optnew. 1183 */ 1184 mp->mnt_optnew = NULL; 1185 1186 MNT_ILOCK(mp); 1187 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 1188 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 1189 mp->mnt_kern_flag |= MNTK_ASYNC; 1190 else 1191 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1192 MNT_IUNLOCK(mp); 1193 1194 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1195 cache_purge(vp); 1196 VI_LOCK(vp); 1197 vp->v_iflag &= ~VI_MOUNT; 1198 vn_irflag_set_locked(vp, VIRF_MOUNTPOINT); 1199 vp->v_mountedhere = mp; 1200 VI_UNLOCK(vp); 1201 /* Place the new filesystem at the end of the mount list. */ 1202 mtx_lock(&mountlist_mtx); 1203 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); 1204 mtx_unlock(&mountlist_mtx); 1205 vfs_event_signal(NULL, VQ_MOUNT, 0); 1206 vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY); 1207 VOP_UNLOCK(vp); 1208 EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); 1209 VOP_UNLOCK(newdp); 1210 mount_devctl_event("MOUNT", mp, false); 1211 mountcheckdirs(vp, newdp); 1212 vn_seqc_write_end(vp); 1213 vn_seqc_write_end(newdp); 1214 vrele(newdp); 1215 if ((mp->mnt_flag & MNT_RDONLY) == 0) 1216 vfs_allocate_syncvnode(mp); 1217 vfs_op_exit(mp); 1218 vfs_unbusy(mp); 1219 return (0); 1220 } 1221 1222 /* 1223 * vfs_domount_update(): update of mounted file system 1224 */ 1225 static int 1226 vfs_domount_update( 1227 struct thread *td, /* Calling thread. */ 1228 struct vnode *vp, /* Mount point vnode. */ 1229 uint64_t fsflags, /* Flags common to all filesystems. */ 1230 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1231 ) 1232 { 1233 struct export_args export; 1234 struct o2export_args o2export; 1235 struct vnode *rootvp; 1236 void *bufp; 1237 struct mount *mp; 1238 int error, export_error, i, len; 1239 uint64_t flag; 1240 gid_t *grps; 1241 1242 ASSERT_VOP_ELOCKED(vp, __func__); 1243 KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here")); 1244 mp = vp->v_mount; 1245 1246 if ((vp->v_vflag & VV_ROOT) == 0) { 1247 if (vfs_copyopt(*optlist, "export", &export, sizeof(export)) 1248 == 0) 1249 error = EXDEV; 1250 else 1251 error = EINVAL; 1252 vput(vp); 1253 return (error); 1254 } 1255 1256 /* 1257 * We only allow the filesystem to be reloaded if it 1258 * is currently mounted read-only. 1259 */ 1260 flag = mp->mnt_flag; 1261 if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) { 1262 vput(vp); 1263 return (EOPNOTSUPP); /* Needs translation */ 1264 } 1265 /* 1266 * Only privileged root, or (if MNT_USER is set) the user that 1267 * did the original mount is permitted to update it. 1268 */ 1269 error = vfs_suser(mp, td); 1270 if (error != 0) { 1271 vput(vp); 1272 return (error); 1273 } 1274 if (vfs_busy(mp, MBF_NOWAIT)) { 1275 vput(vp); 1276 return (EBUSY); 1277 } 1278 VI_LOCK(vp); 1279 if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) { 1280 VI_UNLOCK(vp); 1281 vfs_unbusy(mp); 1282 vput(vp); 1283 return (EBUSY); 1284 } 1285 vp->v_iflag |= VI_MOUNT; 1286 VI_UNLOCK(vp); 1287 VOP_UNLOCK(vp); 1288 1289 vfs_op_enter(mp); 1290 vn_seqc_write_begin(vp); 1291 1292 rootvp = NULL; 1293 MNT_ILOCK(mp); 1294 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { 1295 MNT_IUNLOCK(mp); 1296 error = EBUSY; 1297 goto end; 1298 } 1299 mp->mnt_flag &= ~MNT_UPDATEMASK; 1300 mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | 1301 MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); 1302 if ((mp->mnt_flag & MNT_ASYNC) == 0) 1303 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1304 rootvp = vfs_cache_root_clear(mp); 1305 MNT_IUNLOCK(mp); 1306 mp->mnt_optnew = *optlist; 1307 vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); 1308 1309 /* 1310 * Mount the filesystem. 1311 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they 1312 * get. No freeing of cn_pnbuf. 1313 */ 1314 error = VFS_MOUNT(mp); 1315 1316 export_error = 0; 1317 /* Process the export option. */ 1318 if (error == 0 && vfs_getopt(mp->mnt_optnew, "export", &bufp, 1319 &len) == 0) { 1320 /* Assume that there is only 1 ABI for each length. */ 1321 switch (len) { 1322 case (sizeof(struct oexport_args)): 1323 bzero(&o2export, sizeof(o2export)); 1324 /* FALLTHROUGH */ 1325 case (sizeof(o2export)): 1326 bcopy(bufp, &o2export, len); 1327 export.ex_flags = (uint64_t)o2export.ex_flags; 1328 export.ex_root = o2export.ex_root; 1329 export.ex_uid = o2export.ex_anon.cr_uid; 1330 export.ex_groups = NULL; 1331 export.ex_ngroups = o2export.ex_anon.cr_ngroups; 1332 if (export.ex_ngroups > 0) { 1333 if (export.ex_ngroups <= XU_NGROUPS) { 1334 export.ex_groups = malloc( 1335 export.ex_ngroups * sizeof(gid_t), 1336 M_TEMP, M_WAITOK); 1337 for (i = 0; i < export.ex_ngroups; i++) 1338 export.ex_groups[i] = 1339 o2export.ex_anon.cr_groups[i]; 1340 } else 1341 export_error = EINVAL; 1342 } else if (export.ex_ngroups < 0) 1343 export_error = EINVAL; 1344 export.ex_addr = o2export.ex_addr; 1345 export.ex_addrlen = o2export.ex_addrlen; 1346 export.ex_mask = o2export.ex_mask; 1347 export.ex_masklen = o2export.ex_masklen; 1348 export.ex_indexfile = o2export.ex_indexfile; 1349 export.ex_numsecflavors = o2export.ex_numsecflavors; 1350 if (export.ex_numsecflavors < MAXSECFLAVORS) { 1351 for (i = 0; i < export.ex_numsecflavors; i++) 1352 export.ex_secflavors[i] = 1353 o2export.ex_secflavors[i]; 1354 } else 1355 export_error = EINVAL; 1356 if (export_error == 0) 1357 export_error = vfs_export(mp, &export); 1358 free(export.ex_groups, M_TEMP); 1359 break; 1360 case (sizeof(export)): 1361 bcopy(bufp, &export, len); 1362 grps = NULL; 1363 if (export.ex_ngroups > 0) { 1364 if (export.ex_ngroups <= NGROUPS_MAX) { 1365 grps = malloc(export.ex_ngroups * 1366 sizeof(gid_t), M_TEMP, M_WAITOK); 1367 export_error = copyin(export.ex_groups, 1368 grps, export.ex_ngroups * 1369 sizeof(gid_t)); 1370 if (export_error == 0) 1371 export.ex_groups = grps; 1372 } else 1373 export_error = EINVAL; 1374 } else if (export.ex_ngroups == 0) 1375 export.ex_groups = NULL; 1376 else 1377 export_error = EINVAL; 1378 if (export_error == 0) 1379 export_error = vfs_export(mp, &export); 1380 free(grps, M_TEMP); 1381 break; 1382 default: 1383 export_error = EINVAL; 1384 break; 1385 } 1386 } 1387 1388 MNT_ILOCK(mp); 1389 if (error == 0) { 1390 mp->mnt_flag &= ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | 1391 MNT_SNAPSHOT); 1392 } else { 1393 /* 1394 * If we fail, restore old mount flags. MNT_QUOTA is special, 1395 * because it is not part of MNT_UPDATEMASK, but it could have 1396 * changed in the meantime if quotactl(2) was called. 1397 * All in all we want current value of MNT_QUOTA, not the old 1398 * one. 1399 */ 1400 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); 1401 } 1402 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 1403 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 1404 mp->mnt_kern_flag |= MNTK_ASYNC; 1405 else 1406 mp->mnt_kern_flag &= ~MNTK_ASYNC; 1407 MNT_IUNLOCK(mp); 1408 1409 if (error != 0) 1410 goto end; 1411 1412 mount_devctl_event("REMOUNT", mp, true); 1413 if (mp->mnt_opt != NULL) 1414 vfs_freeopts(mp->mnt_opt); 1415 mp->mnt_opt = mp->mnt_optnew; 1416 *optlist = NULL; 1417 (void)VFS_STATFS(mp, &mp->mnt_stat); 1418 /* 1419 * Prevent external consumers of mount options from reading 1420 * mnt_optnew. 1421 */ 1422 mp->mnt_optnew = NULL; 1423 1424 if ((mp->mnt_flag & MNT_RDONLY) == 0) 1425 vfs_allocate_syncvnode(mp); 1426 else 1427 vfs_deallocate_syncvnode(mp); 1428 end: 1429 vfs_op_exit(mp); 1430 if (rootvp != NULL) { 1431 vn_seqc_write_end(rootvp); 1432 vrele(rootvp); 1433 } 1434 vn_seqc_write_end(vp); 1435 vfs_unbusy(mp); 1436 VI_LOCK(vp); 1437 vp->v_iflag &= ~VI_MOUNT; 1438 VI_UNLOCK(vp); 1439 vrele(vp); 1440 return (error != 0 ? error : export_error); 1441 } 1442 1443 /* 1444 * vfs_domount(): actually attempt a filesystem mount. 1445 */ 1446 static int 1447 vfs_domount( 1448 struct thread *td, /* Calling thread. */ 1449 const char *fstype, /* Filesystem type. */ 1450 char *fspath, /* Mount path. */ 1451 uint64_t fsflags, /* Flags common to all filesystems. */ 1452 struct vfsoptlist **optlist /* Options local to the filesystem. */ 1453 ) 1454 { 1455 struct vfsconf *vfsp; 1456 struct nameidata nd; 1457 struct vnode *vp; 1458 char *pathbuf; 1459 int error; 1460 1461 /* 1462 * Be ultra-paranoid about making sure the type and fspath 1463 * variables will fit in our mp buffers, including the 1464 * terminating NUL. 1465 */ 1466 if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) 1467 return (ENAMETOOLONG); 1468 1469 if (jailed(td->td_ucred) || usermount == 0) { 1470 if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0) 1471 return (error); 1472 } 1473 1474 /* 1475 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users. 1476 */ 1477 if (fsflags & MNT_EXPORTED) { 1478 error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED); 1479 if (error) 1480 return (error); 1481 } 1482 if (fsflags & MNT_SUIDDIR) { 1483 error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR); 1484 if (error) 1485 return (error); 1486 } 1487 /* 1488 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users. 1489 */ 1490 if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) { 1491 if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0) 1492 fsflags |= MNT_NOSUID | MNT_USER; 1493 } 1494 1495 /* Load KLDs before we lock the covered vnode to avoid reversals. */ 1496 vfsp = NULL; 1497 if ((fsflags & MNT_UPDATE) == 0) { 1498 /* Don't try to load KLDs if we're mounting the root. */ 1499 if (fsflags & MNT_ROOTFS) 1500 vfsp = vfs_byname(fstype); 1501 else 1502 vfsp = vfs_byname_kld(fstype, td, &error); 1503 if (vfsp == NULL) 1504 return (ENODEV); 1505 } 1506 1507 /* 1508 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE. 1509 */ 1510 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 1511 UIO_SYSSPACE, fspath, td); 1512 error = namei(&nd); 1513 if (error != 0) 1514 return (error); 1515 NDFREE(&nd, NDF_ONLY_PNBUF); 1516 vp = nd.ni_vp; 1517 if ((fsflags & MNT_UPDATE) == 0) { 1518 if ((vp->v_vflag & VV_ROOT) != 0 && 1519 (fsflags & MNT_NOCOVER) != 0) { 1520 vput(vp); 1521 return (EBUSY); 1522 } 1523 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); 1524 strcpy(pathbuf, fspath); 1525 error = vn_path_to_global_path(td, vp, pathbuf, MNAMELEN); 1526 if (error == 0) { 1527 error = vfs_domount_first(td, vfsp, pathbuf, vp, 1528 fsflags, optlist); 1529 } 1530 free(pathbuf, M_TEMP); 1531 } else 1532 error = vfs_domount_update(td, vp, fsflags, optlist); 1533 1534 return (error); 1535 } 1536 1537 /* 1538 * Unmount a filesystem. 1539 * 1540 * Note: unmount takes a path to the vnode mounted on as argument, not 1541 * special file (as before). 1542 */ 1543 #ifndef _SYS_SYSPROTO_H_ 1544 struct unmount_args { 1545 char *path; 1546 int flags; 1547 }; 1548 #endif 1549 /* ARGSUSED */ 1550 int 1551 sys_unmount(struct thread *td, struct unmount_args *uap) 1552 { 1553 1554 return (kern_unmount(td, uap->path, uap->flags)); 1555 } 1556 1557 int 1558 kern_unmount(struct thread *td, const char *path, int flags) 1559 { 1560 struct nameidata nd; 1561 struct mount *mp; 1562 char *pathbuf; 1563 int error, id0, id1; 1564 1565 AUDIT_ARG_VALUE(flags); 1566 if (jailed(td->td_ucred) || usermount == 0) { 1567 error = priv_check(td, PRIV_VFS_UNMOUNT); 1568 if (error) 1569 return (error); 1570 } 1571 1572 pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK); 1573 error = copyinstr(path, pathbuf, MNAMELEN, NULL); 1574 if (error) { 1575 free(pathbuf, M_TEMP); 1576 return (error); 1577 } 1578 if (flags & MNT_BYFSID) { 1579 AUDIT_ARG_TEXT(pathbuf); 1580 /* Decode the filesystem ID. */ 1581 if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) { 1582 free(pathbuf, M_TEMP); 1583 return (EINVAL); 1584 } 1585 1586 mtx_lock(&mountlist_mtx); 1587 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { 1588 if (mp->mnt_stat.f_fsid.val[0] == id0 && 1589 mp->mnt_stat.f_fsid.val[1] == id1) { 1590 vfs_ref(mp); 1591 break; 1592 } 1593 } 1594 mtx_unlock(&mountlist_mtx); 1595 } else { 1596 /* 1597 * Try to find global path for path argument. 1598 */ 1599 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, 1600 UIO_SYSSPACE, pathbuf, td); 1601 if (namei(&nd) == 0) { 1602 NDFREE(&nd, NDF_ONLY_PNBUF); 1603 error = vn_path_to_global_path(td, nd.ni_vp, pathbuf, 1604 MNAMELEN); 1605 if (error == 0) 1606 vput(nd.ni_vp); 1607 } 1608 mtx_lock(&mountlist_mtx); 1609 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { 1610 if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) { 1611 vfs_ref(mp); 1612 break; 1613 } 1614 } 1615 mtx_unlock(&mountlist_mtx); 1616 } 1617 free(pathbuf, M_TEMP); 1618 if (mp == NULL) { 1619 /* 1620 * Previously we returned ENOENT for a nonexistent path and 1621 * EINVAL for a non-mountpoint. We cannot tell these apart 1622 * now, so in the !MNT_BYFSID case return the more likely 1623 * EINVAL for compatibility. 1624 */ 1625 return ((flags & MNT_BYFSID) ? ENOENT : EINVAL); 1626 } 1627 1628 /* 1629 * Don't allow unmounting the root filesystem. 1630 */ 1631 if (mp->mnt_flag & MNT_ROOTFS) { 1632 vfs_rel(mp); 1633 return (EINVAL); 1634 } 1635 error = dounmount(mp, flags, td); 1636 return (error); 1637 } 1638 1639 /* 1640 * Return error if any of the vnodes, ignoring the root vnode 1641 * and the syncer vnode, have non-zero usecount. 1642 * 1643 * This function is purely advisory - it can return false positives 1644 * and negatives. 1645 */ 1646 static int 1647 vfs_check_usecounts(struct mount *mp) 1648 { 1649 struct vnode *vp, *mvp; 1650 1651 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 1652 if ((vp->v_vflag & VV_ROOT) == 0 && vp->v_type != VNON && 1653 vp->v_usecount != 0) { 1654 VI_UNLOCK(vp); 1655 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 1656 return (EBUSY); 1657 } 1658 VI_UNLOCK(vp); 1659 } 1660 1661 return (0); 1662 } 1663 1664 static void 1665 dounmount_cleanup(struct mount *mp, struct vnode *coveredvp, int mntkflags) 1666 { 1667 1668 mtx_assert(MNT_MTX(mp), MA_OWNED); 1669 mp->mnt_kern_flag &= ~mntkflags; 1670 if ((mp->mnt_kern_flag & MNTK_MWAIT) != 0) { 1671 mp->mnt_kern_flag &= ~MNTK_MWAIT; 1672 wakeup(mp); 1673 } 1674 vfs_op_exit_locked(mp); 1675 MNT_IUNLOCK(mp); 1676 if (coveredvp != NULL) { 1677 VOP_UNLOCK(coveredvp); 1678 vdrop(coveredvp); 1679 } 1680 vn_finished_write(mp); 1681 } 1682 1683 /* 1684 * There are various reference counters associated with the mount point. 1685 * Normally it is permitted to modify them without taking the mnt ilock, 1686 * but this behavior can be temporarily disabled if stable value is needed 1687 * or callers are expected to block (e.g. to not allow new users during 1688 * forced unmount). 1689 */ 1690 void 1691 vfs_op_enter(struct mount *mp) 1692 { 1693 struct mount_pcpu *mpcpu; 1694 int cpu; 1695 1696 MNT_ILOCK(mp); 1697 mp->mnt_vfs_ops++; 1698 if (mp->mnt_vfs_ops > 1) { 1699 MNT_IUNLOCK(mp); 1700 return; 1701 } 1702 vfs_op_barrier_wait(mp); 1703 CPU_FOREACH(cpu) { 1704 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1705 1706 mp->mnt_ref += mpcpu->mntp_ref; 1707 mpcpu->mntp_ref = 0; 1708 1709 mp->mnt_lockref += mpcpu->mntp_lockref; 1710 mpcpu->mntp_lockref = 0; 1711 1712 mp->mnt_writeopcount += mpcpu->mntp_writeopcount; 1713 mpcpu->mntp_writeopcount = 0; 1714 } 1715 if (mp->mnt_ref <= 0 || mp->mnt_lockref < 0 || mp->mnt_writeopcount < 0) 1716 panic("%s: invalid count(s) on mp %p: ref %d lockref %d writeopcount %d\n", 1717 __func__, mp, mp->mnt_ref, mp->mnt_lockref, mp->mnt_writeopcount); 1718 MNT_IUNLOCK(mp); 1719 vfs_assert_mount_counters(mp); 1720 } 1721 1722 void 1723 vfs_op_exit_locked(struct mount *mp) 1724 { 1725 1726 mtx_assert(MNT_MTX(mp), MA_OWNED); 1727 1728 if (mp->mnt_vfs_ops <= 0) 1729 panic("%s: invalid vfs_ops count %d for mp %p\n", 1730 __func__, mp->mnt_vfs_ops, mp); 1731 mp->mnt_vfs_ops--; 1732 } 1733 1734 void 1735 vfs_op_exit(struct mount *mp) 1736 { 1737 1738 MNT_ILOCK(mp); 1739 vfs_op_exit_locked(mp); 1740 MNT_IUNLOCK(mp); 1741 } 1742 1743 struct vfs_op_barrier_ipi { 1744 struct mount *mp; 1745 struct smp_rendezvous_cpus_retry_arg srcra; 1746 }; 1747 1748 static void 1749 vfs_op_action_func(void *arg) 1750 { 1751 struct vfs_op_barrier_ipi *vfsopipi; 1752 struct mount *mp; 1753 1754 vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); 1755 mp = vfsopipi->mp; 1756 1757 if (!vfs_op_thread_entered(mp)) 1758 smp_rendezvous_cpus_done(arg); 1759 } 1760 1761 static void 1762 vfs_op_wait_func(void *arg, int cpu) 1763 { 1764 struct vfs_op_barrier_ipi *vfsopipi; 1765 struct mount *mp; 1766 struct mount_pcpu *mpcpu; 1767 1768 vfsopipi = __containerof(arg, struct vfs_op_barrier_ipi, srcra); 1769 mp = vfsopipi->mp; 1770 1771 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1772 while (atomic_load_int(&mpcpu->mntp_thread_in_ops)) 1773 cpu_spinwait(); 1774 } 1775 1776 void 1777 vfs_op_barrier_wait(struct mount *mp) 1778 { 1779 struct vfs_op_barrier_ipi vfsopipi; 1780 1781 vfsopipi.mp = mp; 1782 1783 smp_rendezvous_cpus_retry(all_cpus, 1784 smp_no_rendezvous_barrier, 1785 vfs_op_action_func, 1786 smp_no_rendezvous_barrier, 1787 vfs_op_wait_func, 1788 &vfsopipi.srcra); 1789 } 1790 1791 #ifdef DIAGNOSTIC 1792 void 1793 vfs_assert_mount_counters(struct mount *mp) 1794 { 1795 struct mount_pcpu *mpcpu; 1796 int cpu; 1797 1798 if (mp->mnt_vfs_ops == 0) 1799 return; 1800 1801 CPU_FOREACH(cpu) { 1802 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1803 if (mpcpu->mntp_ref != 0 || 1804 mpcpu->mntp_lockref != 0 || 1805 mpcpu->mntp_writeopcount != 0) 1806 vfs_dump_mount_counters(mp); 1807 } 1808 } 1809 1810 void 1811 vfs_dump_mount_counters(struct mount *mp) 1812 { 1813 struct mount_pcpu *mpcpu; 1814 int ref, lockref, writeopcount; 1815 int cpu; 1816 1817 printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); 1818 1819 printf(" ref : "); 1820 ref = mp->mnt_ref; 1821 CPU_FOREACH(cpu) { 1822 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1823 printf("%d ", mpcpu->mntp_ref); 1824 ref += mpcpu->mntp_ref; 1825 } 1826 printf("\n"); 1827 printf(" lockref : "); 1828 lockref = mp->mnt_lockref; 1829 CPU_FOREACH(cpu) { 1830 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1831 printf("%d ", mpcpu->mntp_lockref); 1832 lockref += mpcpu->mntp_lockref; 1833 } 1834 printf("\n"); 1835 printf("writeopcount: "); 1836 writeopcount = mp->mnt_writeopcount; 1837 CPU_FOREACH(cpu) { 1838 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1839 printf("%d ", mpcpu->mntp_writeopcount); 1840 writeopcount += mpcpu->mntp_writeopcount; 1841 } 1842 printf("\n"); 1843 1844 printf("counter struct total\n"); 1845 printf("ref %-5d %-5d\n", mp->mnt_ref, ref); 1846 printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); 1847 printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); 1848 1849 panic("invalid counts on struct mount"); 1850 } 1851 #endif 1852 1853 int 1854 vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) 1855 { 1856 struct mount_pcpu *mpcpu; 1857 int cpu, sum; 1858 1859 switch (which) { 1860 case MNT_COUNT_REF: 1861 sum = mp->mnt_ref; 1862 break; 1863 case MNT_COUNT_LOCKREF: 1864 sum = mp->mnt_lockref; 1865 break; 1866 case MNT_COUNT_WRITEOPCOUNT: 1867 sum = mp->mnt_writeopcount; 1868 break; 1869 } 1870 1871 CPU_FOREACH(cpu) { 1872 mpcpu = vfs_mount_pcpu_remote(mp, cpu); 1873 switch (which) { 1874 case MNT_COUNT_REF: 1875 sum += mpcpu->mntp_ref; 1876 break; 1877 case MNT_COUNT_LOCKREF: 1878 sum += mpcpu->mntp_lockref; 1879 break; 1880 case MNT_COUNT_WRITEOPCOUNT: 1881 sum += mpcpu->mntp_writeopcount; 1882 break; 1883 } 1884 } 1885 return (sum); 1886 } 1887 1888 static bool 1889 deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue) 1890 { 1891 bool enqueued; 1892 1893 enqueued = false; 1894 mtx_lock(&deferred_unmount_lock); 1895 if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) { 1896 mp->mnt_taskqueue_flags = flags | MNT_DEFERRED; 1897 STAILQ_INSERT_TAIL(&deferred_unmount_list, mp, 1898 mnt_taskqueue_link); 1899 enqueued = true; 1900 } 1901 mtx_unlock(&deferred_unmount_lock); 1902 1903 if (enqueued) { 1904 taskqueue_enqueue(taskqueue_deferred_unmount, 1905 &deferred_unmount_task); 1906 } 1907 1908 return (enqueued); 1909 } 1910 1911 /* 1912 * Taskqueue handler for processing async/recursive unmounts 1913 */ 1914 static void 1915 vfs_deferred_unmount(void *argi __unused, int pending __unused) 1916 { 1917 STAILQ_HEAD(, mount) local_unmounts; 1918 uint64_t flags; 1919 struct mount *mp, *tmp; 1920 bool unmounted; 1921 1922 STAILQ_INIT(&local_unmounts); 1923 mtx_lock(&deferred_unmount_lock); 1924 STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); 1925 mtx_unlock(&deferred_unmount_lock); 1926 1927 STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) { 1928 flags = mp->mnt_taskqueue_flags; 1929 KASSERT((flags & MNT_DEFERRED) != 0, 1930 ("taskqueue unmount without MNT_DEFERRED")); 1931 if (dounmount(mp, flags, curthread) != 0) { 1932 MNT_ILOCK(mp); 1933 unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0); 1934 MNT_IUNLOCK(mp); 1935 if (!unmounted) 1936 deferred_unmount_enqueue(mp, flags, true); 1937 else 1938 vfs_rel(mp); 1939 } 1940 } 1941 } 1942 1943 /* 1944 * Do the actual filesystem unmount. 1945 */ 1946 int 1947 dounmount(struct mount *mp, uint64_t flags, struct thread *td) 1948 { 1949 struct mount_upper_node *upper; 1950 struct vnode *coveredvp, *rootvp; 1951 int error; 1952 uint64_t async_flag; 1953 int mnt_gen_r; 1954 1955 KASSERT((flags & MNT_DEFERRED) == 0 || 1956 (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE), 1957 ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE")); 1958 1959 /* 1960 * If the caller has explicitly requested the unmount to be handled by 1961 * the taskqueue and we're not already in taskqueue context, queue 1962 * up the unmount request and exit. This is done prior to any 1963 * credential checks; MNT_DEFERRED should be used only for kernel- 1964 * initiated unmounts and will therefore be processed with the 1965 * (kernel) credentials of the taskqueue thread. Still, callers 1966 * should be sure this is the behavior they want. 1967 */ 1968 if ((flags & MNT_DEFERRED) != 0 && 1969 taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) { 1970 if (!deferred_unmount_enqueue(mp, flags, false)) 1971 vfs_rel(mp); 1972 return (EINPROGRESS); 1973 } 1974 1975 /* 1976 * Only privileged root, or (if MNT_USER is set) the user that did the 1977 * original mount is permitted to unmount this filesystem. 1978 * This check should be made prior to queueing up any recursive 1979 * unmounts of upper filesystems. Those unmounts will be executed 1980 * with kernel thread credentials and are expected to succeed, so 1981 * we must at least ensure the originating context has sufficient 1982 * privilege to unmount the base filesystem before proceeding with 1983 * the uppers. 1984 */ 1985 error = vfs_suser(mp, td); 1986 if (error != 0) { 1987 KASSERT((flags & MNT_DEFERRED) == 0, 1988 ("taskqueue unmount with insufficient privilege")); 1989 vfs_rel(mp); 1990 return (error); 1991 } 1992 1993 if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0)) 1994 flags |= MNT_RECURSE; 1995 1996 if ((flags & MNT_RECURSE) != 0) { 1997 KASSERT((flags & MNT_FORCE) != 0, 1998 ("MNT_RECURSE requires MNT_FORCE")); 1999 2000 MNT_ILOCK(mp); 2001 /* 2002 * Set MNTK_RECURSE to prevent new upper mounts from being 2003 * added, and note that an operation on the uppers list is in 2004 * progress. This will ensure that unregistration from the 2005 * uppers list, and therefore any pending unmount of the upper 2006 * FS, can't complete until after we finish walking the list. 2007 */ 2008 mp->mnt_kern_flag |= MNTK_RECURSE; 2009 mp->mnt_upper_pending++; 2010 TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) { 2011 MNT_IUNLOCK(mp); 2012 vfs_ref(upper->mp); 2013 if (!deferred_unmount_enqueue(upper->mp, flags, false)) 2014 vfs_rel(upper->mp); 2015 MNT_ILOCK(mp); 2016 } 2017 mp->mnt_upper_pending--; 2018 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 2019 mp->mnt_upper_pending == 0) { 2020 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 2021 wakeup(&mp->mnt_uppers); 2022 } 2023 /* 2024 * If we're not on the taskqueue, wait until the uppers list 2025 * is drained before proceeding with unmount. Otherwise, if 2026 * we are on the taskqueue and there are still pending uppers, 2027 * just re-enqueue on the end of the taskqueue. 2028 */ 2029 if ((flags & MNT_DEFERRED) == 0) { 2030 while (!TAILQ_EMPTY(&mp->mnt_uppers)) { 2031 mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER; 2032 msleep(&mp->mnt_taskqueue_link, MNT_MTX(mp), 0, 2033 "umntqw", 0); 2034 } 2035 } else if (!TAILQ_EMPTY(&mp->mnt_uppers)) { 2036 MNT_IUNLOCK(mp); 2037 deferred_unmount_enqueue(mp, flags, true); 2038 return (0); 2039 } 2040 MNT_IUNLOCK(mp); 2041 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty")); 2042 } 2043 2044 /* Allow the taskqueue to safely re-enqueue on failure */ 2045 if ((flags & MNT_DEFERRED) != 0) 2046 vfs_ref(mp); 2047 2048 if ((coveredvp = mp->mnt_vnodecovered) != NULL) { 2049 mnt_gen_r = mp->mnt_gen; 2050 VI_LOCK(coveredvp); 2051 vholdl(coveredvp); 2052 vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY); 2053 /* 2054 * Check for mp being unmounted while waiting for the 2055 * covered vnode lock. 2056 */ 2057 if (coveredvp->v_mountedhere != mp || 2058 coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) { 2059 VOP_UNLOCK(coveredvp); 2060 vdrop(coveredvp); 2061 vfs_rel(mp); 2062 return (EBUSY); 2063 } 2064 } 2065 2066 vfs_op_enter(mp); 2067 2068 vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); 2069 MNT_ILOCK(mp); 2070 if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || 2071 (mp->mnt_flag & MNT_UPDATE) != 0 || 2072 !TAILQ_EMPTY(&mp->mnt_uppers)) { 2073 dounmount_cleanup(mp, coveredvp, 0); 2074 return (EBUSY); 2075 } 2076 mp->mnt_kern_flag |= MNTK_UNMOUNT; 2077 rootvp = vfs_cache_root_clear(mp); 2078 if (coveredvp != NULL) 2079 vn_seqc_write_begin(coveredvp); 2080 if (flags & MNT_NONBUSY) { 2081 MNT_IUNLOCK(mp); 2082 error = vfs_check_usecounts(mp); 2083 MNT_ILOCK(mp); 2084 if (error != 0) { 2085 vn_seqc_write_end(coveredvp); 2086 dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); 2087 if (rootvp != NULL) { 2088 vn_seqc_write_end(rootvp); 2089 vrele(rootvp); 2090 } 2091 return (error); 2092 } 2093 } 2094 /* Allow filesystems to detect that a forced unmount is in progress. */ 2095 if (flags & MNT_FORCE) { 2096 mp->mnt_kern_flag |= MNTK_UNMOUNTF; 2097 MNT_IUNLOCK(mp); 2098 /* 2099 * Must be done after setting MNTK_UNMOUNTF and before 2100 * waiting for mnt_lockref to become 0. 2101 */ 2102 VFS_PURGE(mp); 2103 MNT_ILOCK(mp); 2104 } 2105 error = 0; 2106 if (mp->mnt_lockref) { 2107 mp->mnt_kern_flag |= MNTK_DRAINING; 2108 error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, 2109 "mount drain", 0); 2110 } 2111 MNT_IUNLOCK(mp); 2112 KASSERT(mp->mnt_lockref == 0, 2113 ("%s: invalid lock refcount in the drain path @ %s:%d", 2114 __func__, __FILE__, __LINE__)); 2115 KASSERT(error == 0, 2116 ("%s: invalid return value for msleep in the drain path @ %s:%d", 2117 __func__, __FILE__, __LINE__)); 2118 2119 /* 2120 * We want to keep the vnode around so that we can vn_seqc_write_end 2121 * after we are done with unmount. Downgrade our reference to a mere 2122 * hold count so that we don't interefere with anything. 2123 */ 2124 if (rootvp != NULL) { 2125 vhold(rootvp); 2126 vrele(rootvp); 2127 } 2128 2129 if (mp->mnt_flag & MNT_EXPUBLIC) 2130 vfs_setpublicfs(NULL, NULL, NULL); 2131 2132 vfs_periodic(mp, MNT_WAIT); 2133 MNT_ILOCK(mp); 2134 async_flag = mp->mnt_flag & MNT_ASYNC; 2135 mp->mnt_flag &= ~MNT_ASYNC; 2136 mp->mnt_kern_flag &= ~MNTK_ASYNC; 2137 MNT_IUNLOCK(mp); 2138 vfs_deallocate_syncvnode(mp); 2139 error = VFS_UNMOUNT(mp, flags); 2140 vn_finished_write(mp); 2141 /* 2142 * If we failed to flush the dirty blocks for this mount point, 2143 * undo all the cdir/rdir and rootvnode changes we made above. 2144 * Unless we failed to do so because the device is reporting that 2145 * it doesn't exist anymore. 2146 */ 2147 if (error && error != ENXIO) { 2148 MNT_ILOCK(mp); 2149 if ((mp->mnt_flag & MNT_RDONLY) == 0) { 2150 MNT_IUNLOCK(mp); 2151 vfs_allocate_syncvnode(mp); 2152 MNT_ILOCK(mp); 2153 } 2154 mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF); 2155 mp->mnt_flag |= async_flag; 2156 if ((mp->mnt_flag & MNT_ASYNC) != 0 && 2157 (mp->mnt_kern_flag & MNTK_NOASYNC) == 0) 2158 mp->mnt_kern_flag |= MNTK_ASYNC; 2159 if (mp->mnt_kern_flag & MNTK_MWAIT) { 2160 mp->mnt_kern_flag &= ~MNTK_MWAIT; 2161 wakeup(mp); 2162 } 2163 vfs_op_exit_locked(mp); 2164 MNT_IUNLOCK(mp); 2165 if (coveredvp) { 2166 vn_seqc_write_end(coveredvp); 2167 VOP_UNLOCK(coveredvp); 2168 vdrop(coveredvp); 2169 } 2170 if (rootvp != NULL) { 2171 vn_seqc_write_end(rootvp); 2172 vdrop(rootvp); 2173 } 2174 return (error); 2175 } 2176 2177 mtx_lock(&mountlist_mtx); 2178 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2179 mtx_unlock(&mountlist_mtx); 2180 EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); 2181 if (coveredvp != NULL) { 2182 VI_LOCK(coveredvp); 2183 vn_irflag_unset_locked(coveredvp, VIRF_MOUNTPOINT); 2184 coveredvp->v_mountedhere = NULL; 2185 vn_seqc_write_end_locked(coveredvp); 2186 VI_UNLOCK(coveredvp); 2187 VOP_UNLOCK(coveredvp); 2188 vdrop(coveredvp); 2189 } 2190 mount_devctl_event("UNMOUNT", mp, false); 2191 if (rootvp != NULL) { 2192 vn_seqc_write_end(rootvp); 2193 vdrop(rootvp); 2194 } 2195 vfs_event_signal(NULL, VQ_UNMOUNT, 0); 2196 if (rootvnode != NULL && mp == rootvnode->v_mount) { 2197 vrele(rootvnode); 2198 rootvnode = NULL; 2199 } 2200 if (mp == rootdevmp) 2201 rootdevmp = NULL; 2202 if ((flags & MNT_DEFERRED) != 0) 2203 vfs_rel(mp); 2204 vfs_mount_destroy(mp); 2205 return (0); 2206 } 2207 2208 /* 2209 * Report errors during filesystem mounting. 2210 */ 2211 void 2212 vfs_mount_error(struct mount *mp, const char *fmt, ...) 2213 { 2214 struct vfsoptlist *moptlist = mp->mnt_optnew; 2215 va_list ap; 2216 int error, len; 2217 char *errmsg; 2218 2219 error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len); 2220 if (error || errmsg == NULL || len <= 0) 2221 return; 2222 2223 va_start(ap, fmt); 2224 vsnprintf(errmsg, (size_t)len, fmt, ap); 2225 va_end(ap); 2226 } 2227 2228 void 2229 vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...) 2230 { 2231 va_list ap; 2232 int error, len; 2233 char *errmsg; 2234 2235 error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len); 2236 if (error || errmsg == NULL || len <= 0) 2237 return; 2238 2239 va_start(ap, fmt); 2240 vsnprintf(errmsg, (size_t)len, fmt, ap); 2241 va_end(ap); 2242 } 2243 2244 /* 2245 * --------------------------------------------------------------------- 2246 * Functions for querying mount options/arguments from filesystems. 2247 */ 2248 2249 /* 2250 * Check that no unknown options are given 2251 */ 2252 int 2253 vfs_filteropt(struct vfsoptlist *opts, const char **legal) 2254 { 2255 struct vfsopt *opt; 2256 char errmsg[255]; 2257 const char **t, *p, *q; 2258 int ret = 0; 2259 2260 TAILQ_FOREACH(opt, opts, link) { 2261 p = opt->name; 2262 q = NULL; 2263 if (p[0] == 'n' && p[1] == 'o') 2264 q = p + 2; 2265 for(t = global_opts; *t != NULL; t++) { 2266 if (strcmp(*t, p) == 0) 2267 break; 2268 if (q != NULL) { 2269 if (strcmp(*t, q) == 0) 2270 break; 2271 } 2272 } 2273 if (*t != NULL) 2274 continue; 2275 for(t = legal; *t != NULL; t++) { 2276 if (strcmp(*t, p) == 0) 2277 break; 2278 if (q != NULL) { 2279 if (strcmp(*t, q) == 0) 2280 break; 2281 } 2282 } 2283 if (*t != NULL) 2284 continue; 2285 snprintf(errmsg, sizeof(errmsg), 2286 "mount option <%s> is unknown", p); 2287 ret = EINVAL; 2288 } 2289 if (ret != 0) { 2290 TAILQ_FOREACH(opt, opts, link) { 2291 if (strcmp(opt->name, "errmsg") == 0) { 2292 strncpy((char *)opt->value, errmsg, opt->len); 2293 break; 2294 } 2295 } 2296 if (opt == NULL) 2297 printf("%s\n", errmsg); 2298 } 2299 return (ret); 2300 } 2301 2302 /* 2303 * Get a mount option by its name. 2304 * 2305 * Return 0 if the option was found, ENOENT otherwise. 2306 * If len is non-NULL it will be filled with the length 2307 * of the option. If buf is non-NULL, it will be filled 2308 * with the address of the option. 2309 */ 2310 int 2311 vfs_getopt(struct vfsoptlist *opts, const char *name, void **buf, int *len) 2312 { 2313 struct vfsopt *opt; 2314 2315 KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); 2316 2317 TAILQ_FOREACH(opt, opts, link) { 2318 if (strcmp(name, opt->name) == 0) { 2319 opt->seen = 1; 2320 if (len != NULL) 2321 *len = opt->len; 2322 if (buf != NULL) 2323 *buf = opt->value; 2324 return (0); 2325 } 2326 } 2327 return (ENOENT); 2328 } 2329 2330 int 2331 vfs_getopt_pos(struct vfsoptlist *opts, const char *name) 2332 { 2333 struct vfsopt *opt; 2334 2335 if (opts == NULL) 2336 return (-1); 2337 2338 TAILQ_FOREACH(opt, opts, link) { 2339 if (strcmp(name, opt->name) == 0) { 2340 opt->seen = 1; 2341 return (opt->pos); 2342 } 2343 } 2344 return (-1); 2345 } 2346 2347 int 2348 vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value) 2349 { 2350 char *opt_value, *vtp; 2351 quad_t iv; 2352 int error, opt_len; 2353 2354 error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len); 2355 if (error != 0) 2356 return (error); 2357 if (opt_len == 0 || opt_value == NULL) 2358 return (EINVAL); 2359 if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0') 2360 return (EINVAL); 2361 iv = strtoq(opt_value, &vtp, 0); 2362 if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0')) 2363 return (EINVAL); 2364 if (iv < 0) 2365 return (EINVAL); 2366 switch (vtp[0]) { 2367 case 't': case 'T': 2368 iv *= 1024; 2369 /* FALLTHROUGH */ 2370 case 'g': case 'G': 2371 iv *= 1024; 2372 /* FALLTHROUGH */ 2373 case 'm': case 'M': 2374 iv *= 1024; 2375 /* FALLTHROUGH */ 2376 case 'k': case 'K': 2377 iv *= 1024; 2378 case '\0': 2379 break; 2380 default: 2381 return (EINVAL); 2382 } 2383 *value = iv; 2384 2385 return (0); 2386 } 2387 2388 char * 2389 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error) 2390 { 2391 struct vfsopt *opt; 2392 2393 *error = 0; 2394 TAILQ_FOREACH(opt, opts, link) { 2395 if (strcmp(name, opt->name) != 0) 2396 continue; 2397 opt->seen = 1; 2398 if (opt->len == 0 || 2399 ((char *)opt->value)[opt->len - 1] != '\0') { 2400 *error = EINVAL; 2401 return (NULL); 2402 } 2403 return (opt->value); 2404 } 2405 *error = ENOENT; 2406 return (NULL); 2407 } 2408 2409 int 2410 vfs_flagopt(struct vfsoptlist *opts, const char *name, uint64_t *w, 2411 uint64_t val) 2412 { 2413 struct vfsopt *opt; 2414 2415 TAILQ_FOREACH(opt, opts, link) { 2416 if (strcmp(name, opt->name) == 0) { 2417 opt->seen = 1; 2418 if (w != NULL) 2419 *w |= val; 2420 return (1); 2421 } 2422 } 2423 if (w != NULL) 2424 *w &= ~val; 2425 return (0); 2426 } 2427 2428 int 2429 vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...) 2430 { 2431 va_list ap; 2432 struct vfsopt *opt; 2433 int ret; 2434 2435 KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL")); 2436 2437 TAILQ_FOREACH(opt, opts, link) { 2438 if (strcmp(name, opt->name) != 0) 2439 continue; 2440 opt->seen = 1; 2441 if (opt->len == 0 || opt->value == NULL) 2442 return (0); 2443 if (((char *)opt->value)[opt->len - 1] != '\0') 2444 return (0); 2445 va_start(ap, fmt); 2446 ret = vsscanf(opt->value, fmt, ap); 2447 va_end(ap); 2448 return (ret); 2449 } 2450 return (0); 2451 } 2452 2453 int 2454 vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len) 2455 { 2456 struct vfsopt *opt; 2457 2458 TAILQ_FOREACH(opt, opts, link) { 2459 if (strcmp(name, opt->name) != 0) 2460 continue; 2461 opt->seen = 1; 2462 if (opt->value == NULL) 2463 opt->len = len; 2464 else { 2465 if (opt->len != len) 2466 return (EINVAL); 2467 bcopy(value, opt->value, len); 2468 } 2469 return (0); 2470 } 2471 return (ENOENT); 2472 } 2473 2474 int 2475 vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len) 2476 { 2477 struct vfsopt *opt; 2478 2479 TAILQ_FOREACH(opt, opts, link) { 2480 if (strcmp(name, opt->name) != 0) 2481 continue; 2482 opt->seen = 1; 2483 if (opt->value == NULL) 2484 opt->len = len; 2485 else { 2486 if (opt->len < len) 2487 return (EINVAL); 2488 opt->len = len; 2489 bcopy(value, opt->value, len); 2490 } 2491 return (0); 2492 } 2493 return (ENOENT); 2494 } 2495 2496 int 2497 vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value) 2498 { 2499 struct vfsopt *opt; 2500 2501 TAILQ_FOREACH(opt, opts, link) { 2502 if (strcmp(name, opt->name) != 0) 2503 continue; 2504 opt->seen = 1; 2505 if (opt->value == NULL) 2506 opt->len = strlen(value) + 1; 2507 else if (strlcpy(opt->value, value, opt->len) >= opt->len) 2508 return (EINVAL); 2509 return (0); 2510 } 2511 return (ENOENT); 2512 } 2513 2514 /* 2515 * Find and copy a mount option. 2516 * 2517 * The size of the buffer has to be specified 2518 * in len, if it is not the same length as the 2519 * mount option, EINVAL is returned. 2520 * Returns ENOENT if the option is not found. 2521 */ 2522 int 2523 vfs_copyopt(struct vfsoptlist *opts, const char *name, void *dest, int len) 2524 { 2525 struct vfsopt *opt; 2526 2527 KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL")); 2528 2529 TAILQ_FOREACH(opt, opts, link) { 2530 if (strcmp(name, opt->name) == 0) { 2531 opt->seen = 1; 2532 if (len != opt->len) 2533 return (EINVAL); 2534 bcopy(opt->value, dest, opt->len); 2535 return (0); 2536 } 2537 } 2538 return (ENOENT); 2539 } 2540 2541 int 2542 __vfs_statfs(struct mount *mp, struct statfs *sbp) 2543 { 2544 2545 /* 2546 * Filesystems only fill in part of the structure for updates, we 2547 * have to read the entirety first to get all content. 2548 */ 2549 if (sbp != &mp->mnt_stat) 2550 memcpy(sbp, &mp->mnt_stat, sizeof(*sbp)); 2551 2552 /* 2553 * Set these in case the underlying filesystem fails to do so. 2554 */ 2555 sbp->f_version = STATFS_VERSION; 2556 sbp->f_namemax = NAME_MAX; 2557 sbp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; 2558 2559 return (mp->mnt_op->vfs_statfs(mp, sbp)); 2560 } 2561 2562 void 2563 vfs_mountedfrom(struct mount *mp, const char *from) 2564 { 2565 2566 bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname); 2567 strlcpy(mp->mnt_stat.f_mntfromname, from, 2568 sizeof mp->mnt_stat.f_mntfromname); 2569 } 2570 2571 /* 2572 * --------------------------------------------------------------------- 2573 * This is the api for building mount args and mounting filesystems from 2574 * inside the kernel. 2575 * 2576 * The API works by accumulation of individual args. First error is 2577 * latched. 2578 * 2579 * XXX: should be documented in new manpage kernel_mount(9) 2580 */ 2581 2582 /* A memory allocation which must be freed when we are done */ 2583 struct mntaarg { 2584 SLIST_ENTRY(mntaarg) next; 2585 }; 2586 2587 /* The header for the mount arguments */ 2588 struct mntarg { 2589 struct iovec *v; 2590 int len; 2591 int error; 2592 SLIST_HEAD(, mntaarg) list; 2593 }; 2594 2595 /* 2596 * Add a boolean argument. 2597 * 2598 * flag is the boolean value. 2599 * name must start with "no". 2600 */ 2601 struct mntarg * 2602 mount_argb(struct mntarg *ma, int flag, const char *name) 2603 { 2604 2605 KASSERT(name[0] == 'n' && name[1] == 'o', 2606 ("mount_argb(...,%s): name must start with 'no'", name)); 2607 2608 return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0)); 2609 } 2610 2611 /* 2612 * Add an argument printf style 2613 */ 2614 struct mntarg * 2615 mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...) 2616 { 2617 va_list ap; 2618 struct mntaarg *maa; 2619 struct sbuf *sb; 2620 int len; 2621 2622 if (ma == NULL) { 2623 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2624 SLIST_INIT(&ma->list); 2625 } 2626 if (ma->error) 2627 return (ma); 2628 2629 ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), 2630 M_MOUNT, M_WAITOK); 2631 ma->v[ma->len].iov_base = (void *)(uintptr_t)name; 2632 ma->v[ma->len].iov_len = strlen(name) + 1; 2633 ma->len++; 2634 2635 sb = sbuf_new_auto(); 2636 va_start(ap, fmt); 2637 sbuf_vprintf(sb, fmt, ap); 2638 va_end(ap); 2639 sbuf_finish(sb); 2640 len = sbuf_len(sb) + 1; 2641 maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); 2642 SLIST_INSERT_HEAD(&ma->list, maa, next); 2643 bcopy(sbuf_data(sb), maa + 1, len); 2644 sbuf_delete(sb); 2645 2646 ma->v[ma->len].iov_base = maa + 1; 2647 ma->v[ma->len].iov_len = len; 2648 ma->len++; 2649 2650 return (ma); 2651 } 2652 2653 /* 2654 * Add an argument which is a userland string. 2655 */ 2656 struct mntarg * 2657 mount_argsu(struct mntarg *ma, const char *name, const void *val, int len) 2658 { 2659 struct mntaarg *maa; 2660 char *tbuf; 2661 2662 if (val == NULL) 2663 return (ma); 2664 if (ma == NULL) { 2665 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2666 SLIST_INIT(&ma->list); 2667 } 2668 if (ma->error) 2669 return (ma); 2670 maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO); 2671 SLIST_INSERT_HEAD(&ma->list, maa, next); 2672 tbuf = (void *)(maa + 1); 2673 ma->error = copyinstr(val, tbuf, len, NULL); 2674 return (mount_arg(ma, name, tbuf, -1)); 2675 } 2676 2677 /* 2678 * Plain argument. 2679 * 2680 * If length is -1, treat value as a C string. 2681 */ 2682 struct mntarg * 2683 mount_arg(struct mntarg *ma, const char *name, const void *val, int len) 2684 { 2685 2686 if (ma == NULL) { 2687 ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO); 2688 SLIST_INIT(&ma->list); 2689 } 2690 if (ma->error) 2691 return (ma); 2692 2693 ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2), 2694 M_MOUNT, M_WAITOK); 2695 ma->v[ma->len].iov_base = (void *)(uintptr_t)name; 2696 ma->v[ma->len].iov_len = strlen(name) + 1; 2697 ma->len++; 2698 2699 ma->v[ma->len].iov_base = (void *)(uintptr_t)val; 2700 if (len < 0) 2701 ma->v[ma->len].iov_len = strlen(val) + 1; 2702 else 2703 ma->v[ma->len].iov_len = len; 2704 ma->len++; 2705 return (ma); 2706 } 2707 2708 /* 2709 * Free a mntarg structure 2710 */ 2711 static void 2712 free_mntarg(struct mntarg *ma) 2713 { 2714 struct mntaarg *maa; 2715 2716 while (!SLIST_EMPTY(&ma->list)) { 2717 maa = SLIST_FIRST(&ma->list); 2718 SLIST_REMOVE_HEAD(&ma->list, next); 2719 free(maa, M_MOUNT); 2720 } 2721 free(ma->v, M_MOUNT); 2722 free(ma, M_MOUNT); 2723 } 2724 2725 /* 2726 * Mount a filesystem 2727 */ 2728 int 2729 kernel_mount(struct mntarg *ma, uint64_t flags) 2730 { 2731 struct uio auio; 2732 int error; 2733 2734 KASSERT(ma != NULL, ("kernel_mount NULL ma")); 2735 KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v")); 2736 KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len)); 2737 2738 auio.uio_iov = ma->v; 2739 auio.uio_iovcnt = ma->len; 2740 auio.uio_segflg = UIO_SYSSPACE; 2741 2742 error = ma->error; 2743 if (!error) 2744 error = vfs_donmount(curthread, flags, &auio); 2745 free_mntarg(ma); 2746 return (error); 2747 } 2748 2749 /* 2750 * A printflike function to mount a filesystem. 2751 */ 2752 int 2753 kernel_vmount(int flags, ...) 2754 { 2755 struct mntarg *ma = NULL; 2756 va_list ap; 2757 const char *cp; 2758 const void *vp; 2759 int error; 2760 2761 va_start(ap, flags); 2762 for (;;) { 2763 cp = va_arg(ap, const char *); 2764 if (cp == NULL) 2765 break; 2766 vp = va_arg(ap, const void *); 2767 ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0)); 2768 } 2769 va_end(ap); 2770 2771 error = kernel_mount(ma, flags); 2772 return (error); 2773 } 2774 2775 /* Map from mount options to printable formats. */ 2776 static struct mntoptnames optnames[] = { 2777 MNTOPT_NAMES 2778 }; 2779 2780 #define DEVCTL_LEN 1024 2781 static void 2782 mount_devctl_event(const char *type, struct mount *mp, bool donew) 2783 { 2784 const uint8_t *cp; 2785 struct mntoptnames *fp; 2786 struct sbuf sb; 2787 struct statfs *sfp = &mp->mnt_stat; 2788 char *buf; 2789 2790 buf = malloc(DEVCTL_LEN, M_MOUNT, M_NOWAIT); 2791 if (buf == NULL) 2792 return; 2793 sbuf_new(&sb, buf, DEVCTL_LEN, SBUF_FIXEDLEN); 2794 sbuf_cpy(&sb, "mount-point=\""); 2795 devctl_safe_quote_sb(&sb, sfp->f_mntonname); 2796 sbuf_cat(&sb, "\" mount-dev=\""); 2797 devctl_safe_quote_sb(&sb, sfp->f_mntfromname); 2798 sbuf_cat(&sb, "\" mount-type=\""); 2799 devctl_safe_quote_sb(&sb, sfp->f_fstypename); 2800 sbuf_cat(&sb, "\" fsid=0x"); 2801 cp = (const uint8_t *)&sfp->f_fsid.val[0]; 2802 for (int i = 0; i < sizeof(sfp->f_fsid); i++) 2803 sbuf_printf(&sb, "%02x", cp[i]); 2804 sbuf_printf(&sb, " owner=%u flags=\"", sfp->f_owner); 2805 for (fp = optnames; fp->o_opt != 0; fp++) { 2806 if ((mp->mnt_flag & fp->o_opt) != 0) { 2807 sbuf_cat(&sb, fp->o_name); 2808 sbuf_putc(&sb, ';'); 2809 } 2810 } 2811 sbuf_putc(&sb, '"'); 2812 sbuf_finish(&sb); 2813 2814 /* 2815 * Options are not published because the form of the options depends on 2816 * the file system and may include binary data. In addition, they don't 2817 * necessarily provide enough useful information to be actionable when 2818 * devd processes them. 2819 */ 2820 2821 if (sbuf_error(&sb) == 0) 2822 devctl_notify("VFS", "FS", type, sbuf_data(&sb)); 2823 sbuf_delete(&sb); 2824 free(buf, M_MOUNT); 2825 } 2826 2827 /* 2828 * Suspend write operations on all local writeable filesystems. Does 2829 * full sync of them in the process. 2830 * 2831 * Iterate over the mount points in reverse order, suspending most 2832 * recently mounted filesystems first. It handles a case where a 2833 * filesystem mounted from a md(4) vnode-backed device should be 2834 * suspended before the filesystem that owns the vnode. 2835 */ 2836 void 2837 suspend_all_fs(void) 2838 { 2839 struct mount *mp; 2840 int error; 2841 2842 mtx_lock(&mountlist_mtx); 2843 TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) { 2844 error = vfs_busy(mp, MBF_MNTLSTLOCK | MBF_NOWAIT); 2845 if (error != 0) 2846 continue; 2847 if ((mp->mnt_flag & (MNT_RDONLY | MNT_LOCAL)) != MNT_LOCAL || 2848 (mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { 2849 mtx_lock(&mountlist_mtx); 2850 vfs_unbusy(mp); 2851 continue; 2852 } 2853 error = vfs_write_suspend(mp, 0); 2854 if (error == 0) { 2855 MNT_ILOCK(mp); 2856 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0); 2857 mp->mnt_kern_flag |= MNTK_SUSPEND_ALL; 2858 MNT_IUNLOCK(mp); 2859 mtx_lock(&mountlist_mtx); 2860 } else { 2861 printf("suspend of %s failed, error %d\n", 2862 mp->mnt_stat.f_mntonname, error); 2863 mtx_lock(&mountlist_mtx); 2864 vfs_unbusy(mp); 2865 } 2866 } 2867 mtx_unlock(&mountlist_mtx); 2868 } 2869 2870 void 2871 resume_all_fs(void) 2872 { 2873 struct mount *mp; 2874 2875 mtx_lock(&mountlist_mtx); 2876 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2877 if ((mp->mnt_kern_flag & MNTK_SUSPEND_ALL) == 0) 2878 continue; 2879 mtx_unlock(&mountlist_mtx); 2880 MNT_ILOCK(mp); 2881 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) != 0); 2882 mp->mnt_kern_flag &= ~MNTK_SUSPEND_ALL; 2883 MNT_IUNLOCK(mp); 2884 vfs_write_resume(mp, 0); 2885 mtx_lock(&mountlist_mtx); 2886 vfs_unbusy(mp); 2887 } 2888 mtx_unlock(&mountlist_mtx); 2889 } 2890