1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 #include "opt_ffs.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/eventhandler.h> 54 #include <sys/fcntl.h> 55 #include <sys/kernel.h> 56 #include <sys/kthread.h> 57 #include <sys/malloc.h> 58 #include <sys/mount.h> 59 #include <sys/namei.h> 60 #include <sys/stat.h> 61 #include <sys/sysctl.h> 62 #include <sys/syslog.h> 63 #include <sys/vmmeter.h> 64 #include <sys/vnode.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_extern.h> 69 #include <vm/pmap.h> 70 #include <vm/vm_map.h> 71 #include <vm/vm_page.h> 72 #include <vm/uma.h> 73 74 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 75 76 static void addalias(struct vnode *vp, dev_t nvp_rdev); 77 static void insmntque(struct vnode *vp, struct mount *mp); 78 static void vclean(struct vnode *vp, int flags, struct thread *td); 79 static void vlruvp(struct vnode *vp); 80 81 /* 82 * Number of vnodes in existence. Increased whenever getnewvnode() 83 * allocates a new vnode, never decreased. 84 */ 85 static unsigned long numvnodes; 86 87 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 88 89 /* 90 * Conversion tables for conversion from vnode types to inode formats 91 * and back. 92 */ 93 enum vtype iftovt_tab[16] = { 94 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 95 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 96 }; 97 int vttoif_tab[9] = { 98 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 99 S_IFSOCK, S_IFIFO, S_IFMT, 100 }; 101 102 /* 103 * List of vnodes that are ready for recycling. 104 */ 105 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 106 107 /* 108 * Minimum number of free vnodes. If there are fewer than this free vnodes, 109 * getnewvnode() will return a newly allocated vnode. 110 */ 111 static u_long wantfreevnodes = 25; 112 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 113 /* Number of vnodes in the free list. */ 114 static u_long freevnodes; 115 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 116 117 /* 118 * Various variables used for debugging the new implementation of 119 * reassignbuf(). 120 * XXX these are probably of (very) limited utility now. 121 */ 122 static int reassignbufcalls; 123 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 124 static int reassignbufloops; 125 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 126 static int reassignbufsortgood; 127 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 128 static int reassignbufsortbad; 129 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 130 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ 131 static int reassignbufmethod = 1; 132 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 133 static int nameileafonly; 134 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); 135 136 #ifdef ENABLE_VFS_IOOPT 137 /* See NOTES for a description of this setting. */ 138 int vfs_ioopt; 139 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 140 #endif 141 142 /* List of mounted filesystems. */ 143 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 144 145 /* For any iteration/modification of mountlist */ 146 struct mtx mountlist_mtx; 147 148 /* For any iteration/modification of mnt_vnodelist */ 149 struct mtx mntvnode_mtx; 150 151 /* 152 * Cache for the mount type id assigned to NFS. This is used for 153 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 154 */ 155 int nfs_mount_type = -1; 156 157 /* To keep more than one thread at a time from running vfs_getnewfsid */ 158 static struct mtx mntid_mtx; 159 160 /* For any iteration/modification of vnode_free_list */ 161 static struct mtx vnode_free_list_mtx; 162 163 /* 164 * For any iteration/modification of dev->si_hlist (linked through 165 * v_specnext) 166 */ 167 static struct mtx spechash_mtx; 168 169 /* Publicly exported FS */ 170 struct nfs_public nfs_pub; 171 172 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 173 static uma_zone_t vnode_zone; 174 static uma_zone_t vnodepoll_zone; 175 176 /* Set to 1 to print out reclaim of active vnodes */ 177 int prtactive; 178 179 /* 180 * The workitem queue. 181 * 182 * It is useful to delay writes of file data and filesystem metadata 183 * for tens of seconds so that quickly created and deleted files need 184 * not waste disk bandwidth being created and removed. To realize this, 185 * we append vnodes to a "workitem" queue. When running with a soft 186 * updates implementation, most pending metadata dependencies should 187 * not wait for more than a few seconds. Thus, mounted on block devices 188 * are delayed only about a half the time that file data is delayed. 189 * Similarly, directory updates are more critical, so are only delayed 190 * about a third the time that file data is delayed. Thus, there are 191 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 192 * one each second (driven off the filesystem syncer process). The 193 * syncer_delayno variable indicates the next queue that is to be processed. 194 * Items that need to be processed soon are placed in this queue: 195 * 196 * syncer_workitem_pending[syncer_delayno] 197 * 198 * A delay of fifteen seconds is done by placing the request fifteen 199 * entries later in the queue: 200 * 201 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 202 * 203 */ 204 static int syncer_delayno; 205 static long syncer_mask; 206 LIST_HEAD(synclist, vnode); 207 static struct synclist *syncer_workitem_pending; 208 209 #define SYNCER_MAXDELAY 32 210 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 211 static int syncdelay = 30; /* max time to delay syncing data */ 212 static int filedelay = 30; /* time to delay syncing files */ 213 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 214 static int dirdelay = 29; /* time to delay syncing directories */ 215 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 216 static int metadelay = 28; /* time to delay syncing metadata */ 217 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 218 static int rushjob; /* number of slots to run ASAP */ 219 static int stat_rush_requests; /* number of times I/O speeded up */ 220 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 221 222 /* 223 * Number of vnodes we want to exist at any one time. This is mostly used 224 * to size hash tables in vnode-related code. It is normally not used in 225 * getnewvnode(), as wantfreevnodes is normally nonzero.) 226 * 227 * XXX desiredvnodes is historical cruft and should not exist. 228 */ 229 int desiredvnodes; 230 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 231 &desiredvnodes, 0, "Maximum number of vnodes"); 232 static int minvnodes; 233 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 234 &minvnodes, 0, "Minimum number of vnodes"); 235 static int vnlru_nowhere; 236 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, 237 "Number of times the vnlru process ran without success"); 238 239 void 240 v_addpollinfo(struct vnode *vp) 241 { 242 vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK); 243 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 244 } 245 246 /* 247 * Initialize the vnode management data structures. 248 */ 249 static void 250 vntblinit(void *dummy __unused) 251 { 252 253 desiredvnodes = maxproc + cnt.v_page_count / 4; 254 minvnodes = desiredvnodes / 4; 255 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF); 256 mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF); 257 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 258 mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF); 259 TAILQ_INIT(&vnode_free_list); 260 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 261 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 262 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 263 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 264 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 265 /* 266 * Initialize the filesystem syncer. 267 */ 268 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 269 &syncer_mask); 270 syncer_maxdelay = syncer_mask + 1; 271 } 272 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 273 274 275 /* 276 * Mark a mount point as busy. Used to synchronize access and to delay 277 * unmounting. Interlock is not released on failure. 278 */ 279 int 280 vfs_busy(mp, flags, interlkp, td) 281 struct mount *mp; 282 int flags; 283 struct mtx *interlkp; 284 struct thread *td; 285 { 286 int lkflags; 287 288 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 289 if (flags & LK_NOWAIT) 290 return (ENOENT); 291 mp->mnt_kern_flag |= MNTK_MWAIT; 292 /* 293 * Since all busy locks are shared except the exclusive 294 * lock granted when unmounting, the only place that a 295 * wakeup needs to be done is at the release of the 296 * exclusive lock at the end of dounmount. 297 */ 298 msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); 299 return (ENOENT); 300 } 301 lkflags = LK_SHARED | LK_NOPAUSE; 302 if (interlkp) 303 lkflags |= LK_INTERLOCK; 304 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td)) 305 panic("vfs_busy: unexpected lock failure"); 306 return (0); 307 } 308 309 /* 310 * Free a busy filesystem. 311 */ 312 void 313 vfs_unbusy(mp, td) 314 struct mount *mp; 315 struct thread *td; 316 { 317 318 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); 319 } 320 321 /* 322 * Lookup a filesystem type, and if found allocate and initialize 323 * a mount structure for it. 324 * 325 * Devname is usually updated by mount(8) after booting. 326 */ 327 int 328 vfs_rootmountalloc(fstypename, devname, mpp) 329 char *fstypename; 330 char *devname; 331 struct mount **mpp; 332 { 333 struct thread *td = curthread; /* XXX */ 334 struct vfsconf *vfsp; 335 struct mount *mp; 336 337 if (fstypename == NULL) 338 return (ENODEV); 339 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 340 if (!strcmp(vfsp->vfc_name, fstypename)) 341 break; 342 if (vfsp == NULL) 343 return (ENODEV); 344 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); 345 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 346 (void)vfs_busy(mp, LK_NOWAIT, 0, td); 347 TAILQ_INIT(&mp->mnt_nvnodelist); 348 TAILQ_INIT(&mp->mnt_reservedvnlist); 349 mp->mnt_vfc = vfsp; 350 mp->mnt_op = vfsp->vfc_vfsops; 351 mp->mnt_flag = MNT_RDONLY; 352 mp->mnt_vnodecovered = NULLVP; 353 vfsp->vfc_refcount++; 354 mp->mnt_iosize_max = DFLTPHYS; 355 mp->mnt_stat.f_type = vfsp->vfc_typenum; 356 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 357 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 358 mp->mnt_stat.f_mntonname[0] = '/'; 359 mp->mnt_stat.f_mntonname[1] = 0; 360 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 361 *mpp = mp; 362 return (0); 363 } 364 365 /* 366 * Find an appropriate filesystem to use for the root. If a filesystem 367 * has not been preselected, walk through the list of known filesystems 368 * trying those that have mountroot routines, and try them until one 369 * works or we have tried them all. 370 */ 371 #ifdef notdef /* XXX JH */ 372 int 373 lite2_vfs_mountroot() 374 { 375 struct vfsconf *vfsp; 376 extern int (*lite2_mountroot)(void); 377 int error; 378 379 if (lite2_mountroot != NULL) 380 return ((*lite2_mountroot)()); 381 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 382 if (vfsp->vfc_mountroot == NULL) 383 continue; 384 if ((error = (*vfsp->vfc_mountroot)()) == 0) 385 return (0); 386 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 387 } 388 return (ENODEV); 389 } 390 #endif 391 392 /* 393 * Lookup a mount point by filesystem identifier. 394 */ 395 struct mount * 396 vfs_getvfs(fsid) 397 fsid_t *fsid; 398 { 399 register struct mount *mp; 400 401 mtx_lock(&mountlist_mtx); 402 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 403 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 404 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 405 mtx_unlock(&mountlist_mtx); 406 return (mp); 407 } 408 } 409 mtx_unlock(&mountlist_mtx); 410 return ((struct mount *) 0); 411 } 412 413 /* 414 * Get a new unique fsid. Try to make its val[0] unique, since this value 415 * will be used to create fake device numbers for stat(). Also try (but 416 * not so hard) make its val[0] unique mod 2^16, since some emulators only 417 * support 16-bit device numbers. We end up with unique val[0]'s for the 418 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 419 * 420 * Keep in mind that several mounts may be running in parallel. Starting 421 * the search one past where the previous search terminated is both a 422 * micro-optimization and a defense against returning the same fsid to 423 * different mounts. 424 */ 425 void 426 vfs_getnewfsid(mp) 427 struct mount *mp; 428 { 429 static u_int16_t mntid_base; 430 fsid_t tfsid; 431 int mtype; 432 433 mtx_lock(&mntid_mtx); 434 mtype = mp->mnt_vfc->vfc_typenum; 435 tfsid.val[1] = mtype; 436 mtype = (mtype & 0xFF) << 24; 437 for (;;) { 438 tfsid.val[0] = makeudev(255, 439 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 440 mntid_base++; 441 if (vfs_getvfs(&tfsid) == NULL) 442 break; 443 } 444 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 445 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 446 mtx_unlock(&mntid_mtx); 447 } 448 449 /* 450 * Knob to control the precision of file timestamps: 451 * 452 * 0 = seconds only; nanoseconds zeroed. 453 * 1 = seconds and nanoseconds, accurate within 1/HZ. 454 * 2 = seconds and nanoseconds, truncated to microseconds. 455 * >=3 = seconds and nanoseconds, maximum precision. 456 */ 457 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 458 459 static int timestamp_precision = TSP_SEC; 460 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 461 ×tamp_precision, 0, ""); 462 463 /* 464 * Get a current timestamp. 465 */ 466 void 467 vfs_timestamp(tsp) 468 struct timespec *tsp; 469 { 470 struct timeval tv; 471 472 switch (timestamp_precision) { 473 case TSP_SEC: 474 tsp->tv_sec = time_second; 475 tsp->tv_nsec = 0; 476 break; 477 case TSP_HZ: 478 getnanotime(tsp); 479 break; 480 case TSP_USEC: 481 microtime(&tv); 482 TIMEVAL_TO_TIMESPEC(&tv, tsp); 483 break; 484 case TSP_NSEC: 485 default: 486 nanotime(tsp); 487 break; 488 } 489 } 490 491 /* 492 * Build a linked list of mount options from a struct uio. 493 */ 494 int 495 vfs_buildopts(struct uio *auio, struct vfsoptlist **options) 496 { 497 struct vfsoptlist *opts; 498 struct vfsopt *opt; 499 unsigned int i, iovcnt; 500 int error, namelen, optlen; 501 502 iovcnt = auio->uio_iovcnt; 503 opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK); 504 TAILQ_INIT(opts); 505 for (i = 0; i < iovcnt; i += 2) { 506 opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK); 507 namelen = auio->uio_iov[i].iov_len; 508 optlen = auio->uio_iov[i + 1].iov_len; 509 opt->name = malloc(namelen, M_MOUNT, M_WAITOK); 510 opt->value = malloc(optlen, M_MOUNT, M_WAITOK); 511 opt->len = optlen; 512 if (auio->uio_segflg == UIO_SYSSPACE) { 513 bcopy(auio->uio_iov[i].iov_base, opt->name, namelen); 514 bcopy(auio->uio_iov[i + 1].iov_base, opt->value, 515 optlen); 516 } else { 517 error = copyin(auio->uio_iov[i].iov_base, opt->name, 518 namelen); 519 if (!error) 520 error = copyin(auio->uio_iov[i + 1].iov_base, 521 opt->value, optlen); 522 if (error) 523 goto bad; 524 } 525 TAILQ_INSERT_TAIL(opts, opt, link); 526 } 527 *options = opts; 528 return (0); 529 bad: 530 vfs_freeopts(opts); 531 return (error); 532 } 533 534 /* 535 * Get a mount option by its name. 536 * 537 * Return 0 if the option was found, ENOENT otherwise. 538 * If len is non-NULL it will be filled with the length 539 * of the option. If buf is non-NULL, it will be filled 540 * with the address of the option. 541 */ 542 int 543 vfs_getopt(opts, name, buf, len) 544 struct vfsoptlist *opts; 545 const char *name; 546 void **buf; 547 int *len; 548 { 549 struct vfsopt *opt; 550 551 TAILQ_FOREACH(opt, opts, link) { 552 if (strcmp(name, opt->name) == 0) { 553 if (len != NULL) 554 *len = opt->len; 555 if (buf != NULL) 556 *buf = opt->value; 557 return (0); 558 } 559 } 560 return (ENOENT); 561 } 562 563 /* 564 * Find and copy a mount option. 565 * 566 * The size of the buffer has to be specified 567 * in len, if it is not the same length as the 568 * mount option, EINVAL is returned. 569 * Returns ENOENT if the option is not found. 570 */ 571 int 572 vfs_copyopt(opts, name, dest, len) 573 struct vfsoptlist *opts; 574 const char *name; 575 void *dest; 576 int len; 577 { 578 struct vfsopt *opt; 579 580 TAILQ_FOREACH(opt, opts, link) { 581 if (strcmp(name, opt->name) == 0) { 582 if (len != opt->len) 583 return (EINVAL); 584 bcopy(opt->value, dest, opt->len); 585 return (0); 586 } 587 } 588 return (ENOENT); 589 } 590 591 /* 592 * Set vnode attributes to VNOVAL 593 */ 594 void 595 vattr_null(vap) 596 register struct vattr *vap; 597 { 598 599 vap->va_type = VNON; 600 vap->va_size = VNOVAL; 601 vap->va_bytes = VNOVAL; 602 vap->va_mode = VNOVAL; 603 vap->va_nlink = VNOVAL; 604 vap->va_uid = VNOVAL; 605 vap->va_gid = VNOVAL; 606 vap->va_fsid = VNOVAL; 607 vap->va_fileid = VNOVAL; 608 vap->va_blocksize = VNOVAL; 609 vap->va_rdev = VNOVAL; 610 vap->va_atime.tv_sec = VNOVAL; 611 vap->va_atime.tv_nsec = VNOVAL; 612 vap->va_mtime.tv_sec = VNOVAL; 613 vap->va_mtime.tv_nsec = VNOVAL; 614 vap->va_ctime.tv_sec = VNOVAL; 615 vap->va_ctime.tv_nsec = VNOVAL; 616 vap->va_flags = VNOVAL; 617 vap->va_gen = VNOVAL; 618 vap->va_vaflags = 0; 619 } 620 621 /* 622 * This routine is called when we have too many vnodes. It attempts 623 * to free <count> vnodes and will potentially free vnodes that still 624 * have VM backing store (VM backing store is typically the cause 625 * of a vnode blowout so we want to do this). Therefore, this operation 626 * is not considered cheap. 627 * 628 * A number of conditions may prevent a vnode from being reclaimed. 629 * the buffer cache may have references on the vnode, a directory 630 * vnode may still have references due to the namei cache representing 631 * underlying files, or the vnode may be in active use. It is not 632 * desireable to reuse such vnodes. These conditions may cause the 633 * number of vnodes to reach some minimum value regardless of what 634 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 635 */ 636 static int 637 vlrureclaim(struct mount *mp, int count) 638 { 639 struct vnode *vp; 640 int done; 641 int trigger; 642 int usevnodes; 643 644 /* 645 * Calculate the trigger point, don't allow user 646 * screwups to blow us up. This prevents us from 647 * recycling vnodes with lots of resident pages. We 648 * aren't trying to free memory, we are trying to 649 * free vnodes. 650 */ 651 usevnodes = desiredvnodes; 652 if (usevnodes <= 0) 653 usevnodes = 1; 654 trigger = cnt.v_page_count * 2 / usevnodes; 655 656 done = 0; 657 mtx_lock(&mntvnode_mtx); 658 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 659 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 660 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 661 662 if (vp->v_type != VNON && 663 vp->v_type != VBAD && 664 VMIGHTFREE(vp) && /* critical path opt */ 665 (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) && 666 mtx_trylock(&vp->v_interlock) 667 ) { 668 mtx_unlock(&mntvnode_mtx); 669 if (VMIGHTFREE(vp)) { 670 vgonel(vp, curthread); 671 done++; 672 } else { 673 mtx_unlock(&vp->v_interlock); 674 } 675 mtx_lock(&mntvnode_mtx); 676 } 677 --count; 678 } 679 mtx_unlock(&mntvnode_mtx); 680 return done; 681 } 682 683 /* 684 * Attempt to recycle vnodes in a context that is always safe to block. 685 * Calling vlrurecycle() from the bowels of filesystem code has some 686 * interesting deadlock problems. 687 */ 688 static struct proc *vnlruproc; 689 static int vnlruproc_sig; 690 691 static void 692 vnlru_proc(void) 693 { 694 struct mount *mp, *nmp; 695 int s; 696 int done; 697 struct proc *p = vnlruproc; 698 struct thread *td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */ 699 700 mtx_lock(&Giant); 701 702 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 703 SHUTDOWN_PRI_FIRST); 704 705 s = splbio(); 706 for (;;) { 707 kthread_suspend_check(p); 708 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { 709 vnlruproc_sig = 0; 710 tsleep(vnlruproc, PVFS, "vlruwt", 0); 711 continue; 712 } 713 done = 0; 714 mtx_lock(&mountlist_mtx); 715 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 716 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 717 nmp = TAILQ_NEXT(mp, mnt_list); 718 continue; 719 } 720 done += vlrureclaim(mp, 10); 721 mtx_lock(&mountlist_mtx); 722 nmp = TAILQ_NEXT(mp, mnt_list); 723 vfs_unbusy(mp, td); 724 } 725 mtx_unlock(&mountlist_mtx); 726 if (done == 0) { 727 #if 0 728 /* These messages are temporary debugging aids */ 729 if (vnlru_nowhere < 5) 730 printf("vnlru process getting nowhere..\n"); 731 else if (vnlru_nowhere == 5) 732 printf("vnlru process messages stopped.\n"); 733 #endif 734 vnlru_nowhere++; 735 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 736 } 737 } 738 splx(s); 739 } 740 741 static struct kproc_desc vnlru_kp = { 742 "vnlru", 743 vnlru_proc, 744 &vnlruproc 745 }; 746 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 747 748 749 /* 750 * Routines having to do with the management of the vnode table. 751 */ 752 753 /* 754 * Return the next vnode from the free list. 755 */ 756 int 757 getnewvnode(tag, mp, vops, vpp) 758 enum vtagtype tag; 759 struct mount *mp; 760 vop_t **vops; 761 struct vnode **vpp; 762 { 763 int s; 764 struct thread *td = curthread; /* XXX */ 765 struct vnode *vp = NULL; 766 struct mount *vnmp; 767 vm_object_t object; 768 769 s = splbio(); 770 /* 771 * Try to reuse vnodes if we hit the max. This situation only 772 * occurs in certain large-memory (2G+) situations. We cannot 773 * attempt to directly reclaim vnodes due to nasty recursion 774 * problems. 775 */ 776 if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) { 777 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 778 wakeup(vnlruproc); 779 } 780 781 /* 782 * Attempt to reuse a vnode already on the free list, allocating 783 * a new vnode if we can't find one or if we have not reached a 784 * good minimum for good LRU performance. 785 */ 786 787 mtx_lock(&vnode_free_list_mtx); 788 789 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 790 int count; 791 792 for (count = 0; count < freevnodes; count++) { 793 vp = TAILQ_FIRST(&vnode_free_list); 794 if (vp == NULL || vp->v_usecount) 795 panic("getnewvnode: free vnode isn't"); 796 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 797 798 /* Don't recycle if we can't get the interlock */ 799 if (!mtx_trylock(&vp->v_interlock)) { 800 vp = NULL; 801 continue; 802 } 803 804 /* We should be able to immediately acquire this */ 805 if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0) 806 continue; 807 /* 808 * Don't recycle if we still have cached pages. 809 */ 810 if (VOP_GETVOBJECT(vp, &object) == 0 && 811 (object->resident_page_count || 812 object->ref_count)) { 813 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 814 v_freelist); 815 vp = NULL; 816 VOP_UNLOCK(vp, 0, td); 817 continue; 818 } 819 if (LIST_FIRST(&vp->v_cache_src)) { 820 /* 821 * note: nameileafonly sysctl is temporary, 822 * for debugging only, and will eventually be 823 * removed. 824 */ 825 if (nameileafonly > 0) { 826 /* 827 * Do not reuse namei-cached directory 828 * vnodes that have cached 829 * subdirectories. 830 */ 831 if (cache_leaf_test(vp) < 0) { 832 VOP_UNLOCK(vp, 0, td); 833 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 834 vp = NULL; 835 continue; 836 } 837 } else if (nameileafonly < 0 || 838 vmiodirenable == 0) { 839 /* 840 * Do not reuse namei-cached directory 841 * vnodes if nameileafonly is -1 or 842 * if VMIO backing for directories is 843 * turned off (otherwise we reuse them 844 * too quickly). 845 */ 846 VOP_UNLOCK(vp, 0, td); 847 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 848 vp = NULL; 849 continue; 850 } 851 } 852 /* 853 * Skip over it if its filesystem is being suspended. 854 */ 855 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 856 break; 857 VOP_UNLOCK(vp, 0, td); 858 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 859 vp = NULL; 860 } 861 } 862 if (vp) { 863 vp->v_flag |= VDOOMED; 864 vp->v_flag &= ~VFREE; 865 freevnodes--; 866 mtx_unlock(&vnode_free_list_mtx); 867 cache_purge(vp); 868 if (vp->v_type != VBAD) { 869 VOP_UNLOCK(vp, 0, td); 870 vgone(vp); 871 } else { 872 VOP_UNLOCK(vp, 0, td); 873 } 874 vn_finished_write(vnmp); 875 876 #ifdef INVARIANTS 877 { 878 int s; 879 880 if (vp->v_data) 881 panic("cleaned vnode isn't"); 882 s = splbio(); 883 if (vp->v_numoutput) 884 panic("Clean vnode has pending I/O's"); 885 splx(s); 886 if (vp->v_writecount != 0) 887 panic("Non-zero write count"); 888 } 889 #endif 890 if (vp->v_pollinfo) { 891 mtx_destroy(&vp->v_pollinfo->vpi_lock); 892 uma_zfree(vnodepoll_zone, vp->v_pollinfo); 893 } 894 vp->v_pollinfo = NULL; 895 vp->v_flag = 0; 896 vp->v_lastw = 0; 897 vp->v_lasta = 0; 898 vp->v_cstart = 0; 899 vp->v_clen = 0; 900 vp->v_socket = 0; 901 } else { 902 mtx_unlock(&vnode_free_list_mtx); 903 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); 904 bzero((char *) vp, sizeof *vp); 905 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 906 vp->v_dd = vp; 907 cache_purge(vp); 908 LIST_INIT(&vp->v_cache_src); 909 TAILQ_INIT(&vp->v_cache_dst); 910 numvnodes++; 911 } 912 913 TAILQ_INIT(&vp->v_cleanblkhd); 914 TAILQ_INIT(&vp->v_dirtyblkhd); 915 vp->v_type = VNON; 916 vp->v_tag = tag; 917 vp->v_op = vops; 918 lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE); 919 insmntque(vp, mp); 920 *vpp = vp; 921 vp->v_usecount = 1; 922 vp->v_data = 0; 923 924 splx(s); 925 926 #if 0 927 vnodeallocs++; 928 if (vnodeallocs % vnoderecycleperiod == 0 && 929 freevnodes < vnoderecycleminfreevn && 930 vnoderecyclemintotalvn < numvnodes) { 931 /* Recycle vnodes. */ 932 cache_purgeleafdirs(vnoderecyclenumber); 933 } 934 #endif 935 936 return (0); 937 } 938 939 /* 940 * Move a vnode from one mount queue to another. 941 */ 942 static void 943 insmntque(vp, mp) 944 register struct vnode *vp; 945 register struct mount *mp; 946 { 947 948 mtx_lock(&mntvnode_mtx); 949 /* 950 * Delete from old mount point vnode list, if on one. 951 */ 952 if (vp->v_mount != NULL) 953 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); 954 /* 955 * Insert into list of vnodes for the new mount point, if available. 956 */ 957 if ((vp->v_mount = mp) == NULL) { 958 mtx_unlock(&mntvnode_mtx); 959 return; 960 } 961 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 962 mtx_unlock(&mntvnode_mtx); 963 } 964 965 /* 966 * Update outstanding I/O count and do wakeup if requested. 967 */ 968 void 969 vwakeup(bp) 970 register struct buf *bp; 971 { 972 register struct vnode *vp; 973 974 bp->b_flags &= ~B_WRITEINPROG; 975 if ((vp = bp->b_vp)) { 976 vp->v_numoutput--; 977 if (vp->v_numoutput < 0) 978 panic("vwakeup: neg numoutput"); 979 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 980 vp->v_flag &= ~VBWAIT; 981 wakeup((caddr_t) &vp->v_numoutput); 982 } 983 } 984 } 985 986 /* 987 * Flush out and invalidate all buffers associated with a vnode. 988 * Called with the underlying object locked. 989 */ 990 int 991 vinvalbuf(vp, flags, cred, td, slpflag, slptimeo) 992 register struct vnode *vp; 993 int flags; 994 struct ucred *cred; 995 struct thread *td; 996 int slpflag, slptimeo; 997 { 998 register struct buf *bp; 999 struct buf *nbp, *blist; 1000 int s, error; 1001 vm_object_t object; 1002 1003 GIANT_REQUIRED; 1004 1005 if (flags & V_SAVE) { 1006 s = splbio(); 1007 while (vp->v_numoutput) { 1008 vp->v_flag |= VBWAIT; 1009 error = tsleep((caddr_t)&vp->v_numoutput, 1010 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 1011 if (error) { 1012 splx(s); 1013 return (error); 1014 } 1015 } 1016 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 1017 splx(s); 1018 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0) 1019 return (error); 1020 s = splbio(); 1021 if (vp->v_numoutput > 0 || 1022 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 1023 panic("vinvalbuf: dirty bufs"); 1024 } 1025 splx(s); 1026 } 1027 s = splbio(); 1028 for (;;) { 1029 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 1030 if (!blist) 1031 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 1032 if (!blist) 1033 break; 1034 1035 for (bp = blist; bp; bp = nbp) { 1036 nbp = TAILQ_NEXT(bp, b_vnbufs); 1037 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1038 error = BUF_TIMELOCK(bp, 1039 LK_EXCLUSIVE | LK_SLEEPFAIL, 1040 "vinvalbuf", slpflag, slptimeo); 1041 if (error == ENOLCK) 1042 break; 1043 splx(s); 1044 return (error); 1045 } 1046 /* 1047 * XXX Since there are no node locks for NFS, I 1048 * believe there is a slight chance that a delayed 1049 * write will occur while sleeping just above, so 1050 * check for it. Note that vfs_bio_awrite expects 1051 * buffers to reside on a queue, while BUF_WRITE and 1052 * brelse do not. 1053 */ 1054 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1055 (flags & V_SAVE)) { 1056 1057 if (bp->b_vp == vp) { 1058 if (bp->b_flags & B_CLUSTEROK) { 1059 BUF_UNLOCK(bp); 1060 vfs_bio_awrite(bp); 1061 } else { 1062 bremfree(bp); 1063 bp->b_flags |= B_ASYNC; 1064 BUF_WRITE(bp); 1065 } 1066 } else { 1067 bremfree(bp); 1068 (void) BUF_WRITE(bp); 1069 } 1070 break; 1071 } 1072 bremfree(bp); 1073 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 1074 bp->b_flags &= ~B_ASYNC; 1075 brelse(bp); 1076 } 1077 } 1078 1079 /* 1080 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1081 * have write I/O in-progress but if there is a VM object then the 1082 * VM object can also have read-I/O in-progress. 1083 */ 1084 do { 1085 while (vp->v_numoutput > 0) { 1086 vp->v_flag |= VBWAIT; 1087 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 1088 } 1089 if (VOP_GETVOBJECT(vp, &object) == 0) { 1090 while (object->paging_in_progress) 1091 vm_object_pip_sleep(object, "vnvlbx"); 1092 } 1093 } while (vp->v_numoutput > 0); 1094 1095 splx(s); 1096 1097 /* 1098 * Destroy the copy in the VM cache, too. 1099 */ 1100 mtx_lock(&vp->v_interlock); 1101 if (VOP_GETVOBJECT(vp, &object) == 0) { 1102 vm_object_page_remove(object, 0, 0, 1103 (flags & V_SAVE) ? TRUE : FALSE); 1104 } 1105 mtx_unlock(&vp->v_interlock); 1106 1107 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 1108 panic("vinvalbuf: flush failed"); 1109 return (0); 1110 } 1111 1112 /* 1113 * Truncate a file's buffer and pages to a specified length. This 1114 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1115 * sync activity. 1116 */ 1117 int 1118 vtruncbuf(vp, cred, td, length, blksize) 1119 register struct vnode *vp; 1120 struct ucred *cred; 1121 struct thread *td; 1122 off_t length; 1123 int blksize; 1124 { 1125 register struct buf *bp; 1126 struct buf *nbp; 1127 int s, anyfreed; 1128 int trunclbn; 1129 1130 /* 1131 * Round up to the *next* lbn. 1132 */ 1133 trunclbn = (length + blksize - 1) / blksize; 1134 1135 s = splbio(); 1136 restart: 1137 anyfreed = 1; 1138 for (;anyfreed;) { 1139 anyfreed = 0; 1140 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 1141 nbp = TAILQ_NEXT(bp, b_vnbufs); 1142 if (bp->b_lblkno >= trunclbn) { 1143 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1144 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1145 goto restart; 1146 } else { 1147 bremfree(bp); 1148 bp->b_flags |= (B_INVAL | B_RELBUF); 1149 bp->b_flags &= ~B_ASYNC; 1150 brelse(bp); 1151 anyfreed = 1; 1152 } 1153 if (nbp && 1154 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1155 (nbp->b_vp != vp) || 1156 (nbp->b_flags & B_DELWRI))) { 1157 goto restart; 1158 } 1159 } 1160 } 1161 1162 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1163 nbp = TAILQ_NEXT(bp, b_vnbufs); 1164 if (bp->b_lblkno >= trunclbn) { 1165 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1166 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1167 goto restart; 1168 } else { 1169 bremfree(bp); 1170 bp->b_flags |= (B_INVAL | B_RELBUF); 1171 bp->b_flags &= ~B_ASYNC; 1172 brelse(bp); 1173 anyfreed = 1; 1174 } 1175 if (nbp && 1176 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1177 (nbp->b_vp != vp) || 1178 (nbp->b_flags & B_DELWRI) == 0)) { 1179 goto restart; 1180 } 1181 } 1182 } 1183 } 1184 1185 if (length > 0) { 1186 restartsync: 1187 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1188 nbp = TAILQ_NEXT(bp, b_vnbufs); 1189 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 1190 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1191 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1192 goto restart; 1193 } else { 1194 bremfree(bp); 1195 if (bp->b_vp == vp) { 1196 bp->b_flags |= B_ASYNC; 1197 } else { 1198 bp->b_flags &= ~B_ASYNC; 1199 } 1200 BUF_WRITE(bp); 1201 } 1202 goto restartsync; 1203 } 1204 1205 } 1206 } 1207 1208 while (vp->v_numoutput > 0) { 1209 vp->v_flag |= VBWAIT; 1210 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 1211 } 1212 1213 splx(s); 1214 1215 vnode_pager_setsize(vp, length); 1216 1217 return (0); 1218 } 1219 1220 /* 1221 * Associate a buffer with a vnode. 1222 */ 1223 void 1224 bgetvp(vp, bp) 1225 register struct vnode *vp; 1226 register struct buf *bp; 1227 { 1228 int s; 1229 1230 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 1231 1232 vhold(vp); 1233 bp->b_vp = vp; 1234 bp->b_dev = vn_todev(vp); 1235 /* 1236 * Insert onto list for new vnode. 1237 */ 1238 s = splbio(); 1239 bp->b_xflags |= BX_VNCLEAN; 1240 bp->b_xflags &= ~BX_VNDIRTY; 1241 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 1242 splx(s); 1243 } 1244 1245 /* 1246 * Disassociate a buffer from a vnode. 1247 */ 1248 void 1249 brelvp(bp) 1250 register struct buf *bp; 1251 { 1252 struct vnode *vp; 1253 struct buflists *listheadp; 1254 int s; 1255 1256 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1257 1258 /* 1259 * Delete from old vnode list, if on one. 1260 */ 1261 vp = bp->b_vp; 1262 s = splbio(); 1263 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1264 if (bp->b_xflags & BX_VNDIRTY) 1265 listheadp = &vp->v_dirtyblkhd; 1266 else 1267 listheadp = &vp->v_cleanblkhd; 1268 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1269 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1270 } 1271 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 1272 vp->v_flag &= ~VONWORKLST; 1273 LIST_REMOVE(vp, v_synclist); 1274 } 1275 splx(s); 1276 bp->b_vp = (struct vnode *) 0; 1277 vdrop(vp); 1278 } 1279 1280 /* 1281 * Add an item to the syncer work queue. 1282 */ 1283 static void 1284 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 1285 { 1286 int s, slot; 1287 1288 s = splbio(); 1289 1290 if (vp->v_flag & VONWORKLST) { 1291 LIST_REMOVE(vp, v_synclist); 1292 } 1293 1294 if (delay > syncer_maxdelay - 2) 1295 delay = syncer_maxdelay - 2; 1296 slot = (syncer_delayno + delay) & syncer_mask; 1297 1298 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 1299 vp->v_flag |= VONWORKLST; 1300 splx(s); 1301 } 1302 1303 struct proc *updateproc; 1304 static void sched_sync(void); 1305 static struct kproc_desc up_kp = { 1306 "syncer", 1307 sched_sync, 1308 &updateproc 1309 }; 1310 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1311 1312 /* 1313 * System filesystem synchronizer daemon. 1314 */ 1315 void 1316 sched_sync(void) 1317 { 1318 struct synclist *slp; 1319 struct vnode *vp; 1320 struct mount *mp; 1321 long starttime; 1322 int s; 1323 struct thread *td = FIRST_THREAD_IN_PROC(updateproc); /* XXXKSE */ 1324 1325 mtx_lock(&Giant); 1326 1327 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc, 1328 SHUTDOWN_PRI_LAST); 1329 1330 for (;;) { 1331 kthread_suspend_check(td->td_proc); 1332 1333 starttime = time_second; 1334 1335 /* 1336 * Push files whose dirty time has expired. Be careful 1337 * of interrupt race on slp queue. 1338 */ 1339 s = splbio(); 1340 slp = &syncer_workitem_pending[syncer_delayno]; 1341 syncer_delayno += 1; 1342 if (syncer_delayno == syncer_maxdelay) 1343 syncer_delayno = 0; 1344 splx(s); 1345 1346 while ((vp = LIST_FIRST(slp)) != NULL) { 1347 if (VOP_ISLOCKED(vp, NULL) == 0 && 1348 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 1349 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1350 (void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td); 1351 VOP_UNLOCK(vp, 0, td); 1352 vn_finished_write(mp); 1353 } 1354 s = splbio(); 1355 if (LIST_FIRST(slp) == vp) { 1356 /* 1357 * Note: v_tag VT_VFS vps can remain on the 1358 * worklist too with no dirty blocks, but 1359 * since sync_fsync() moves it to a different 1360 * slot we are safe. 1361 */ 1362 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1363 !vn_isdisk(vp, NULL)) 1364 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1365 /* 1366 * Put us back on the worklist. The worklist 1367 * routine will remove us from our current 1368 * position and then add us back in at a later 1369 * position. 1370 */ 1371 vn_syncer_add_to_worklist(vp, syncdelay); 1372 } 1373 splx(s); 1374 } 1375 1376 /* 1377 * Do soft update processing. 1378 */ 1379 #ifdef SOFTUPDATES 1380 softdep_process_worklist(NULL); 1381 #endif 1382 1383 /* 1384 * The variable rushjob allows the kernel to speed up the 1385 * processing of the filesystem syncer process. A rushjob 1386 * value of N tells the filesystem syncer to process the next 1387 * N seconds worth of work on its queue ASAP. Currently rushjob 1388 * is used by the soft update code to speed up the filesystem 1389 * syncer process when the incore state is getting so far 1390 * ahead of the disk that the kernel memory pool is being 1391 * threatened with exhaustion. 1392 */ 1393 if (rushjob > 0) { 1394 rushjob -= 1; 1395 continue; 1396 } 1397 /* 1398 * If it has taken us less than a second to process the 1399 * current work, then wait. Otherwise start right over 1400 * again. We can still lose time if any single round 1401 * takes more than two seconds, but it does not really 1402 * matter as we are just trying to generally pace the 1403 * filesystem activity. 1404 */ 1405 if (time_second == starttime) 1406 tsleep(&lbolt, PPAUSE, "syncer", 0); 1407 } 1408 } 1409 1410 /* 1411 * Request the syncer daemon to speed up its work. 1412 * We never push it to speed up more than half of its 1413 * normal turn time, otherwise it could take over the cpu. 1414 * XXXKSE only one update? 1415 */ 1416 int 1417 speedup_syncer() 1418 { 1419 1420 mtx_lock_spin(&sched_lock); 1421 if (FIRST_THREAD_IN_PROC(updateproc)->td_wchan == &lbolt) /* XXXKSE */ 1422 setrunnable(FIRST_THREAD_IN_PROC(updateproc)); 1423 mtx_unlock_spin(&sched_lock); 1424 if (rushjob < syncdelay / 2) { 1425 rushjob += 1; 1426 stat_rush_requests += 1; 1427 return (1); 1428 } 1429 return(0); 1430 } 1431 1432 /* 1433 * Associate a p-buffer with a vnode. 1434 * 1435 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1436 * with the buffer. i.e. the bp has not been linked into the vnode or 1437 * ref-counted. 1438 */ 1439 void 1440 pbgetvp(vp, bp) 1441 register struct vnode *vp; 1442 register struct buf *bp; 1443 { 1444 1445 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1446 1447 bp->b_vp = vp; 1448 bp->b_flags |= B_PAGING; 1449 bp->b_dev = vn_todev(vp); 1450 } 1451 1452 /* 1453 * Disassociate a p-buffer from a vnode. 1454 */ 1455 void 1456 pbrelvp(bp) 1457 register struct buf *bp; 1458 { 1459 1460 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1461 1462 /* XXX REMOVE ME */ 1463 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1464 panic( 1465 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1466 bp, 1467 (int)bp->b_flags 1468 ); 1469 } 1470 bp->b_vp = (struct vnode *) 0; 1471 bp->b_flags &= ~B_PAGING; 1472 } 1473 1474 /* 1475 * Reassign a buffer from one vnode to another. 1476 * Used to assign file specific control information 1477 * (indirect blocks) to the vnode to which they belong. 1478 */ 1479 void 1480 reassignbuf(bp, newvp) 1481 register struct buf *bp; 1482 register struct vnode *newvp; 1483 { 1484 struct buflists *listheadp; 1485 int delay; 1486 int s; 1487 1488 if (newvp == NULL) { 1489 printf("reassignbuf: NULL"); 1490 return; 1491 } 1492 ++reassignbufcalls; 1493 1494 /* 1495 * B_PAGING flagged buffers cannot be reassigned because their vp 1496 * is not fully linked in. 1497 */ 1498 if (bp->b_flags & B_PAGING) 1499 panic("cannot reassign paging buffer"); 1500 1501 s = splbio(); 1502 /* 1503 * Delete from old vnode list, if on one. 1504 */ 1505 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1506 if (bp->b_xflags & BX_VNDIRTY) 1507 listheadp = &bp->b_vp->v_dirtyblkhd; 1508 else 1509 listheadp = &bp->b_vp->v_cleanblkhd; 1510 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1511 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1512 if (bp->b_vp != newvp) { 1513 vdrop(bp->b_vp); 1514 bp->b_vp = NULL; /* for clarification */ 1515 } 1516 } 1517 /* 1518 * If dirty, put on list of dirty buffers; otherwise insert onto list 1519 * of clean buffers. 1520 */ 1521 if (bp->b_flags & B_DELWRI) { 1522 struct buf *tbp; 1523 1524 listheadp = &newvp->v_dirtyblkhd; 1525 if ((newvp->v_flag & VONWORKLST) == 0) { 1526 switch (newvp->v_type) { 1527 case VDIR: 1528 delay = dirdelay; 1529 break; 1530 case VCHR: 1531 if (newvp->v_rdev->si_mountpoint != NULL) { 1532 delay = metadelay; 1533 break; 1534 } 1535 /* fall through */ 1536 default: 1537 delay = filedelay; 1538 } 1539 vn_syncer_add_to_worklist(newvp, delay); 1540 } 1541 bp->b_xflags |= BX_VNDIRTY; 1542 tbp = TAILQ_FIRST(listheadp); 1543 if (tbp == NULL || 1544 bp->b_lblkno == 0 || 1545 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1546 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1547 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1548 ++reassignbufsortgood; 1549 } else if (bp->b_lblkno < 0) { 1550 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1551 ++reassignbufsortgood; 1552 } else if (reassignbufmethod == 1) { 1553 /* 1554 * New sorting algorithm, only handle sequential case, 1555 * otherwise append to end (but before metadata) 1556 */ 1557 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1558 (tbp->b_xflags & BX_VNDIRTY)) { 1559 /* 1560 * Found the best place to insert the buffer 1561 */ 1562 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1563 ++reassignbufsortgood; 1564 } else { 1565 /* 1566 * Missed, append to end, but before meta-data. 1567 * We know that the head buffer in the list is 1568 * not meta-data due to prior conditionals. 1569 * 1570 * Indirect effects: NFS second stage write 1571 * tends to wind up here, giving maximum 1572 * distance between the unstable write and the 1573 * commit rpc. 1574 */ 1575 tbp = TAILQ_LAST(listheadp, buflists); 1576 while (tbp && tbp->b_lblkno < 0) 1577 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1578 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1579 ++reassignbufsortbad; 1580 } 1581 } else { 1582 /* 1583 * Old sorting algorithm, scan queue and insert 1584 */ 1585 struct buf *ttbp; 1586 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1587 (ttbp->b_lblkno < bp->b_lblkno)) { 1588 ++reassignbufloops; 1589 tbp = ttbp; 1590 } 1591 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1592 } 1593 } else { 1594 bp->b_xflags |= BX_VNCLEAN; 1595 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1596 if ((newvp->v_flag & VONWORKLST) && 1597 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1598 newvp->v_flag &= ~VONWORKLST; 1599 LIST_REMOVE(newvp, v_synclist); 1600 } 1601 } 1602 if (bp->b_vp != newvp) { 1603 bp->b_vp = newvp; 1604 vhold(bp->b_vp); 1605 } 1606 splx(s); 1607 } 1608 1609 /* 1610 * Create a vnode for a device. 1611 * Used for mounting the root filesystem. 1612 */ 1613 int 1614 bdevvp(dev, vpp) 1615 dev_t dev; 1616 struct vnode **vpp; 1617 { 1618 register struct vnode *vp; 1619 struct vnode *nvp; 1620 int error; 1621 1622 if (dev == NODEV) { 1623 *vpp = NULLVP; 1624 return (ENXIO); 1625 } 1626 if (vfinddev(dev, VCHR, vpp)) 1627 return (0); 1628 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1629 if (error) { 1630 *vpp = NULLVP; 1631 return (error); 1632 } 1633 vp = nvp; 1634 vp->v_type = VCHR; 1635 addalias(vp, dev); 1636 *vpp = vp; 1637 return (0); 1638 } 1639 1640 /* 1641 * Add vnode to the alias list hung off the dev_t. 1642 * 1643 * The reason for this gunk is that multiple vnodes can reference 1644 * the same physical device, so checking vp->v_usecount to see 1645 * how many users there are is inadequate; the v_usecount for 1646 * the vnodes need to be accumulated. vcount() does that. 1647 */ 1648 struct vnode * 1649 addaliasu(nvp, nvp_rdev) 1650 struct vnode *nvp; 1651 udev_t nvp_rdev; 1652 { 1653 struct vnode *ovp; 1654 vop_t **ops; 1655 dev_t dev; 1656 1657 if (nvp->v_type == VBLK) 1658 return (nvp); 1659 if (nvp->v_type != VCHR) 1660 panic("addaliasu on non-special vnode"); 1661 dev = udev2dev(nvp_rdev, 0); 1662 /* 1663 * Check to see if we have a bdevvp vnode with no associated 1664 * filesystem. If so, we want to associate the filesystem of 1665 * the new newly instigated vnode with the bdevvp vnode and 1666 * discard the newly created vnode rather than leaving the 1667 * bdevvp vnode lying around with no associated filesystem. 1668 */ 1669 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1670 addalias(nvp, dev); 1671 return (nvp); 1672 } 1673 /* 1674 * Discard unneeded vnode, but save its node specific data. 1675 * Note that if there is a lock, it is carried over in the 1676 * node specific data to the replacement vnode. 1677 */ 1678 vref(ovp); 1679 ovp->v_data = nvp->v_data; 1680 ovp->v_tag = nvp->v_tag; 1681 nvp->v_data = NULL; 1682 lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, 1683 nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); 1684 if (nvp->v_vnlock) 1685 ovp->v_vnlock = &ovp->v_lock; 1686 ops = ovp->v_op; 1687 ovp->v_op = nvp->v_op; 1688 if (VOP_ISLOCKED(nvp, curthread)) { 1689 VOP_UNLOCK(nvp, 0, curthread); 1690 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread); 1691 } 1692 nvp->v_op = ops; 1693 insmntque(ovp, nvp->v_mount); 1694 vrele(nvp); 1695 vgone(nvp); 1696 return (ovp); 1697 } 1698 1699 /* This is a local helper function that do the same as addaliasu, but for a 1700 * dev_t instead of an udev_t. */ 1701 static void 1702 addalias(nvp, dev) 1703 struct vnode *nvp; 1704 dev_t dev; 1705 { 1706 1707 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); 1708 nvp->v_rdev = dev; 1709 mtx_lock(&spechash_mtx); 1710 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1711 mtx_unlock(&spechash_mtx); 1712 } 1713 1714 /* 1715 * Grab a particular vnode from the free list, increment its 1716 * reference count and lock it. The vnode lock bit is set if the 1717 * vnode is being eliminated in vgone. The process is awakened 1718 * when the transition is completed, and an error returned to 1719 * indicate that the vnode is no longer usable (possibly having 1720 * been changed to a new filesystem type). 1721 */ 1722 int 1723 vget(vp, flags, td) 1724 register struct vnode *vp; 1725 int flags; 1726 struct thread *td; 1727 { 1728 int error; 1729 1730 /* 1731 * If the vnode is in the process of being cleaned out for 1732 * another use, we wait for the cleaning to finish and then 1733 * return failure. Cleaning is determined by checking that 1734 * the VXLOCK flag is set. 1735 */ 1736 if ((flags & LK_INTERLOCK) == 0) 1737 mtx_lock(&vp->v_interlock); 1738 if (vp->v_flag & VXLOCK) { 1739 if (vp->v_vxproc == curthread) { 1740 #if 0 1741 /* this can now occur in normal operation */ 1742 log(LOG_INFO, "VXLOCK interlock avoided\n"); 1743 #endif 1744 } else { 1745 vp->v_flag |= VXWANT; 1746 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1747 "vget", 0); 1748 return (ENOENT); 1749 } 1750 } 1751 1752 vp->v_usecount++; 1753 1754 if (VSHOULDBUSY(vp)) 1755 vbusy(vp); 1756 if (flags & LK_TYPE_MASK) { 1757 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { 1758 /* 1759 * must expand vrele here because we do not want 1760 * to call VOP_INACTIVE if the reference count 1761 * drops back to zero since it was never really 1762 * active. We must remove it from the free list 1763 * before sleeping so that multiple processes do 1764 * not try to recycle it. 1765 */ 1766 mtx_lock(&vp->v_interlock); 1767 vp->v_usecount--; 1768 if (VSHOULDFREE(vp)) 1769 vfree(vp); 1770 else 1771 vlruvp(vp); 1772 mtx_unlock(&vp->v_interlock); 1773 } 1774 return (error); 1775 } 1776 mtx_unlock(&vp->v_interlock); 1777 return (0); 1778 } 1779 1780 /* 1781 * Increase the reference count of a vnode. 1782 */ 1783 void 1784 vref(struct vnode *vp) 1785 { 1786 mtx_lock(&vp->v_interlock); 1787 vp->v_usecount++; 1788 mtx_unlock(&vp->v_interlock); 1789 } 1790 1791 /* 1792 * Vnode put/release. 1793 * If count drops to zero, call inactive routine and return to freelist. 1794 */ 1795 void 1796 vrele(vp) 1797 struct vnode *vp; 1798 { 1799 struct thread *td = curthread; /* XXX */ 1800 1801 KASSERT(vp != NULL, ("vrele: null vp")); 1802 1803 mtx_lock(&vp->v_interlock); 1804 1805 /* Skip this v_writecount check if we're going to panic below. */ 1806 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, 1807 ("vrele: missed vn_close")); 1808 1809 if (vp->v_usecount > 1) { 1810 1811 vp->v_usecount--; 1812 mtx_unlock(&vp->v_interlock); 1813 1814 return; 1815 } 1816 1817 if (vp->v_usecount == 1) { 1818 vp->v_usecount--; 1819 /* 1820 * We must call VOP_INACTIVE with the node locked. 1821 * If we are doing a vput, the node is already locked, 1822 * but, in the case of vrele, we must explicitly lock 1823 * the vnode before calling VOP_INACTIVE. 1824 */ 1825 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) 1826 VOP_INACTIVE(vp, td); 1827 if (VSHOULDFREE(vp)) 1828 vfree(vp); 1829 else 1830 vlruvp(vp); 1831 1832 } else { 1833 #ifdef DIAGNOSTIC 1834 vprint("vrele: negative ref count", vp); 1835 mtx_unlock(&vp->v_interlock); 1836 #endif 1837 panic("vrele: negative ref cnt"); 1838 } 1839 } 1840 1841 /* 1842 * Release an already locked vnode. This give the same effects as 1843 * unlock+vrele(), but takes less time and avoids releasing and 1844 * re-aquiring the lock (as vrele() aquires the lock internally.) 1845 */ 1846 void 1847 vput(vp) 1848 struct vnode *vp; 1849 { 1850 struct thread *td = curthread; /* XXX */ 1851 1852 GIANT_REQUIRED; 1853 1854 KASSERT(vp != NULL, ("vput: null vp")); 1855 mtx_lock(&vp->v_interlock); 1856 /* Skip this v_writecount check if we're going to panic below. */ 1857 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, 1858 ("vput: missed vn_close")); 1859 1860 if (vp->v_usecount > 1) { 1861 vp->v_usecount--; 1862 VOP_UNLOCK(vp, LK_INTERLOCK, td); 1863 return; 1864 } 1865 1866 if (vp->v_usecount == 1) { 1867 vp->v_usecount--; 1868 /* 1869 * We must call VOP_INACTIVE with the node locked. 1870 * If we are doing a vput, the node is already locked, 1871 * so we just need to release the vnode mutex. 1872 */ 1873 mtx_unlock(&vp->v_interlock); 1874 VOP_INACTIVE(vp, td); 1875 if (VSHOULDFREE(vp)) 1876 vfree(vp); 1877 else 1878 vlruvp(vp); 1879 1880 } else { 1881 #ifdef DIAGNOSTIC 1882 vprint("vput: negative ref count", vp); 1883 #endif 1884 panic("vput: negative ref cnt"); 1885 } 1886 } 1887 1888 /* 1889 * Somebody doesn't want the vnode recycled. 1890 */ 1891 void 1892 vhold(vp) 1893 register struct vnode *vp; 1894 { 1895 int s; 1896 1897 s = splbio(); 1898 vp->v_holdcnt++; 1899 if (VSHOULDBUSY(vp)) 1900 vbusy(vp); 1901 splx(s); 1902 } 1903 1904 /* 1905 * Note that there is one less who cares about this vnode. vdrop() is the 1906 * opposite of vhold(). 1907 */ 1908 void 1909 vdrop(vp) 1910 register struct vnode *vp; 1911 { 1912 int s; 1913 1914 s = splbio(); 1915 if (vp->v_holdcnt <= 0) 1916 panic("vdrop: holdcnt"); 1917 vp->v_holdcnt--; 1918 if (VSHOULDFREE(vp)) 1919 vfree(vp); 1920 else 1921 vlruvp(vp); 1922 splx(s); 1923 } 1924 1925 /* 1926 * Remove any vnodes in the vnode table belonging to mount point mp. 1927 * 1928 * If FORCECLOSE is not specified, there should not be any active ones, 1929 * return error if any are found (nb: this is a user error, not a 1930 * system error). If FORCECLOSE is specified, detach any active vnodes 1931 * that are found. 1932 * 1933 * If WRITECLOSE is set, only flush out regular file vnodes open for 1934 * writing. 1935 * 1936 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. 1937 * 1938 * `rootrefs' specifies the base reference count for the root vnode 1939 * of this filesystem. The root vnode is considered busy if its 1940 * v_usecount exceeds this value. On a successful return, vflush() 1941 * will call vrele() on the root vnode exactly rootrefs times. 1942 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 1943 * be zero. 1944 */ 1945 #ifdef DIAGNOSTIC 1946 static int busyprt = 0; /* print out busy vnodes */ 1947 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1948 #endif 1949 1950 int 1951 vflush(mp, rootrefs, flags) 1952 struct mount *mp; 1953 int rootrefs; 1954 int flags; 1955 { 1956 struct thread *td = curthread; /* XXX */ 1957 struct vnode *vp, *nvp, *rootvp = NULL; 1958 struct vattr vattr; 1959 int busy = 0, error; 1960 1961 if (rootrefs > 0) { 1962 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 1963 ("vflush: bad args")); 1964 /* 1965 * Get the filesystem root vnode. We can vput() it 1966 * immediately, since with rootrefs > 0, it won't go away. 1967 */ 1968 if ((error = VFS_ROOT(mp, &rootvp)) != 0) 1969 return (error); 1970 vput(rootvp); 1971 } 1972 mtx_lock(&mntvnode_mtx); 1973 loop: 1974 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) { 1975 /* 1976 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1977 * Start over if it has (it won't be on the list anymore). 1978 */ 1979 if (vp->v_mount != mp) 1980 goto loop; 1981 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 1982 1983 mtx_unlock(&mntvnode_mtx); 1984 mtx_lock(&vp->v_interlock); 1985 /* 1986 * Skip over a vnodes marked VSYSTEM. 1987 */ 1988 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1989 mtx_unlock(&vp->v_interlock); 1990 mtx_lock(&mntvnode_mtx); 1991 continue; 1992 } 1993 /* 1994 * If WRITECLOSE is set, flush out unlinked but still open 1995 * files (even if open only for reading) and regular file 1996 * vnodes open for writing. 1997 */ 1998 if ((flags & WRITECLOSE) && 1999 (vp->v_type == VNON || 2000 (VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 && 2001 vattr.va_nlink > 0)) && 2002 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2003 mtx_unlock(&vp->v_interlock); 2004 mtx_lock(&mntvnode_mtx); 2005 continue; 2006 } 2007 2008 /* 2009 * With v_usecount == 0, all we need to do is clear out the 2010 * vnode data structures and we are done. 2011 */ 2012 if (vp->v_usecount == 0) { 2013 vgonel(vp, td); 2014 mtx_lock(&mntvnode_mtx); 2015 continue; 2016 } 2017 2018 /* 2019 * If FORCECLOSE is set, forcibly close the vnode. For block 2020 * or character devices, revert to an anonymous device. For 2021 * all other files, just kill them. 2022 */ 2023 if (flags & FORCECLOSE) { 2024 if (vp->v_type != VCHR) { 2025 vgonel(vp, td); 2026 } else { 2027 vclean(vp, 0, td); 2028 vp->v_op = spec_vnodeop_p; 2029 insmntque(vp, (struct mount *) 0); 2030 } 2031 mtx_lock(&mntvnode_mtx); 2032 continue; 2033 } 2034 #ifdef DIAGNOSTIC 2035 if (busyprt) 2036 vprint("vflush: busy vnode", vp); 2037 #endif 2038 mtx_unlock(&vp->v_interlock); 2039 mtx_lock(&mntvnode_mtx); 2040 busy++; 2041 } 2042 mtx_unlock(&mntvnode_mtx); 2043 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 2044 /* 2045 * If just the root vnode is busy, and if its refcount 2046 * is equal to `rootrefs', then go ahead and kill it. 2047 */ 2048 mtx_lock(&rootvp->v_interlock); 2049 KASSERT(busy > 0, ("vflush: not busy")); 2050 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); 2051 if (busy == 1 && rootvp->v_usecount == rootrefs) { 2052 vgonel(rootvp, td); 2053 busy = 0; 2054 } else 2055 mtx_unlock(&rootvp->v_interlock); 2056 } 2057 if (busy) 2058 return (EBUSY); 2059 for (; rootrefs > 0; rootrefs--) 2060 vrele(rootvp); 2061 return (0); 2062 } 2063 2064 /* 2065 * This moves a now (likely recyclable) vnode to the end of the 2066 * mountlist. XXX However, it is temporarily disabled until we 2067 * can clean up ffs_sync() and friends, which have loop restart 2068 * conditions which this code causes to operate O(N^2). 2069 */ 2070 static void 2071 vlruvp(struct vnode *vp) 2072 { 2073 #if 0 2074 struct mount *mp; 2075 2076 if ((mp = vp->v_mount) != NULL) { 2077 mtx_lock(&mntvnode_mtx); 2078 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2079 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2080 mtx_unlock(&mntvnode_mtx); 2081 } 2082 #endif 2083 } 2084 2085 /* 2086 * Disassociate the underlying filesystem from a vnode. 2087 */ 2088 static void 2089 vclean(vp, flags, td) 2090 struct vnode *vp; 2091 int flags; 2092 struct thread *td; 2093 { 2094 int active; 2095 2096 /* 2097 * Check to see if the vnode is in use. If so we have to reference it 2098 * before we clean it out so that its count cannot fall to zero and 2099 * generate a race against ourselves to recycle it. 2100 */ 2101 if ((active = vp->v_usecount)) 2102 vp->v_usecount++; 2103 2104 /* 2105 * Prevent the vnode from being recycled or brought into use while we 2106 * clean it out. 2107 */ 2108 if (vp->v_flag & VXLOCK) 2109 panic("vclean: deadlock"); 2110 vp->v_flag |= VXLOCK; 2111 vp->v_vxproc = curthread; 2112 /* 2113 * Even if the count is zero, the VOP_INACTIVE routine may still 2114 * have the object locked while it cleans it out. The VOP_LOCK 2115 * ensures that the VOP_INACTIVE routine is done with its work. 2116 * For active vnodes, it ensures that no other activity can 2117 * occur while the underlying object is being cleaned out. 2118 */ 2119 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); 2120 2121 /* 2122 * Clean out any buffers associated with the vnode. 2123 * If the flush fails, just toss the buffers. 2124 */ 2125 if (flags & DOCLOSE) { 2126 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 2127 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 2128 if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0) 2129 vinvalbuf(vp, 0, NOCRED, td, 0, 0); 2130 } 2131 2132 VOP_DESTROYVOBJECT(vp); 2133 2134 /* 2135 * If purging an active vnode, it must be closed and 2136 * deactivated before being reclaimed. Note that the 2137 * VOP_INACTIVE will unlock the vnode. 2138 */ 2139 if (active) { 2140 if (flags & DOCLOSE) 2141 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 2142 VOP_INACTIVE(vp, td); 2143 } else { 2144 /* 2145 * Any other processes trying to obtain this lock must first 2146 * wait for VXLOCK to clear, then call the new lock operation. 2147 */ 2148 VOP_UNLOCK(vp, 0, td); 2149 } 2150 /* 2151 * Reclaim the vnode. 2152 */ 2153 if (VOP_RECLAIM(vp, td)) 2154 panic("vclean: cannot reclaim"); 2155 2156 if (active) { 2157 /* 2158 * Inline copy of vrele() since VOP_INACTIVE 2159 * has already been called. 2160 */ 2161 mtx_lock(&vp->v_interlock); 2162 if (--vp->v_usecount <= 0) { 2163 #ifdef DIAGNOSTIC 2164 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 2165 vprint("vclean: bad ref count", vp); 2166 panic("vclean: ref cnt"); 2167 } 2168 #endif 2169 vfree(vp); 2170 } 2171 mtx_unlock(&vp->v_interlock); 2172 } 2173 2174 cache_purge(vp); 2175 vp->v_vnlock = NULL; 2176 lockdestroy(&vp->v_lock); 2177 2178 if (VSHOULDFREE(vp)) 2179 vfree(vp); 2180 2181 /* 2182 * Done with purge, notify sleepers of the grim news. 2183 */ 2184 vp->v_op = dead_vnodeop_p; 2185 if (vp->v_pollinfo != NULL) 2186 vn_pollgone(vp); 2187 vp->v_tag = VT_NON; 2188 vp->v_flag &= ~VXLOCK; 2189 vp->v_vxproc = NULL; 2190 if (vp->v_flag & VXWANT) { 2191 vp->v_flag &= ~VXWANT; 2192 wakeup((caddr_t) vp); 2193 } 2194 } 2195 2196 /* 2197 * Eliminate all activity associated with the requested vnode 2198 * and with all vnodes aliased to the requested vnode. 2199 */ 2200 int 2201 vop_revoke(ap) 2202 struct vop_revoke_args /* { 2203 struct vnode *a_vp; 2204 int a_flags; 2205 } */ *ap; 2206 { 2207 struct vnode *vp, *vq; 2208 dev_t dev; 2209 2210 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 2211 2212 vp = ap->a_vp; 2213 /* 2214 * If a vgone (or vclean) is already in progress, 2215 * wait until it is done and return. 2216 */ 2217 if (vp->v_flag & VXLOCK) { 2218 vp->v_flag |= VXWANT; 2219 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 2220 "vop_revokeall", 0); 2221 return (0); 2222 } 2223 dev = vp->v_rdev; 2224 for (;;) { 2225 mtx_lock(&spechash_mtx); 2226 vq = SLIST_FIRST(&dev->si_hlist); 2227 mtx_unlock(&spechash_mtx); 2228 if (!vq) 2229 break; 2230 vgone(vq); 2231 } 2232 return (0); 2233 } 2234 2235 /* 2236 * Recycle an unused vnode to the front of the free list. 2237 * Release the passed interlock if the vnode will be recycled. 2238 */ 2239 int 2240 vrecycle(vp, inter_lkp, td) 2241 struct vnode *vp; 2242 struct mtx *inter_lkp; 2243 struct thread *td; 2244 { 2245 2246 mtx_lock(&vp->v_interlock); 2247 if (vp->v_usecount == 0) { 2248 if (inter_lkp) { 2249 mtx_unlock(inter_lkp); 2250 } 2251 vgonel(vp, td); 2252 return (1); 2253 } 2254 mtx_unlock(&vp->v_interlock); 2255 return (0); 2256 } 2257 2258 /* 2259 * Eliminate all activity associated with a vnode 2260 * in preparation for reuse. 2261 */ 2262 void 2263 vgone(vp) 2264 register struct vnode *vp; 2265 { 2266 struct thread *td = curthread; /* XXX */ 2267 2268 mtx_lock(&vp->v_interlock); 2269 vgonel(vp, td); 2270 } 2271 2272 /* 2273 * vgone, with the vp interlock held. 2274 */ 2275 void 2276 vgonel(vp, td) 2277 struct vnode *vp; 2278 struct thread *td; 2279 { 2280 int s; 2281 2282 /* 2283 * If a vgone (or vclean) is already in progress, 2284 * wait until it is done and return. 2285 */ 2286 if (vp->v_flag & VXLOCK) { 2287 vp->v_flag |= VXWANT; 2288 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 2289 "vgone", 0); 2290 return; 2291 } 2292 2293 /* 2294 * Clean out the filesystem specific data. 2295 */ 2296 vclean(vp, DOCLOSE, td); 2297 mtx_lock(&vp->v_interlock); 2298 2299 /* 2300 * Delete from old mount point vnode list, if on one. 2301 */ 2302 if (vp->v_mount != NULL) 2303 insmntque(vp, (struct mount *)0); 2304 /* 2305 * If special device, remove it from special device alias list 2306 * if it is on one. 2307 */ 2308 if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { 2309 mtx_lock(&spechash_mtx); 2310 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); 2311 freedev(vp->v_rdev); 2312 mtx_unlock(&spechash_mtx); 2313 vp->v_rdev = NULL; 2314 } 2315 2316 /* 2317 * If it is on the freelist and not already at the head, 2318 * move it to the head of the list. The test of the 2319 * VDOOMED flag and the reference count of zero is because 2320 * it will be removed from the free list by getnewvnode, 2321 * but will not have its reference count incremented until 2322 * after calling vgone. If the reference count were 2323 * incremented first, vgone would (incorrectly) try to 2324 * close the previous instance of the underlying object. 2325 */ 2326 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 2327 s = splbio(); 2328 mtx_lock(&vnode_free_list_mtx); 2329 if (vp->v_flag & VFREE) 2330 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2331 else 2332 freevnodes++; 2333 vp->v_flag |= VFREE; 2334 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2335 mtx_unlock(&vnode_free_list_mtx); 2336 splx(s); 2337 } 2338 2339 vp->v_type = VBAD; 2340 mtx_unlock(&vp->v_interlock); 2341 } 2342 2343 /* 2344 * Lookup a vnode by device number. 2345 */ 2346 int 2347 vfinddev(dev, type, vpp) 2348 dev_t dev; 2349 enum vtype type; 2350 struct vnode **vpp; 2351 { 2352 struct vnode *vp; 2353 2354 mtx_lock(&spechash_mtx); 2355 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2356 if (type == vp->v_type) { 2357 *vpp = vp; 2358 mtx_unlock(&spechash_mtx); 2359 return (1); 2360 } 2361 } 2362 mtx_unlock(&spechash_mtx); 2363 return (0); 2364 } 2365 2366 /* 2367 * Calculate the total number of references to a special device. 2368 */ 2369 int 2370 vcount(vp) 2371 struct vnode *vp; 2372 { 2373 struct vnode *vq; 2374 int count; 2375 2376 count = 0; 2377 mtx_lock(&spechash_mtx); 2378 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) 2379 count += vq->v_usecount; 2380 mtx_unlock(&spechash_mtx); 2381 return (count); 2382 } 2383 2384 /* 2385 * Same as above, but using the dev_t as argument 2386 */ 2387 int 2388 count_dev(dev) 2389 dev_t dev; 2390 { 2391 struct vnode *vp; 2392 2393 vp = SLIST_FIRST(&dev->si_hlist); 2394 if (vp == NULL) 2395 return (0); 2396 return(vcount(vp)); 2397 } 2398 2399 /* 2400 * Print out a description of a vnode. 2401 */ 2402 static char *typename[] = 2403 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2404 2405 void 2406 vprint(label, vp) 2407 char *label; 2408 struct vnode *vp; 2409 { 2410 char buf[96]; 2411 2412 if (label != NULL) 2413 printf("%s: %p: ", label, (void *)vp); 2414 else 2415 printf("%p: ", (void *)vp); 2416 printf("type %s, usecount %d, writecount %d, refcount %d,", 2417 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2418 vp->v_holdcnt); 2419 buf[0] = '\0'; 2420 if (vp->v_flag & VROOT) 2421 strcat(buf, "|VROOT"); 2422 if (vp->v_flag & VTEXT) 2423 strcat(buf, "|VTEXT"); 2424 if (vp->v_flag & VSYSTEM) 2425 strcat(buf, "|VSYSTEM"); 2426 if (vp->v_flag & VXLOCK) 2427 strcat(buf, "|VXLOCK"); 2428 if (vp->v_flag & VXWANT) 2429 strcat(buf, "|VXWANT"); 2430 if (vp->v_flag & VBWAIT) 2431 strcat(buf, "|VBWAIT"); 2432 if (vp->v_flag & VDOOMED) 2433 strcat(buf, "|VDOOMED"); 2434 if (vp->v_flag & VFREE) 2435 strcat(buf, "|VFREE"); 2436 if (vp->v_flag & VOBJBUF) 2437 strcat(buf, "|VOBJBUF"); 2438 if (buf[0] != '\0') 2439 printf(" flags (%s)", &buf[1]); 2440 if (vp->v_data == NULL) { 2441 printf("\n"); 2442 } else { 2443 printf("\n\t"); 2444 VOP_PRINT(vp); 2445 } 2446 } 2447 2448 #ifdef DDB 2449 #include <ddb/ddb.h> 2450 /* 2451 * List all of the locked vnodes in the system. 2452 * Called when debugging the kernel. 2453 */ 2454 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2455 { 2456 struct thread *td = curthread; /* XXX */ 2457 struct mount *mp, *nmp; 2458 struct vnode *vp; 2459 2460 printf("Locked vnodes\n"); 2461 mtx_lock(&mountlist_mtx); 2462 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2463 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 2464 nmp = TAILQ_NEXT(mp, mnt_list); 2465 continue; 2466 } 2467 mtx_lock(&mntvnode_mtx); 2468 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2469 if (VOP_ISLOCKED(vp, NULL)) 2470 vprint((char *)0, vp); 2471 } 2472 mtx_unlock(&mntvnode_mtx); 2473 mtx_lock(&mountlist_mtx); 2474 nmp = TAILQ_NEXT(mp, mnt_list); 2475 vfs_unbusy(mp, td); 2476 } 2477 mtx_unlock(&mountlist_mtx); 2478 } 2479 #endif 2480 2481 /* 2482 * Top level filesystem related information gathering. 2483 */ 2484 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 2485 2486 static int 2487 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2488 { 2489 int *name = (int *)arg1 - 1; /* XXX */ 2490 u_int namelen = arg2 + 1; /* XXX */ 2491 struct vfsconf *vfsp; 2492 2493 #if 1 || defined(COMPAT_PRELITE2) 2494 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2495 if (namelen == 1) 2496 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2497 #endif 2498 2499 /* XXX the below code does not compile; vfs_sysctl does not exist. */ 2500 #ifdef notyet 2501 /* all sysctl names at this level are at least name and field */ 2502 if (namelen < 2) 2503 return (ENOTDIR); /* overloaded */ 2504 if (name[0] != VFS_GENERIC) { 2505 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2506 if (vfsp->vfc_typenum == name[0]) 2507 break; 2508 if (vfsp == NULL) 2509 return (EOPNOTSUPP); 2510 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2511 oldp, oldlenp, newp, newlen, td)); 2512 } 2513 #endif 2514 switch (name[1]) { 2515 case VFS_MAXTYPENUM: 2516 if (namelen != 2) 2517 return (ENOTDIR); 2518 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2519 case VFS_CONF: 2520 if (namelen != 3) 2521 return (ENOTDIR); /* overloaded */ 2522 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2523 if (vfsp->vfc_typenum == name[2]) 2524 break; 2525 if (vfsp == NULL) 2526 return (EOPNOTSUPP); 2527 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2528 } 2529 return (EOPNOTSUPP); 2530 } 2531 2532 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2533 "Generic filesystem"); 2534 2535 #if 1 || defined(COMPAT_PRELITE2) 2536 2537 static int 2538 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2539 { 2540 int error; 2541 struct vfsconf *vfsp; 2542 struct ovfsconf ovfs; 2543 2544 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2545 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2546 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2547 ovfs.vfc_index = vfsp->vfc_typenum; 2548 ovfs.vfc_refcount = vfsp->vfc_refcount; 2549 ovfs.vfc_flags = vfsp->vfc_flags; 2550 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2551 if (error) 2552 return error; 2553 } 2554 return 0; 2555 } 2556 2557 #endif /* 1 || COMPAT_PRELITE2 */ 2558 2559 #if COMPILING_LINT 2560 #define KINFO_VNODESLOP 10 2561 /* 2562 * Dump vnode list (via sysctl). 2563 * Copyout address of vnode followed by vnode. 2564 */ 2565 /* ARGSUSED */ 2566 static int 2567 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2568 { 2569 struct thread *td = curthread; /* XXX */ 2570 struct mount *mp, *nmp; 2571 struct vnode *nvp, *vp; 2572 int error; 2573 2574 #define VPTRSZ sizeof (struct vnode *) 2575 #define VNODESZ sizeof (struct vnode) 2576 2577 req->lock = 0; 2578 if (!req->oldptr) /* Make an estimate */ 2579 return (SYSCTL_OUT(req, 0, 2580 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2581 2582 mtx_lock(&mountlist_mtx); 2583 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2584 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 2585 nmp = TAILQ_NEXT(mp, mnt_list); 2586 continue; 2587 } 2588 mtx_lock(&mntvnode_mtx); 2589 again: 2590 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 2591 vp != NULL; 2592 vp = nvp) { 2593 /* 2594 * Check that the vp is still associated with 2595 * this filesystem. RACE: could have been 2596 * recycled onto the same filesystem. 2597 */ 2598 if (vp->v_mount != mp) 2599 goto again; 2600 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2601 mtx_unlock(&mntvnode_mtx); 2602 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2603 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2604 return (error); 2605 mtx_lock(&mntvnode_mtx); 2606 } 2607 mtx_unlock(&mntvnode_mtx); 2608 mtx_lock(&mountlist_mtx); 2609 nmp = TAILQ_NEXT(mp, mnt_list); 2610 vfs_unbusy(mp, td); 2611 } 2612 mtx_unlock(&mountlist_mtx); 2613 2614 return (0); 2615 } 2616 2617 /* 2618 * XXX 2619 * Exporting the vnode list on large systems causes them to crash. 2620 * Exporting the vnode list on medium systems causes sysctl to coredump. 2621 */ 2622 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2623 0, 0, sysctl_vnode, "S,vnode", ""); 2624 #endif 2625 2626 /* 2627 * Check to see if a filesystem is mounted on a block device. 2628 */ 2629 int 2630 vfs_mountedon(vp) 2631 struct vnode *vp; 2632 { 2633 2634 if (vp->v_rdev->si_mountpoint != NULL) 2635 return (EBUSY); 2636 return (0); 2637 } 2638 2639 /* 2640 * Unmount all filesystems. The list is traversed in reverse order 2641 * of mounting to avoid dependencies. 2642 */ 2643 void 2644 vfs_unmountall() 2645 { 2646 struct mount *mp; 2647 struct thread *td; 2648 int error; 2649 2650 if (curthread != NULL) 2651 td = curthread; 2652 else 2653 td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */ 2654 /* 2655 * Since this only runs when rebooting, it is not interlocked. 2656 */ 2657 while(!TAILQ_EMPTY(&mountlist)) { 2658 mp = TAILQ_LAST(&mountlist, mntlist); 2659 error = dounmount(mp, MNT_FORCE, td); 2660 if (error) { 2661 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2662 printf("unmount of %s failed (", 2663 mp->mnt_stat.f_mntonname); 2664 if (error == EBUSY) 2665 printf("BUSY)\n"); 2666 else 2667 printf("%d)\n", error); 2668 } else { 2669 /* The unmount has removed mp from the mountlist */ 2670 } 2671 } 2672 } 2673 2674 /* 2675 * perform msync on all vnodes under a mount point 2676 * the mount point must be locked. 2677 */ 2678 void 2679 vfs_msync(struct mount *mp, int flags) 2680 { 2681 struct vnode *vp, *nvp; 2682 struct vm_object *obj; 2683 int tries; 2684 2685 GIANT_REQUIRED; 2686 2687 tries = 5; 2688 mtx_lock(&mntvnode_mtx); 2689 loop: 2690 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { 2691 if (vp->v_mount != mp) { 2692 if (--tries > 0) 2693 goto loop; 2694 break; 2695 } 2696 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2697 2698 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2699 continue; 2700 2701 if (vp->v_flag & VNOSYNC) /* unlinked, skip it */ 2702 continue; 2703 2704 if ((vp->v_flag & VOBJDIRTY) && 2705 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2706 mtx_unlock(&mntvnode_mtx); 2707 if (!vget(vp, 2708 LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) { 2709 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2710 vm_object_page_clean(obj, 0, 0, 2711 flags == MNT_WAIT ? 2712 OBJPC_SYNC : OBJPC_NOSYNC); 2713 } 2714 vput(vp); 2715 } 2716 mtx_lock(&mntvnode_mtx); 2717 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { 2718 if (--tries > 0) 2719 goto loop; 2720 break; 2721 } 2722 } 2723 } 2724 mtx_unlock(&mntvnode_mtx); 2725 } 2726 2727 /* 2728 * Create the VM object needed for VMIO and mmap support. This 2729 * is done for all VREG files in the system. Some filesystems might 2730 * afford the additional metadata buffering capability of the 2731 * VMIO code by making the device node be VMIO mode also. 2732 * 2733 * vp must be locked when vfs_object_create is called. 2734 */ 2735 int 2736 vfs_object_create(vp, td, cred) 2737 struct vnode *vp; 2738 struct thread *td; 2739 struct ucred *cred; 2740 { 2741 GIANT_REQUIRED; 2742 return (VOP_CREATEVOBJECT(vp, cred, td)); 2743 } 2744 2745 /* 2746 * Mark a vnode as free, putting it up for recycling. 2747 */ 2748 void 2749 vfree(vp) 2750 struct vnode *vp; 2751 { 2752 int s; 2753 2754 s = splbio(); 2755 mtx_lock(&vnode_free_list_mtx); 2756 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2757 if (vp->v_flag & VAGE) { 2758 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2759 } else { 2760 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2761 } 2762 freevnodes++; 2763 mtx_unlock(&vnode_free_list_mtx); 2764 vp->v_flag &= ~VAGE; 2765 vp->v_flag |= VFREE; 2766 splx(s); 2767 } 2768 2769 /* 2770 * Opposite of vfree() - mark a vnode as in use. 2771 */ 2772 void 2773 vbusy(vp) 2774 struct vnode *vp; 2775 { 2776 int s; 2777 2778 s = splbio(); 2779 mtx_lock(&vnode_free_list_mtx); 2780 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2781 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2782 freevnodes--; 2783 mtx_unlock(&vnode_free_list_mtx); 2784 vp->v_flag &= ~(VFREE|VAGE); 2785 splx(s); 2786 } 2787 2788 /* 2789 * Record a process's interest in events which might happen to 2790 * a vnode. Because poll uses the historic select-style interface 2791 * internally, this routine serves as both the ``check for any 2792 * pending events'' and the ``record my interest in future events'' 2793 * functions. (These are done together, while the lock is held, 2794 * to avoid race conditions.) 2795 */ 2796 int 2797 vn_pollrecord(vp, td, events) 2798 struct vnode *vp; 2799 struct thread *td; 2800 short events; 2801 { 2802 2803 if (vp->v_pollinfo == NULL) 2804 v_addpollinfo(vp); 2805 mtx_lock(&vp->v_pollinfo->vpi_lock); 2806 if (vp->v_pollinfo->vpi_revents & events) { 2807 /* 2808 * This leaves events we are not interested 2809 * in available for the other process which 2810 * which presumably had requested them 2811 * (otherwise they would never have been 2812 * recorded). 2813 */ 2814 events &= vp->v_pollinfo->vpi_revents; 2815 vp->v_pollinfo->vpi_revents &= ~events; 2816 2817 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2818 return events; 2819 } 2820 vp->v_pollinfo->vpi_events |= events; 2821 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 2822 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2823 return 0; 2824 } 2825 2826 /* 2827 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2828 * it is possible for us to miss an event due to race conditions, but 2829 * that condition is expected to be rare, so for the moment it is the 2830 * preferred interface. 2831 */ 2832 void 2833 vn_pollevent(vp, events) 2834 struct vnode *vp; 2835 short events; 2836 { 2837 2838 if (vp->v_pollinfo == NULL) 2839 v_addpollinfo(vp); 2840 mtx_lock(&vp->v_pollinfo->vpi_lock); 2841 if (vp->v_pollinfo->vpi_events & events) { 2842 /* 2843 * We clear vpi_events so that we don't 2844 * call selwakeup() twice if two events are 2845 * posted before the polling process(es) is 2846 * awakened. This also ensures that we take at 2847 * most one selwakeup() if the polling process 2848 * is no longer interested. However, it does 2849 * mean that only one event can be noticed at 2850 * a time. (Perhaps we should only clear those 2851 * event bits which we note?) XXX 2852 */ 2853 vp->v_pollinfo->vpi_events = 0; /* &= ~events ??? */ 2854 vp->v_pollinfo->vpi_revents |= events; 2855 selwakeup(&vp->v_pollinfo->vpi_selinfo); 2856 } 2857 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2858 } 2859 2860 /* 2861 * Wake up anyone polling on vp because it is being revoked. 2862 * This depends on dead_poll() returning POLLHUP for correct 2863 * behavior. 2864 */ 2865 void 2866 vn_pollgone(vp) 2867 struct vnode *vp; 2868 { 2869 2870 mtx_lock(&vp->v_pollinfo->vpi_lock); 2871 VN_KNOTE(vp, NOTE_REVOKE); 2872 if (vp->v_pollinfo->vpi_events) { 2873 vp->v_pollinfo->vpi_events = 0; 2874 selwakeup(&vp->v_pollinfo->vpi_selinfo); 2875 } 2876 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2877 } 2878 2879 2880 2881 /* 2882 * Routine to create and manage a filesystem syncer vnode. 2883 */ 2884 #define sync_close ((int (*)(struct vop_close_args *))nullop) 2885 static int sync_fsync(struct vop_fsync_args *); 2886 static int sync_inactive(struct vop_inactive_args *); 2887 static int sync_reclaim(struct vop_reclaim_args *); 2888 #define sync_lock ((int (*)(struct vop_lock_args *))vop_nolock) 2889 #define sync_unlock ((int (*)(struct vop_unlock_args *))vop_nounlock) 2890 static int sync_print(struct vop_print_args *); 2891 #define sync_islocked ((int(*)(struct vop_islocked_args *))vop_noislocked) 2892 2893 static vop_t **sync_vnodeop_p; 2894 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2895 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2896 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2897 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2898 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2899 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2900 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2901 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2902 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2903 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2904 { NULL, NULL } 2905 }; 2906 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2907 { &sync_vnodeop_p, sync_vnodeop_entries }; 2908 2909 VNODEOP_SET(sync_vnodeop_opv_desc); 2910 2911 /* 2912 * Create a new filesystem syncer vnode for the specified mount point. 2913 */ 2914 int 2915 vfs_allocate_syncvnode(mp) 2916 struct mount *mp; 2917 { 2918 struct vnode *vp; 2919 static long start, incr, next; 2920 int error; 2921 2922 /* Allocate a new vnode */ 2923 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2924 mp->mnt_syncer = NULL; 2925 return (error); 2926 } 2927 vp->v_type = VNON; 2928 /* 2929 * Place the vnode onto the syncer worklist. We attempt to 2930 * scatter them about on the list so that they will go off 2931 * at evenly distributed times even if all the filesystems 2932 * are mounted at once. 2933 */ 2934 next += incr; 2935 if (next == 0 || next > syncer_maxdelay) { 2936 start /= 2; 2937 incr /= 2; 2938 if (start == 0) { 2939 start = syncer_maxdelay / 2; 2940 incr = syncer_maxdelay; 2941 } 2942 next = start; 2943 } 2944 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2945 mp->mnt_syncer = vp; 2946 return (0); 2947 } 2948 2949 /* 2950 * Do a lazy sync of the filesystem. 2951 */ 2952 static int 2953 sync_fsync(ap) 2954 struct vop_fsync_args /* { 2955 struct vnode *a_vp; 2956 struct ucred *a_cred; 2957 int a_waitfor; 2958 struct thread *a_td; 2959 } */ *ap; 2960 { 2961 struct vnode *syncvp = ap->a_vp; 2962 struct mount *mp = syncvp->v_mount; 2963 struct thread *td = ap->a_td; 2964 int asyncflag; 2965 2966 /* 2967 * We only need to do something if this is a lazy evaluation. 2968 */ 2969 if (ap->a_waitfor != MNT_LAZY) 2970 return (0); 2971 2972 /* 2973 * Move ourselves to the back of the sync list. 2974 */ 2975 vn_syncer_add_to_worklist(syncvp, syncdelay); 2976 2977 /* 2978 * Walk the list of vnodes pushing all that are dirty and 2979 * not already on the sync list. 2980 */ 2981 mtx_lock(&mountlist_mtx); 2982 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { 2983 mtx_unlock(&mountlist_mtx); 2984 return (0); 2985 } 2986 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2987 vfs_unbusy(mp, td); 2988 return (0); 2989 } 2990 asyncflag = mp->mnt_flag & MNT_ASYNC; 2991 mp->mnt_flag &= ~MNT_ASYNC; 2992 vfs_msync(mp, MNT_NOWAIT); 2993 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td); 2994 if (asyncflag) 2995 mp->mnt_flag |= MNT_ASYNC; 2996 vn_finished_write(mp); 2997 vfs_unbusy(mp, td); 2998 return (0); 2999 } 3000 3001 /* 3002 * The syncer vnode is no referenced. 3003 */ 3004 static int 3005 sync_inactive(ap) 3006 struct vop_inactive_args /* { 3007 struct vnode *a_vp; 3008 struct thread *a_td; 3009 } */ *ap; 3010 { 3011 3012 vgone(ap->a_vp); 3013 return (0); 3014 } 3015 3016 /* 3017 * The syncer vnode is no longer needed and is being decommissioned. 3018 * 3019 * Modifications to the worklist must be protected at splbio(). 3020 */ 3021 static int 3022 sync_reclaim(ap) 3023 struct vop_reclaim_args /* { 3024 struct vnode *a_vp; 3025 } */ *ap; 3026 { 3027 struct vnode *vp = ap->a_vp; 3028 int s; 3029 3030 s = splbio(); 3031 vp->v_mount->mnt_syncer = NULL; 3032 if (vp->v_flag & VONWORKLST) { 3033 LIST_REMOVE(vp, v_synclist); 3034 vp->v_flag &= ~VONWORKLST; 3035 } 3036 splx(s); 3037 3038 return (0); 3039 } 3040 3041 /* 3042 * Print out a syncer vnode. 3043 */ 3044 static int 3045 sync_print(ap) 3046 struct vop_print_args /* { 3047 struct vnode *a_vp; 3048 } */ *ap; 3049 { 3050 struct vnode *vp = ap->a_vp; 3051 3052 printf("syncer vnode"); 3053 if (vp->v_vnlock != NULL) 3054 lockmgr_printinfo(vp->v_vnlock); 3055 printf("\n"); 3056 return (0); 3057 } 3058 3059 /* 3060 * extract the dev_t from a VCHR 3061 */ 3062 dev_t 3063 vn_todev(vp) 3064 struct vnode *vp; 3065 { 3066 if (vp->v_type != VCHR) 3067 return (NODEV); 3068 return (vp->v_rdev); 3069 } 3070 3071 /* 3072 * Check if vnode represents a disk device 3073 */ 3074 int 3075 vn_isdisk(vp, errp) 3076 struct vnode *vp; 3077 int *errp; 3078 { 3079 struct cdevsw *cdevsw; 3080 3081 if (vp->v_type != VCHR) { 3082 if (errp != NULL) 3083 *errp = ENOTBLK; 3084 return (0); 3085 } 3086 if (vp->v_rdev == NULL) { 3087 if (errp != NULL) 3088 *errp = ENXIO; 3089 return (0); 3090 } 3091 cdevsw = devsw(vp->v_rdev); 3092 if (cdevsw == NULL) { 3093 if (errp != NULL) 3094 *errp = ENXIO; 3095 return (0); 3096 } 3097 if (!(cdevsw->d_flags & D_DISK)) { 3098 if (errp != NULL) 3099 *errp = ENOTBLK; 3100 return (0); 3101 } 3102 if (errp != NULL) 3103 *errp = 0; 3104 return (1); 3105 } 3106 3107 /* 3108 * Free data allocated by namei(); see namei(9) for details. 3109 */ 3110 void 3111 NDFREE(ndp, flags) 3112 struct nameidata *ndp; 3113 const uint flags; 3114 { 3115 if (!(flags & NDF_NO_FREE_PNBUF) && 3116 (ndp->ni_cnd.cn_flags & HASBUF)) { 3117 uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3118 ndp->ni_cnd.cn_flags &= ~HASBUF; 3119 } 3120 if (!(flags & NDF_NO_DVP_UNLOCK) && 3121 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3122 ndp->ni_dvp != ndp->ni_vp) 3123 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread); 3124 if (!(flags & NDF_NO_DVP_RELE) && 3125 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3126 vrele(ndp->ni_dvp); 3127 ndp->ni_dvp = NULL; 3128 } 3129 if (!(flags & NDF_NO_VP_UNLOCK) && 3130 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3131 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread); 3132 if (!(flags & NDF_NO_VP_RELE) && 3133 ndp->ni_vp) { 3134 vrele(ndp->ni_vp); 3135 ndp->ni_vp = NULL; 3136 } 3137 if (!(flags & NDF_NO_STARTDIR_RELE) && 3138 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3139 vrele(ndp->ni_startdir); 3140 ndp->ni_startdir = NULL; 3141 } 3142 } 3143 3144 /* 3145 * Common filesystem object access control check routine. Accepts a 3146 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3147 * and optional call-by-reference privused argument allowing vaccess() 3148 * to indicate to the caller whether privilege was used to satisfy the 3149 * request. Returns 0 on success, or an errno on failure. 3150 */ 3151 int 3152 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3153 enum vtype type; 3154 mode_t file_mode; 3155 uid_t file_uid; 3156 gid_t file_gid; 3157 mode_t acc_mode; 3158 struct ucred *cred; 3159 int *privused; 3160 { 3161 mode_t dac_granted; 3162 #ifdef CAPABILITIES 3163 mode_t cap_granted; 3164 #endif 3165 3166 /* 3167 * Look for a normal, non-privileged way to access the file/directory 3168 * as requested. If it exists, go with that. 3169 */ 3170 3171 if (privused != NULL) 3172 *privused = 0; 3173 3174 dac_granted = 0; 3175 3176 /* Check the owner. */ 3177 if (cred->cr_uid == file_uid) { 3178 dac_granted |= VADMIN; 3179 if (file_mode & S_IXUSR) 3180 dac_granted |= VEXEC; 3181 if (file_mode & S_IRUSR) 3182 dac_granted |= VREAD; 3183 if (file_mode & S_IWUSR) 3184 dac_granted |= VWRITE; 3185 3186 if ((acc_mode & dac_granted) == acc_mode) 3187 return (0); 3188 3189 goto privcheck; 3190 } 3191 3192 /* Otherwise, check the groups (first match) */ 3193 if (groupmember(file_gid, cred)) { 3194 if (file_mode & S_IXGRP) 3195 dac_granted |= VEXEC; 3196 if (file_mode & S_IRGRP) 3197 dac_granted |= VREAD; 3198 if (file_mode & S_IWGRP) 3199 dac_granted |= VWRITE; 3200 3201 if ((acc_mode & dac_granted) == acc_mode) 3202 return (0); 3203 3204 goto privcheck; 3205 } 3206 3207 /* Otherwise, check everyone else. */ 3208 if (file_mode & S_IXOTH) 3209 dac_granted |= VEXEC; 3210 if (file_mode & S_IROTH) 3211 dac_granted |= VREAD; 3212 if (file_mode & S_IWOTH) 3213 dac_granted |= VWRITE; 3214 if ((acc_mode & dac_granted) == acc_mode) 3215 return (0); 3216 3217 privcheck: 3218 if (!suser_cred(cred, PRISON_ROOT)) { 3219 /* XXX audit: privilege used */ 3220 if (privused != NULL) 3221 *privused = 1; 3222 return (0); 3223 } 3224 3225 #ifdef CAPABILITIES 3226 /* 3227 * Build a capability mask to determine if the set of capabilities 3228 * satisfies the requirements when combined with the granted mask 3229 * from above. 3230 * For each capability, if the capability is required, bitwise 3231 * or the request type onto the cap_granted mask. 3232 */ 3233 cap_granted = 0; 3234 3235 if (type == VDIR) { 3236 /* 3237 * For directories, use CAP_DAC_READ_SEARCH to satisfy 3238 * VEXEC requests, instead of CAP_DAC_EXECUTE. 3239 */ 3240 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3241 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3242 cap_granted |= VEXEC; 3243 } else { 3244 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3245 !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3246 cap_granted |= VEXEC; 3247 } 3248 3249 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3250 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3251 cap_granted |= VREAD; 3252 3253 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3254 !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3255 cap_granted |= VWRITE; 3256 3257 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3258 !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) 3259 cap_granted |= VADMIN; 3260 3261 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3262 /* XXX audit: privilege used */ 3263 if (privused != NULL) 3264 *privused = 1; 3265 return (0); 3266 } 3267 #endif 3268 3269 return ((acc_mode & VADMIN) ? EPERM : EACCES); 3270 } 3271