1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 #include "opt_ffs.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/eventhandler.h> 54 #include <sys/fcntl.h> 55 #include <sys/kernel.h> 56 #include <sys/kthread.h> 57 #include <sys/malloc.h> 58 #include <sys/mount.h> 59 #include <sys/namei.h> 60 #include <sys/stat.h> 61 #include <sys/sysctl.h> 62 #include <sys/vmmeter.h> 63 #include <sys/vnode.h> 64 65 #include <vm/vm.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_extern.h> 68 #include <vm/pmap.h> 69 #include <vm/vm_map.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_zone.h> 72 73 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 74 75 static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); 76 static void insmntque __P((struct vnode *vp, struct mount *mp)); 77 static void vclean __P((struct vnode *vp, int flags, struct thread *td)); 78 79 /* 80 * Number of vnodes in existence. Increased whenever getnewvnode() 81 * allocates a new vnode, never decreased. 82 */ 83 static unsigned long numvnodes; 84 SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 85 86 /* 87 * Conversion tables for conversion from vnode types to inode formats 88 * and back. 89 */ 90 enum vtype iftovt_tab[16] = { 91 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 92 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 93 }; 94 int vttoif_tab[9] = { 95 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 96 S_IFSOCK, S_IFIFO, S_IFMT, 97 }; 98 99 /* 100 * List of vnodes that are ready for recycling. 101 */ 102 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 103 104 /* 105 * Minimum number of free vnodes. If there are fewer than this free vnodes, 106 * getnewvnode() will return a newly allocated vnode. 107 */ 108 static u_long wantfreevnodes = 25; 109 SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 110 /* Number of vnodes in the free list. */ 111 static u_long freevnodes = 0; 112 SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 113 114 #if 0 115 /* Number of vnode allocation. */ 116 static u_long vnodeallocs = 0; 117 SYSCTL_LONG(_debug, OID_AUTO, vnodeallocs, CTLFLAG_RD, &vnodeallocs, 0, ""); 118 /* Period of vnode recycle from namecache in vnode allocation times. */ 119 static u_long vnoderecycleperiod = 1000; 120 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleperiod, CTLFLAG_RW, &vnoderecycleperiod, 0, ""); 121 /* Minimum number of total vnodes required to invoke vnode recycle from namecache. */ 122 static u_long vnoderecyclemintotalvn = 2000; 123 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclemintotalvn, CTLFLAG_RW, &vnoderecyclemintotalvn, 0, ""); 124 /* Minimum number of free vnodes required to invoke vnode recycle from namecache. */ 125 static u_long vnoderecycleminfreevn = 2000; 126 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleminfreevn, CTLFLAG_RW, &vnoderecycleminfreevn, 0, ""); 127 /* Number of vnodes attempted to recycle at a time. */ 128 static u_long vnoderecyclenumber = 3000; 129 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclenumber, CTLFLAG_RW, &vnoderecyclenumber, 0, ""); 130 #endif 131 132 /* 133 * Various variables used for debugging the new implementation of 134 * reassignbuf(). 135 * XXX these are probably of (very) limited utility now. 136 */ 137 static int reassignbufcalls; 138 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 139 static int reassignbufloops; 140 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 141 static int reassignbufsortgood; 142 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 143 static int reassignbufsortbad; 144 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 145 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ 146 static int reassignbufmethod = 1; 147 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 148 static int nameileafonly = 0; 149 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); 150 151 #ifdef ENABLE_VFS_IOOPT 152 /* See NOTES for a description of this setting. */ 153 int vfs_ioopt = 0; 154 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 155 #endif 156 157 /* List of mounted filesystems. */ 158 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 159 160 /* For any iteration/modification of mountlist */ 161 struct mtx mountlist_mtx; 162 163 /* For any iteration/modification of mnt_vnodelist */ 164 struct mtx mntvnode_mtx; 165 166 /* 167 * Cache for the mount type id assigned to NFS. This is used for 168 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 169 */ 170 int nfs_mount_type = -1; 171 172 /* To keep more than one thread at a time from running vfs_getnewfsid */ 173 static struct mtx mntid_mtx; 174 175 /* For any iteration/modification of vnode_free_list */ 176 static struct mtx vnode_free_list_mtx; 177 178 /* 179 * For any iteration/modification of dev->si_hlist (linked through 180 * v_specnext) 181 */ 182 static struct mtx spechash_mtx; 183 184 /* Publicly exported FS */ 185 struct nfs_public nfs_pub; 186 187 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 188 static vm_zone_t vnode_zone; 189 190 /* Set to 1 to print out reclaim of active vnodes */ 191 int prtactive = 0; 192 193 /* 194 * The workitem queue. 195 * 196 * It is useful to delay writes of file data and filesystem metadata 197 * for tens of seconds so that quickly created and deleted files need 198 * not waste disk bandwidth being created and removed. To realize this, 199 * we append vnodes to a "workitem" queue. When running with a soft 200 * updates implementation, most pending metadata dependencies should 201 * not wait for more than a few seconds. Thus, mounted on block devices 202 * are delayed only about a half the time that file data is delayed. 203 * Similarly, directory updates are more critical, so are only delayed 204 * about a third the time that file data is delayed. Thus, there are 205 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 206 * one each second (driven off the filesystem syncer process). The 207 * syncer_delayno variable indicates the next queue that is to be processed. 208 * Items that need to be processed soon are placed in this queue: 209 * 210 * syncer_workitem_pending[syncer_delayno] 211 * 212 * A delay of fifteen seconds is done by placing the request fifteen 213 * entries later in the queue: 214 * 215 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 216 * 217 */ 218 static int syncer_delayno = 0; 219 static long syncer_mask; 220 LIST_HEAD(synclist, vnode); 221 static struct synclist *syncer_workitem_pending; 222 223 #define SYNCER_MAXDELAY 32 224 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 225 static int syncdelay = 30; /* max time to delay syncing data */ 226 static int filedelay = 30; /* time to delay syncing files */ 227 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 228 static int dirdelay = 29; /* time to delay syncing directories */ 229 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 230 static int metadelay = 28; /* time to delay syncing metadata */ 231 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 232 static int rushjob; /* number of slots to run ASAP */ 233 static int stat_rush_requests; /* number of times I/O speeded up */ 234 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 235 236 /* 237 * Number of vnodes we want to exist at any one time. This is mostly used 238 * to size hash tables in vnode-related code. It is normally not used in 239 * getnewvnode(), as wantfreevnodes is normally nonzero.) 240 * 241 * XXX desiredvnodes is historical cruft and should not exist. 242 */ 243 int desiredvnodes; 244 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 245 &desiredvnodes, 0, "Maximum number of vnodes"); 246 static int minvnodes; 247 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 248 &minvnodes, 0, "Minimum number of vnodes"); 249 250 /* 251 * Initialize the vnode management data structures. 252 */ 253 static void 254 vntblinit(void *dummy __unused) 255 { 256 257 desiredvnodes = maxproc + cnt.v_page_count / 4; 258 minvnodes = desiredvnodes / 4; 259 mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); 260 mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); 261 mtx_init(&mntid_mtx, "mntid", MTX_DEF); 262 mtx_init(&spechash_mtx, "spechash", MTX_DEF); 263 TAILQ_INIT(&vnode_free_list); 264 mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); 265 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 266 /* 267 * Initialize the filesystem syncer. 268 */ 269 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 270 &syncer_mask); 271 syncer_maxdelay = syncer_mask + 1; 272 } 273 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 274 275 276 /* 277 * Mark a mount point as busy. Used to synchronize access and to delay 278 * unmounting. Interlock is not released on failure. 279 */ 280 int 281 vfs_busy(mp, flags, interlkp, td) 282 struct mount *mp; 283 int flags; 284 struct mtx *interlkp; 285 struct thread *td; 286 { 287 int lkflags; 288 289 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 290 if (flags & LK_NOWAIT) 291 return (ENOENT); 292 mp->mnt_kern_flag |= MNTK_MWAIT; 293 /* 294 * Since all busy locks are shared except the exclusive 295 * lock granted when unmounting, the only place that a 296 * wakeup needs to be done is at the release of the 297 * exclusive lock at the end of dounmount. 298 */ 299 msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); 300 return (ENOENT); 301 } 302 lkflags = LK_SHARED | LK_NOPAUSE; 303 if (interlkp) 304 lkflags |= LK_INTERLOCK; 305 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td)) 306 panic("vfs_busy: unexpected lock failure"); 307 return (0); 308 } 309 310 /* 311 * Free a busy filesystem. 312 */ 313 void 314 vfs_unbusy(mp, td) 315 struct mount *mp; 316 struct thread *td; 317 { 318 319 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); 320 } 321 322 /* 323 * Lookup a filesystem type, and if found allocate and initialize 324 * a mount structure for it. 325 * 326 * Devname is usually updated by mount(8) after booting. 327 */ 328 int 329 vfs_rootmountalloc(fstypename, devname, mpp) 330 char *fstypename; 331 char *devname; 332 struct mount **mpp; 333 { 334 struct thread *td = curthread; /* XXX */ 335 struct vfsconf *vfsp; 336 struct mount *mp; 337 338 if (fstypename == NULL) 339 return (ENODEV); 340 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 341 if (!strcmp(vfsp->vfc_name, fstypename)) 342 break; 343 if (vfsp == NULL) 344 return (ENODEV); 345 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); 346 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 347 (void)vfs_busy(mp, LK_NOWAIT, 0, td); 348 TAILQ_INIT(&mp->mnt_nvnodelist); 349 mp->mnt_vfc = vfsp; 350 mp->mnt_op = vfsp->vfc_vfsops; 351 mp->mnt_flag = MNT_RDONLY; 352 mp->mnt_vnodecovered = NULLVP; 353 vfsp->vfc_refcount++; 354 mp->mnt_iosize_max = DFLTPHYS; 355 mp->mnt_stat.f_type = vfsp->vfc_typenum; 356 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 357 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 358 mp->mnt_stat.f_mntonname[0] = '/'; 359 mp->mnt_stat.f_mntonname[1] = 0; 360 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 361 *mpp = mp; 362 return (0); 363 } 364 365 /* 366 * Find an appropriate filesystem to use for the root. If a filesystem 367 * has not been preselected, walk through the list of known filesystems 368 * trying those that have mountroot routines, and try them until one 369 * works or we have tried them all. 370 */ 371 #ifdef notdef /* XXX JH */ 372 int 373 lite2_vfs_mountroot() 374 { 375 struct vfsconf *vfsp; 376 extern int (*lite2_mountroot) __P((void)); 377 int error; 378 379 if (lite2_mountroot != NULL) 380 return ((*lite2_mountroot)()); 381 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 382 if (vfsp->vfc_mountroot == NULL) 383 continue; 384 if ((error = (*vfsp->vfc_mountroot)()) == 0) 385 return (0); 386 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 387 } 388 return (ENODEV); 389 } 390 #endif 391 392 /* 393 * Lookup a mount point by filesystem identifier. 394 */ 395 struct mount * 396 vfs_getvfs(fsid) 397 fsid_t *fsid; 398 { 399 register struct mount *mp; 400 401 mtx_lock(&mountlist_mtx); 402 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 403 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 404 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 405 mtx_unlock(&mountlist_mtx); 406 return (mp); 407 } 408 } 409 mtx_unlock(&mountlist_mtx); 410 return ((struct mount *) 0); 411 } 412 413 /* 414 * Get a new unique fsid. Try to make its val[0] unique, since this value 415 * will be used to create fake device numbers for stat(). Also try (but 416 * not so hard) make its val[0] unique mod 2^16, since some emulators only 417 * support 16-bit device numbers. We end up with unique val[0]'s for the 418 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 419 * 420 * Keep in mind that several mounts may be running in parallel. Starting 421 * the search one past where the previous search terminated is both a 422 * micro-optimization and a defense against returning the same fsid to 423 * different mounts. 424 */ 425 void 426 vfs_getnewfsid(mp) 427 struct mount *mp; 428 { 429 static u_int16_t mntid_base; 430 fsid_t tfsid; 431 int mtype; 432 433 mtx_lock(&mntid_mtx); 434 mtype = mp->mnt_vfc->vfc_typenum; 435 tfsid.val[1] = mtype; 436 mtype = (mtype & 0xFF) << 24; 437 for (;;) { 438 tfsid.val[0] = makeudev(255, 439 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 440 mntid_base++; 441 if (vfs_getvfs(&tfsid) == NULL) 442 break; 443 } 444 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 445 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 446 mtx_unlock(&mntid_mtx); 447 } 448 449 /* 450 * Knob to control the precision of file timestamps: 451 * 452 * 0 = seconds only; nanoseconds zeroed. 453 * 1 = seconds and nanoseconds, accurate within 1/HZ. 454 * 2 = seconds and nanoseconds, truncated to microseconds. 455 * >=3 = seconds and nanoseconds, maximum precision. 456 */ 457 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 458 459 static int timestamp_precision = TSP_SEC; 460 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 461 ×tamp_precision, 0, ""); 462 463 /* 464 * Get a current timestamp. 465 */ 466 void 467 vfs_timestamp(tsp) 468 struct timespec *tsp; 469 { 470 struct timeval tv; 471 472 switch (timestamp_precision) { 473 case TSP_SEC: 474 tsp->tv_sec = time_second; 475 tsp->tv_nsec = 0; 476 break; 477 case TSP_HZ: 478 getnanotime(tsp); 479 break; 480 case TSP_USEC: 481 microtime(&tv); 482 TIMEVAL_TO_TIMESPEC(&tv, tsp); 483 break; 484 case TSP_NSEC: 485 default: 486 nanotime(tsp); 487 break; 488 } 489 } 490 491 /* 492 * Set vnode attributes to VNOVAL 493 */ 494 void 495 vattr_null(vap) 496 register struct vattr *vap; 497 { 498 499 vap->va_type = VNON; 500 vap->va_size = VNOVAL; 501 vap->va_bytes = VNOVAL; 502 vap->va_mode = VNOVAL; 503 vap->va_nlink = VNOVAL; 504 vap->va_uid = VNOVAL; 505 vap->va_gid = VNOVAL; 506 vap->va_fsid = VNOVAL; 507 vap->va_fileid = VNOVAL; 508 vap->va_blocksize = VNOVAL; 509 vap->va_rdev = VNOVAL; 510 vap->va_atime.tv_sec = VNOVAL; 511 vap->va_atime.tv_nsec = VNOVAL; 512 vap->va_mtime.tv_sec = VNOVAL; 513 vap->va_mtime.tv_nsec = VNOVAL; 514 vap->va_ctime.tv_sec = VNOVAL; 515 vap->va_ctime.tv_nsec = VNOVAL; 516 vap->va_flags = VNOVAL; 517 vap->va_gen = VNOVAL; 518 vap->va_vaflags = 0; 519 } 520 521 /* 522 * This routine is called when we have too many vnodes. It attempts 523 * to free <count> vnodes and will potentially free vnodes that still 524 * have VM backing store (VM backing store is typically the cause 525 * of a vnode blowout so we want to do this). Therefore, this operation 526 * is not considered cheap. 527 * 528 * A number of conditions may prevent a vnode from being reclaimed. 529 * the buffer cache may have references on the vnode, a directory 530 * vnode may still have references due to the namei cache representing 531 * underlying files, or the vnode may be in active use. It is not 532 * desireable to reuse such vnodes. These conditions may cause the 533 * number of vnodes to reach some minimum value regardless of what 534 * you set kern.maxvnodes to. Do not set kernl.maxvnodes too low. 535 */ 536 static void 537 vlrureclaim(struct mount *mp, int count) 538 { 539 struct vnode *vp; 540 541 mtx_lock(&mntvnode_mtx); 542 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 543 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 544 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 545 546 if (vp->v_type != VNON && 547 vp->v_type != VBAD && 548 VMIGHTFREE(vp) && /* critical path opt */ 549 mtx_trylock(&vp->v_interlock) 550 ) { 551 mtx_unlock(&mntvnode_mtx); 552 if (VMIGHTFREE(vp)) { 553 vgonel(vp, curthread); 554 } else { 555 mtx_unlock(&vp->v_interlock); 556 } 557 mtx_lock(&mntvnode_mtx); 558 } 559 --count; 560 } 561 mtx_unlock(&mntvnode_mtx); 562 } 563 564 /* 565 * Routines having to do with the management of the vnode table. 566 */ 567 568 /* 569 * Return the next vnode from the free list. 570 */ 571 int 572 getnewvnode(tag, mp, vops, vpp) 573 enum vtagtype tag; 574 struct mount *mp; 575 vop_t **vops; 576 struct vnode **vpp; 577 { 578 int s; 579 struct thread *td = curthread; /* XXX */ 580 struct vnode *vp = NULL; 581 struct mount *vnmp; 582 vm_object_t object; 583 584 s = splbio(); 585 /* 586 * Try to reuse vnodes if we hit the max. This situation only 587 * occurs in certain large-memory (2G+) situations. For the 588 * algorithm to be stable we have to try to reuse at least 2. 589 * No hysteresis should be necessary. 590 */ 591 if (numvnodes - freevnodes > desiredvnodes) 592 vlrureclaim(mp, 2); 593 594 /* 595 * Attempt to reuse a vnode already on the free list, allocating 596 * a new vnode if we can't find one or if we have not reached a 597 * good minimum for good LRU performance. 598 */ 599 600 mtx_lock(&vnode_free_list_mtx); 601 602 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 603 int count; 604 605 for (count = 0; count < freevnodes; count++) { 606 vp = TAILQ_FIRST(&vnode_free_list); 607 if (vp == NULL || vp->v_usecount) 608 panic("getnewvnode: free vnode isn't"); 609 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 610 611 /* 612 * Don't recycle if we still have cached pages or if 613 * we cannot get the interlock. 614 */ 615 if ((VOP_GETVOBJECT(vp, &object) == 0 && 616 (object->resident_page_count || 617 object->ref_count)) || 618 !mtx_trylock(&vp->v_interlock)) { 619 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 620 v_freelist); 621 vp = NULL; 622 continue; 623 } 624 if (LIST_FIRST(&vp->v_cache_src)) { 625 /* 626 * note: nameileafonly sysctl is temporary, 627 * for debugging only, and will eventually be 628 * removed. 629 */ 630 if (nameileafonly > 0) { 631 /* 632 * Do not reuse namei-cached directory 633 * vnodes that have cached 634 * subdirectories. 635 */ 636 if (cache_leaf_test(vp) < 0) { 637 mtx_unlock(&vp->v_interlock); 638 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 639 vp = NULL; 640 continue; 641 } 642 } else if (nameileafonly < 0 || 643 vmiodirenable == 0) { 644 /* 645 * Do not reuse namei-cached directory 646 * vnodes if nameileafonly is -1 or 647 * if VMIO backing for directories is 648 * turned off (otherwise we reuse them 649 * too quickly). 650 */ 651 mtx_unlock(&vp->v_interlock); 652 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 653 vp = NULL; 654 continue; 655 } 656 } 657 /* 658 * Skip over it if its filesystem is being suspended. 659 */ 660 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 661 break; 662 mtx_unlock(&vp->v_interlock); 663 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 664 vp = NULL; 665 } 666 } 667 if (vp) { 668 vp->v_flag |= VDOOMED; 669 vp->v_flag &= ~VFREE; 670 freevnodes--; 671 mtx_unlock(&vnode_free_list_mtx); 672 cache_purge(vp); 673 vp->v_lease = NULL; 674 if (vp->v_type != VBAD) { 675 vgonel(vp, td); 676 } else { 677 mtx_unlock(&vp->v_interlock); 678 } 679 vn_finished_write(vnmp); 680 681 #ifdef INVARIANTS 682 { 683 int s; 684 685 if (vp->v_data) 686 panic("cleaned vnode isn't"); 687 s = splbio(); 688 if (vp->v_numoutput) 689 panic("Clean vnode has pending I/O's"); 690 splx(s); 691 if (vp->v_writecount != 0) 692 panic("Non-zero write count"); 693 } 694 #endif 695 vp->v_flag = 0; 696 vp->v_lastw = 0; 697 vp->v_lasta = 0; 698 vp->v_cstart = 0; 699 vp->v_clen = 0; 700 vp->v_socket = 0; 701 } else { 702 mtx_unlock(&vnode_free_list_mtx); 703 vp = (struct vnode *) zalloc(vnode_zone); 704 bzero((char *) vp, sizeof *vp); 705 mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); 706 vp->v_dd = vp; 707 mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF); 708 cache_purge(vp); 709 LIST_INIT(&vp->v_cache_src); 710 TAILQ_INIT(&vp->v_cache_dst); 711 numvnodes++; 712 } 713 714 TAILQ_INIT(&vp->v_cleanblkhd); 715 TAILQ_INIT(&vp->v_dirtyblkhd); 716 vp->v_type = VNON; 717 vp->v_tag = tag; 718 vp->v_op = vops; 719 lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); 720 insmntque(vp, mp); 721 *vpp = vp; 722 vp->v_usecount = 1; 723 vp->v_data = 0; 724 725 splx(s); 726 727 vfs_object_create(vp, td, td->td_proc->p_ucred); 728 729 #if 0 730 vnodeallocs++; 731 if (vnodeallocs % vnoderecycleperiod == 0 && 732 freevnodes < vnoderecycleminfreevn && 733 vnoderecyclemintotalvn < numvnodes) { 734 /* Recycle vnodes. */ 735 cache_purgeleafdirs(vnoderecyclenumber); 736 } 737 #endif 738 739 return (0); 740 } 741 742 /* 743 * Move a vnode from one mount queue to another. 744 */ 745 static void 746 insmntque(vp, mp) 747 register struct vnode *vp; 748 register struct mount *mp; 749 { 750 751 mtx_lock(&mntvnode_mtx); 752 /* 753 * Delete from old mount point vnode list, if on one. 754 */ 755 if (vp->v_mount != NULL) 756 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); 757 /* 758 * Insert into list of vnodes for the new mount point, if available. 759 */ 760 if ((vp->v_mount = mp) == NULL) { 761 mtx_unlock(&mntvnode_mtx); 762 return; 763 } 764 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 765 mtx_unlock(&mntvnode_mtx); 766 } 767 768 /* 769 * Update outstanding I/O count and do wakeup if requested. 770 */ 771 void 772 vwakeup(bp) 773 register struct buf *bp; 774 { 775 register struct vnode *vp; 776 777 bp->b_flags &= ~B_WRITEINPROG; 778 if ((vp = bp->b_vp)) { 779 vp->v_numoutput--; 780 if (vp->v_numoutput < 0) 781 panic("vwakeup: neg numoutput"); 782 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 783 vp->v_flag &= ~VBWAIT; 784 wakeup((caddr_t) &vp->v_numoutput); 785 } 786 } 787 } 788 789 /* 790 * Flush out and invalidate all buffers associated with a vnode. 791 * Called with the underlying object locked. 792 */ 793 int 794 vinvalbuf(vp, flags, cred, td, slpflag, slptimeo) 795 register struct vnode *vp; 796 int flags; 797 struct ucred *cred; 798 struct thread *td; 799 int slpflag, slptimeo; 800 { 801 register struct buf *bp; 802 struct buf *nbp, *blist; 803 int s, error; 804 vm_object_t object; 805 806 GIANT_REQUIRED; 807 808 if (flags & V_SAVE) { 809 s = splbio(); 810 while (vp->v_numoutput) { 811 vp->v_flag |= VBWAIT; 812 error = tsleep((caddr_t)&vp->v_numoutput, 813 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 814 if (error) { 815 splx(s); 816 return (error); 817 } 818 } 819 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 820 splx(s); 821 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0) 822 return (error); 823 s = splbio(); 824 if (vp->v_numoutput > 0 || 825 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 826 panic("vinvalbuf: dirty bufs"); 827 } 828 splx(s); 829 } 830 s = splbio(); 831 for (;;) { 832 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 833 if (!blist) 834 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 835 if (!blist) 836 break; 837 838 for (bp = blist; bp; bp = nbp) { 839 nbp = TAILQ_NEXT(bp, b_vnbufs); 840 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 841 error = BUF_TIMELOCK(bp, 842 LK_EXCLUSIVE | LK_SLEEPFAIL, 843 "vinvalbuf", slpflag, slptimeo); 844 if (error == ENOLCK) 845 break; 846 splx(s); 847 return (error); 848 } 849 /* 850 * XXX Since there are no node locks for NFS, I 851 * believe there is a slight chance that a delayed 852 * write will occur while sleeping just above, so 853 * check for it. Note that vfs_bio_awrite expects 854 * buffers to reside on a queue, while BUF_WRITE and 855 * brelse do not. 856 */ 857 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 858 (flags & V_SAVE)) { 859 860 if (bp->b_vp == vp) { 861 if (bp->b_flags & B_CLUSTEROK) { 862 BUF_UNLOCK(bp); 863 vfs_bio_awrite(bp); 864 } else { 865 bremfree(bp); 866 bp->b_flags |= B_ASYNC; 867 BUF_WRITE(bp); 868 } 869 } else { 870 bremfree(bp); 871 (void) BUF_WRITE(bp); 872 } 873 break; 874 } 875 bremfree(bp); 876 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 877 bp->b_flags &= ~B_ASYNC; 878 brelse(bp); 879 } 880 } 881 882 /* 883 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 884 * have write I/O in-progress but if there is a VM object then the 885 * VM object can also have read-I/O in-progress. 886 */ 887 do { 888 while (vp->v_numoutput > 0) { 889 vp->v_flag |= VBWAIT; 890 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 891 } 892 if (VOP_GETVOBJECT(vp, &object) == 0) { 893 while (object->paging_in_progress) 894 vm_object_pip_sleep(object, "vnvlbx"); 895 } 896 } while (vp->v_numoutput > 0); 897 898 splx(s); 899 900 /* 901 * Destroy the copy in the VM cache, too. 902 */ 903 mtx_lock(&vp->v_interlock); 904 if (VOP_GETVOBJECT(vp, &object) == 0) { 905 vm_object_page_remove(object, 0, 0, 906 (flags & V_SAVE) ? TRUE : FALSE); 907 } 908 mtx_unlock(&vp->v_interlock); 909 910 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 911 panic("vinvalbuf: flush failed"); 912 return (0); 913 } 914 915 /* 916 * Truncate a file's buffer and pages to a specified length. This 917 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 918 * sync activity. 919 */ 920 int 921 vtruncbuf(vp, cred, td, length, blksize) 922 register struct vnode *vp; 923 struct ucred *cred; 924 struct thread *td; 925 off_t length; 926 int blksize; 927 { 928 register struct buf *bp; 929 struct buf *nbp; 930 int s, anyfreed; 931 int trunclbn; 932 933 /* 934 * Round up to the *next* lbn. 935 */ 936 trunclbn = (length + blksize - 1) / blksize; 937 938 s = splbio(); 939 restart: 940 anyfreed = 1; 941 for (;anyfreed;) { 942 anyfreed = 0; 943 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 944 nbp = TAILQ_NEXT(bp, b_vnbufs); 945 if (bp->b_lblkno >= trunclbn) { 946 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 947 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 948 goto restart; 949 } else { 950 bremfree(bp); 951 bp->b_flags |= (B_INVAL | B_RELBUF); 952 bp->b_flags &= ~B_ASYNC; 953 brelse(bp); 954 anyfreed = 1; 955 } 956 if (nbp && 957 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 958 (nbp->b_vp != vp) || 959 (nbp->b_flags & B_DELWRI))) { 960 goto restart; 961 } 962 } 963 } 964 965 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 966 nbp = TAILQ_NEXT(bp, b_vnbufs); 967 if (bp->b_lblkno >= trunclbn) { 968 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 969 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 970 goto restart; 971 } else { 972 bremfree(bp); 973 bp->b_flags |= (B_INVAL | B_RELBUF); 974 bp->b_flags &= ~B_ASYNC; 975 brelse(bp); 976 anyfreed = 1; 977 } 978 if (nbp && 979 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 980 (nbp->b_vp != vp) || 981 (nbp->b_flags & B_DELWRI) == 0)) { 982 goto restart; 983 } 984 } 985 } 986 } 987 988 if (length > 0) { 989 restartsync: 990 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 991 nbp = TAILQ_NEXT(bp, b_vnbufs); 992 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 993 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 994 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 995 goto restart; 996 } else { 997 bremfree(bp); 998 if (bp->b_vp == vp) { 999 bp->b_flags |= B_ASYNC; 1000 } else { 1001 bp->b_flags &= ~B_ASYNC; 1002 } 1003 BUF_WRITE(bp); 1004 } 1005 goto restartsync; 1006 } 1007 1008 } 1009 } 1010 1011 while (vp->v_numoutput > 0) { 1012 vp->v_flag |= VBWAIT; 1013 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 1014 } 1015 1016 splx(s); 1017 1018 vnode_pager_setsize(vp, length); 1019 1020 return (0); 1021 } 1022 1023 /* 1024 * Associate a buffer with a vnode. 1025 */ 1026 void 1027 bgetvp(vp, bp) 1028 register struct vnode *vp; 1029 register struct buf *bp; 1030 { 1031 int s; 1032 1033 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 1034 1035 vhold(vp); 1036 bp->b_vp = vp; 1037 bp->b_dev = vn_todev(vp); 1038 /* 1039 * Insert onto list for new vnode. 1040 */ 1041 s = splbio(); 1042 bp->b_xflags |= BX_VNCLEAN; 1043 bp->b_xflags &= ~BX_VNDIRTY; 1044 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 1045 splx(s); 1046 } 1047 1048 /* 1049 * Disassociate a buffer from a vnode. 1050 */ 1051 void 1052 brelvp(bp) 1053 register struct buf *bp; 1054 { 1055 struct vnode *vp; 1056 struct buflists *listheadp; 1057 int s; 1058 1059 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1060 1061 /* 1062 * Delete from old vnode list, if on one. 1063 */ 1064 vp = bp->b_vp; 1065 s = splbio(); 1066 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1067 if (bp->b_xflags & BX_VNDIRTY) 1068 listheadp = &vp->v_dirtyblkhd; 1069 else 1070 listheadp = &vp->v_cleanblkhd; 1071 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1072 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1073 } 1074 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 1075 vp->v_flag &= ~VONWORKLST; 1076 LIST_REMOVE(vp, v_synclist); 1077 } 1078 splx(s); 1079 bp->b_vp = (struct vnode *) 0; 1080 vdrop(vp); 1081 } 1082 1083 /* 1084 * Add an item to the syncer work queue. 1085 */ 1086 static void 1087 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 1088 { 1089 int s, slot; 1090 1091 s = splbio(); 1092 1093 if (vp->v_flag & VONWORKLST) { 1094 LIST_REMOVE(vp, v_synclist); 1095 } 1096 1097 if (delay > syncer_maxdelay - 2) 1098 delay = syncer_maxdelay - 2; 1099 slot = (syncer_delayno + delay) & syncer_mask; 1100 1101 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 1102 vp->v_flag |= VONWORKLST; 1103 splx(s); 1104 } 1105 1106 struct proc *updateproc; 1107 static void sched_sync __P((void)); 1108 static struct kproc_desc up_kp = { 1109 "syncer", 1110 sched_sync, 1111 &updateproc 1112 }; 1113 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1114 1115 /* 1116 * System filesystem synchronizer daemon. 1117 */ 1118 void 1119 sched_sync(void) 1120 { 1121 struct synclist *slp; 1122 struct vnode *vp; 1123 struct mount *mp; 1124 long starttime; 1125 int s; 1126 struct thread *td = &updateproc->p_thread; /* XXXKSE */ 1127 1128 mtx_lock(&Giant); 1129 1130 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc, 1131 SHUTDOWN_PRI_LAST); 1132 1133 for (;;) { 1134 kthread_suspend_check(td->td_proc); 1135 1136 starttime = time_second; 1137 1138 /* 1139 * Push files whose dirty time has expired. Be careful 1140 * of interrupt race on slp queue. 1141 */ 1142 s = splbio(); 1143 slp = &syncer_workitem_pending[syncer_delayno]; 1144 syncer_delayno += 1; 1145 if (syncer_delayno == syncer_maxdelay) 1146 syncer_delayno = 0; 1147 splx(s); 1148 1149 while ((vp = LIST_FIRST(slp)) != NULL) { 1150 if (VOP_ISLOCKED(vp, NULL) == 0 && 1151 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 1152 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1153 (void) VOP_FSYNC(vp, td->td_proc->p_ucred, MNT_LAZY, td); 1154 VOP_UNLOCK(vp, 0, td); 1155 vn_finished_write(mp); 1156 } 1157 s = splbio(); 1158 if (LIST_FIRST(slp) == vp) { 1159 /* 1160 * Note: v_tag VT_VFS vps can remain on the 1161 * worklist too with no dirty blocks, but 1162 * since sync_fsync() moves it to a different 1163 * slot we are safe. 1164 */ 1165 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1166 !vn_isdisk(vp, NULL)) 1167 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1168 /* 1169 * Put us back on the worklist. The worklist 1170 * routine will remove us from our current 1171 * position and then add us back in at a later 1172 * position. 1173 */ 1174 vn_syncer_add_to_worklist(vp, syncdelay); 1175 } 1176 splx(s); 1177 } 1178 1179 /* 1180 * Do soft update processing. 1181 */ 1182 #ifdef SOFTUPDATES 1183 softdep_process_worklist(NULL); 1184 #endif 1185 1186 /* 1187 * The variable rushjob allows the kernel to speed up the 1188 * processing of the filesystem syncer process. A rushjob 1189 * value of N tells the filesystem syncer to process the next 1190 * N seconds worth of work on its queue ASAP. Currently rushjob 1191 * is used by the soft update code to speed up the filesystem 1192 * syncer process when the incore state is getting so far 1193 * ahead of the disk that the kernel memory pool is being 1194 * threatened with exhaustion. 1195 */ 1196 if (rushjob > 0) { 1197 rushjob -= 1; 1198 continue; 1199 } 1200 /* 1201 * If it has taken us less than a second to process the 1202 * current work, then wait. Otherwise start right over 1203 * again. We can still lose time if any single round 1204 * takes more than two seconds, but it does not really 1205 * matter as we are just trying to generally pace the 1206 * filesystem activity. 1207 */ 1208 if (time_second == starttime) 1209 tsleep(&lbolt, PPAUSE, "syncer", 0); 1210 } 1211 } 1212 1213 /* 1214 * Request the syncer daemon to speed up its work. 1215 * We never push it to speed up more than half of its 1216 * normal turn time, otherwise it could take over the cpu. 1217 * XXXKSE only one update? 1218 */ 1219 int 1220 speedup_syncer() 1221 { 1222 1223 mtx_lock_spin(&sched_lock); 1224 if (updateproc->p_thread.td_wchan == &lbolt) /* XXXKSE */ 1225 setrunnable(&updateproc->p_thread); 1226 mtx_unlock_spin(&sched_lock); 1227 if (rushjob < syncdelay / 2) { 1228 rushjob += 1; 1229 stat_rush_requests += 1; 1230 return (1); 1231 } 1232 return(0); 1233 } 1234 1235 /* 1236 * Associate a p-buffer with a vnode. 1237 * 1238 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1239 * with the buffer. i.e. the bp has not been linked into the vnode or 1240 * ref-counted. 1241 */ 1242 void 1243 pbgetvp(vp, bp) 1244 register struct vnode *vp; 1245 register struct buf *bp; 1246 { 1247 1248 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1249 1250 bp->b_vp = vp; 1251 bp->b_flags |= B_PAGING; 1252 bp->b_dev = vn_todev(vp); 1253 } 1254 1255 /* 1256 * Disassociate a p-buffer from a vnode. 1257 */ 1258 void 1259 pbrelvp(bp) 1260 register struct buf *bp; 1261 { 1262 1263 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1264 1265 /* XXX REMOVE ME */ 1266 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1267 panic( 1268 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1269 bp, 1270 (int)bp->b_flags 1271 ); 1272 } 1273 bp->b_vp = (struct vnode *) 0; 1274 bp->b_flags &= ~B_PAGING; 1275 } 1276 1277 /* 1278 * Change the vnode a pager buffer is associated with. 1279 */ 1280 void 1281 pbreassignbuf(bp, newvp) 1282 struct buf *bp; 1283 struct vnode *newvp; 1284 { 1285 1286 KASSERT(bp->b_flags & B_PAGING, 1287 ("pbreassignbuf() on non phys bp %p", bp)); 1288 bp->b_vp = newvp; 1289 } 1290 1291 /* 1292 * Reassign a buffer from one vnode to another. 1293 * Used to assign file specific control information 1294 * (indirect blocks) to the vnode to which they belong. 1295 */ 1296 void 1297 reassignbuf(bp, newvp) 1298 register struct buf *bp; 1299 register struct vnode *newvp; 1300 { 1301 struct buflists *listheadp; 1302 int delay; 1303 int s; 1304 1305 if (newvp == NULL) { 1306 printf("reassignbuf: NULL"); 1307 return; 1308 } 1309 ++reassignbufcalls; 1310 1311 /* 1312 * B_PAGING flagged buffers cannot be reassigned because their vp 1313 * is not fully linked in. 1314 */ 1315 if (bp->b_flags & B_PAGING) 1316 panic("cannot reassign paging buffer"); 1317 1318 s = splbio(); 1319 /* 1320 * Delete from old vnode list, if on one. 1321 */ 1322 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1323 if (bp->b_xflags & BX_VNDIRTY) 1324 listheadp = &bp->b_vp->v_dirtyblkhd; 1325 else 1326 listheadp = &bp->b_vp->v_cleanblkhd; 1327 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1328 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1329 if (bp->b_vp != newvp) { 1330 vdrop(bp->b_vp); 1331 bp->b_vp = NULL; /* for clarification */ 1332 } 1333 } 1334 /* 1335 * If dirty, put on list of dirty buffers; otherwise insert onto list 1336 * of clean buffers. 1337 */ 1338 if (bp->b_flags & B_DELWRI) { 1339 struct buf *tbp; 1340 1341 listheadp = &newvp->v_dirtyblkhd; 1342 if ((newvp->v_flag & VONWORKLST) == 0) { 1343 switch (newvp->v_type) { 1344 case VDIR: 1345 delay = dirdelay; 1346 break; 1347 case VCHR: 1348 if (newvp->v_rdev->si_mountpoint != NULL) { 1349 delay = metadelay; 1350 break; 1351 } 1352 /* fall through */ 1353 default: 1354 delay = filedelay; 1355 } 1356 vn_syncer_add_to_worklist(newvp, delay); 1357 } 1358 bp->b_xflags |= BX_VNDIRTY; 1359 tbp = TAILQ_FIRST(listheadp); 1360 if (tbp == NULL || 1361 bp->b_lblkno == 0 || 1362 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1363 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1364 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1365 ++reassignbufsortgood; 1366 } else if (bp->b_lblkno < 0) { 1367 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1368 ++reassignbufsortgood; 1369 } else if (reassignbufmethod == 1) { 1370 /* 1371 * New sorting algorithm, only handle sequential case, 1372 * otherwise append to end (but before metadata) 1373 */ 1374 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1375 (tbp->b_xflags & BX_VNDIRTY)) { 1376 /* 1377 * Found the best place to insert the buffer 1378 */ 1379 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1380 ++reassignbufsortgood; 1381 } else { 1382 /* 1383 * Missed, append to end, but before meta-data. 1384 * We know that the head buffer in the list is 1385 * not meta-data due to prior conditionals. 1386 * 1387 * Indirect effects: NFS second stage write 1388 * tends to wind up here, giving maximum 1389 * distance between the unstable write and the 1390 * commit rpc. 1391 */ 1392 tbp = TAILQ_LAST(listheadp, buflists); 1393 while (tbp && tbp->b_lblkno < 0) 1394 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1395 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1396 ++reassignbufsortbad; 1397 } 1398 } else { 1399 /* 1400 * Old sorting algorithm, scan queue and insert 1401 */ 1402 struct buf *ttbp; 1403 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1404 (ttbp->b_lblkno < bp->b_lblkno)) { 1405 ++reassignbufloops; 1406 tbp = ttbp; 1407 } 1408 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1409 } 1410 } else { 1411 bp->b_xflags |= BX_VNCLEAN; 1412 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1413 if ((newvp->v_flag & VONWORKLST) && 1414 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1415 newvp->v_flag &= ~VONWORKLST; 1416 LIST_REMOVE(newvp, v_synclist); 1417 } 1418 } 1419 if (bp->b_vp != newvp) { 1420 bp->b_vp = newvp; 1421 vhold(bp->b_vp); 1422 } 1423 splx(s); 1424 } 1425 1426 /* 1427 * Create a vnode for a device. 1428 * Used for mounting the root file system. 1429 */ 1430 int 1431 bdevvp(dev, vpp) 1432 dev_t dev; 1433 struct vnode **vpp; 1434 { 1435 register struct vnode *vp; 1436 struct vnode *nvp; 1437 int error; 1438 1439 if (dev == NODEV) { 1440 *vpp = NULLVP; 1441 return (ENXIO); 1442 } 1443 if (vfinddev(dev, VCHR, vpp)) 1444 return (0); 1445 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1446 if (error) { 1447 *vpp = NULLVP; 1448 return (error); 1449 } 1450 vp = nvp; 1451 vp->v_type = VCHR; 1452 addalias(vp, dev); 1453 *vpp = vp; 1454 return (0); 1455 } 1456 1457 /* 1458 * Add vnode to the alias list hung off the dev_t. 1459 * 1460 * The reason for this gunk is that multiple vnodes can reference 1461 * the same physical device, so checking vp->v_usecount to see 1462 * how many users there are is inadequate; the v_usecount for 1463 * the vnodes need to be accumulated. vcount() does that. 1464 */ 1465 struct vnode * 1466 addaliasu(nvp, nvp_rdev) 1467 struct vnode *nvp; 1468 udev_t nvp_rdev; 1469 { 1470 struct vnode *ovp; 1471 vop_t **ops; 1472 dev_t dev; 1473 1474 if (nvp->v_type == VBLK) 1475 return (nvp); 1476 if (nvp->v_type != VCHR) 1477 panic("addaliasu on non-special vnode"); 1478 dev = udev2dev(nvp_rdev, 0); 1479 /* 1480 * Check to see if we have a bdevvp vnode with no associated 1481 * filesystem. If so, we want to associate the filesystem of 1482 * the new newly instigated vnode with the bdevvp vnode and 1483 * discard the newly created vnode rather than leaving the 1484 * bdevvp vnode lying around with no associated filesystem. 1485 */ 1486 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1487 addalias(nvp, dev); 1488 return (nvp); 1489 } 1490 /* 1491 * Discard unneeded vnode, but save its node specific data. 1492 * Note that if there is a lock, it is carried over in the 1493 * node specific data to the replacement vnode. 1494 */ 1495 vref(ovp); 1496 ovp->v_data = nvp->v_data; 1497 ovp->v_tag = nvp->v_tag; 1498 nvp->v_data = NULL; 1499 lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, 1500 nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); 1501 if (nvp->v_vnlock) 1502 ovp->v_vnlock = &ovp->v_lock; 1503 ops = ovp->v_op; 1504 ovp->v_op = nvp->v_op; 1505 if (VOP_ISLOCKED(nvp, curthread)) { 1506 VOP_UNLOCK(nvp, 0, curthread); 1507 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread); 1508 } 1509 nvp->v_op = ops; 1510 insmntque(ovp, nvp->v_mount); 1511 vrele(nvp); 1512 vgone(nvp); 1513 return (ovp); 1514 } 1515 1516 /* This is a local helper function that do the same as addaliasu, but for a 1517 * dev_t instead of an udev_t. */ 1518 static void 1519 addalias(nvp, dev) 1520 struct vnode *nvp; 1521 dev_t dev; 1522 { 1523 1524 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); 1525 nvp->v_rdev = dev; 1526 mtx_lock(&spechash_mtx); 1527 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1528 mtx_unlock(&spechash_mtx); 1529 } 1530 1531 /* 1532 * Grab a particular vnode from the free list, increment its 1533 * reference count and lock it. The vnode lock bit is set if the 1534 * vnode is being eliminated in vgone. The process is awakened 1535 * when the transition is completed, and an error returned to 1536 * indicate that the vnode is no longer usable (possibly having 1537 * been changed to a new file system type). 1538 */ 1539 int 1540 vget(vp, flags, td) 1541 register struct vnode *vp; 1542 int flags; 1543 struct thread *td; 1544 { 1545 int error; 1546 1547 /* 1548 * If the vnode is in the process of being cleaned out for 1549 * another use, we wait for the cleaning to finish and then 1550 * return failure. Cleaning is determined by checking that 1551 * the VXLOCK flag is set. 1552 */ 1553 if ((flags & LK_INTERLOCK) == 0) 1554 mtx_lock(&vp->v_interlock); 1555 if (vp->v_flag & VXLOCK) { 1556 if (vp->v_vxproc == curthread) { 1557 printf("VXLOCK interlock avoided\n"); 1558 } else { 1559 vp->v_flag |= VXWANT; 1560 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1561 "vget", 0); 1562 return (ENOENT); 1563 } 1564 } 1565 1566 vp->v_usecount++; 1567 1568 if (VSHOULDBUSY(vp)) 1569 vbusy(vp); 1570 if (flags & LK_TYPE_MASK) { 1571 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { 1572 /* 1573 * must expand vrele here because we do not want 1574 * to call VOP_INACTIVE if the reference count 1575 * drops back to zero since it was never really 1576 * active. We must remove it from the free list 1577 * before sleeping so that multiple processes do 1578 * not try to recycle it. 1579 */ 1580 mtx_lock(&vp->v_interlock); 1581 vp->v_usecount--; 1582 if (VSHOULDFREE(vp)) 1583 vfree(vp); 1584 mtx_unlock(&vp->v_interlock); 1585 } 1586 return (error); 1587 } 1588 mtx_unlock(&vp->v_interlock); 1589 return (0); 1590 } 1591 1592 /* 1593 * Increase the reference count of a vnode. 1594 */ 1595 void 1596 vref(struct vnode *vp) 1597 { 1598 mtx_lock(&vp->v_interlock); 1599 vp->v_usecount++; 1600 mtx_unlock(&vp->v_interlock); 1601 } 1602 1603 /* 1604 * Vnode put/release. 1605 * If count drops to zero, call inactive routine and return to freelist. 1606 */ 1607 void 1608 vrele(vp) 1609 struct vnode *vp; 1610 { 1611 struct thread *td = curthread; /* XXX */ 1612 1613 KASSERT(vp != NULL, ("vrele: null vp")); 1614 1615 mtx_lock(&vp->v_interlock); 1616 1617 /* Skip this v_writecount check if we're going to panic below. */ 1618 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, 1619 ("vrele: missed vn_close")); 1620 1621 if (vp->v_usecount > 1) { 1622 1623 vp->v_usecount--; 1624 mtx_unlock(&vp->v_interlock); 1625 1626 return; 1627 } 1628 1629 if (vp->v_usecount == 1) { 1630 vp->v_usecount--; 1631 if (VSHOULDFREE(vp)) 1632 vfree(vp); 1633 /* 1634 * If we are doing a vput, the node is already locked, and we must 1635 * call VOP_INACTIVE with the node locked. So, in the case of 1636 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1637 */ 1638 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) { 1639 VOP_INACTIVE(vp, td); 1640 } 1641 1642 } else { 1643 #ifdef DIAGNOSTIC 1644 vprint("vrele: negative ref count", vp); 1645 mtx_unlock(&vp->v_interlock); 1646 #endif 1647 panic("vrele: negative ref cnt"); 1648 } 1649 } 1650 1651 /* 1652 * Release an already locked vnode. This give the same effects as 1653 * unlock+vrele(), but takes less time and avoids releasing and 1654 * re-aquiring the lock (as vrele() aquires the lock internally.) 1655 */ 1656 void 1657 vput(vp) 1658 struct vnode *vp; 1659 { 1660 struct thread *td = curthread; /* XXX */ 1661 1662 GIANT_REQUIRED; 1663 1664 KASSERT(vp != NULL, ("vput: null vp")); 1665 mtx_lock(&vp->v_interlock); 1666 /* Skip this v_writecount check if we're going to panic below. */ 1667 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, 1668 ("vput: missed vn_close")); 1669 1670 if (vp->v_usecount > 1) { 1671 vp->v_usecount--; 1672 VOP_UNLOCK(vp, LK_INTERLOCK, td); 1673 return; 1674 } 1675 1676 if (vp->v_usecount == 1) { 1677 vp->v_usecount--; 1678 if (VSHOULDFREE(vp)) 1679 vfree(vp); 1680 /* 1681 * If we are doing a vput, the node is already locked, and we must 1682 * call VOP_INACTIVE with the node locked. So, in the case of 1683 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1684 */ 1685 mtx_unlock(&vp->v_interlock); 1686 VOP_INACTIVE(vp, td); 1687 1688 } else { 1689 #ifdef DIAGNOSTIC 1690 vprint("vput: negative ref count", vp); 1691 #endif 1692 panic("vput: negative ref cnt"); 1693 } 1694 } 1695 1696 /* 1697 * Somebody doesn't want the vnode recycled. 1698 */ 1699 void 1700 vhold(vp) 1701 register struct vnode *vp; 1702 { 1703 int s; 1704 1705 s = splbio(); 1706 vp->v_holdcnt++; 1707 if (VSHOULDBUSY(vp)) 1708 vbusy(vp); 1709 splx(s); 1710 } 1711 1712 /* 1713 * Note that there is one less who cares about this vnode. vdrop() is the 1714 * opposite of vhold(). 1715 */ 1716 void 1717 vdrop(vp) 1718 register struct vnode *vp; 1719 { 1720 int s; 1721 1722 s = splbio(); 1723 if (vp->v_holdcnt <= 0) 1724 panic("vdrop: holdcnt"); 1725 vp->v_holdcnt--; 1726 if (VSHOULDFREE(vp)) 1727 vfree(vp); 1728 splx(s); 1729 } 1730 1731 /* 1732 * Remove any vnodes in the vnode table belonging to mount point mp. 1733 * 1734 * If FORCECLOSE is not specified, there should not be any active ones, 1735 * return error if any are found (nb: this is a user error, not a 1736 * system error). If FORCECLOSE is specified, detach any active vnodes 1737 * that are found. 1738 * 1739 * If WRITECLOSE is set, only flush out regular file vnodes open for 1740 * writing. 1741 * 1742 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. 1743 * 1744 * `rootrefs' specifies the base reference count for the root vnode 1745 * of this filesystem. The root vnode is considered busy if its 1746 * v_usecount exceeds this value. On a successful return, vflush() 1747 * will call vrele() on the root vnode exactly rootrefs times. 1748 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 1749 * be zero. 1750 */ 1751 #ifdef DIAGNOSTIC 1752 static int busyprt = 0; /* print out busy vnodes */ 1753 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1754 #endif 1755 1756 int 1757 vflush(mp, rootrefs, flags) 1758 struct mount *mp; 1759 int rootrefs; 1760 int flags; 1761 { 1762 struct thread *td = curthread; /* XXX */ 1763 struct vnode *vp, *nvp, *rootvp = NULL; 1764 int busy = 0, error; 1765 1766 if (rootrefs > 0) { 1767 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 1768 ("vflush: bad args")); 1769 /* 1770 * Get the filesystem root vnode. We can vput() it 1771 * immediately, since with rootrefs > 0, it won't go away. 1772 */ 1773 if ((error = VFS_ROOT(mp, &rootvp)) != 0) 1774 return (error); 1775 vput(rootvp); 1776 } 1777 mtx_lock(&mntvnode_mtx); 1778 loop: 1779 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) { 1780 /* 1781 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1782 * Start over if it has (it won't be on the list anymore). 1783 */ 1784 if (vp->v_mount != mp) 1785 goto loop; 1786 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 1787 1788 mtx_unlock(&mntvnode_mtx); 1789 mtx_lock(&vp->v_interlock); 1790 /* 1791 * Skip over a vnodes marked VSYSTEM. 1792 */ 1793 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1794 mtx_unlock(&vp->v_interlock); 1795 mtx_lock(&mntvnode_mtx); 1796 continue; 1797 } 1798 /* 1799 * If WRITECLOSE is set, only flush out regular file vnodes 1800 * open for writing. 1801 */ 1802 if ((flags & WRITECLOSE) && 1803 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1804 mtx_unlock(&vp->v_interlock); 1805 mtx_lock(&mntvnode_mtx); 1806 continue; 1807 } 1808 1809 /* 1810 * With v_usecount == 0, all we need to do is clear out the 1811 * vnode data structures and we are done. 1812 */ 1813 if (vp->v_usecount == 0) { 1814 vgonel(vp, td); 1815 mtx_lock(&mntvnode_mtx); 1816 continue; 1817 } 1818 1819 /* 1820 * If FORCECLOSE is set, forcibly close the vnode. For block 1821 * or character devices, revert to an anonymous device. For 1822 * all other files, just kill them. 1823 */ 1824 if (flags & FORCECLOSE) { 1825 if (vp->v_type != VCHR) { 1826 vgonel(vp, td); 1827 } else { 1828 vclean(vp, 0, td); 1829 vp->v_op = spec_vnodeop_p; 1830 insmntque(vp, (struct mount *) 0); 1831 } 1832 mtx_lock(&mntvnode_mtx); 1833 continue; 1834 } 1835 #ifdef DIAGNOSTIC 1836 if (busyprt) 1837 vprint("vflush: busy vnode", vp); 1838 #endif 1839 mtx_unlock(&vp->v_interlock); 1840 mtx_lock(&mntvnode_mtx); 1841 busy++; 1842 } 1843 mtx_unlock(&mntvnode_mtx); 1844 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 1845 /* 1846 * If just the root vnode is busy, and if its refcount 1847 * is equal to `rootrefs', then go ahead and kill it. 1848 */ 1849 mtx_lock(&rootvp->v_interlock); 1850 KASSERT(busy > 0, ("vflush: not busy")); 1851 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); 1852 if (busy == 1 && rootvp->v_usecount == rootrefs) { 1853 vgonel(rootvp, td); 1854 busy = 0; 1855 } else 1856 mtx_unlock(&rootvp->v_interlock); 1857 } 1858 if (busy) 1859 return (EBUSY); 1860 for (; rootrefs > 0; rootrefs--) 1861 vrele(rootvp); 1862 return (0); 1863 } 1864 1865 /* 1866 * Disassociate the underlying file system from a vnode. 1867 */ 1868 static void 1869 vclean(vp, flags, td) 1870 struct vnode *vp; 1871 int flags; 1872 struct thread *td; 1873 { 1874 int active; 1875 1876 /* 1877 * Check to see if the vnode is in use. If so we have to reference it 1878 * before we clean it out so that its count cannot fall to zero and 1879 * generate a race against ourselves to recycle it. 1880 */ 1881 if ((active = vp->v_usecount)) 1882 vp->v_usecount++; 1883 1884 /* 1885 * Prevent the vnode from being recycled or brought into use while we 1886 * clean it out. 1887 */ 1888 if (vp->v_flag & VXLOCK) 1889 panic("vclean: deadlock"); 1890 vp->v_flag |= VXLOCK; 1891 vp->v_vxproc = curthread; 1892 /* 1893 * Even if the count is zero, the VOP_INACTIVE routine may still 1894 * have the object locked while it cleans it out. The VOP_LOCK 1895 * ensures that the VOP_INACTIVE routine is done with its work. 1896 * For active vnodes, it ensures that no other activity can 1897 * occur while the underlying object is being cleaned out. 1898 */ 1899 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); 1900 1901 /* 1902 * Clean out any buffers associated with the vnode. 1903 * If the flush fails, just toss the buffers. 1904 */ 1905 if (flags & DOCLOSE) { 1906 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1907 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1908 if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0) 1909 vinvalbuf(vp, 0, NOCRED, td, 0, 0); 1910 } 1911 1912 VOP_DESTROYVOBJECT(vp); 1913 1914 /* 1915 * If purging an active vnode, it must be closed and 1916 * deactivated before being reclaimed. Note that the 1917 * VOP_INACTIVE will unlock the vnode. 1918 */ 1919 if (active) { 1920 if (flags & DOCLOSE) 1921 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 1922 VOP_INACTIVE(vp, td); 1923 } else { 1924 /* 1925 * Any other processes trying to obtain this lock must first 1926 * wait for VXLOCK to clear, then call the new lock operation. 1927 */ 1928 VOP_UNLOCK(vp, 0, td); 1929 } 1930 /* 1931 * Reclaim the vnode. 1932 */ 1933 if (VOP_RECLAIM(vp, td)) 1934 panic("vclean: cannot reclaim"); 1935 1936 if (active) { 1937 /* 1938 * Inline copy of vrele() since VOP_INACTIVE 1939 * has already been called. 1940 */ 1941 mtx_lock(&vp->v_interlock); 1942 if (--vp->v_usecount <= 0) { 1943 #ifdef DIAGNOSTIC 1944 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1945 vprint("vclean: bad ref count", vp); 1946 panic("vclean: ref cnt"); 1947 } 1948 #endif 1949 vfree(vp); 1950 } 1951 mtx_unlock(&vp->v_interlock); 1952 } 1953 1954 cache_purge(vp); 1955 vp->v_vnlock = NULL; 1956 lockdestroy(&vp->v_lock); 1957 1958 if (VSHOULDFREE(vp)) 1959 vfree(vp); 1960 1961 /* 1962 * Done with purge, notify sleepers of the grim news. 1963 */ 1964 vp->v_op = dead_vnodeop_p; 1965 vn_pollgone(vp); 1966 vp->v_tag = VT_NON; 1967 vp->v_flag &= ~VXLOCK; 1968 vp->v_vxproc = NULL; 1969 if (vp->v_flag & VXWANT) { 1970 vp->v_flag &= ~VXWANT; 1971 wakeup((caddr_t) vp); 1972 } 1973 } 1974 1975 /* 1976 * Eliminate all activity associated with the requested vnode 1977 * and with all vnodes aliased to the requested vnode. 1978 */ 1979 int 1980 vop_revoke(ap) 1981 struct vop_revoke_args /* { 1982 struct vnode *a_vp; 1983 int a_flags; 1984 } */ *ap; 1985 { 1986 struct vnode *vp, *vq; 1987 dev_t dev; 1988 1989 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1990 1991 vp = ap->a_vp; 1992 /* 1993 * If a vgone (or vclean) is already in progress, 1994 * wait until it is done and return. 1995 */ 1996 if (vp->v_flag & VXLOCK) { 1997 vp->v_flag |= VXWANT; 1998 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1999 "vop_revokeall", 0); 2000 return (0); 2001 } 2002 dev = vp->v_rdev; 2003 for (;;) { 2004 mtx_lock(&spechash_mtx); 2005 vq = SLIST_FIRST(&dev->si_hlist); 2006 mtx_unlock(&spechash_mtx); 2007 if (!vq) 2008 break; 2009 vgone(vq); 2010 } 2011 return (0); 2012 } 2013 2014 /* 2015 * Recycle an unused vnode to the front of the free list. 2016 * Release the passed interlock if the vnode will be recycled. 2017 */ 2018 int 2019 vrecycle(vp, inter_lkp, td) 2020 struct vnode *vp; 2021 struct mtx *inter_lkp; 2022 struct thread *td; 2023 { 2024 2025 mtx_lock(&vp->v_interlock); 2026 if (vp->v_usecount == 0) { 2027 if (inter_lkp) { 2028 mtx_unlock(inter_lkp); 2029 } 2030 vgonel(vp, td); 2031 return (1); 2032 } 2033 mtx_unlock(&vp->v_interlock); 2034 return (0); 2035 } 2036 2037 /* 2038 * Eliminate all activity associated with a vnode 2039 * in preparation for reuse. 2040 */ 2041 void 2042 vgone(vp) 2043 register struct vnode *vp; 2044 { 2045 struct thread *td = curthread; /* XXX */ 2046 2047 mtx_lock(&vp->v_interlock); 2048 vgonel(vp, td); 2049 } 2050 2051 /* 2052 * vgone, with the vp interlock held. 2053 */ 2054 void 2055 vgonel(vp, td) 2056 struct vnode *vp; 2057 struct thread *td; 2058 { 2059 int s; 2060 2061 /* 2062 * If a vgone (or vclean) is already in progress, 2063 * wait until it is done and return. 2064 */ 2065 if (vp->v_flag & VXLOCK) { 2066 vp->v_flag |= VXWANT; 2067 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 2068 "vgone", 0); 2069 return; 2070 } 2071 2072 /* 2073 * Clean out the filesystem specific data. 2074 */ 2075 vclean(vp, DOCLOSE, td); 2076 mtx_lock(&vp->v_interlock); 2077 2078 /* 2079 * Delete from old mount point vnode list, if on one. 2080 */ 2081 if (vp->v_mount != NULL) 2082 insmntque(vp, (struct mount *)0); 2083 /* 2084 * If special device, remove it from special device alias list 2085 * if it is on one. 2086 */ 2087 if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { 2088 mtx_lock(&spechash_mtx); 2089 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); 2090 freedev(vp->v_rdev); 2091 mtx_unlock(&spechash_mtx); 2092 vp->v_rdev = NULL; 2093 } 2094 2095 /* 2096 * If it is on the freelist and not already at the head, 2097 * move it to the head of the list. The test of the 2098 * VDOOMED flag and the reference count of zero is because 2099 * it will be removed from the free list by getnewvnode, 2100 * but will not have its reference count incremented until 2101 * after calling vgone. If the reference count were 2102 * incremented first, vgone would (incorrectly) try to 2103 * close the previous instance of the underlying object. 2104 */ 2105 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 2106 s = splbio(); 2107 mtx_lock(&vnode_free_list_mtx); 2108 if (vp->v_flag & VFREE) 2109 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2110 else 2111 freevnodes++; 2112 vp->v_flag |= VFREE; 2113 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2114 mtx_unlock(&vnode_free_list_mtx); 2115 splx(s); 2116 } 2117 2118 vp->v_type = VBAD; 2119 mtx_unlock(&vp->v_interlock); 2120 } 2121 2122 /* 2123 * Lookup a vnode by device number. 2124 */ 2125 int 2126 vfinddev(dev, type, vpp) 2127 dev_t dev; 2128 enum vtype type; 2129 struct vnode **vpp; 2130 { 2131 struct vnode *vp; 2132 2133 mtx_lock(&spechash_mtx); 2134 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2135 if (type == vp->v_type) { 2136 *vpp = vp; 2137 mtx_unlock(&spechash_mtx); 2138 return (1); 2139 } 2140 } 2141 mtx_unlock(&spechash_mtx); 2142 return (0); 2143 } 2144 2145 /* 2146 * Calculate the total number of references to a special device. 2147 */ 2148 int 2149 vcount(vp) 2150 struct vnode *vp; 2151 { 2152 struct vnode *vq; 2153 int count; 2154 2155 count = 0; 2156 mtx_lock(&spechash_mtx); 2157 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) 2158 count += vq->v_usecount; 2159 mtx_unlock(&spechash_mtx); 2160 return (count); 2161 } 2162 2163 /* 2164 * Same as above, but using the dev_t as argument 2165 */ 2166 int 2167 count_dev(dev) 2168 dev_t dev; 2169 { 2170 struct vnode *vp; 2171 2172 vp = SLIST_FIRST(&dev->si_hlist); 2173 if (vp == NULL) 2174 return (0); 2175 return(vcount(vp)); 2176 } 2177 2178 /* 2179 * Print out a description of a vnode. 2180 */ 2181 static char *typename[] = 2182 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2183 2184 void 2185 vprint(label, vp) 2186 char *label; 2187 struct vnode *vp; 2188 { 2189 char buf[96]; 2190 2191 if (label != NULL) 2192 printf("%s: %p: ", label, (void *)vp); 2193 else 2194 printf("%p: ", (void *)vp); 2195 printf("type %s, usecount %d, writecount %d, refcount %d,", 2196 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2197 vp->v_holdcnt); 2198 buf[0] = '\0'; 2199 if (vp->v_flag & VROOT) 2200 strcat(buf, "|VROOT"); 2201 if (vp->v_flag & VTEXT) 2202 strcat(buf, "|VTEXT"); 2203 if (vp->v_flag & VSYSTEM) 2204 strcat(buf, "|VSYSTEM"); 2205 if (vp->v_flag & VXLOCK) 2206 strcat(buf, "|VXLOCK"); 2207 if (vp->v_flag & VXWANT) 2208 strcat(buf, "|VXWANT"); 2209 if (vp->v_flag & VBWAIT) 2210 strcat(buf, "|VBWAIT"); 2211 if (vp->v_flag & VDOOMED) 2212 strcat(buf, "|VDOOMED"); 2213 if (vp->v_flag & VFREE) 2214 strcat(buf, "|VFREE"); 2215 if (vp->v_flag & VOBJBUF) 2216 strcat(buf, "|VOBJBUF"); 2217 if (buf[0] != '\0') 2218 printf(" flags (%s)", &buf[1]); 2219 if (vp->v_data == NULL) { 2220 printf("\n"); 2221 } else { 2222 printf("\n\t"); 2223 VOP_PRINT(vp); 2224 } 2225 } 2226 2227 #ifdef DDB 2228 #include <ddb/ddb.h> 2229 /* 2230 * List all of the locked vnodes in the system. 2231 * Called when debugging the kernel. 2232 */ 2233 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2234 { 2235 struct thread *td = curthread; /* XXX */ 2236 struct mount *mp, *nmp; 2237 struct vnode *vp; 2238 2239 printf("Locked vnodes\n"); 2240 mtx_lock(&mountlist_mtx); 2241 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2242 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 2243 nmp = TAILQ_NEXT(mp, mnt_list); 2244 continue; 2245 } 2246 mtx_lock(&mntvnode_mtx); 2247 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2248 if (VOP_ISLOCKED(vp, NULL)) 2249 vprint((char *)0, vp); 2250 } 2251 mtx_unlock(&mntvnode_mtx); 2252 mtx_lock(&mountlist_mtx); 2253 nmp = TAILQ_NEXT(mp, mnt_list); 2254 vfs_unbusy(mp, td); 2255 } 2256 mtx_unlock(&mountlist_mtx); 2257 } 2258 #endif 2259 2260 /* 2261 * Top level filesystem related information gathering. 2262 */ 2263 static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2264 2265 static int 2266 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2267 { 2268 int *name = (int *)arg1 - 1; /* XXX */ 2269 u_int namelen = arg2 + 1; /* XXX */ 2270 struct vfsconf *vfsp; 2271 2272 #if 1 || defined(COMPAT_PRELITE2) 2273 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2274 if (namelen == 1) 2275 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2276 #endif 2277 2278 /* XXX the below code does not compile; vfs_sysctl does not exist. */ 2279 #ifdef notyet 2280 /* all sysctl names at this level are at least name and field */ 2281 if (namelen < 2) 2282 return (ENOTDIR); /* overloaded */ 2283 if (name[0] != VFS_GENERIC) { 2284 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2285 if (vfsp->vfc_typenum == name[0]) 2286 break; 2287 if (vfsp == NULL) 2288 return (EOPNOTSUPP); 2289 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2290 oldp, oldlenp, newp, newlen, td)); 2291 } 2292 #endif 2293 switch (name[1]) { 2294 case VFS_MAXTYPENUM: 2295 if (namelen != 2) 2296 return (ENOTDIR); 2297 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2298 case VFS_CONF: 2299 if (namelen != 3) 2300 return (ENOTDIR); /* overloaded */ 2301 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2302 if (vfsp->vfc_typenum == name[2]) 2303 break; 2304 if (vfsp == NULL) 2305 return (EOPNOTSUPP); 2306 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2307 } 2308 return (EOPNOTSUPP); 2309 } 2310 2311 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2312 "Generic filesystem"); 2313 2314 #if 1 || defined(COMPAT_PRELITE2) 2315 2316 static int 2317 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2318 { 2319 int error; 2320 struct vfsconf *vfsp; 2321 struct ovfsconf ovfs; 2322 2323 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2324 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2325 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2326 ovfs.vfc_index = vfsp->vfc_typenum; 2327 ovfs.vfc_refcount = vfsp->vfc_refcount; 2328 ovfs.vfc_flags = vfsp->vfc_flags; 2329 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2330 if (error) 2331 return error; 2332 } 2333 return 0; 2334 } 2335 2336 #endif /* 1 || COMPAT_PRELITE2 */ 2337 2338 #if COMPILING_LINT 2339 #define KINFO_VNODESLOP 10 2340 /* 2341 * Dump vnode list (via sysctl). 2342 * Copyout address of vnode followed by vnode. 2343 */ 2344 /* ARGSUSED */ 2345 static int 2346 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2347 { 2348 struct thread *td = curthread; /* XXX */ 2349 struct mount *mp, *nmp; 2350 struct vnode *nvp, *vp; 2351 int error; 2352 2353 #define VPTRSZ sizeof (struct vnode *) 2354 #define VNODESZ sizeof (struct vnode) 2355 2356 req->lock = 0; 2357 if (!req->oldptr) /* Make an estimate */ 2358 return (SYSCTL_OUT(req, 0, 2359 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2360 2361 mtx_lock(&mountlist_mtx); 2362 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2363 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 2364 nmp = TAILQ_NEXT(mp, mnt_list); 2365 continue; 2366 } 2367 mtx_lock(&mntvnode_mtx); 2368 again: 2369 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 2370 vp != NULL; 2371 vp = nvp) { 2372 /* 2373 * Check that the vp is still associated with 2374 * this filesystem. RACE: could have been 2375 * recycled onto the same filesystem. 2376 */ 2377 if (vp->v_mount != mp) 2378 goto again; 2379 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2380 mtx_unlock(&mntvnode_mtx); 2381 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2382 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2383 return (error); 2384 mtx_lock(&mntvnode_mtx); 2385 } 2386 mtx_unlock(&mntvnode_mtx); 2387 mtx_lock(&mountlist_mtx); 2388 nmp = TAILQ_NEXT(mp, mnt_list); 2389 vfs_unbusy(mp, td); 2390 } 2391 mtx_unlock(&mountlist_mtx); 2392 2393 return (0); 2394 } 2395 2396 /* 2397 * XXX 2398 * Exporting the vnode list on large systems causes them to crash. 2399 * Exporting the vnode list on medium systems causes sysctl to coredump. 2400 */ 2401 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2402 0, 0, sysctl_vnode, "S,vnode", ""); 2403 #endif 2404 2405 /* 2406 * Check to see if a filesystem is mounted on a block device. 2407 */ 2408 int 2409 vfs_mountedon(vp) 2410 struct vnode *vp; 2411 { 2412 2413 if (vp->v_rdev->si_mountpoint != NULL) 2414 return (EBUSY); 2415 return (0); 2416 } 2417 2418 /* 2419 * Unmount all filesystems. The list is traversed in reverse order 2420 * of mounting to avoid dependencies. 2421 */ 2422 void 2423 vfs_unmountall() 2424 { 2425 struct mount *mp; 2426 struct thread *td; 2427 int error; 2428 2429 if (curthread != NULL) 2430 td = curthread; 2431 else 2432 td = &initproc->p_thread; /* XXX XXX should this be proc0? */ 2433 /* 2434 * Since this only runs when rebooting, it is not interlocked. 2435 */ 2436 while(!TAILQ_EMPTY(&mountlist)) { 2437 mp = TAILQ_LAST(&mountlist, mntlist); 2438 error = dounmount(mp, MNT_FORCE, td); 2439 if (error) { 2440 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2441 printf("unmount of %s failed (", 2442 mp->mnt_stat.f_mntonname); 2443 if (error == EBUSY) 2444 printf("BUSY)\n"); 2445 else 2446 printf("%d)\n", error); 2447 } else { 2448 /* The unmount has removed mp from the mountlist */ 2449 } 2450 } 2451 } 2452 2453 /* 2454 * perform msync on all vnodes under a mount point 2455 * the mount point must be locked. 2456 */ 2457 void 2458 vfs_msync(struct mount *mp, int flags) 2459 { 2460 struct vnode *vp, *nvp; 2461 struct vm_object *obj; 2462 int tries; 2463 2464 GIANT_REQUIRED; 2465 2466 tries = 5; 2467 mtx_lock(&mntvnode_mtx); 2468 loop: 2469 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { 2470 if (vp->v_mount != mp) { 2471 if (--tries > 0) 2472 goto loop; 2473 break; 2474 } 2475 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2476 2477 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2478 continue; 2479 2480 if (vp->v_flag & VNOSYNC) /* unlinked, skip it */ 2481 continue; 2482 2483 if ((vp->v_flag & VOBJDIRTY) && 2484 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2485 mtx_unlock(&mntvnode_mtx); 2486 if (!vget(vp, 2487 LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) { 2488 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2489 vm_object_page_clean(obj, 0, 0, 2490 flags == MNT_WAIT ? 2491 OBJPC_SYNC : OBJPC_NOSYNC); 2492 } 2493 vput(vp); 2494 } 2495 mtx_lock(&mntvnode_mtx); 2496 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { 2497 if (--tries > 0) 2498 goto loop; 2499 break; 2500 } 2501 } 2502 } 2503 mtx_unlock(&mntvnode_mtx); 2504 } 2505 2506 /* 2507 * Create the VM object needed for VMIO and mmap support. This 2508 * is done for all VREG files in the system. Some filesystems might 2509 * afford the additional metadata buffering capability of the 2510 * VMIO code by making the device node be VMIO mode also. 2511 * 2512 * vp must be locked when vfs_object_create is called. 2513 */ 2514 int 2515 vfs_object_create(vp, td, cred) 2516 struct vnode *vp; 2517 struct thread *td; 2518 struct ucred *cred; 2519 { 2520 GIANT_REQUIRED; 2521 return (VOP_CREATEVOBJECT(vp, cred, td)); 2522 } 2523 2524 /* 2525 * Mark a vnode as free, putting it up for recycling. 2526 */ 2527 void 2528 vfree(vp) 2529 struct vnode *vp; 2530 { 2531 int s; 2532 2533 s = splbio(); 2534 mtx_lock(&vnode_free_list_mtx); 2535 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2536 if (vp->v_flag & VAGE) { 2537 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2538 } else { 2539 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2540 } 2541 freevnodes++; 2542 mtx_unlock(&vnode_free_list_mtx); 2543 vp->v_flag &= ~VAGE; 2544 vp->v_flag |= VFREE; 2545 splx(s); 2546 } 2547 2548 /* 2549 * Opposite of vfree() - mark a vnode as in use. 2550 */ 2551 void 2552 vbusy(vp) 2553 struct vnode *vp; 2554 { 2555 int s; 2556 2557 s = splbio(); 2558 mtx_lock(&vnode_free_list_mtx); 2559 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2560 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2561 freevnodes--; 2562 mtx_unlock(&vnode_free_list_mtx); 2563 vp->v_flag &= ~(VFREE|VAGE); 2564 splx(s); 2565 } 2566 2567 /* 2568 * Record a process's interest in events which might happen to 2569 * a vnode. Because poll uses the historic select-style interface 2570 * internally, this routine serves as both the ``check for any 2571 * pending events'' and the ``record my interest in future events'' 2572 * functions. (These are done together, while the lock is held, 2573 * to avoid race conditions.) 2574 */ 2575 int 2576 vn_pollrecord(vp, td, events) 2577 struct vnode *vp; 2578 struct thread *td; 2579 short events; 2580 { 2581 mtx_lock(&vp->v_pollinfo.vpi_lock); 2582 if (vp->v_pollinfo.vpi_revents & events) { 2583 /* 2584 * This leaves events we are not interested 2585 * in available for the other process which 2586 * which presumably had requested them 2587 * (otherwise they would never have been 2588 * recorded). 2589 */ 2590 events &= vp->v_pollinfo.vpi_revents; 2591 vp->v_pollinfo.vpi_revents &= ~events; 2592 2593 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2594 return events; 2595 } 2596 vp->v_pollinfo.vpi_events |= events; 2597 selrecord(td, &vp->v_pollinfo.vpi_selinfo); 2598 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2599 return 0; 2600 } 2601 2602 /* 2603 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2604 * it is possible for us to miss an event due to race conditions, but 2605 * that condition is expected to be rare, so for the moment it is the 2606 * preferred interface. 2607 */ 2608 void 2609 vn_pollevent(vp, events) 2610 struct vnode *vp; 2611 short events; 2612 { 2613 mtx_lock(&vp->v_pollinfo.vpi_lock); 2614 if (vp->v_pollinfo.vpi_events & events) { 2615 /* 2616 * We clear vpi_events so that we don't 2617 * call selwakeup() twice if two events are 2618 * posted before the polling process(es) is 2619 * awakened. This also ensures that we take at 2620 * most one selwakeup() if the polling process 2621 * is no longer interested. However, it does 2622 * mean that only one event can be noticed at 2623 * a time. (Perhaps we should only clear those 2624 * event bits which we note?) XXX 2625 */ 2626 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2627 vp->v_pollinfo.vpi_revents |= events; 2628 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2629 } 2630 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2631 } 2632 2633 #define VN_KNOTE(vp, b) \ 2634 KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) 2635 2636 /* 2637 * Wake up anyone polling on vp because it is being revoked. 2638 * This depends on dead_poll() returning POLLHUP for correct 2639 * behavior. 2640 */ 2641 void 2642 vn_pollgone(vp) 2643 struct vnode *vp; 2644 { 2645 mtx_lock(&vp->v_pollinfo.vpi_lock); 2646 VN_KNOTE(vp, NOTE_REVOKE); 2647 if (vp->v_pollinfo.vpi_events) { 2648 vp->v_pollinfo.vpi_events = 0; 2649 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2650 } 2651 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2652 } 2653 2654 2655 2656 /* 2657 * Routine to create and manage a filesystem syncer vnode. 2658 */ 2659 #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2660 static int sync_fsync __P((struct vop_fsync_args *)); 2661 static int sync_inactive __P((struct vop_inactive_args *)); 2662 static int sync_reclaim __P((struct vop_reclaim_args *)); 2663 #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2664 #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2665 static int sync_print __P((struct vop_print_args *)); 2666 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2667 2668 static vop_t **sync_vnodeop_p; 2669 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2670 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2671 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2672 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2673 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2674 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2675 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2676 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2677 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2678 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2679 { NULL, NULL } 2680 }; 2681 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2682 { &sync_vnodeop_p, sync_vnodeop_entries }; 2683 2684 VNODEOP_SET(sync_vnodeop_opv_desc); 2685 2686 /* 2687 * Create a new filesystem syncer vnode for the specified mount point. 2688 */ 2689 int 2690 vfs_allocate_syncvnode(mp) 2691 struct mount *mp; 2692 { 2693 struct vnode *vp; 2694 static long start, incr, next; 2695 int error; 2696 2697 /* Allocate a new vnode */ 2698 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2699 mp->mnt_syncer = NULL; 2700 return (error); 2701 } 2702 vp->v_type = VNON; 2703 /* 2704 * Place the vnode onto the syncer worklist. We attempt to 2705 * scatter them about on the list so that they will go off 2706 * at evenly distributed times even if all the filesystems 2707 * are mounted at once. 2708 */ 2709 next += incr; 2710 if (next == 0 || next > syncer_maxdelay) { 2711 start /= 2; 2712 incr /= 2; 2713 if (start == 0) { 2714 start = syncer_maxdelay / 2; 2715 incr = syncer_maxdelay; 2716 } 2717 next = start; 2718 } 2719 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2720 mp->mnt_syncer = vp; 2721 return (0); 2722 } 2723 2724 /* 2725 * Do a lazy sync of the filesystem. 2726 */ 2727 static int 2728 sync_fsync(ap) 2729 struct vop_fsync_args /* { 2730 struct vnode *a_vp; 2731 struct ucred *a_cred; 2732 int a_waitfor; 2733 struct thread *a_td; 2734 } */ *ap; 2735 { 2736 struct vnode *syncvp = ap->a_vp; 2737 struct mount *mp = syncvp->v_mount; 2738 struct thread *td = ap->a_td; 2739 int asyncflag; 2740 2741 /* 2742 * We only need to do something if this is a lazy evaluation. 2743 */ 2744 if (ap->a_waitfor != MNT_LAZY) 2745 return (0); 2746 2747 /* 2748 * Move ourselves to the back of the sync list. 2749 */ 2750 vn_syncer_add_to_worklist(syncvp, syncdelay); 2751 2752 /* 2753 * Walk the list of vnodes pushing all that are dirty and 2754 * not already on the sync list. 2755 */ 2756 mtx_lock(&mountlist_mtx); 2757 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { 2758 mtx_unlock(&mountlist_mtx); 2759 return (0); 2760 } 2761 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2762 vfs_unbusy(mp, td); 2763 return (0); 2764 } 2765 asyncflag = mp->mnt_flag & MNT_ASYNC; 2766 mp->mnt_flag &= ~MNT_ASYNC; 2767 vfs_msync(mp, MNT_NOWAIT); 2768 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td); 2769 if (asyncflag) 2770 mp->mnt_flag |= MNT_ASYNC; 2771 vn_finished_write(mp); 2772 vfs_unbusy(mp, td); 2773 return (0); 2774 } 2775 2776 /* 2777 * The syncer vnode is no referenced. 2778 */ 2779 static int 2780 sync_inactive(ap) 2781 struct vop_inactive_args /* { 2782 struct vnode *a_vp; 2783 struct thread *a_td; 2784 } */ *ap; 2785 { 2786 2787 vgone(ap->a_vp); 2788 return (0); 2789 } 2790 2791 /* 2792 * The syncer vnode is no longer needed and is being decommissioned. 2793 * 2794 * Modifications to the worklist must be protected at splbio(). 2795 */ 2796 static int 2797 sync_reclaim(ap) 2798 struct vop_reclaim_args /* { 2799 struct vnode *a_vp; 2800 } */ *ap; 2801 { 2802 struct vnode *vp = ap->a_vp; 2803 int s; 2804 2805 s = splbio(); 2806 vp->v_mount->mnt_syncer = NULL; 2807 if (vp->v_flag & VONWORKLST) { 2808 LIST_REMOVE(vp, v_synclist); 2809 vp->v_flag &= ~VONWORKLST; 2810 } 2811 splx(s); 2812 2813 return (0); 2814 } 2815 2816 /* 2817 * Print out a syncer vnode. 2818 */ 2819 static int 2820 sync_print(ap) 2821 struct vop_print_args /* { 2822 struct vnode *a_vp; 2823 } */ *ap; 2824 { 2825 struct vnode *vp = ap->a_vp; 2826 2827 printf("syncer vnode"); 2828 if (vp->v_vnlock != NULL) 2829 lockmgr_printinfo(vp->v_vnlock); 2830 printf("\n"); 2831 return (0); 2832 } 2833 2834 /* 2835 * extract the dev_t from a VCHR 2836 */ 2837 dev_t 2838 vn_todev(vp) 2839 struct vnode *vp; 2840 { 2841 if (vp->v_type != VCHR) 2842 return (NODEV); 2843 return (vp->v_rdev); 2844 } 2845 2846 /* 2847 * Check if vnode represents a disk device 2848 */ 2849 int 2850 vn_isdisk(vp, errp) 2851 struct vnode *vp; 2852 int *errp; 2853 { 2854 struct cdevsw *cdevsw; 2855 2856 if (vp->v_type != VCHR) { 2857 if (errp != NULL) 2858 *errp = ENOTBLK; 2859 return (0); 2860 } 2861 if (vp->v_rdev == NULL) { 2862 if (errp != NULL) 2863 *errp = ENXIO; 2864 return (0); 2865 } 2866 cdevsw = devsw(vp->v_rdev); 2867 if (cdevsw == NULL) { 2868 if (errp != NULL) 2869 *errp = ENXIO; 2870 return (0); 2871 } 2872 if (!(cdevsw->d_flags & D_DISK)) { 2873 if (errp != NULL) 2874 *errp = ENOTBLK; 2875 return (0); 2876 } 2877 if (errp != NULL) 2878 *errp = 0; 2879 return (1); 2880 } 2881 2882 /* 2883 * Free data allocated by namei(); see namei(9) for details. 2884 */ 2885 void 2886 NDFREE(ndp, flags) 2887 struct nameidata *ndp; 2888 const uint flags; 2889 { 2890 if (!(flags & NDF_NO_FREE_PNBUF) && 2891 (ndp->ni_cnd.cn_flags & HASBUF)) { 2892 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2893 ndp->ni_cnd.cn_flags &= ~HASBUF; 2894 } 2895 if (!(flags & NDF_NO_DVP_UNLOCK) && 2896 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 2897 ndp->ni_dvp != ndp->ni_vp) 2898 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread); 2899 if (!(flags & NDF_NO_DVP_RELE) && 2900 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 2901 vrele(ndp->ni_dvp); 2902 ndp->ni_dvp = NULL; 2903 } 2904 if (!(flags & NDF_NO_VP_UNLOCK) && 2905 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 2906 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread); 2907 if (!(flags & NDF_NO_VP_RELE) && 2908 ndp->ni_vp) { 2909 vrele(ndp->ni_vp); 2910 ndp->ni_vp = NULL; 2911 } 2912 if (!(flags & NDF_NO_STARTDIR_RELE) && 2913 (ndp->ni_cnd.cn_flags & SAVESTART)) { 2914 vrele(ndp->ni_startdir); 2915 ndp->ni_startdir = NULL; 2916 } 2917 } 2918 2919 /* 2920 * Common file system object access control check routine. Accepts a 2921 * vnode's type, "mode", uid and gid, requested access mode, credentials, 2922 * and optional call-by-reference privused argument allowing vaccess() 2923 * to indicate to the caller whether privilege was used to satisfy the 2924 * request. Returns 0 on success, or an errno on failure. 2925 */ 2926 int 2927 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 2928 enum vtype type; 2929 mode_t file_mode; 2930 uid_t file_uid; 2931 gid_t file_gid; 2932 mode_t acc_mode; 2933 struct ucred *cred; 2934 int *privused; 2935 { 2936 mode_t dac_granted; 2937 #ifdef CAPABILITIES 2938 mode_t cap_granted; 2939 #endif 2940 2941 /* 2942 * Look for a normal, non-privileged way to access the file/directory 2943 * as requested. If it exists, go with that. 2944 */ 2945 2946 if (privused != NULL) 2947 *privused = 0; 2948 2949 dac_granted = 0; 2950 2951 /* Check the owner. */ 2952 if (cred->cr_uid == file_uid) { 2953 dac_granted |= VADMIN; 2954 if (file_mode & S_IXUSR) 2955 dac_granted |= VEXEC; 2956 if (file_mode & S_IRUSR) 2957 dac_granted |= VREAD; 2958 if (file_mode & S_IWUSR) 2959 dac_granted |= VWRITE; 2960 2961 if ((acc_mode & dac_granted) == acc_mode) 2962 return (0); 2963 2964 goto privcheck; 2965 } 2966 2967 /* Otherwise, check the groups (first match) */ 2968 if (groupmember(file_gid, cred)) { 2969 if (file_mode & S_IXGRP) 2970 dac_granted |= VEXEC; 2971 if (file_mode & S_IRGRP) 2972 dac_granted |= VREAD; 2973 if (file_mode & S_IWGRP) 2974 dac_granted |= VWRITE; 2975 2976 if ((acc_mode & dac_granted) == acc_mode) 2977 return (0); 2978 2979 goto privcheck; 2980 } 2981 2982 /* Otherwise, check everyone else. */ 2983 if (file_mode & S_IXOTH) 2984 dac_granted |= VEXEC; 2985 if (file_mode & S_IROTH) 2986 dac_granted |= VREAD; 2987 if (file_mode & S_IWOTH) 2988 dac_granted |= VWRITE; 2989 if ((acc_mode & dac_granted) == acc_mode) 2990 return (0); 2991 2992 privcheck: 2993 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 2994 /* XXX audit: privilege used */ 2995 if (privused != NULL) 2996 *privused = 1; 2997 return (0); 2998 } 2999 3000 #ifdef CAPABILITIES 3001 /* 3002 * Build a capability mask to determine if the set of capabilities 3003 * satisfies the requirements when combined with the granted mask 3004 * from above. 3005 * For each capability, if the capability is required, bitwise 3006 * or the request type onto the cap_granted mask. 3007 */ 3008 cap_granted = 0; 3009 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3010 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3011 cap_granted |= VEXEC; 3012 3013 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3014 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3015 cap_granted |= VREAD; 3016 3017 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3018 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3019 cap_granted |= VWRITE; 3020 3021 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3022 !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) 3023 cap_granted |= VADMIN; 3024 3025 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3026 /* XXX audit: privilege used */ 3027 if (privused != NULL) 3028 *privused = 1; 3029 return (0); 3030 } 3031 #endif 3032 3033 return ((acc_mode & VADMIN) ? EPERM : EACCES); 3034 } 3035 3036