1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 #include "opt_ffs.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/eventhandler.h> 54 #include <sys/fcntl.h> 55 #include <sys/kernel.h> 56 #include <sys/kthread.h> 57 #include <sys/malloc.h> 58 #include <sys/mount.h> 59 #include <sys/namei.h> 60 #include <sys/stat.h> 61 #include <sys/sysctl.h> 62 #include <sys/syslog.h> 63 #include <sys/vmmeter.h> 64 #include <sys/vnode.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_object.h> 68 #include <vm/vm_extern.h> 69 #include <vm/pmap.h> 70 #include <vm/vm_map.h> 71 #include <vm/vm_page.h> 72 #include <vm/uma.h> 73 74 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 75 76 static void addalias(struct vnode *vp, dev_t nvp_rdev); 77 static void insmntque(struct vnode *vp, struct mount *mp); 78 static void vclean(struct vnode *vp, int flags, struct thread *td); 79 static void vlruvp(struct vnode *vp); 80 81 /* 82 * Number of vnodes in existence. Increased whenever getnewvnode() 83 * allocates a new vnode, never decreased. 84 */ 85 static unsigned long numvnodes; 86 87 SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 88 89 /* 90 * Conversion tables for conversion from vnode types to inode formats 91 * and back. 92 */ 93 enum vtype iftovt_tab[16] = { 94 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 95 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 96 }; 97 int vttoif_tab[9] = { 98 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 99 S_IFSOCK, S_IFIFO, S_IFMT, 100 }; 101 102 /* 103 * List of vnodes that are ready for recycling. 104 */ 105 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 106 107 /* 108 * Minimum number of free vnodes. If there are fewer than this free vnodes, 109 * getnewvnode() will return a newly allocated vnode. 110 */ 111 static u_long wantfreevnodes = 25; 112 SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 113 /* Number of vnodes in the free list. */ 114 static u_long freevnodes; 115 SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 116 117 #if 0 118 /* Number of vnode allocation. */ 119 static u_long vnodeallocs; 120 SYSCTL_LONG(_debug, OID_AUTO, vnodeallocs, CTLFLAG_RD, &vnodeallocs, 0, ""); 121 /* Period of vnode recycle from namecache in vnode allocation times. */ 122 static u_long vnoderecycleperiod = 1000; 123 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleperiod, CTLFLAG_RW, &vnoderecycleperiod, 0, ""); 124 /* Minimum number of total vnodes required to invoke vnode recycle from namecache. */ 125 static u_long vnoderecyclemintotalvn = 2000; 126 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclemintotalvn, CTLFLAG_RW, &vnoderecyclemintotalvn, 0, ""); 127 /* Minimum number of free vnodes required to invoke vnode recycle from namecache. */ 128 static u_long vnoderecycleminfreevn = 2000; 129 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleminfreevn, CTLFLAG_RW, &vnoderecycleminfreevn, 0, ""); 130 /* Number of vnodes attempted to recycle at a time. */ 131 static u_long vnoderecyclenumber = 3000; 132 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclenumber, CTLFLAG_RW, &vnoderecyclenumber, 0, ""); 133 #endif 134 135 /* 136 * Various variables used for debugging the new implementation of 137 * reassignbuf(). 138 * XXX these are probably of (very) limited utility now. 139 */ 140 static int reassignbufcalls; 141 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 142 static int reassignbufloops; 143 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 144 static int reassignbufsortgood; 145 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 146 static int reassignbufsortbad; 147 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 148 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ 149 static int reassignbufmethod = 1; 150 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 151 static int nameileafonly; 152 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); 153 154 #ifdef ENABLE_VFS_IOOPT 155 /* See NOTES for a description of this setting. */ 156 int vfs_ioopt; 157 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 158 #endif 159 160 /* List of mounted filesystems. */ 161 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 162 163 /* For any iteration/modification of mountlist */ 164 struct mtx mountlist_mtx; 165 166 /* For any iteration/modification of mnt_vnodelist */ 167 struct mtx mntvnode_mtx; 168 169 /* 170 * Cache for the mount type id assigned to NFS. This is used for 171 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 172 */ 173 int nfs_mount_type = -1; 174 175 /* To keep more than one thread at a time from running vfs_getnewfsid */ 176 static struct mtx mntid_mtx; 177 178 /* For any iteration/modification of vnode_free_list */ 179 static struct mtx vnode_free_list_mtx; 180 181 /* 182 * For any iteration/modification of dev->si_hlist (linked through 183 * v_specnext) 184 */ 185 static struct mtx spechash_mtx; 186 187 /* Publicly exported FS */ 188 struct nfs_public nfs_pub; 189 190 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 191 static uma_zone_t vnode_zone; 192 static uma_zone_t vnodepoll_zone; 193 194 /* Set to 1 to print out reclaim of active vnodes */ 195 int prtactive; 196 197 /* 198 * The workitem queue. 199 * 200 * It is useful to delay writes of file data and filesystem metadata 201 * for tens of seconds so that quickly created and deleted files need 202 * not waste disk bandwidth being created and removed. To realize this, 203 * we append vnodes to a "workitem" queue. When running with a soft 204 * updates implementation, most pending metadata dependencies should 205 * not wait for more than a few seconds. Thus, mounted on block devices 206 * are delayed only about a half the time that file data is delayed. 207 * Similarly, directory updates are more critical, so are only delayed 208 * about a third the time that file data is delayed. Thus, there are 209 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 210 * one each second (driven off the filesystem syncer process). The 211 * syncer_delayno variable indicates the next queue that is to be processed. 212 * Items that need to be processed soon are placed in this queue: 213 * 214 * syncer_workitem_pending[syncer_delayno] 215 * 216 * A delay of fifteen seconds is done by placing the request fifteen 217 * entries later in the queue: 218 * 219 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 220 * 221 */ 222 static int syncer_delayno; 223 static long syncer_mask; 224 LIST_HEAD(synclist, vnode); 225 static struct synclist *syncer_workitem_pending; 226 227 #define SYNCER_MAXDELAY 32 228 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 229 static int syncdelay = 30; /* max time to delay syncing data */ 230 static int filedelay = 30; /* time to delay syncing files */ 231 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 232 static int dirdelay = 29; /* time to delay syncing directories */ 233 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 234 static int metadelay = 28; /* time to delay syncing metadata */ 235 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 236 static int rushjob; /* number of slots to run ASAP */ 237 static int stat_rush_requests; /* number of times I/O speeded up */ 238 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 239 240 /* 241 * Number of vnodes we want to exist at any one time. This is mostly used 242 * to size hash tables in vnode-related code. It is normally not used in 243 * getnewvnode(), as wantfreevnodes is normally nonzero.) 244 * 245 * XXX desiredvnodes is historical cruft and should not exist. 246 */ 247 int desiredvnodes; 248 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 249 &desiredvnodes, 0, "Maximum number of vnodes"); 250 static int minvnodes; 251 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 252 &minvnodes, 0, "Minimum number of vnodes"); 253 static int vnlru_nowhere; 254 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, 255 "Number of times the vnlru process ran without success"); 256 257 void 258 v_addpollinfo(struct vnode *vp) 259 { 260 vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK); 261 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", MTX_DEF); 262 } 263 264 /* 265 * Initialize the vnode management data structures. 266 */ 267 static void 268 vntblinit(void *dummy __unused) 269 { 270 271 desiredvnodes = maxproc + cnt.v_page_count / 4; 272 minvnodes = desiredvnodes / 4; 273 mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); 274 mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); 275 mtx_init(&mntid_mtx, "mntid", MTX_DEF); 276 mtx_init(&spechash_mtx, "spechash", MTX_DEF); 277 TAILQ_INIT(&vnode_free_list); 278 mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); 279 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 280 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 281 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 282 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 283 /* 284 * Initialize the filesystem syncer. 285 */ 286 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 287 &syncer_mask); 288 syncer_maxdelay = syncer_mask + 1; 289 } 290 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 291 292 293 /* 294 * Mark a mount point as busy. Used to synchronize access and to delay 295 * unmounting. Interlock is not released on failure. 296 */ 297 int 298 vfs_busy(mp, flags, interlkp, td) 299 struct mount *mp; 300 int flags; 301 struct mtx *interlkp; 302 struct thread *td; 303 { 304 int lkflags; 305 306 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 307 if (flags & LK_NOWAIT) 308 return (ENOENT); 309 mp->mnt_kern_flag |= MNTK_MWAIT; 310 /* 311 * Since all busy locks are shared except the exclusive 312 * lock granted when unmounting, the only place that a 313 * wakeup needs to be done is at the release of the 314 * exclusive lock at the end of dounmount. 315 */ 316 msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); 317 return (ENOENT); 318 } 319 lkflags = LK_SHARED | LK_NOPAUSE; 320 if (interlkp) 321 lkflags |= LK_INTERLOCK; 322 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td)) 323 panic("vfs_busy: unexpected lock failure"); 324 return (0); 325 } 326 327 /* 328 * Free a busy filesystem. 329 */ 330 void 331 vfs_unbusy(mp, td) 332 struct mount *mp; 333 struct thread *td; 334 { 335 336 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); 337 } 338 339 /* 340 * Lookup a filesystem type, and if found allocate and initialize 341 * a mount structure for it. 342 * 343 * Devname is usually updated by mount(8) after booting. 344 */ 345 int 346 vfs_rootmountalloc(fstypename, devname, mpp) 347 char *fstypename; 348 char *devname; 349 struct mount **mpp; 350 { 351 struct thread *td = curthread; /* XXX */ 352 struct vfsconf *vfsp; 353 struct mount *mp; 354 355 if (fstypename == NULL) 356 return (ENODEV); 357 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 358 if (!strcmp(vfsp->vfc_name, fstypename)) 359 break; 360 if (vfsp == NULL) 361 return (ENODEV); 362 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); 363 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 364 (void)vfs_busy(mp, LK_NOWAIT, 0, td); 365 TAILQ_INIT(&mp->mnt_nvnodelist); 366 TAILQ_INIT(&mp->mnt_reservedvnlist); 367 mp->mnt_vfc = vfsp; 368 mp->mnt_op = vfsp->vfc_vfsops; 369 mp->mnt_flag = MNT_RDONLY; 370 mp->mnt_vnodecovered = NULLVP; 371 vfsp->vfc_refcount++; 372 mp->mnt_iosize_max = DFLTPHYS; 373 mp->mnt_stat.f_type = vfsp->vfc_typenum; 374 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 375 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 376 mp->mnt_stat.f_mntonname[0] = '/'; 377 mp->mnt_stat.f_mntonname[1] = 0; 378 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 379 *mpp = mp; 380 return (0); 381 } 382 383 /* 384 * Find an appropriate filesystem to use for the root. If a filesystem 385 * has not been preselected, walk through the list of known filesystems 386 * trying those that have mountroot routines, and try them until one 387 * works or we have tried them all. 388 */ 389 #ifdef notdef /* XXX JH */ 390 int 391 lite2_vfs_mountroot() 392 { 393 struct vfsconf *vfsp; 394 extern int (*lite2_mountroot)(void); 395 int error; 396 397 if (lite2_mountroot != NULL) 398 return ((*lite2_mountroot)()); 399 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 400 if (vfsp->vfc_mountroot == NULL) 401 continue; 402 if ((error = (*vfsp->vfc_mountroot)()) == 0) 403 return (0); 404 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 405 } 406 return (ENODEV); 407 } 408 #endif 409 410 /* 411 * Lookup a mount point by filesystem identifier. 412 */ 413 struct mount * 414 vfs_getvfs(fsid) 415 fsid_t *fsid; 416 { 417 register struct mount *mp; 418 419 mtx_lock(&mountlist_mtx); 420 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 421 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 422 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 423 mtx_unlock(&mountlist_mtx); 424 return (mp); 425 } 426 } 427 mtx_unlock(&mountlist_mtx); 428 return ((struct mount *) 0); 429 } 430 431 /* 432 * Get a new unique fsid. Try to make its val[0] unique, since this value 433 * will be used to create fake device numbers for stat(). Also try (but 434 * not so hard) make its val[0] unique mod 2^16, since some emulators only 435 * support 16-bit device numbers. We end up with unique val[0]'s for the 436 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 437 * 438 * Keep in mind that several mounts may be running in parallel. Starting 439 * the search one past where the previous search terminated is both a 440 * micro-optimization and a defense against returning the same fsid to 441 * different mounts. 442 */ 443 void 444 vfs_getnewfsid(mp) 445 struct mount *mp; 446 { 447 static u_int16_t mntid_base; 448 fsid_t tfsid; 449 int mtype; 450 451 mtx_lock(&mntid_mtx); 452 mtype = mp->mnt_vfc->vfc_typenum; 453 tfsid.val[1] = mtype; 454 mtype = (mtype & 0xFF) << 24; 455 for (;;) { 456 tfsid.val[0] = makeudev(255, 457 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 458 mntid_base++; 459 if (vfs_getvfs(&tfsid) == NULL) 460 break; 461 } 462 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 463 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 464 mtx_unlock(&mntid_mtx); 465 } 466 467 /* 468 * Knob to control the precision of file timestamps: 469 * 470 * 0 = seconds only; nanoseconds zeroed. 471 * 1 = seconds and nanoseconds, accurate within 1/HZ. 472 * 2 = seconds and nanoseconds, truncated to microseconds. 473 * >=3 = seconds and nanoseconds, maximum precision. 474 */ 475 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 476 477 static int timestamp_precision = TSP_SEC; 478 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 479 ×tamp_precision, 0, ""); 480 481 /* 482 * Get a current timestamp. 483 */ 484 void 485 vfs_timestamp(tsp) 486 struct timespec *tsp; 487 { 488 struct timeval tv; 489 490 switch (timestamp_precision) { 491 case TSP_SEC: 492 tsp->tv_sec = time_second; 493 tsp->tv_nsec = 0; 494 break; 495 case TSP_HZ: 496 getnanotime(tsp); 497 break; 498 case TSP_USEC: 499 microtime(&tv); 500 TIMEVAL_TO_TIMESPEC(&tv, tsp); 501 break; 502 case TSP_NSEC: 503 default: 504 nanotime(tsp); 505 break; 506 } 507 } 508 509 /* 510 * Set vnode attributes to VNOVAL 511 */ 512 void 513 vattr_null(vap) 514 register struct vattr *vap; 515 { 516 517 vap->va_type = VNON; 518 vap->va_size = VNOVAL; 519 vap->va_bytes = VNOVAL; 520 vap->va_mode = VNOVAL; 521 vap->va_nlink = VNOVAL; 522 vap->va_uid = VNOVAL; 523 vap->va_gid = VNOVAL; 524 vap->va_fsid = VNOVAL; 525 vap->va_fileid = VNOVAL; 526 vap->va_blocksize = VNOVAL; 527 vap->va_rdev = VNOVAL; 528 vap->va_atime.tv_sec = VNOVAL; 529 vap->va_atime.tv_nsec = VNOVAL; 530 vap->va_mtime.tv_sec = VNOVAL; 531 vap->va_mtime.tv_nsec = VNOVAL; 532 vap->va_ctime.tv_sec = VNOVAL; 533 vap->va_ctime.tv_nsec = VNOVAL; 534 vap->va_flags = VNOVAL; 535 vap->va_gen = VNOVAL; 536 vap->va_vaflags = 0; 537 } 538 539 /* 540 * This routine is called when we have too many vnodes. It attempts 541 * to free <count> vnodes and will potentially free vnodes that still 542 * have VM backing store (VM backing store is typically the cause 543 * of a vnode blowout so we want to do this). Therefore, this operation 544 * is not considered cheap. 545 * 546 * A number of conditions may prevent a vnode from being reclaimed. 547 * the buffer cache may have references on the vnode, a directory 548 * vnode may still have references due to the namei cache representing 549 * underlying files, or the vnode may be in active use. It is not 550 * desireable to reuse such vnodes. These conditions may cause the 551 * number of vnodes to reach some minimum value regardless of what 552 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 553 */ 554 static int 555 vlrureclaim(struct mount *mp, int count) 556 { 557 struct vnode *vp; 558 int done; 559 int trigger; 560 int usevnodes; 561 562 /* 563 * Calculate the trigger point, don't allow user 564 * screwups to blow us up. This prevents us from 565 * recycling vnodes with lots of resident pages. We 566 * aren't trying to free memory, we are trying to 567 * free vnodes. 568 */ 569 usevnodes = desiredvnodes; 570 if (usevnodes <= 0) 571 usevnodes = 1; 572 trigger = cnt.v_page_count * 2 / usevnodes; 573 574 done = 0; 575 mtx_lock(&mntvnode_mtx); 576 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 577 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 578 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 579 580 if (vp->v_type != VNON && 581 vp->v_type != VBAD && 582 VMIGHTFREE(vp) && /* critical path opt */ 583 (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) && 584 mtx_trylock(&vp->v_interlock) 585 ) { 586 mtx_unlock(&mntvnode_mtx); 587 if (VMIGHTFREE(vp)) { 588 vgonel(vp, curthread); 589 done++; 590 } else { 591 mtx_unlock(&vp->v_interlock); 592 } 593 mtx_lock(&mntvnode_mtx); 594 } 595 --count; 596 } 597 mtx_unlock(&mntvnode_mtx); 598 return done; 599 } 600 601 /* 602 * Attempt to recycle vnodes in a context that is always safe to block. 603 * Calling vlrurecycle() from the bowels of file system code has some 604 * interesting deadlock problems. 605 */ 606 static struct proc *vnlruproc; 607 static int vnlruproc_sig; 608 609 static void 610 vnlru_proc(void) 611 { 612 struct mount *mp, *nmp; 613 int s; 614 int done; 615 struct proc *p = vnlruproc; 616 struct thread *td = FIRST_THREAD_IN_PROC(p); /* XXXKSE */ 617 618 mtx_lock(&Giant); 619 620 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 621 SHUTDOWN_PRI_FIRST); 622 623 s = splbio(); 624 for (;;) { 625 kthread_suspend_check(p); 626 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { 627 vnlruproc_sig = 0; 628 tsleep(vnlruproc, PVFS, "vlruwt", 0); 629 continue; 630 } 631 done = 0; 632 mtx_lock(&mountlist_mtx); 633 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 634 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 635 nmp = TAILQ_NEXT(mp, mnt_list); 636 continue; 637 } 638 done += vlrureclaim(mp, 10); 639 mtx_lock(&mountlist_mtx); 640 nmp = TAILQ_NEXT(mp, mnt_list); 641 vfs_unbusy(mp, td); 642 } 643 mtx_unlock(&mountlist_mtx); 644 if (done == 0) { 645 #if 0 646 /* These messages are temporary debugging aids */ 647 if (vnlru_nowhere < 5) 648 printf("vnlru process getting nowhere..\n"); 649 else if (vnlru_nowhere == 5) 650 printf("vnlru process messages stopped.\n"); 651 #endif 652 vnlru_nowhere++; 653 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 654 } 655 } 656 splx(s); 657 } 658 659 static struct kproc_desc vnlru_kp = { 660 "vnlru", 661 vnlru_proc, 662 &vnlruproc 663 }; 664 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 665 666 667 /* 668 * Routines having to do with the management of the vnode table. 669 */ 670 671 /* 672 * Return the next vnode from the free list. 673 */ 674 int 675 getnewvnode(tag, mp, vops, vpp) 676 enum vtagtype tag; 677 struct mount *mp; 678 vop_t **vops; 679 struct vnode **vpp; 680 { 681 int s; 682 struct thread *td = curthread; /* XXX */ 683 struct vnode *vp = NULL; 684 struct mount *vnmp; 685 vm_object_t object; 686 687 s = splbio(); 688 /* 689 * Try to reuse vnodes if we hit the max. This situation only 690 * occurs in certain large-memory (2G+) situations. We cannot 691 * attempt to directly reclaim vnodes due to nasty recursion 692 * problems. 693 */ 694 if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) { 695 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 696 wakeup(vnlruproc); 697 } 698 699 /* 700 * Attempt to reuse a vnode already on the free list, allocating 701 * a new vnode if we can't find one or if we have not reached a 702 * good minimum for good LRU performance. 703 */ 704 705 mtx_lock(&vnode_free_list_mtx); 706 707 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 708 int count; 709 710 for (count = 0; count < freevnodes; count++) { 711 vp = TAILQ_FIRST(&vnode_free_list); 712 if (vp == NULL || vp->v_usecount) 713 panic("getnewvnode: free vnode isn't"); 714 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 715 716 /* 717 * Don't recycle if we still have cached pages or if 718 * we cannot get the interlock. 719 */ 720 if ((VOP_GETVOBJECT(vp, &object) == 0 && 721 (object->resident_page_count || 722 object->ref_count)) || 723 !mtx_trylock(&vp->v_interlock)) { 724 TAILQ_INSERT_TAIL(&vnode_free_list, vp, 725 v_freelist); 726 vp = NULL; 727 continue; 728 } 729 if (LIST_FIRST(&vp->v_cache_src)) { 730 /* 731 * note: nameileafonly sysctl is temporary, 732 * for debugging only, and will eventually be 733 * removed. 734 */ 735 if (nameileafonly > 0) { 736 /* 737 * Do not reuse namei-cached directory 738 * vnodes that have cached 739 * subdirectories. 740 */ 741 if (cache_leaf_test(vp) < 0) { 742 mtx_unlock(&vp->v_interlock); 743 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 744 vp = NULL; 745 continue; 746 } 747 } else if (nameileafonly < 0 || 748 vmiodirenable == 0) { 749 /* 750 * Do not reuse namei-cached directory 751 * vnodes if nameileafonly is -1 or 752 * if VMIO backing for directories is 753 * turned off (otherwise we reuse them 754 * too quickly). 755 */ 756 mtx_unlock(&vp->v_interlock); 757 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 758 vp = NULL; 759 continue; 760 } 761 } 762 /* 763 * Skip over it if its filesystem is being suspended. 764 */ 765 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 766 break; 767 mtx_unlock(&vp->v_interlock); 768 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 769 vp = NULL; 770 } 771 } 772 if (vp) { 773 vp->v_flag |= VDOOMED; 774 vp->v_flag &= ~VFREE; 775 freevnodes--; 776 mtx_unlock(&vnode_free_list_mtx); 777 cache_purge(vp); 778 if (vp->v_type != VBAD) { 779 vgonel(vp, td); 780 } else { 781 mtx_unlock(&vp->v_interlock); 782 } 783 vn_finished_write(vnmp); 784 785 #ifdef INVARIANTS 786 { 787 int s; 788 789 if (vp->v_data) 790 panic("cleaned vnode isn't"); 791 s = splbio(); 792 if (vp->v_numoutput) 793 panic("Clean vnode has pending I/O's"); 794 splx(s); 795 if (vp->v_writecount != 0) 796 panic("Non-zero write count"); 797 } 798 #endif 799 if (vp->v_pollinfo) { 800 mtx_destroy(&vp->v_pollinfo->vpi_lock); 801 uma_zfree(vnodepoll_zone, vp->v_pollinfo); 802 } 803 vp->v_pollinfo = NULL; 804 vp->v_flag = 0; 805 vp->v_lastw = 0; 806 vp->v_lasta = 0; 807 vp->v_cstart = 0; 808 vp->v_clen = 0; 809 vp->v_socket = 0; 810 } else { 811 mtx_unlock(&vnode_free_list_mtx); 812 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); 813 bzero((char *) vp, sizeof *vp); 814 mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); 815 vp->v_dd = vp; 816 cache_purge(vp); 817 LIST_INIT(&vp->v_cache_src); 818 TAILQ_INIT(&vp->v_cache_dst); 819 numvnodes++; 820 } 821 822 TAILQ_INIT(&vp->v_cleanblkhd); 823 TAILQ_INIT(&vp->v_dirtyblkhd); 824 vp->v_type = VNON; 825 vp->v_tag = tag; 826 vp->v_op = vops; 827 lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE); 828 insmntque(vp, mp); 829 *vpp = vp; 830 vp->v_usecount = 1; 831 vp->v_data = 0; 832 833 splx(s); 834 835 vfs_object_create(vp, td, td->td_ucred); 836 837 #if 0 838 vnodeallocs++; 839 if (vnodeallocs % vnoderecycleperiod == 0 && 840 freevnodes < vnoderecycleminfreevn && 841 vnoderecyclemintotalvn < numvnodes) { 842 /* Recycle vnodes. */ 843 cache_purgeleafdirs(vnoderecyclenumber); 844 } 845 #endif 846 847 return (0); 848 } 849 850 /* 851 * Move a vnode from one mount queue to another. 852 */ 853 static void 854 insmntque(vp, mp) 855 register struct vnode *vp; 856 register struct mount *mp; 857 { 858 859 mtx_lock(&mntvnode_mtx); 860 /* 861 * Delete from old mount point vnode list, if on one. 862 */ 863 if (vp->v_mount != NULL) 864 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); 865 /* 866 * Insert into list of vnodes for the new mount point, if available. 867 */ 868 if ((vp->v_mount = mp) == NULL) { 869 mtx_unlock(&mntvnode_mtx); 870 return; 871 } 872 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 873 mtx_unlock(&mntvnode_mtx); 874 } 875 876 /* 877 * Update outstanding I/O count and do wakeup if requested. 878 */ 879 void 880 vwakeup(bp) 881 register struct buf *bp; 882 { 883 register struct vnode *vp; 884 885 bp->b_flags &= ~B_WRITEINPROG; 886 if ((vp = bp->b_vp)) { 887 vp->v_numoutput--; 888 if (vp->v_numoutput < 0) 889 panic("vwakeup: neg numoutput"); 890 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 891 vp->v_flag &= ~VBWAIT; 892 wakeup((caddr_t) &vp->v_numoutput); 893 } 894 } 895 } 896 897 /* 898 * Flush out and invalidate all buffers associated with a vnode. 899 * Called with the underlying object locked. 900 */ 901 int 902 vinvalbuf(vp, flags, cred, td, slpflag, slptimeo) 903 register struct vnode *vp; 904 int flags; 905 struct ucred *cred; 906 struct thread *td; 907 int slpflag, slptimeo; 908 { 909 register struct buf *bp; 910 struct buf *nbp, *blist; 911 int s, error; 912 vm_object_t object; 913 914 GIANT_REQUIRED; 915 916 if (flags & V_SAVE) { 917 s = splbio(); 918 while (vp->v_numoutput) { 919 vp->v_flag |= VBWAIT; 920 error = tsleep((caddr_t)&vp->v_numoutput, 921 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 922 if (error) { 923 splx(s); 924 return (error); 925 } 926 } 927 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 928 splx(s); 929 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0) 930 return (error); 931 s = splbio(); 932 if (vp->v_numoutput > 0 || 933 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 934 panic("vinvalbuf: dirty bufs"); 935 } 936 splx(s); 937 } 938 s = splbio(); 939 for (;;) { 940 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 941 if (!blist) 942 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 943 if (!blist) 944 break; 945 946 for (bp = blist; bp; bp = nbp) { 947 nbp = TAILQ_NEXT(bp, b_vnbufs); 948 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 949 error = BUF_TIMELOCK(bp, 950 LK_EXCLUSIVE | LK_SLEEPFAIL, 951 "vinvalbuf", slpflag, slptimeo); 952 if (error == ENOLCK) 953 break; 954 splx(s); 955 return (error); 956 } 957 /* 958 * XXX Since there are no node locks for NFS, I 959 * believe there is a slight chance that a delayed 960 * write will occur while sleeping just above, so 961 * check for it. Note that vfs_bio_awrite expects 962 * buffers to reside on a queue, while BUF_WRITE and 963 * brelse do not. 964 */ 965 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 966 (flags & V_SAVE)) { 967 968 if (bp->b_vp == vp) { 969 if (bp->b_flags & B_CLUSTEROK) { 970 BUF_UNLOCK(bp); 971 vfs_bio_awrite(bp); 972 } else { 973 bremfree(bp); 974 bp->b_flags |= B_ASYNC; 975 BUF_WRITE(bp); 976 } 977 } else { 978 bremfree(bp); 979 (void) BUF_WRITE(bp); 980 } 981 break; 982 } 983 bremfree(bp); 984 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 985 bp->b_flags &= ~B_ASYNC; 986 brelse(bp); 987 } 988 } 989 990 /* 991 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 992 * have write I/O in-progress but if there is a VM object then the 993 * VM object can also have read-I/O in-progress. 994 */ 995 do { 996 while (vp->v_numoutput > 0) { 997 vp->v_flag |= VBWAIT; 998 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 999 } 1000 if (VOP_GETVOBJECT(vp, &object) == 0) { 1001 while (object->paging_in_progress) 1002 vm_object_pip_sleep(object, "vnvlbx"); 1003 } 1004 } while (vp->v_numoutput > 0); 1005 1006 splx(s); 1007 1008 /* 1009 * Destroy the copy in the VM cache, too. 1010 */ 1011 mtx_lock(&vp->v_interlock); 1012 if (VOP_GETVOBJECT(vp, &object) == 0) { 1013 vm_object_page_remove(object, 0, 0, 1014 (flags & V_SAVE) ? TRUE : FALSE); 1015 } 1016 mtx_unlock(&vp->v_interlock); 1017 1018 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 1019 panic("vinvalbuf: flush failed"); 1020 return (0); 1021 } 1022 1023 /* 1024 * Truncate a file's buffer and pages to a specified length. This 1025 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1026 * sync activity. 1027 */ 1028 int 1029 vtruncbuf(vp, cred, td, length, blksize) 1030 register struct vnode *vp; 1031 struct ucred *cred; 1032 struct thread *td; 1033 off_t length; 1034 int blksize; 1035 { 1036 register struct buf *bp; 1037 struct buf *nbp; 1038 int s, anyfreed; 1039 int trunclbn; 1040 1041 /* 1042 * Round up to the *next* lbn. 1043 */ 1044 trunclbn = (length + blksize - 1) / blksize; 1045 1046 s = splbio(); 1047 restart: 1048 anyfreed = 1; 1049 for (;anyfreed;) { 1050 anyfreed = 0; 1051 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 1052 nbp = TAILQ_NEXT(bp, b_vnbufs); 1053 if (bp->b_lblkno >= trunclbn) { 1054 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1055 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1056 goto restart; 1057 } else { 1058 bremfree(bp); 1059 bp->b_flags |= (B_INVAL | B_RELBUF); 1060 bp->b_flags &= ~B_ASYNC; 1061 brelse(bp); 1062 anyfreed = 1; 1063 } 1064 if (nbp && 1065 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1066 (nbp->b_vp != vp) || 1067 (nbp->b_flags & B_DELWRI))) { 1068 goto restart; 1069 } 1070 } 1071 } 1072 1073 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1074 nbp = TAILQ_NEXT(bp, b_vnbufs); 1075 if (bp->b_lblkno >= trunclbn) { 1076 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1077 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1078 goto restart; 1079 } else { 1080 bremfree(bp); 1081 bp->b_flags |= (B_INVAL | B_RELBUF); 1082 bp->b_flags &= ~B_ASYNC; 1083 brelse(bp); 1084 anyfreed = 1; 1085 } 1086 if (nbp && 1087 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1088 (nbp->b_vp != vp) || 1089 (nbp->b_flags & B_DELWRI) == 0)) { 1090 goto restart; 1091 } 1092 } 1093 } 1094 } 1095 1096 if (length > 0) { 1097 restartsync: 1098 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1099 nbp = TAILQ_NEXT(bp, b_vnbufs); 1100 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 1101 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1102 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1103 goto restart; 1104 } else { 1105 bremfree(bp); 1106 if (bp->b_vp == vp) { 1107 bp->b_flags |= B_ASYNC; 1108 } else { 1109 bp->b_flags &= ~B_ASYNC; 1110 } 1111 BUF_WRITE(bp); 1112 } 1113 goto restartsync; 1114 } 1115 1116 } 1117 } 1118 1119 while (vp->v_numoutput > 0) { 1120 vp->v_flag |= VBWAIT; 1121 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 1122 } 1123 1124 splx(s); 1125 1126 vnode_pager_setsize(vp, length); 1127 1128 return (0); 1129 } 1130 1131 /* 1132 * Associate a buffer with a vnode. 1133 */ 1134 void 1135 bgetvp(vp, bp) 1136 register struct vnode *vp; 1137 register struct buf *bp; 1138 { 1139 int s; 1140 1141 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 1142 1143 vhold(vp); 1144 bp->b_vp = vp; 1145 bp->b_dev = vn_todev(vp); 1146 /* 1147 * Insert onto list for new vnode. 1148 */ 1149 s = splbio(); 1150 bp->b_xflags |= BX_VNCLEAN; 1151 bp->b_xflags &= ~BX_VNDIRTY; 1152 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 1153 splx(s); 1154 } 1155 1156 /* 1157 * Disassociate a buffer from a vnode. 1158 */ 1159 void 1160 brelvp(bp) 1161 register struct buf *bp; 1162 { 1163 struct vnode *vp; 1164 struct buflists *listheadp; 1165 int s; 1166 1167 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1168 1169 /* 1170 * Delete from old vnode list, if on one. 1171 */ 1172 vp = bp->b_vp; 1173 s = splbio(); 1174 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1175 if (bp->b_xflags & BX_VNDIRTY) 1176 listheadp = &vp->v_dirtyblkhd; 1177 else 1178 listheadp = &vp->v_cleanblkhd; 1179 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1180 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1181 } 1182 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 1183 vp->v_flag &= ~VONWORKLST; 1184 LIST_REMOVE(vp, v_synclist); 1185 } 1186 splx(s); 1187 bp->b_vp = (struct vnode *) 0; 1188 vdrop(vp); 1189 } 1190 1191 /* 1192 * Add an item to the syncer work queue. 1193 */ 1194 static void 1195 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 1196 { 1197 int s, slot; 1198 1199 s = splbio(); 1200 1201 if (vp->v_flag & VONWORKLST) { 1202 LIST_REMOVE(vp, v_synclist); 1203 } 1204 1205 if (delay > syncer_maxdelay - 2) 1206 delay = syncer_maxdelay - 2; 1207 slot = (syncer_delayno + delay) & syncer_mask; 1208 1209 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 1210 vp->v_flag |= VONWORKLST; 1211 splx(s); 1212 } 1213 1214 struct proc *updateproc; 1215 static void sched_sync(void); 1216 static struct kproc_desc up_kp = { 1217 "syncer", 1218 sched_sync, 1219 &updateproc 1220 }; 1221 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1222 1223 /* 1224 * System filesystem synchronizer daemon. 1225 */ 1226 void 1227 sched_sync(void) 1228 { 1229 struct synclist *slp; 1230 struct vnode *vp; 1231 struct mount *mp; 1232 long starttime; 1233 int s; 1234 struct thread *td = FIRST_THREAD_IN_PROC(updateproc); /* XXXKSE */ 1235 1236 mtx_lock(&Giant); 1237 1238 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc, 1239 SHUTDOWN_PRI_LAST); 1240 1241 for (;;) { 1242 kthread_suspend_check(td->td_proc); 1243 1244 starttime = time_second; 1245 1246 /* 1247 * Push files whose dirty time has expired. Be careful 1248 * of interrupt race on slp queue. 1249 */ 1250 s = splbio(); 1251 slp = &syncer_workitem_pending[syncer_delayno]; 1252 syncer_delayno += 1; 1253 if (syncer_delayno == syncer_maxdelay) 1254 syncer_delayno = 0; 1255 splx(s); 1256 1257 while ((vp = LIST_FIRST(slp)) != NULL) { 1258 if (VOP_ISLOCKED(vp, NULL) == 0 && 1259 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 1260 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1261 (void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td); 1262 VOP_UNLOCK(vp, 0, td); 1263 vn_finished_write(mp); 1264 } 1265 s = splbio(); 1266 if (LIST_FIRST(slp) == vp) { 1267 /* 1268 * Note: v_tag VT_VFS vps can remain on the 1269 * worklist too with no dirty blocks, but 1270 * since sync_fsync() moves it to a different 1271 * slot we are safe. 1272 */ 1273 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1274 !vn_isdisk(vp, NULL)) 1275 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1276 /* 1277 * Put us back on the worklist. The worklist 1278 * routine will remove us from our current 1279 * position and then add us back in at a later 1280 * position. 1281 */ 1282 vn_syncer_add_to_worklist(vp, syncdelay); 1283 } 1284 splx(s); 1285 } 1286 1287 /* 1288 * Do soft update processing. 1289 */ 1290 #ifdef SOFTUPDATES 1291 softdep_process_worklist(NULL); 1292 #endif 1293 1294 /* 1295 * The variable rushjob allows the kernel to speed up the 1296 * processing of the filesystem syncer process. A rushjob 1297 * value of N tells the filesystem syncer to process the next 1298 * N seconds worth of work on its queue ASAP. Currently rushjob 1299 * is used by the soft update code to speed up the filesystem 1300 * syncer process when the incore state is getting so far 1301 * ahead of the disk that the kernel memory pool is being 1302 * threatened with exhaustion. 1303 */ 1304 if (rushjob > 0) { 1305 rushjob -= 1; 1306 continue; 1307 } 1308 /* 1309 * If it has taken us less than a second to process the 1310 * current work, then wait. Otherwise start right over 1311 * again. We can still lose time if any single round 1312 * takes more than two seconds, but it does not really 1313 * matter as we are just trying to generally pace the 1314 * filesystem activity. 1315 */ 1316 if (time_second == starttime) 1317 tsleep(&lbolt, PPAUSE, "syncer", 0); 1318 } 1319 } 1320 1321 /* 1322 * Request the syncer daemon to speed up its work. 1323 * We never push it to speed up more than half of its 1324 * normal turn time, otherwise it could take over the cpu. 1325 * XXXKSE only one update? 1326 */ 1327 int 1328 speedup_syncer() 1329 { 1330 1331 mtx_lock_spin(&sched_lock); 1332 if (FIRST_THREAD_IN_PROC(updateproc)->td_wchan == &lbolt) /* XXXKSE */ 1333 setrunnable(FIRST_THREAD_IN_PROC(updateproc)); 1334 mtx_unlock_spin(&sched_lock); 1335 if (rushjob < syncdelay / 2) { 1336 rushjob += 1; 1337 stat_rush_requests += 1; 1338 return (1); 1339 } 1340 return(0); 1341 } 1342 1343 /* 1344 * Associate a p-buffer with a vnode. 1345 * 1346 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1347 * with the buffer. i.e. the bp has not been linked into the vnode or 1348 * ref-counted. 1349 */ 1350 void 1351 pbgetvp(vp, bp) 1352 register struct vnode *vp; 1353 register struct buf *bp; 1354 { 1355 1356 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1357 1358 bp->b_vp = vp; 1359 bp->b_flags |= B_PAGING; 1360 bp->b_dev = vn_todev(vp); 1361 } 1362 1363 /* 1364 * Disassociate a p-buffer from a vnode. 1365 */ 1366 void 1367 pbrelvp(bp) 1368 register struct buf *bp; 1369 { 1370 1371 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1372 1373 /* XXX REMOVE ME */ 1374 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1375 panic( 1376 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1377 bp, 1378 (int)bp->b_flags 1379 ); 1380 } 1381 bp->b_vp = (struct vnode *) 0; 1382 bp->b_flags &= ~B_PAGING; 1383 } 1384 1385 /* 1386 * Change the vnode a pager buffer is associated with. 1387 */ 1388 void 1389 pbreassignbuf(bp, newvp) 1390 struct buf *bp; 1391 struct vnode *newvp; 1392 { 1393 1394 KASSERT(bp->b_flags & B_PAGING, 1395 ("pbreassignbuf() on non phys bp %p", bp)); 1396 bp->b_vp = newvp; 1397 } 1398 1399 /* 1400 * Reassign a buffer from one vnode to another. 1401 * Used to assign file specific control information 1402 * (indirect blocks) to the vnode to which they belong. 1403 */ 1404 void 1405 reassignbuf(bp, newvp) 1406 register struct buf *bp; 1407 register struct vnode *newvp; 1408 { 1409 struct buflists *listheadp; 1410 int delay; 1411 int s; 1412 1413 if (newvp == NULL) { 1414 printf("reassignbuf: NULL"); 1415 return; 1416 } 1417 ++reassignbufcalls; 1418 1419 /* 1420 * B_PAGING flagged buffers cannot be reassigned because their vp 1421 * is not fully linked in. 1422 */ 1423 if (bp->b_flags & B_PAGING) 1424 panic("cannot reassign paging buffer"); 1425 1426 s = splbio(); 1427 /* 1428 * Delete from old vnode list, if on one. 1429 */ 1430 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1431 if (bp->b_xflags & BX_VNDIRTY) 1432 listheadp = &bp->b_vp->v_dirtyblkhd; 1433 else 1434 listheadp = &bp->b_vp->v_cleanblkhd; 1435 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1436 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1437 if (bp->b_vp != newvp) { 1438 vdrop(bp->b_vp); 1439 bp->b_vp = NULL; /* for clarification */ 1440 } 1441 } 1442 /* 1443 * If dirty, put on list of dirty buffers; otherwise insert onto list 1444 * of clean buffers. 1445 */ 1446 if (bp->b_flags & B_DELWRI) { 1447 struct buf *tbp; 1448 1449 listheadp = &newvp->v_dirtyblkhd; 1450 if ((newvp->v_flag & VONWORKLST) == 0) { 1451 switch (newvp->v_type) { 1452 case VDIR: 1453 delay = dirdelay; 1454 break; 1455 case VCHR: 1456 if (newvp->v_rdev->si_mountpoint != NULL) { 1457 delay = metadelay; 1458 break; 1459 } 1460 /* fall through */ 1461 default: 1462 delay = filedelay; 1463 } 1464 vn_syncer_add_to_worklist(newvp, delay); 1465 } 1466 bp->b_xflags |= BX_VNDIRTY; 1467 tbp = TAILQ_FIRST(listheadp); 1468 if (tbp == NULL || 1469 bp->b_lblkno == 0 || 1470 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1471 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1472 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1473 ++reassignbufsortgood; 1474 } else if (bp->b_lblkno < 0) { 1475 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1476 ++reassignbufsortgood; 1477 } else if (reassignbufmethod == 1) { 1478 /* 1479 * New sorting algorithm, only handle sequential case, 1480 * otherwise append to end (but before metadata) 1481 */ 1482 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1483 (tbp->b_xflags & BX_VNDIRTY)) { 1484 /* 1485 * Found the best place to insert the buffer 1486 */ 1487 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1488 ++reassignbufsortgood; 1489 } else { 1490 /* 1491 * Missed, append to end, but before meta-data. 1492 * We know that the head buffer in the list is 1493 * not meta-data due to prior conditionals. 1494 * 1495 * Indirect effects: NFS second stage write 1496 * tends to wind up here, giving maximum 1497 * distance between the unstable write and the 1498 * commit rpc. 1499 */ 1500 tbp = TAILQ_LAST(listheadp, buflists); 1501 while (tbp && tbp->b_lblkno < 0) 1502 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1503 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1504 ++reassignbufsortbad; 1505 } 1506 } else { 1507 /* 1508 * Old sorting algorithm, scan queue and insert 1509 */ 1510 struct buf *ttbp; 1511 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1512 (ttbp->b_lblkno < bp->b_lblkno)) { 1513 ++reassignbufloops; 1514 tbp = ttbp; 1515 } 1516 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1517 } 1518 } else { 1519 bp->b_xflags |= BX_VNCLEAN; 1520 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1521 if ((newvp->v_flag & VONWORKLST) && 1522 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1523 newvp->v_flag &= ~VONWORKLST; 1524 LIST_REMOVE(newvp, v_synclist); 1525 } 1526 } 1527 if (bp->b_vp != newvp) { 1528 bp->b_vp = newvp; 1529 vhold(bp->b_vp); 1530 } 1531 splx(s); 1532 } 1533 1534 /* 1535 * Create a vnode for a device. 1536 * Used for mounting the root file system. 1537 */ 1538 int 1539 bdevvp(dev, vpp) 1540 dev_t dev; 1541 struct vnode **vpp; 1542 { 1543 register struct vnode *vp; 1544 struct vnode *nvp; 1545 int error; 1546 1547 if (dev == NODEV) { 1548 *vpp = NULLVP; 1549 return (ENXIO); 1550 } 1551 if (vfinddev(dev, VCHR, vpp)) 1552 return (0); 1553 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1554 if (error) { 1555 *vpp = NULLVP; 1556 return (error); 1557 } 1558 vp = nvp; 1559 vp->v_type = VCHR; 1560 addalias(vp, dev); 1561 *vpp = vp; 1562 return (0); 1563 } 1564 1565 /* 1566 * Add vnode to the alias list hung off the dev_t. 1567 * 1568 * The reason for this gunk is that multiple vnodes can reference 1569 * the same physical device, so checking vp->v_usecount to see 1570 * how many users there are is inadequate; the v_usecount for 1571 * the vnodes need to be accumulated. vcount() does that. 1572 */ 1573 struct vnode * 1574 addaliasu(nvp, nvp_rdev) 1575 struct vnode *nvp; 1576 udev_t nvp_rdev; 1577 { 1578 struct vnode *ovp; 1579 vop_t **ops; 1580 dev_t dev; 1581 1582 if (nvp->v_type == VBLK) 1583 return (nvp); 1584 if (nvp->v_type != VCHR) 1585 panic("addaliasu on non-special vnode"); 1586 dev = udev2dev(nvp_rdev, 0); 1587 /* 1588 * Check to see if we have a bdevvp vnode with no associated 1589 * filesystem. If so, we want to associate the filesystem of 1590 * the new newly instigated vnode with the bdevvp vnode and 1591 * discard the newly created vnode rather than leaving the 1592 * bdevvp vnode lying around with no associated filesystem. 1593 */ 1594 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1595 addalias(nvp, dev); 1596 return (nvp); 1597 } 1598 /* 1599 * Discard unneeded vnode, but save its node specific data. 1600 * Note that if there is a lock, it is carried over in the 1601 * node specific data to the replacement vnode. 1602 */ 1603 vref(ovp); 1604 ovp->v_data = nvp->v_data; 1605 ovp->v_tag = nvp->v_tag; 1606 nvp->v_data = NULL; 1607 lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, 1608 nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); 1609 if (nvp->v_vnlock) 1610 ovp->v_vnlock = &ovp->v_lock; 1611 ops = ovp->v_op; 1612 ovp->v_op = nvp->v_op; 1613 if (VOP_ISLOCKED(nvp, curthread)) { 1614 VOP_UNLOCK(nvp, 0, curthread); 1615 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread); 1616 } 1617 nvp->v_op = ops; 1618 insmntque(ovp, nvp->v_mount); 1619 vrele(nvp); 1620 vgone(nvp); 1621 return (ovp); 1622 } 1623 1624 /* This is a local helper function that do the same as addaliasu, but for a 1625 * dev_t instead of an udev_t. */ 1626 static void 1627 addalias(nvp, dev) 1628 struct vnode *nvp; 1629 dev_t dev; 1630 { 1631 1632 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); 1633 nvp->v_rdev = dev; 1634 mtx_lock(&spechash_mtx); 1635 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1636 mtx_unlock(&spechash_mtx); 1637 } 1638 1639 /* 1640 * Grab a particular vnode from the free list, increment its 1641 * reference count and lock it. The vnode lock bit is set if the 1642 * vnode is being eliminated in vgone. The process is awakened 1643 * when the transition is completed, and an error returned to 1644 * indicate that the vnode is no longer usable (possibly having 1645 * been changed to a new file system type). 1646 */ 1647 int 1648 vget(vp, flags, td) 1649 register struct vnode *vp; 1650 int flags; 1651 struct thread *td; 1652 { 1653 int error; 1654 1655 /* 1656 * If the vnode is in the process of being cleaned out for 1657 * another use, we wait for the cleaning to finish and then 1658 * return failure. Cleaning is determined by checking that 1659 * the VXLOCK flag is set. 1660 */ 1661 if ((flags & LK_INTERLOCK) == 0) 1662 mtx_lock(&vp->v_interlock); 1663 if (vp->v_flag & VXLOCK) { 1664 if (vp->v_vxproc == curthread) { 1665 #if 0 1666 /* this can now occur in normal operation */ 1667 log(LOG_INFO, "VXLOCK interlock avoided\n"); 1668 #endif 1669 } else { 1670 vp->v_flag |= VXWANT; 1671 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1672 "vget", 0); 1673 return (ENOENT); 1674 } 1675 } 1676 1677 vp->v_usecount++; 1678 1679 if (VSHOULDBUSY(vp)) 1680 vbusy(vp); 1681 if (flags & LK_TYPE_MASK) { 1682 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { 1683 /* 1684 * must expand vrele here because we do not want 1685 * to call VOP_INACTIVE if the reference count 1686 * drops back to zero since it was never really 1687 * active. We must remove it from the free list 1688 * before sleeping so that multiple processes do 1689 * not try to recycle it. 1690 */ 1691 mtx_lock(&vp->v_interlock); 1692 vp->v_usecount--; 1693 if (VSHOULDFREE(vp)) 1694 vfree(vp); 1695 else 1696 vlruvp(vp); 1697 mtx_unlock(&vp->v_interlock); 1698 } 1699 return (error); 1700 } 1701 mtx_unlock(&vp->v_interlock); 1702 return (0); 1703 } 1704 1705 /* 1706 * Increase the reference count of a vnode. 1707 */ 1708 void 1709 vref(struct vnode *vp) 1710 { 1711 mtx_lock(&vp->v_interlock); 1712 vp->v_usecount++; 1713 mtx_unlock(&vp->v_interlock); 1714 } 1715 1716 /* 1717 * Vnode put/release. 1718 * If count drops to zero, call inactive routine and return to freelist. 1719 */ 1720 void 1721 vrele(vp) 1722 struct vnode *vp; 1723 { 1724 struct thread *td = curthread; /* XXX */ 1725 1726 KASSERT(vp != NULL, ("vrele: null vp")); 1727 1728 mtx_lock(&vp->v_interlock); 1729 1730 /* Skip this v_writecount check if we're going to panic below. */ 1731 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, 1732 ("vrele: missed vn_close")); 1733 1734 if (vp->v_usecount > 1) { 1735 1736 vp->v_usecount--; 1737 mtx_unlock(&vp->v_interlock); 1738 1739 return; 1740 } 1741 1742 if (vp->v_usecount == 1) { 1743 vp->v_usecount--; 1744 /* 1745 * We must call VOP_INACTIVE with the node locked. 1746 * If we are doing a vput, the node is already locked, 1747 * but, in the case of vrele, we must explicitly lock 1748 * the vnode before calling VOP_INACTIVE. 1749 */ 1750 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) 1751 VOP_INACTIVE(vp, td); 1752 if (VSHOULDFREE(vp)) 1753 vfree(vp); 1754 else 1755 vlruvp(vp); 1756 1757 } else { 1758 #ifdef DIAGNOSTIC 1759 vprint("vrele: negative ref count", vp); 1760 mtx_unlock(&vp->v_interlock); 1761 #endif 1762 panic("vrele: negative ref cnt"); 1763 } 1764 } 1765 1766 /* 1767 * Release an already locked vnode. This give the same effects as 1768 * unlock+vrele(), but takes less time and avoids releasing and 1769 * re-aquiring the lock (as vrele() aquires the lock internally.) 1770 */ 1771 void 1772 vput(vp) 1773 struct vnode *vp; 1774 { 1775 struct thread *td = curthread; /* XXX */ 1776 1777 GIANT_REQUIRED; 1778 1779 KASSERT(vp != NULL, ("vput: null vp")); 1780 mtx_lock(&vp->v_interlock); 1781 /* Skip this v_writecount check if we're going to panic below. */ 1782 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, 1783 ("vput: missed vn_close")); 1784 1785 if (vp->v_usecount > 1) { 1786 vp->v_usecount--; 1787 VOP_UNLOCK(vp, LK_INTERLOCK, td); 1788 return; 1789 } 1790 1791 if (vp->v_usecount == 1) { 1792 vp->v_usecount--; 1793 /* 1794 * We must call VOP_INACTIVE with the node locked. 1795 * If we are doing a vput, the node is already locked, 1796 * so we just need to release the vnode mutex. 1797 */ 1798 mtx_unlock(&vp->v_interlock); 1799 VOP_INACTIVE(vp, td); 1800 if (VSHOULDFREE(vp)) 1801 vfree(vp); 1802 else 1803 vlruvp(vp); 1804 1805 } else { 1806 #ifdef DIAGNOSTIC 1807 vprint("vput: negative ref count", vp); 1808 #endif 1809 panic("vput: negative ref cnt"); 1810 } 1811 } 1812 1813 /* 1814 * Somebody doesn't want the vnode recycled. 1815 */ 1816 void 1817 vhold(vp) 1818 register struct vnode *vp; 1819 { 1820 int s; 1821 1822 s = splbio(); 1823 vp->v_holdcnt++; 1824 if (VSHOULDBUSY(vp)) 1825 vbusy(vp); 1826 splx(s); 1827 } 1828 1829 /* 1830 * Note that there is one less who cares about this vnode. vdrop() is the 1831 * opposite of vhold(). 1832 */ 1833 void 1834 vdrop(vp) 1835 register struct vnode *vp; 1836 { 1837 int s; 1838 1839 s = splbio(); 1840 if (vp->v_holdcnt <= 0) 1841 panic("vdrop: holdcnt"); 1842 vp->v_holdcnt--; 1843 if (VSHOULDFREE(vp)) 1844 vfree(vp); 1845 else 1846 vlruvp(vp); 1847 splx(s); 1848 } 1849 1850 /* 1851 * Remove any vnodes in the vnode table belonging to mount point mp. 1852 * 1853 * If FORCECLOSE is not specified, there should not be any active ones, 1854 * return error if any are found (nb: this is a user error, not a 1855 * system error). If FORCECLOSE is specified, detach any active vnodes 1856 * that are found. 1857 * 1858 * If WRITECLOSE is set, only flush out regular file vnodes open for 1859 * writing. 1860 * 1861 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. 1862 * 1863 * `rootrefs' specifies the base reference count for the root vnode 1864 * of this filesystem. The root vnode is considered busy if its 1865 * v_usecount exceeds this value. On a successful return, vflush() 1866 * will call vrele() on the root vnode exactly rootrefs times. 1867 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 1868 * be zero. 1869 */ 1870 #ifdef DIAGNOSTIC 1871 static int busyprt = 0; /* print out busy vnodes */ 1872 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1873 #endif 1874 1875 int 1876 vflush(mp, rootrefs, flags) 1877 struct mount *mp; 1878 int rootrefs; 1879 int flags; 1880 { 1881 struct thread *td = curthread; /* XXX */ 1882 struct vnode *vp, *nvp, *rootvp = NULL; 1883 struct vattr vattr; 1884 int busy = 0, error; 1885 1886 if (rootrefs > 0) { 1887 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 1888 ("vflush: bad args")); 1889 /* 1890 * Get the filesystem root vnode. We can vput() it 1891 * immediately, since with rootrefs > 0, it won't go away. 1892 */ 1893 if ((error = VFS_ROOT(mp, &rootvp)) != 0) 1894 return (error); 1895 vput(rootvp); 1896 } 1897 mtx_lock(&mntvnode_mtx); 1898 loop: 1899 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) { 1900 /* 1901 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1902 * Start over if it has (it won't be on the list anymore). 1903 */ 1904 if (vp->v_mount != mp) 1905 goto loop; 1906 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 1907 1908 mtx_unlock(&mntvnode_mtx); 1909 mtx_lock(&vp->v_interlock); 1910 /* 1911 * Skip over a vnodes marked VSYSTEM. 1912 */ 1913 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1914 mtx_unlock(&vp->v_interlock); 1915 mtx_lock(&mntvnode_mtx); 1916 continue; 1917 } 1918 /* 1919 * If WRITECLOSE is set, flush out unlinked but still open 1920 * files (even if open only for reading) and regular file 1921 * vnodes open for writing. 1922 */ 1923 if ((flags & WRITECLOSE) && 1924 (vp->v_type == VNON || 1925 (VOP_GETATTR(vp, &vattr, td->td_ucred, td) == 0 && 1926 vattr.va_nlink > 0)) && 1927 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1928 mtx_unlock(&vp->v_interlock); 1929 mtx_lock(&mntvnode_mtx); 1930 continue; 1931 } 1932 1933 /* 1934 * With v_usecount == 0, all we need to do is clear out the 1935 * vnode data structures and we are done. 1936 */ 1937 if (vp->v_usecount == 0) { 1938 vgonel(vp, td); 1939 mtx_lock(&mntvnode_mtx); 1940 continue; 1941 } 1942 1943 /* 1944 * If FORCECLOSE is set, forcibly close the vnode. For block 1945 * or character devices, revert to an anonymous device. For 1946 * all other files, just kill them. 1947 */ 1948 if (flags & FORCECLOSE) { 1949 if (vp->v_type != VCHR) { 1950 vgonel(vp, td); 1951 } else { 1952 vclean(vp, 0, td); 1953 vp->v_op = spec_vnodeop_p; 1954 insmntque(vp, (struct mount *) 0); 1955 } 1956 mtx_lock(&mntvnode_mtx); 1957 continue; 1958 } 1959 #ifdef DIAGNOSTIC 1960 if (busyprt) 1961 vprint("vflush: busy vnode", vp); 1962 #endif 1963 mtx_unlock(&vp->v_interlock); 1964 mtx_lock(&mntvnode_mtx); 1965 busy++; 1966 } 1967 mtx_unlock(&mntvnode_mtx); 1968 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 1969 /* 1970 * If just the root vnode is busy, and if its refcount 1971 * is equal to `rootrefs', then go ahead and kill it. 1972 */ 1973 mtx_lock(&rootvp->v_interlock); 1974 KASSERT(busy > 0, ("vflush: not busy")); 1975 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); 1976 if (busy == 1 && rootvp->v_usecount == rootrefs) { 1977 vgonel(rootvp, td); 1978 busy = 0; 1979 } else 1980 mtx_unlock(&rootvp->v_interlock); 1981 } 1982 if (busy) 1983 return (EBUSY); 1984 for (; rootrefs > 0; rootrefs--) 1985 vrele(rootvp); 1986 return (0); 1987 } 1988 1989 /* 1990 * This moves a now (likely recyclable) vnode to the end of the 1991 * mountlist. XXX However, it is temporarily disabled until we 1992 * can clean up ffs_sync() and friends, which have loop restart 1993 * conditions which this code causes to operate O(N^2). 1994 */ 1995 static void 1996 vlruvp(struct vnode *vp) 1997 { 1998 #if 0 1999 struct mount *mp; 2000 2001 if ((mp = vp->v_mount) != NULL) { 2002 mtx_lock(&mntvnode_mtx); 2003 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2004 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2005 mtx_unlock(&mntvnode_mtx); 2006 } 2007 #endif 2008 } 2009 2010 /* 2011 * Disassociate the underlying file system from a vnode. 2012 */ 2013 static void 2014 vclean(vp, flags, td) 2015 struct vnode *vp; 2016 int flags; 2017 struct thread *td; 2018 { 2019 int active; 2020 2021 /* 2022 * Check to see if the vnode is in use. If so we have to reference it 2023 * before we clean it out so that its count cannot fall to zero and 2024 * generate a race against ourselves to recycle it. 2025 */ 2026 if ((active = vp->v_usecount)) 2027 vp->v_usecount++; 2028 2029 /* 2030 * Prevent the vnode from being recycled or brought into use while we 2031 * clean it out. 2032 */ 2033 if (vp->v_flag & VXLOCK) 2034 panic("vclean: deadlock"); 2035 vp->v_flag |= VXLOCK; 2036 vp->v_vxproc = curthread; 2037 /* 2038 * Even if the count is zero, the VOP_INACTIVE routine may still 2039 * have the object locked while it cleans it out. The VOP_LOCK 2040 * ensures that the VOP_INACTIVE routine is done with its work. 2041 * For active vnodes, it ensures that no other activity can 2042 * occur while the underlying object is being cleaned out. 2043 */ 2044 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); 2045 2046 /* 2047 * Clean out any buffers associated with the vnode. 2048 * If the flush fails, just toss the buffers. 2049 */ 2050 if (flags & DOCLOSE) { 2051 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 2052 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 2053 if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0) 2054 vinvalbuf(vp, 0, NOCRED, td, 0, 0); 2055 } 2056 2057 VOP_DESTROYVOBJECT(vp); 2058 2059 /* 2060 * If purging an active vnode, it must be closed and 2061 * deactivated before being reclaimed. Note that the 2062 * VOP_INACTIVE will unlock the vnode. 2063 */ 2064 if (active) { 2065 if (flags & DOCLOSE) 2066 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 2067 VOP_INACTIVE(vp, td); 2068 } else { 2069 /* 2070 * Any other processes trying to obtain this lock must first 2071 * wait for VXLOCK to clear, then call the new lock operation. 2072 */ 2073 VOP_UNLOCK(vp, 0, td); 2074 } 2075 /* 2076 * Reclaim the vnode. 2077 */ 2078 if (VOP_RECLAIM(vp, td)) 2079 panic("vclean: cannot reclaim"); 2080 2081 if (active) { 2082 /* 2083 * Inline copy of vrele() since VOP_INACTIVE 2084 * has already been called. 2085 */ 2086 mtx_lock(&vp->v_interlock); 2087 if (--vp->v_usecount <= 0) { 2088 #ifdef DIAGNOSTIC 2089 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 2090 vprint("vclean: bad ref count", vp); 2091 panic("vclean: ref cnt"); 2092 } 2093 #endif 2094 vfree(vp); 2095 } 2096 mtx_unlock(&vp->v_interlock); 2097 } 2098 2099 cache_purge(vp); 2100 vp->v_vnlock = NULL; 2101 lockdestroy(&vp->v_lock); 2102 2103 if (VSHOULDFREE(vp)) 2104 vfree(vp); 2105 2106 /* 2107 * Done with purge, notify sleepers of the grim news. 2108 */ 2109 vp->v_op = dead_vnodeop_p; 2110 if (vp->v_pollinfo != NULL) 2111 vn_pollgone(vp); 2112 vp->v_tag = VT_NON; 2113 vp->v_flag &= ~VXLOCK; 2114 vp->v_vxproc = NULL; 2115 if (vp->v_flag & VXWANT) { 2116 vp->v_flag &= ~VXWANT; 2117 wakeup((caddr_t) vp); 2118 } 2119 } 2120 2121 /* 2122 * Eliminate all activity associated with the requested vnode 2123 * and with all vnodes aliased to the requested vnode. 2124 */ 2125 int 2126 vop_revoke(ap) 2127 struct vop_revoke_args /* { 2128 struct vnode *a_vp; 2129 int a_flags; 2130 } */ *ap; 2131 { 2132 struct vnode *vp, *vq; 2133 dev_t dev; 2134 2135 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 2136 2137 vp = ap->a_vp; 2138 /* 2139 * If a vgone (or vclean) is already in progress, 2140 * wait until it is done and return. 2141 */ 2142 if (vp->v_flag & VXLOCK) { 2143 vp->v_flag |= VXWANT; 2144 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 2145 "vop_revokeall", 0); 2146 return (0); 2147 } 2148 dev = vp->v_rdev; 2149 for (;;) { 2150 mtx_lock(&spechash_mtx); 2151 vq = SLIST_FIRST(&dev->si_hlist); 2152 mtx_unlock(&spechash_mtx); 2153 if (!vq) 2154 break; 2155 vgone(vq); 2156 } 2157 return (0); 2158 } 2159 2160 /* 2161 * Recycle an unused vnode to the front of the free list. 2162 * Release the passed interlock if the vnode will be recycled. 2163 */ 2164 int 2165 vrecycle(vp, inter_lkp, td) 2166 struct vnode *vp; 2167 struct mtx *inter_lkp; 2168 struct thread *td; 2169 { 2170 2171 mtx_lock(&vp->v_interlock); 2172 if (vp->v_usecount == 0) { 2173 if (inter_lkp) { 2174 mtx_unlock(inter_lkp); 2175 } 2176 vgonel(vp, td); 2177 return (1); 2178 } 2179 mtx_unlock(&vp->v_interlock); 2180 return (0); 2181 } 2182 2183 /* 2184 * Eliminate all activity associated with a vnode 2185 * in preparation for reuse. 2186 */ 2187 void 2188 vgone(vp) 2189 register struct vnode *vp; 2190 { 2191 struct thread *td = curthread; /* XXX */ 2192 2193 mtx_lock(&vp->v_interlock); 2194 vgonel(vp, td); 2195 } 2196 2197 /* 2198 * vgone, with the vp interlock held. 2199 */ 2200 void 2201 vgonel(vp, td) 2202 struct vnode *vp; 2203 struct thread *td; 2204 { 2205 int s; 2206 2207 /* 2208 * If a vgone (or vclean) is already in progress, 2209 * wait until it is done and return. 2210 */ 2211 if (vp->v_flag & VXLOCK) { 2212 vp->v_flag |= VXWANT; 2213 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 2214 "vgone", 0); 2215 return; 2216 } 2217 2218 /* 2219 * Clean out the filesystem specific data. 2220 */ 2221 vclean(vp, DOCLOSE, td); 2222 mtx_lock(&vp->v_interlock); 2223 2224 /* 2225 * Delete from old mount point vnode list, if on one. 2226 */ 2227 if (vp->v_mount != NULL) 2228 insmntque(vp, (struct mount *)0); 2229 /* 2230 * If special device, remove it from special device alias list 2231 * if it is on one. 2232 */ 2233 if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { 2234 mtx_lock(&spechash_mtx); 2235 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); 2236 freedev(vp->v_rdev); 2237 mtx_unlock(&spechash_mtx); 2238 vp->v_rdev = NULL; 2239 } 2240 2241 /* 2242 * If it is on the freelist and not already at the head, 2243 * move it to the head of the list. The test of the 2244 * VDOOMED flag and the reference count of zero is because 2245 * it will be removed from the free list by getnewvnode, 2246 * but will not have its reference count incremented until 2247 * after calling vgone. If the reference count were 2248 * incremented first, vgone would (incorrectly) try to 2249 * close the previous instance of the underlying object. 2250 */ 2251 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 2252 s = splbio(); 2253 mtx_lock(&vnode_free_list_mtx); 2254 if (vp->v_flag & VFREE) 2255 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2256 else 2257 freevnodes++; 2258 vp->v_flag |= VFREE; 2259 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2260 mtx_unlock(&vnode_free_list_mtx); 2261 splx(s); 2262 } 2263 2264 vp->v_type = VBAD; 2265 mtx_unlock(&vp->v_interlock); 2266 } 2267 2268 /* 2269 * Lookup a vnode by device number. 2270 */ 2271 int 2272 vfinddev(dev, type, vpp) 2273 dev_t dev; 2274 enum vtype type; 2275 struct vnode **vpp; 2276 { 2277 struct vnode *vp; 2278 2279 mtx_lock(&spechash_mtx); 2280 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2281 if (type == vp->v_type) { 2282 *vpp = vp; 2283 mtx_unlock(&spechash_mtx); 2284 return (1); 2285 } 2286 } 2287 mtx_unlock(&spechash_mtx); 2288 return (0); 2289 } 2290 2291 /* 2292 * Calculate the total number of references to a special device. 2293 */ 2294 int 2295 vcount(vp) 2296 struct vnode *vp; 2297 { 2298 struct vnode *vq; 2299 int count; 2300 2301 count = 0; 2302 mtx_lock(&spechash_mtx); 2303 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) 2304 count += vq->v_usecount; 2305 mtx_unlock(&spechash_mtx); 2306 return (count); 2307 } 2308 2309 /* 2310 * Same as above, but using the dev_t as argument 2311 */ 2312 int 2313 count_dev(dev) 2314 dev_t dev; 2315 { 2316 struct vnode *vp; 2317 2318 vp = SLIST_FIRST(&dev->si_hlist); 2319 if (vp == NULL) 2320 return (0); 2321 return(vcount(vp)); 2322 } 2323 2324 /* 2325 * Print out a description of a vnode. 2326 */ 2327 static char *typename[] = 2328 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2329 2330 void 2331 vprint(label, vp) 2332 char *label; 2333 struct vnode *vp; 2334 { 2335 char buf[96]; 2336 2337 if (label != NULL) 2338 printf("%s: %p: ", label, (void *)vp); 2339 else 2340 printf("%p: ", (void *)vp); 2341 printf("type %s, usecount %d, writecount %d, refcount %d,", 2342 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2343 vp->v_holdcnt); 2344 buf[0] = '\0'; 2345 if (vp->v_flag & VROOT) 2346 strcat(buf, "|VROOT"); 2347 if (vp->v_flag & VTEXT) 2348 strcat(buf, "|VTEXT"); 2349 if (vp->v_flag & VSYSTEM) 2350 strcat(buf, "|VSYSTEM"); 2351 if (vp->v_flag & VXLOCK) 2352 strcat(buf, "|VXLOCK"); 2353 if (vp->v_flag & VXWANT) 2354 strcat(buf, "|VXWANT"); 2355 if (vp->v_flag & VBWAIT) 2356 strcat(buf, "|VBWAIT"); 2357 if (vp->v_flag & VDOOMED) 2358 strcat(buf, "|VDOOMED"); 2359 if (vp->v_flag & VFREE) 2360 strcat(buf, "|VFREE"); 2361 if (vp->v_flag & VOBJBUF) 2362 strcat(buf, "|VOBJBUF"); 2363 if (buf[0] != '\0') 2364 printf(" flags (%s)", &buf[1]); 2365 if (vp->v_data == NULL) { 2366 printf("\n"); 2367 } else { 2368 printf("\n\t"); 2369 VOP_PRINT(vp); 2370 } 2371 } 2372 2373 #ifdef DDB 2374 #include <ddb/ddb.h> 2375 /* 2376 * List all of the locked vnodes in the system. 2377 * Called when debugging the kernel. 2378 */ 2379 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2380 { 2381 struct thread *td = curthread; /* XXX */ 2382 struct mount *mp, *nmp; 2383 struct vnode *vp; 2384 2385 printf("Locked vnodes\n"); 2386 mtx_lock(&mountlist_mtx); 2387 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2388 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 2389 nmp = TAILQ_NEXT(mp, mnt_list); 2390 continue; 2391 } 2392 mtx_lock(&mntvnode_mtx); 2393 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2394 if (VOP_ISLOCKED(vp, NULL)) 2395 vprint((char *)0, vp); 2396 } 2397 mtx_unlock(&mntvnode_mtx); 2398 mtx_lock(&mountlist_mtx); 2399 nmp = TAILQ_NEXT(mp, mnt_list); 2400 vfs_unbusy(mp, td); 2401 } 2402 mtx_unlock(&mountlist_mtx); 2403 } 2404 #endif 2405 2406 /* 2407 * Top level filesystem related information gathering. 2408 */ 2409 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 2410 2411 static int 2412 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2413 { 2414 int *name = (int *)arg1 - 1; /* XXX */ 2415 u_int namelen = arg2 + 1; /* XXX */ 2416 struct vfsconf *vfsp; 2417 2418 #if 1 || defined(COMPAT_PRELITE2) 2419 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2420 if (namelen == 1) 2421 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2422 #endif 2423 2424 /* XXX the below code does not compile; vfs_sysctl does not exist. */ 2425 #ifdef notyet 2426 /* all sysctl names at this level are at least name and field */ 2427 if (namelen < 2) 2428 return (ENOTDIR); /* overloaded */ 2429 if (name[0] != VFS_GENERIC) { 2430 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2431 if (vfsp->vfc_typenum == name[0]) 2432 break; 2433 if (vfsp == NULL) 2434 return (EOPNOTSUPP); 2435 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2436 oldp, oldlenp, newp, newlen, td)); 2437 } 2438 #endif 2439 switch (name[1]) { 2440 case VFS_MAXTYPENUM: 2441 if (namelen != 2) 2442 return (ENOTDIR); 2443 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2444 case VFS_CONF: 2445 if (namelen != 3) 2446 return (ENOTDIR); /* overloaded */ 2447 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2448 if (vfsp->vfc_typenum == name[2]) 2449 break; 2450 if (vfsp == NULL) 2451 return (EOPNOTSUPP); 2452 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2453 } 2454 return (EOPNOTSUPP); 2455 } 2456 2457 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2458 "Generic filesystem"); 2459 2460 #if 1 || defined(COMPAT_PRELITE2) 2461 2462 static int 2463 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2464 { 2465 int error; 2466 struct vfsconf *vfsp; 2467 struct ovfsconf ovfs; 2468 2469 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2470 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2471 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2472 ovfs.vfc_index = vfsp->vfc_typenum; 2473 ovfs.vfc_refcount = vfsp->vfc_refcount; 2474 ovfs.vfc_flags = vfsp->vfc_flags; 2475 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2476 if (error) 2477 return error; 2478 } 2479 return 0; 2480 } 2481 2482 #endif /* 1 || COMPAT_PRELITE2 */ 2483 2484 #if COMPILING_LINT 2485 #define KINFO_VNODESLOP 10 2486 /* 2487 * Dump vnode list (via sysctl). 2488 * Copyout address of vnode followed by vnode. 2489 */ 2490 /* ARGSUSED */ 2491 static int 2492 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2493 { 2494 struct thread *td = curthread; /* XXX */ 2495 struct mount *mp, *nmp; 2496 struct vnode *nvp, *vp; 2497 int error; 2498 2499 #define VPTRSZ sizeof (struct vnode *) 2500 #define VNODESZ sizeof (struct vnode) 2501 2502 req->lock = 0; 2503 if (!req->oldptr) /* Make an estimate */ 2504 return (SYSCTL_OUT(req, 0, 2505 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2506 2507 mtx_lock(&mountlist_mtx); 2508 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2509 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 2510 nmp = TAILQ_NEXT(mp, mnt_list); 2511 continue; 2512 } 2513 mtx_lock(&mntvnode_mtx); 2514 again: 2515 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 2516 vp != NULL; 2517 vp = nvp) { 2518 /* 2519 * Check that the vp is still associated with 2520 * this filesystem. RACE: could have been 2521 * recycled onto the same filesystem. 2522 */ 2523 if (vp->v_mount != mp) 2524 goto again; 2525 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2526 mtx_unlock(&mntvnode_mtx); 2527 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2528 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2529 return (error); 2530 mtx_lock(&mntvnode_mtx); 2531 } 2532 mtx_unlock(&mntvnode_mtx); 2533 mtx_lock(&mountlist_mtx); 2534 nmp = TAILQ_NEXT(mp, mnt_list); 2535 vfs_unbusy(mp, td); 2536 } 2537 mtx_unlock(&mountlist_mtx); 2538 2539 return (0); 2540 } 2541 2542 /* 2543 * XXX 2544 * Exporting the vnode list on large systems causes them to crash. 2545 * Exporting the vnode list on medium systems causes sysctl to coredump. 2546 */ 2547 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2548 0, 0, sysctl_vnode, "S,vnode", ""); 2549 #endif 2550 2551 /* 2552 * Check to see if a filesystem is mounted on a block device. 2553 */ 2554 int 2555 vfs_mountedon(vp) 2556 struct vnode *vp; 2557 { 2558 2559 if (vp->v_rdev->si_mountpoint != NULL) 2560 return (EBUSY); 2561 return (0); 2562 } 2563 2564 /* 2565 * Unmount all filesystems. The list is traversed in reverse order 2566 * of mounting to avoid dependencies. 2567 */ 2568 void 2569 vfs_unmountall() 2570 { 2571 struct mount *mp; 2572 struct thread *td; 2573 int error; 2574 2575 if (curthread != NULL) 2576 td = curthread; 2577 else 2578 td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */ 2579 /* 2580 * Since this only runs when rebooting, it is not interlocked. 2581 */ 2582 while(!TAILQ_EMPTY(&mountlist)) { 2583 mp = TAILQ_LAST(&mountlist, mntlist); 2584 error = dounmount(mp, MNT_FORCE, td); 2585 if (error) { 2586 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2587 printf("unmount of %s failed (", 2588 mp->mnt_stat.f_mntonname); 2589 if (error == EBUSY) 2590 printf("BUSY)\n"); 2591 else 2592 printf("%d)\n", error); 2593 } else { 2594 /* The unmount has removed mp from the mountlist */ 2595 } 2596 } 2597 } 2598 2599 /* 2600 * perform msync on all vnodes under a mount point 2601 * the mount point must be locked. 2602 */ 2603 void 2604 vfs_msync(struct mount *mp, int flags) 2605 { 2606 struct vnode *vp, *nvp; 2607 struct vm_object *obj; 2608 int tries; 2609 2610 GIANT_REQUIRED; 2611 2612 tries = 5; 2613 mtx_lock(&mntvnode_mtx); 2614 loop: 2615 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { 2616 if (vp->v_mount != mp) { 2617 if (--tries > 0) 2618 goto loop; 2619 break; 2620 } 2621 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2622 2623 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2624 continue; 2625 2626 if (vp->v_flag & VNOSYNC) /* unlinked, skip it */ 2627 continue; 2628 2629 if ((vp->v_flag & VOBJDIRTY) && 2630 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2631 mtx_unlock(&mntvnode_mtx); 2632 if (!vget(vp, 2633 LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) { 2634 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2635 vm_object_page_clean(obj, 0, 0, 2636 flags == MNT_WAIT ? 2637 OBJPC_SYNC : OBJPC_NOSYNC); 2638 } 2639 vput(vp); 2640 } 2641 mtx_lock(&mntvnode_mtx); 2642 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { 2643 if (--tries > 0) 2644 goto loop; 2645 break; 2646 } 2647 } 2648 } 2649 mtx_unlock(&mntvnode_mtx); 2650 } 2651 2652 /* 2653 * Create the VM object needed for VMIO and mmap support. This 2654 * is done for all VREG files in the system. Some filesystems might 2655 * afford the additional metadata buffering capability of the 2656 * VMIO code by making the device node be VMIO mode also. 2657 * 2658 * vp must be locked when vfs_object_create is called. 2659 */ 2660 int 2661 vfs_object_create(vp, td, cred) 2662 struct vnode *vp; 2663 struct thread *td; 2664 struct ucred *cred; 2665 { 2666 GIANT_REQUIRED; 2667 return (VOP_CREATEVOBJECT(vp, cred, td)); 2668 } 2669 2670 /* 2671 * Mark a vnode as free, putting it up for recycling. 2672 */ 2673 void 2674 vfree(vp) 2675 struct vnode *vp; 2676 { 2677 int s; 2678 2679 s = splbio(); 2680 mtx_lock(&vnode_free_list_mtx); 2681 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2682 if (vp->v_flag & VAGE) { 2683 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2684 } else { 2685 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2686 } 2687 freevnodes++; 2688 mtx_unlock(&vnode_free_list_mtx); 2689 vp->v_flag &= ~VAGE; 2690 vp->v_flag |= VFREE; 2691 splx(s); 2692 } 2693 2694 /* 2695 * Opposite of vfree() - mark a vnode as in use. 2696 */ 2697 void 2698 vbusy(vp) 2699 struct vnode *vp; 2700 { 2701 int s; 2702 2703 s = splbio(); 2704 mtx_lock(&vnode_free_list_mtx); 2705 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2706 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2707 freevnodes--; 2708 mtx_unlock(&vnode_free_list_mtx); 2709 vp->v_flag &= ~(VFREE|VAGE); 2710 splx(s); 2711 } 2712 2713 /* 2714 * Record a process's interest in events which might happen to 2715 * a vnode. Because poll uses the historic select-style interface 2716 * internally, this routine serves as both the ``check for any 2717 * pending events'' and the ``record my interest in future events'' 2718 * functions. (These are done together, while the lock is held, 2719 * to avoid race conditions.) 2720 */ 2721 int 2722 vn_pollrecord(vp, td, events) 2723 struct vnode *vp; 2724 struct thread *td; 2725 short events; 2726 { 2727 2728 if (vp->v_pollinfo == NULL) 2729 v_addpollinfo(vp); 2730 mtx_lock(&vp->v_pollinfo->vpi_lock); 2731 if (vp->v_pollinfo->vpi_revents & events) { 2732 /* 2733 * This leaves events we are not interested 2734 * in available for the other process which 2735 * which presumably had requested them 2736 * (otherwise they would never have been 2737 * recorded). 2738 */ 2739 events &= vp->v_pollinfo->vpi_revents; 2740 vp->v_pollinfo->vpi_revents &= ~events; 2741 2742 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2743 return events; 2744 } 2745 vp->v_pollinfo->vpi_events |= events; 2746 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 2747 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2748 return 0; 2749 } 2750 2751 /* 2752 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2753 * it is possible for us to miss an event due to race conditions, but 2754 * that condition is expected to be rare, so for the moment it is the 2755 * preferred interface. 2756 */ 2757 void 2758 vn_pollevent(vp, events) 2759 struct vnode *vp; 2760 short events; 2761 { 2762 2763 if (vp->v_pollinfo == NULL) 2764 v_addpollinfo(vp); 2765 mtx_lock(&vp->v_pollinfo->vpi_lock); 2766 if (vp->v_pollinfo->vpi_events & events) { 2767 /* 2768 * We clear vpi_events so that we don't 2769 * call selwakeup() twice if two events are 2770 * posted before the polling process(es) is 2771 * awakened. This also ensures that we take at 2772 * most one selwakeup() if the polling process 2773 * is no longer interested. However, it does 2774 * mean that only one event can be noticed at 2775 * a time. (Perhaps we should only clear those 2776 * event bits which we note?) XXX 2777 */ 2778 vp->v_pollinfo->vpi_events = 0; /* &= ~events ??? */ 2779 vp->v_pollinfo->vpi_revents |= events; 2780 selwakeup(&vp->v_pollinfo->vpi_selinfo); 2781 } 2782 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2783 } 2784 2785 /* 2786 * Wake up anyone polling on vp because it is being revoked. 2787 * This depends on dead_poll() returning POLLHUP for correct 2788 * behavior. 2789 */ 2790 void 2791 vn_pollgone(vp) 2792 struct vnode *vp; 2793 { 2794 2795 mtx_lock(&vp->v_pollinfo->vpi_lock); 2796 VN_KNOTE(vp, NOTE_REVOKE); 2797 if (vp->v_pollinfo->vpi_events) { 2798 vp->v_pollinfo->vpi_events = 0; 2799 selwakeup(&vp->v_pollinfo->vpi_selinfo); 2800 } 2801 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2802 } 2803 2804 2805 2806 /* 2807 * Routine to create and manage a filesystem syncer vnode. 2808 */ 2809 #define sync_close ((int (*)(struct vop_close_args *))nullop) 2810 static int sync_fsync(struct vop_fsync_args *); 2811 static int sync_inactive(struct vop_inactive_args *); 2812 static int sync_reclaim(struct vop_reclaim_args *); 2813 #define sync_lock ((int (*)(struct vop_lock_args *))vop_nolock) 2814 #define sync_unlock ((int (*)(struct vop_unlock_args *))vop_nounlock) 2815 static int sync_print(struct vop_print_args *); 2816 #define sync_islocked ((int(*)(struct vop_islocked_args *))vop_noislocked) 2817 2818 static vop_t **sync_vnodeop_p; 2819 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2820 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2821 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2822 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2823 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2824 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2825 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2826 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2827 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2828 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2829 { NULL, NULL } 2830 }; 2831 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2832 { &sync_vnodeop_p, sync_vnodeop_entries }; 2833 2834 VNODEOP_SET(sync_vnodeop_opv_desc); 2835 2836 /* 2837 * Create a new filesystem syncer vnode for the specified mount point. 2838 */ 2839 int 2840 vfs_allocate_syncvnode(mp) 2841 struct mount *mp; 2842 { 2843 struct vnode *vp; 2844 static long start, incr, next; 2845 int error; 2846 2847 /* Allocate a new vnode */ 2848 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2849 mp->mnt_syncer = NULL; 2850 return (error); 2851 } 2852 vp->v_type = VNON; 2853 /* 2854 * Place the vnode onto the syncer worklist. We attempt to 2855 * scatter them about on the list so that they will go off 2856 * at evenly distributed times even if all the filesystems 2857 * are mounted at once. 2858 */ 2859 next += incr; 2860 if (next == 0 || next > syncer_maxdelay) { 2861 start /= 2; 2862 incr /= 2; 2863 if (start == 0) { 2864 start = syncer_maxdelay / 2; 2865 incr = syncer_maxdelay; 2866 } 2867 next = start; 2868 } 2869 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2870 mp->mnt_syncer = vp; 2871 return (0); 2872 } 2873 2874 /* 2875 * Do a lazy sync of the filesystem. 2876 */ 2877 static int 2878 sync_fsync(ap) 2879 struct vop_fsync_args /* { 2880 struct vnode *a_vp; 2881 struct ucred *a_cred; 2882 int a_waitfor; 2883 struct thread *a_td; 2884 } */ *ap; 2885 { 2886 struct vnode *syncvp = ap->a_vp; 2887 struct mount *mp = syncvp->v_mount; 2888 struct thread *td = ap->a_td; 2889 int asyncflag; 2890 2891 /* 2892 * We only need to do something if this is a lazy evaluation. 2893 */ 2894 if (ap->a_waitfor != MNT_LAZY) 2895 return (0); 2896 2897 /* 2898 * Move ourselves to the back of the sync list. 2899 */ 2900 vn_syncer_add_to_worklist(syncvp, syncdelay); 2901 2902 /* 2903 * Walk the list of vnodes pushing all that are dirty and 2904 * not already on the sync list. 2905 */ 2906 mtx_lock(&mountlist_mtx); 2907 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { 2908 mtx_unlock(&mountlist_mtx); 2909 return (0); 2910 } 2911 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2912 vfs_unbusy(mp, td); 2913 return (0); 2914 } 2915 asyncflag = mp->mnt_flag & MNT_ASYNC; 2916 mp->mnt_flag &= ~MNT_ASYNC; 2917 vfs_msync(mp, MNT_NOWAIT); 2918 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td); 2919 if (asyncflag) 2920 mp->mnt_flag |= MNT_ASYNC; 2921 vn_finished_write(mp); 2922 vfs_unbusy(mp, td); 2923 return (0); 2924 } 2925 2926 /* 2927 * The syncer vnode is no referenced. 2928 */ 2929 static int 2930 sync_inactive(ap) 2931 struct vop_inactive_args /* { 2932 struct vnode *a_vp; 2933 struct thread *a_td; 2934 } */ *ap; 2935 { 2936 2937 vgone(ap->a_vp); 2938 return (0); 2939 } 2940 2941 /* 2942 * The syncer vnode is no longer needed and is being decommissioned. 2943 * 2944 * Modifications to the worklist must be protected at splbio(). 2945 */ 2946 static int 2947 sync_reclaim(ap) 2948 struct vop_reclaim_args /* { 2949 struct vnode *a_vp; 2950 } */ *ap; 2951 { 2952 struct vnode *vp = ap->a_vp; 2953 int s; 2954 2955 s = splbio(); 2956 vp->v_mount->mnt_syncer = NULL; 2957 if (vp->v_flag & VONWORKLST) { 2958 LIST_REMOVE(vp, v_synclist); 2959 vp->v_flag &= ~VONWORKLST; 2960 } 2961 splx(s); 2962 2963 return (0); 2964 } 2965 2966 /* 2967 * Print out a syncer vnode. 2968 */ 2969 static int 2970 sync_print(ap) 2971 struct vop_print_args /* { 2972 struct vnode *a_vp; 2973 } */ *ap; 2974 { 2975 struct vnode *vp = ap->a_vp; 2976 2977 printf("syncer vnode"); 2978 if (vp->v_vnlock != NULL) 2979 lockmgr_printinfo(vp->v_vnlock); 2980 printf("\n"); 2981 return (0); 2982 } 2983 2984 /* 2985 * extract the dev_t from a VCHR 2986 */ 2987 dev_t 2988 vn_todev(vp) 2989 struct vnode *vp; 2990 { 2991 if (vp->v_type != VCHR) 2992 return (NODEV); 2993 return (vp->v_rdev); 2994 } 2995 2996 /* 2997 * Check if vnode represents a disk device 2998 */ 2999 int 3000 vn_isdisk(vp, errp) 3001 struct vnode *vp; 3002 int *errp; 3003 { 3004 struct cdevsw *cdevsw; 3005 3006 if (vp->v_type != VCHR) { 3007 if (errp != NULL) 3008 *errp = ENOTBLK; 3009 return (0); 3010 } 3011 if (vp->v_rdev == NULL) { 3012 if (errp != NULL) 3013 *errp = ENXIO; 3014 return (0); 3015 } 3016 cdevsw = devsw(vp->v_rdev); 3017 if (cdevsw == NULL) { 3018 if (errp != NULL) 3019 *errp = ENXIO; 3020 return (0); 3021 } 3022 if (!(cdevsw->d_flags & D_DISK)) { 3023 if (errp != NULL) 3024 *errp = ENOTBLK; 3025 return (0); 3026 } 3027 if (errp != NULL) 3028 *errp = 0; 3029 return (1); 3030 } 3031 3032 /* 3033 * Free data allocated by namei(); see namei(9) for details. 3034 */ 3035 void 3036 NDFREE(ndp, flags) 3037 struct nameidata *ndp; 3038 const uint flags; 3039 { 3040 if (!(flags & NDF_NO_FREE_PNBUF) && 3041 (ndp->ni_cnd.cn_flags & HASBUF)) { 3042 uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3043 ndp->ni_cnd.cn_flags &= ~HASBUF; 3044 } 3045 if (!(flags & NDF_NO_DVP_UNLOCK) && 3046 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3047 ndp->ni_dvp != ndp->ni_vp) 3048 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread); 3049 if (!(flags & NDF_NO_DVP_RELE) && 3050 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3051 vrele(ndp->ni_dvp); 3052 ndp->ni_dvp = NULL; 3053 } 3054 if (!(flags & NDF_NO_VP_UNLOCK) && 3055 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3056 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread); 3057 if (!(flags & NDF_NO_VP_RELE) && 3058 ndp->ni_vp) { 3059 vrele(ndp->ni_vp); 3060 ndp->ni_vp = NULL; 3061 } 3062 if (!(flags & NDF_NO_STARTDIR_RELE) && 3063 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3064 vrele(ndp->ni_startdir); 3065 ndp->ni_startdir = NULL; 3066 } 3067 } 3068 3069 /* 3070 * Common file system object access control check routine. Accepts a 3071 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3072 * and optional call-by-reference privused argument allowing vaccess() 3073 * to indicate to the caller whether privilege was used to satisfy the 3074 * request. Returns 0 on success, or an errno on failure. 3075 */ 3076 int 3077 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3078 enum vtype type; 3079 mode_t file_mode; 3080 uid_t file_uid; 3081 gid_t file_gid; 3082 mode_t acc_mode; 3083 struct ucred *cred; 3084 int *privused; 3085 { 3086 mode_t dac_granted; 3087 #ifdef CAPABILITIES 3088 mode_t cap_granted; 3089 #endif 3090 3091 /* 3092 * Look for a normal, non-privileged way to access the file/directory 3093 * as requested. If it exists, go with that. 3094 */ 3095 3096 if (privused != NULL) 3097 *privused = 0; 3098 3099 dac_granted = 0; 3100 3101 /* Check the owner. */ 3102 if (cred->cr_uid == file_uid) { 3103 dac_granted |= VADMIN; 3104 if (file_mode & S_IXUSR) 3105 dac_granted |= VEXEC; 3106 if (file_mode & S_IRUSR) 3107 dac_granted |= VREAD; 3108 if (file_mode & S_IWUSR) 3109 dac_granted |= VWRITE; 3110 3111 if ((acc_mode & dac_granted) == acc_mode) 3112 return (0); 3113 3114 goto privcheck; 3115 } 3116 3117 /* Otherwise, check the groups (first match) */ 3118 if (groupmember(file_gid, cred)) { 3119 if (file_mode & S_IXGRP) 3120 dac_granted |= VEXEC; 3121 if (file_mode & S_IRGRP) 3122 dac_granted |= VREAD; 3123 if (file_mode & S_IWGRP) 3124 dac_granted |= VWRITE; 3125 3126 if ((acc_mode & dac_granted) == acc_mode) 3127 return (0); 3128 3129 goto privcheck; 3130 } 3131 3132 /* Otherwise, check everyone else. */ 3133 if (file_mode & S_IXOTH) 3134 dac_granted |= VEXEC; 3135 if (file_mode & S_IROTH) 3136 dac_granted |= VREAD; 3137 if (file_mode & S_IWOTH) 3138 dac_granted |= VWRITE; 3139 if ((acc_mode & dac_granted) == acc_mode) 3140 return (0); 3141 3142 privcheck: 3143 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 3144 /* XXX audit: privilege used */ 3145 if (privused != NULL) 3146 *privused = 1; 3147 return (0); 3148 } 3149 3150 #ifdef CAPABILITIES 3151 /* 3152 * Build a capability mask to determine if the set of capabilities 3153 * satisfies the requirements when combined with the granted mask 3154 * from above. 3155 * For each capability, if the capability is required, bitwise 3156 * or the request type onto the cap_granted mask. 3157 */ 3158 cap_granted = 0; 3159 3160 if (type == VDIR) { 3161 /* 3162 * For directories, use CAP_DAC_READ_SEARCH to satisfy 3163 * VEXEC requests, instead of CAP_DAC_EXECUTE. 3164 */ 3165 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3166 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3167 cap_granted |= VEXEC; 3168 } else { 3169 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3170 !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3171 cap_granted |= VEXEC; 3172 } 3173 3174 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3175 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3176 cap_granted |= VREAD; 3177 3178 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3179 !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3180 cap_granted |= VWRITE; 3181 3182 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3183 !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT)) 3184 cap_granted |= VADMIN; 3185 3186 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3187 /* XXX audit: privilege used */ 3188 if (privused != NULL) 3189 *privused = 1; 3190 return (0); 3191 } 3192 #endif 3193 3194 return ((acc_mode & VADMIN) ? EPERM : EACCES); 3195 } 3196 3197