1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 #include "opt_ffs.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/eventhandler.h> 54 #include <sys/fcntl.h> 55 #include <sys/kernel.h> 56 #include <sys/kthread.h> 57 #include <sys/malloc.h> 58 #include <sys/mount.h> 59 #include <sys/namei.h> 60 #include <sys/stat.h> 61 #include <sys/sysctl.h> 62 #include <sys/vmmeter.h> 63 #include <sys/vnode.h> 64 65 #include <vm/vm.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_extern.h> 68 #include <vm/pmap.h> 69 #include <vm/vm_map.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_zone.h> 72 73 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 74 75 static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); 76 static void insmntque __P((struct vnode *vp, struct mount *mp)); 77 static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 78 79 /* 80 * Number of vnodes in existence. Increased whenever getnewvnode() 81 * allocates a new vnode, never decreased. 82 */ 83 static unsigned long numvnodes; 84 SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 85 86 /* 87 * Conversion tables for conversion from vnode types to inode formats 88 * and back. 89 */ 90 enum vtype iftovt_tab[16] = { 91 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 92 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 93 }; 94 int vttoif_tab[9] = { 95 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 96 S_IFSOCK, S_IFIFO, S_IFMT, 97 }; 98 99 /* 100 * List of vnodes that are ready for recycling. 101 */ 102 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 103 104 /* 105 * Minimum number of free vnodes. If there are fewer than this free vnodes, 106 * getnewvnode() will return a newly allocated vnode. 107 */ 108 static u_long wantfreevnodes = 25; 109 SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 110 /* Number of vnodes in the free list. */ 111 static u_long freevnodes = 0; 112 SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 113 /* Number of vnode allocation. */ 114 static u_long vnodeallocs = 0; 115 SYSCTL_LONG(_debug, OID_AUTO, vnodeallocs, CTLFLAG_RD, &vnodeallocs, 0, ""); 116 /* Period of vnode recycle from namecache in vnode allocation times. */ 117 static u_long vnoderecycleperiod = 1000; 118 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleperiod, CTLFLAG_RW, &vnoderecycleperiod, 0, ""); 119 /* Minimum number of total vnodes required to invoke vnode recycle from namecache. */ 120 static u_long vnoderecyclemintotalvn = 2000; 121 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclemintotalvn, CTLFLAG_RW, &vnoderecyclemintotalvn, 0, ""); 122 /* Minimum number of free vnodes required to invoke vnode recycle from namecache. */ 123 static u_long vnoderecycleminfreevn = 2000; 124 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleminfreevn, CTLFLAG_RW, &vnoderecycleminfreevn, 0, ""); 125 /* Number of vnodes attempted to recycle at a time. */ 126 static u_long vnoderecyclenumber = 3000; 127 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclenumber, CTLFLAG_RW, &vnoderecyclenumber, 0, ""); 128 129 /* 130 * Various variables used for debugging the new implementation of 131 * reassignbuf(). 132 * XXX these are probably of (very) limited utility now. 133 */ 134 static int reassignbufcalls; 135 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 136 static int reassignbufloops; 137 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 138 static int reassignbufsortgood; 139 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 140 static int reassignbufsortbad; 141 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 142 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ 143 static int reassignbufmethod = 1; 144 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 145 146 #ifdef ENABLE_VFS_IOOPT 147 /* See NOTES for a description of this setting. */ 148 int vfs_ioopt = 0; 149 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 150 #endif 151 152 /* List of mounted filesystems. */ 153 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 154 155 /* For any iteration/modification of mountlist */ 156 struct mtx mountlist_mtx; 157 158 /* For any iteration/modification of mnt_vnodelist */ 159 struct mtx mntvnode_mtx; 160 161 /* 162 * Cache for the mount type id assigned to NFS. This is used for 163 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 164 */ 165 int nfs_mount_type = -1; 166 167 /* To keep more than one thread at a time from running vfs_getnewfsid */ 168 static struct mtx mntid_mtx; 169 170 /* For any iteration/modification of vnode_free_list */ 171 static struct mtx vnode_free_list_mtx; 172 173 /* 174 * For any iteration/modification of dev->si_hlist (linked through 175 * v_specnext) 176 */ 177 static struct mtx spechash_mtx; 178 179 /* Publicly exported FS */ 180 struct nfs_public nfs_pub; 181 182 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 183 static vm_zone_t vnode_zone; 184 185 /* Set to 1 to print out reclaim of active vnodes */ 186 int prtactive = 0; 187 188 /* 189 * The workitem queue. 190 * 191 * It is useful to delay writes of file data and filesystem metadata 192 * for tens of seconds so that quickly created and deleted files need 193 * not waste disk bandwidth being created and removed. To realize this, 194 * we append vnodes to a "workitem" queue. When running with a soft 195 * updates implementation, most pending metadata dependencies should 196 * not wait for more than a few seconds. Thus, mounted on block devices 197 * are delayed only about a half the time that file data is delayed. 198 * Similarly, directory updates are more critical, so are only delayed 199 * about a third the time that file data is delayed. Thus, there are 200 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 201 * one each second (driven off the filesystem syncer process). The 202 * syncer_delayno variable indicates the next queue that is to be processed. 203 * Items that need to be processed soon are placed in this queue: 204 * 205 * syncer_workitem_pending[syncer_delayno] 206 * 207 * A delay of fifteen seconds is done by placing the request fifteen 208 * entries later in the queue: 209 * 210 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 211 * 212 */ 213 static int syncer_delayno = 0; 214 static long syncer_mask; 215 LIST_HEAD(synclist, vnode); 216 static struct synclist *syncer_workitem_pending; 217 218 #define SYNCER_MAXDELAY 32 219 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 220 time_t syncdelay = 30; /* max time to delay syncing data */ 221 time_t filedelay = 30; /* time to delay syncing files */ 222 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 223 time_t dirdelay = 29; /* time to delay syncing directories */ 224 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 225 time_t metadelay = 28; /* time to delay syncing metadata */ 226 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 227 static int rushjob; /* number of slots to run ASAP */ 228 static int stat_rush_requests; /* number of times I/O speeded up */ 229 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 230 231 /* 232 * Number of vnodes we want to exist at any one time. This is mostly used 233 * to size hash tables in vnode-related code. It is normally not used in 234 * getnewvnode(), as wantfreevnodes is normally nonzero.) 235 * 236 * XXX desiredvnodes is historical cruft and should not exist. 237 */ 238 int desiredvnodes; 239 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 240 &desiredvnodes, 0, "Maximum number of vnodes"); 241 242 /* 243 * Initialize the vnode management data structures. 244 */ 245 static void 246 vntblinit(void *dummy __unused) 247 { 248 249 desiredvnodes = maxproc + cnt.v_page_count / 4; 250 mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); 251 mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); 252 mtx_init(&mntid_mtx, "mntid", MTX_DEF); 253 mtx_init(&spechash_mtx, "spechash", MTX_DEF); 254 TAILQ_INIT(&vnode_free_list); 255 mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); 256 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 257 /* 258 * Initialize the filesystem syncer. 259 */ 260 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 261 &syncer_mask); 262 syncer_maxdelay = syncer_mask + 1; 263 } 264 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 265 266 267 /* 268 * Mark a mount point as busy. Used to synchronize access and to delay 269 * unmounting. Interlock is not released on failure. 270 */ 271 int 272 vfs_busy(mp, flags, interlkp, p) 273 struct mount *mp; 274 int flags; 275 struct mtx *interlkp; 276 struct proc *p; 277 { 278 int lkflags; 279 280 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 281 if (flags & LK_NOWAIT) 282 return (ENOENT); 283 mp->mnt_kern_flag |= MNTK_MWAIT; 284 /* 285 * Since all busy locks are shared except the exclusive 286 * lock granted when unmounting, the only place that a 287 * wakeup needs to be done is at the release of the 288 * exclusive lock at the end of dounmount. 289 */ 290 msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); 291 return (ENOENT); 292 } 293 lkflags = LK_SHARED | LK_NOPAUSE; 294 if (interlkp) 295 lkflags |= LK_INTERLOCK; 296 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 297 panic("vfs_busy: unexpected lock failure"); 298 return (0); 299 } 300 301 /* 302 * Free a busy filesystem. 303 */ 304 void 305 vfs_unbusy(mp, p) 306 struct mount *mp; 307 struct proc *p; 308 { 309 310 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 311 } 312 313 /* 314 * Lookup a filesystem type, and if found allocate and initialize 315 * a mount structure for it. 316 * 317 * Devname is usually updated by mount(8) after booting. 318 */ 319 int 320 vfs_rootmountalloc(fstypename, devname, mpp) 321 char *fstypename; 322 char *devname; 323 struct mount **mpp; 324 { 325 struct proc *p = curproc; /* XXX */ 326 struct vfsconf *vfsp; 327 struct mount *mp; 328 329 if (fstypename == NULL) 330 return (ENODEV); 331 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 332 if (!strcmp(vfsp->vfc_name, fstypename)) 333 break; 334 if (vfsp == NULL) 335 return (ENODEV); 336 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); 337 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 338 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 339 LIST_INIT(&mp->mnt_vnodelist); 340 mp->mnt_vfc = vfsp; 341 mp->mnt_op = vfsp->vfc_vfsops; 342 mp->mnt_flag = MNT_RDONLY; 343 mp->mnt_vnodecovered = NULLVP; 344 vfsp->vfc_refcount++; 345 mp->mnt_iosize_max = DFLTPHYS; 346 mp->mnt_stat.f_type = vfsp->vfc_typenum; 347 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 348 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 349 mp->mnt_stat.f_mntonname[0] = '/'; 350 mp->mnt_stat.f_mntonname[1] = 0; 351 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 352 *mpp = mp; 353 return (0); 354 } 355 356 /* 357 * Find an appropriate filesystem to use for the root. If a filesystem 358 * has not been preselected, walk through the list of known filesystems 359 * trying those that have mountroot routines, and try them until one 360 * works or we have tried them all. 361 */ 362 #ifdef notdef /* XXX JH */ 363 int 364 lite2_vfs_mountroot() 365 { 366 struct vfsconf *vfsp; 367 extern int (*lite2_mountroot) __P((void)); 368 int error; 369 370 if (lite2_mountroot != NULL) 371 return ((*lite2_mountroot)()); 372 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 373 if (vfsp->vfc_mountroot == NULL) 374 continue; 375 if ((error = (*vfsp->vfc_mountroot)()) == 0) 376 return (0); 377 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 378 } 379 return (ENODEV); 380 } 381 #endif 382 383 /* 384 * Lookup a mount point by filesystem identifier. 385 */ 386 struct mount * 387 vfs_getvfs(fsid) 388 fsid_t *fsid; 389 { 390 register struct mount *mp; 391 392 mtx_lock(&mountlist_mtx); 393 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 394 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 395 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 396 mtx_unlock(&mountlist_mtx); 397 return (mp); 398 } 399 } 400 mtx_unlock(&mountlist_mtx); 401 return ((struct mount *) 0); 402 } 403 404 /* 405 * Get a new unique fsid. Try to make its val[0] unique, since this value 406 * will be used to create fake device numbers for stat(). Also try (but 407 * not so hard) make its val[0] unique mod 2^16, since some emulators only 408 * support 16-bit device numbers. We end up with unique val[0]'s for the 409 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 410 * 411 * Keep in mind that several mounts may be running in parallel. Starting 412 * the search one past where the previous search terminated is both a 413 * micro-optimization and a defense against returning the same fsid to 414 * different mounts. 415 */ 416 void 417 vfs_getnewfsid(mp) 418 struct mount *mp; 419 { 420 static u_int16_t mntid_base; 421 fsid_t tfsid; 422 int mtype; 423 424 mtx_lock(&mntid_mtx); 425 mtype = mp->mnt_vfc->vfc_typenum; 426 tfsid.val[1] = mtype; 427 mtype = (mtype & 0xFF) << 24; 428 for (;;) { 429 tfsid.val[0] = makeudev(255, 430 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 431 mntid_base++; 432 if (vfs_getvfs(&tfsid) == NULL) 433 break; 434 } 435 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 436 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 437 mtx_unlock(&mntid_mtx); 438 } 439 440 /* 441 * Knob to control the precision of file timestamps: 442 * 443 * 0 = seconds only; nanoseconds zeroed. 444 * 1 = seconds and nanoseconds, accurate within 1/HZ. 445 * 2 = seconds and nanoseconds, truncated to microseconds. 446 * >=3 = seconds and nanoseconds, maximum precision. 447 */ 448 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 449 450 static int timestamp_precision = TSP_SEC; 451 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 452 ×tamp_precision, 0, ""); 453 454 /* 455 * Get a current timestamp. 456 */ 457 void 458 vfs_timestamp(tsp) 459 struct timespec *tsp; 460 { 461 struct timeval tv; 462 463 switch (timestamp_precision) { 464 case TSP_SEC: 465 tsp->tv_sec = time_second; 466 tsp->tv_nsec = 0; 467 break; 468 case TSP_HZ: 469 getnanotime(tsp); 470 break; 471 case TSP_USEC: 472 microtime(&tv); 473 TIMEVAL_TO_TIMESPEC(&tv, tsp); 474 break; 475 case TSP_NSEC: 476 default: 477 nanotime(tsp); 478 break; 479 } 480 } 481 482 /* 483 * Set vnode attributes to VNOVAL 484 */ 485 void 486 vattr_null(vap) 487 register struct vattr *vap; 488 { 489 490 vap->va_type = VNON; 491 vap->va_size = VNOVAL; 492 vap->va_bytes = VNOVAL; 493 vap->va_mode = VNOVAL; 494 vap->va_nlink = VNOVAL; 495 vap->va_uid = VNOVAL; 496 vap->va_gid = VNOVAL; 497 vap->va_fsid = VNOVAL; 498 vap->va_fileid = VNOVAL; 499 vap->va_blocksize = VNOVAL; 500 vap->va_rdev = VNOVAL; 501 vap->va_atime.tv_sec = VNOVAL; 502 vap->va_atime.tv_nsec = VNOVAL; 503 vap->va_mtime.tv_sec = VNOVAL; 504 vap->va_mtime.tv_nsec = VNOVAL; 505 vap->va_ctime.tv_sec = VNOVAL; 506 vap->va_ctime.tv_nsec = VNOVAL; 507 vap->va_flags = VNOVAL; 508 vap->va_gen = VNOVAL; 509 vap->va_vaflags = 0; 510 } 511 512 /* 513 * Routines having to do with the management of the vnode table. 514 */ 515 516 /* 517 * Return the next vnode from the free list. 518 */ 519 int 520 getnewvnode(tag, mp, vops, vpp) 521 enum vtagtype tag; 522 struct mount *mp; 523 vop_t **vops; 524 struct vnode **vpp; 525 { 526 int s, count; 527 struct proc *p = curproc; /* XXX */ 528 struct vnode *vp = NULL; 529 struct mount *vnmp; 530 vm_object_t object; 531 532 /* 533 * We take the least recently used vnode from the freelist 534 * if we can get it and it has no cached pages, and no 535 * namecache entries are relative to it. 536 * Otherwise we allocate a new vnode 537 */ 538 539 s = splbio(); 540 mtx_lock(&vnode_free_list_mtx); 541 542 if (wantfreevnodes && freevnodes < wantfreevnodes) { 543 vp = NULL; 544 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 545 /* 546 * XXX: this is only here to be backwards compatible 547 */ 548 vp = NULL; 549 } else for (count = 0; count < freevnodes; count++) { 550 vp = TAILQ_FIRST(&vnode_free_list); 551 if (vp == NULL || vp->v_usecount) 552 panic("getnewvnode: free vnode isn't"); 553 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 554 555 /* 556 * Don't recycle if active in the namecache or 557 * if it still has cached pages or we cannot get 558 * its interlock. 559 */ 560 if (LIST_FIRST(&vp->v_cache_src) != NULL || 561 (VOP_GETVOBJECT(vp, &object) == 0 && 562 (object->resident_page_count || object->ref_count)) || 563 !mtx_trylock(&vp->v_interlock)) { 564 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 565 vp = NULL; 566 continue; 567 } 568 /* 569 * Skip over it if its filesystem is being suspended. 570 */ 571 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 572 break; 573 mtx_unlock(&vp->v_interlock); 574 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 575 vp = NULL; 576 } 577 if (vp) { 578 vp->v_flag |= VDOOMED; 579 vp->v_flag &= ~VFREE; 580 freevnodes--; 581 mtx_unlock(&vnode_free_list_mtx); 582 cache_purge(vp); 583 vp->v_lease = NULL; 584 if (vp->v_type != VBAD) { 585 vgonel(vp, p); 586 } else { 587 mtx_unlock(&vp->v_interlock); 588 } 589 vn_finished_write(vnmp); 590 591 #ifdef INVARIANTS 592 { 593 int s; 594 595 if (vp->v_data) 596 panic("cleaned vnode isn't"); 597 s = splbio(); 598 if (vp->v_numoutput) 599 panic("Clean vnode has pending I/O's"); 600 splx(s); 601 if (vp->v_writecount != 0) 602 panic("Non-zero write count"); 603 } 604 #endif 605 vp->v_flag = 0; 606 vp->v_lastw = 0; 607 vp->v_lasta = 0; 608 vp->v_cstart = 0; 609 vp->v_clen = 0; 610 vp->v_socket = 0; 611 } else { 612 mtx_unlock(&vnode_free_list_mtx); 613 vp = (struct vnode *) zalloc(vnode_zone); 614 bzero((char *) vp, sizeof *vp); 615 mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); 616 vp->v_dd = vp; 617 mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF); 618 cache_purge(vp); 619 LIST_INIT(&vp->v_cache_src); 620 TAILQ_INIT(&vp->v_cache_dst); 621 numvnodes++; 622 } 623 624 TAILQ_INIT(&vp->v_cleanblkhd); 625 TAILQ_INIT(&vp->v_dirtyblkhd); 626 vp->v_type = VNON; 627 vp->v_tag = tag; 628 vp->v_op = vops; 629 lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); 630 insmntque(vp, mp); 631 *vpp = vp; 632 vp->v_usecount = 1; 633 vp->v_data = 0; 634 635 splx(s); 636 637 vfs_object_create(vp, p, p->p_ucred); 638 639 vnodeallocs++; 640 if (vnodeallocs % vnoderecycleperiod == 0 && 641 freevnodes < vnoderecycleminfreevn && 642 vnoderecyclemintotalvn < numvnodes) { 643 /* Recycle vnodes. */ 644 cache_purgeleafdirs(vnoderecyclenumber); 645 } 646 647 return (0); 648 } 649 650 /* 651 * Move a vnode from one mount queue to another. 652 */ 653 static void 654 insmntque(vp, mp) 655 register struct vnode *vp; 656 register struct mount *mp; 657 { 658 659 mtx_lock(&mntvnode_mtx); 660 /* 661 * Delete from old mount point vnode list, if on one. 662 */ 663 if (vp->v_mount != NULL) 664 LIST_REMOVE(vp, v_mntvnodes); 665 /* 666 * Insert into list of vnodes for the new mount point, if available. 667 */ 668 if ((vp->v_mount = mp) == NULL) { 669 mtx_unlock(&mntvnode_mtx); 670 return; 671 } 672 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 673 mtx_unlock(&mntvnode_mtx); 674 } 675 676 /* 677 * Update outstanding I/O count and do wakeup if requested. 678 */ 679 void 680 vwakeup(bp) 681 register struct buf *bp; 682 { 683 register struct vnode *vp; 684 685 bp->b_flags &= ~B_WRITEINPROG; 686 if ((vp = bp->b_vp)) { 687 vp->v_numoutput--; 688 if (vp->v_numoutput < 0) 689 panic("vwakeup: neg numoutput"); 690 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 691 vp->v_flag &= ~VBWAIT; 692 wakeup((caddr_t) &vp->v_numoutput); 693 } 694 } 695 } 696 697 /* 698 * Flush out and invalidate all buffers associated with a vnode. 699 * Called with the underlying object locked. 700 */ 701 int 702 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 703 register struct vnode *vp; 704 int flags; 705 struct ucred *cred; 706 struct proc *p; 707 int slpflag, slptimeo; 708 { 709 register struct buf *bp; 710 struct buf *nbp, *blist; 711 int s, error; 712 vm_object_t object; 713 714 mtx_assert(&vm_mtx, MA_NOTOWNED); 715 716 if (flags & V_SAVE) { 717 s = splbio(); 718 while (vp->v_numoutput) { 719 vp->v_flag |= VBWAIT; 720 error = tsleep((caddr_t)&vp->v_numoutput, 721 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 722 if (error) { 723 splx(s); 724 return (error); 725 } 726 } 727 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 728 splx(s); 729 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 730 return (error); 731 s = splbio(); 732 if (vp->v_numoutput > 0 || 733 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 734 panic("vinvalbuf: dirty bufs"); 735 } 736 splx(s); 737 } 738 s = splbio(); 739 for (;;) { 740 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 741 if (!blist) 742 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 743 if (!blist) 744 break; 745 746 for (bp = blist; bp; bp = nbp) { 747 nbp = TAILQ_NEXT(bp, b_vnbufs); 748 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 749 error = BUF_TIMELOCK(bp, 750 LK_EXCLUSIVE | LK_SLEEPFAIL, 751 "vinvalbuf", slpflag, slptimeo); 752 if (error == ENOLCK) 753 break; 754 splx(s); 755 return (error); 756 } 757 /* 758 * XXX Since there are no node locks for NFS, I 759 * believe there is a slight chance that a delayed 760 * write will occur while sleeping just above, so 761 * check for it. Note that vfs_bio_awrite expects 762 * buffers to reside on a queue, while BUF_WRITE and 763 * brelse do not. 764 */ 765 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 766 (flags & V_SAVE)) { 767 768 if (bp->b_vp == vp) { 769 if (bp->b_flags & B_CLUSTEROK) { 770 BUF_UNLOCK(bp); 771 vfs_bio_awrite(bp); 772 } else { 773 bremfree(bp); 774 bp->b_flags |= B_ASYNC; 775 BUF_WRITE(bp); 776 } 777 } else { 778 bremfree(bp); 779 (void) BUF_WRITE(bp); 780 } 781 break; 782 } 783 bremfree(bp); 784 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 785 bp->b_flags &= ~B_ASYNC; 786 brelse(bp); 787 } 788 } 789 790 while (vp->v_numoutput > 0) { 791 vp->v_flag |= VBWAIT; 792 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 793 } 794 795 splx(s); 796 797 /* 798 * Destroy the copy in the VM cache, too. 799 */ 800 mtx_lock(&vp->v_interlock); 801 if (VOP_GETVOBJECT(vp, &object) == 0) { 802 mtx_lock(&vm_mtx); 803 vm_object_page_remove(object, 0, 0, 804 (flags & V_SAVE) ? TRUE : FALSE); 805 mtx_unlock(&vm_mtx); 806 } 807 mtx_unlock(&vp->v_interlock); 808 809 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 810 panic("vinvalbuf: flush failed"); 811 return (0); 812 } 813 814 /* 815 * Truncate a file's buffer and pages to a specified length. This 816 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 817 * sync activity. 818 */ 819 int 820 vtruncbuf(vp, cred, p, length, blksize) 821 register struct vnode *vp; 822 struct ucred *cred; 823 struct proc *p; 824 off_t length; 825 int blksize; 826 { 827 register struct buf *bp; 828 struct buf *nbp; 829 int s, anyfreed; 830 int trunclbn; 831 832 /* 833 * Round up to the *next* lbn. 834 */ 835 trunclbn = (length + blksize - 1) / blksize; 836 837 s = splbio(); 838 restart: 839 anyfreed = 1; 840 for (;anyfreed;) { 841 anyfreed = 0; 842 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 843 nbp = TAILQ_NEXT(bp, b_vnbufs); 844 if (bp->b_lblkno >= trunclbn) { 845 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 846 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 847 goto restart; 848 } else { 849 bremfree(bp); 850 bp->b_flags |= (B_INVAL | B_RELBUF); 851 bp->b_flags &= ~B_ASYNC; 852 brelse(bp); 853 anyfreed = 1; 854 } 855 if (nbp && 856 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 857 (nbp->b_vp != vp) || 858 (nbp->b_flags & B_DELWRI))) { 859 goto restart; 860 } 861 } 862 } 863 864 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 865 nbp = TAILQ_NEXT(bp, b_vnbufs); 866 if (bp->b_lblkno >= trunclbn) { 867 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 868 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 869 goto restart; 870 } else { 871 bremfree(bp); 872 bp->b_flags |= (B_INVAL | B_RELBUF); 873 bp->b_flags &= ~B_ASYNC; 874 brelse(bp); 875 anyfreed = 1; 876 } 877 if (nbp && 878 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 879 (nbp->b_vp != vp) || 880 (nbp->b_flags & B_DELWRI) == 0)) { 881 goto restart; 882 } 883 } 884 } 885 } 886 887 if (length > 0) { 888 restartsync: 889 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 890 nbp = TAILQ_NEXT(bp, b_vnbufs); 891 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 892 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 893 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 894 goto restart; 895 } else { 896 bremfree(bp); 897 if (bp->b_vp == vp) { 898 bp->b_flags |= B_ASYNC; 899 } else { 900 bp->b_flags &= ~B_ASYNC; 901 } 902 BUF_WRITE(bp); 903 } 904 goto restartsync; 905 } 906 907 } 908 } 909 910 while (vp->v_numoutput > 0) { 911 vp->v_flag |= VBWAIT; 912 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 913 } 914 915 splx(s); 916 917 vnode_pager_setsize(vp, length); 918 919 return (0); 920 } 921 922 /* 923 * Associate a buffer with a vnode. 924 */ 925 void 926 bgetvp(vp, bp) 927 register struct vnode *vp; 928 register struct buf *bp; 929 { 930 int s; 931 932 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 933 934 vhold(vp); 935 bp->b_vp = vp; 936 bp->b_dev = vn_todev(vp); 937 /* 938 * Insert onto list for new vnode. 939 */ 940 s = splbio(); 941 bp->b_xflags |= BX_VNCLEAN; 942 bp->b_xflags &= ~BX_VNDIRTY; 943 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 944 splx(s); 945 } 946 947 /* 948 * Disassociate a buffer from a vnode. 949 */ 950 void 951 brelvp(bp) 952 register struct buf *bp; 953 { 954 struct vnode *vp; 955 struct buflists *listheadp; 956 int s; 957 958 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 959 960 /* 961 * Delete from old vnode list, if on one. 962 */ 963 vp = bp->b_vp; 964 s = splbio(); 965 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 966 if (bp->b_xflags & BX_VNDIRTY) 967 listheadp = &vp->v_dirtyblkhd; 968 else 969 listheadp = &vp->v_cleanblkhd; 970 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 971 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 972 } 973 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 974 vp->v_flag &= ~VONWORKLST; 975 LIST_REMOVE(vp, v_synclist); 976 } 977 splx(s); 978 bp->b_vp = (struct vnode *) 0; 979 vdrop(vp); 980 } 981 982 /* 983 * Add an item to the syncer work queue. 984 */ 985 static void 986 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 987 { 988 int s, slot; 989 990 s = splbio(); 991 992 if (vp->v_flag & VONWORKLST) { 993 LIST_REMOVE(vp, v_synclist); 994 } 995 996 if (delay > syncer_maxdelay - 2) 997 delay = syncer_maxdelay - 2; 998 slot = (syncer_delayno + delay) & syncer_mask; 999 1000 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 1001 vp->v_flag |= VONWORKLST; 1002 splx(s); 1003 } 1004 1005 struct proc *updateproc; 1006 static void sched_sync __P((void)); 1007 static struct kproc_desc up_kp = { 1008 "syncer", 1009 sched_sync, 1010 &updateproc 1011 }; 1012 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1013 1014 /* 1015 * System filesystem synchronizer daemon. 1016 */ 1017 void 1018 sched_sync(void) 1019 { 1020 struct synclist *slp; 1021 struct vnode *vp; 1022 struct mount *mp; 1023 long starttime; 1024 int s; 1025 struct proc *p = updateproc; 1026 1027 mtx_lock(&Giant); 1028 1029 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 1030 SHUTDOWN_PRI_LAST); 1031 1032 for (;;) { 1033 kthread_suspend_check(p); 1034 1035 starttime = time_second; 1036 1037 /* 1038 * Push files whose dirty time has expired. Be careful 1039 * of interrupt race on slp queue. 1040 */ 1041 s = splbio(); 1042 slp = &syncer_workitem_pending[syncer_delayno]; 1043 syncer_delayno += 1; 1044 if (syncer_delayno == syncer_maxdelay) 1045 syncer_delayno = 0; 1046 splx(s); 1047 1048 while ((vp = LIST_FIRST(slp)) != NULL) { 1049 if (VOP_ISLOCKED(vp, NULL) == 0 && 1050 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 1051 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1052 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 1053 VOP_UNLOCK(vp, 0, p); 1054 vn_finished_write(mp); 1055 } 1056 s = splbio(); 1057 if (LIST_FIRST(slp) == vp) { 1058 /* 1059 * Note: v_tag VT_VFS vps can remain on the 1060 * worklist too with no dirty blocks, but 1061 * since sync_fsync() moves it to a different 1062 * slot we are safe. 1063 */ 1064 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1065 !vn_isdisk(vp, NULL)) 1066 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1067 /* 1068 * Put us back on the worklist. The worklist 1069 * routine will remove us from our current 1070 * position and then add us back in at a later 1071 * position. 1072 */ 1073 vn_syncer_add_to_worklist(vp, syncdelay); 1074 } 1075 splx(s); 1076 } 1077 1078 /* 1079 * Do soft update processing. 1080 */ 1081 #ifdef SOFTUPDATES 1082 softdep_process_worklist(NULL); 1083 #endif 1084 1085 /* 1086 * The variable rushjob allows the kernel to speed up the 1087 * processing of the filesystem syncer process. A rushjob 1088 * value of N tells the filesystem syncer to process the next 1089 * N seconds worth of work on its queue ASAP. Currently rushjob 1090 * is used by the soft update code to speed up the filesystem 1091 * syncer process when the incore state is getting so far 1092 * ahead of the disk that the kernel memory pool is being 1093 * threatened with exhaustion. 1094 */ 1095 if (rushjob > 0) { 1096 rushjob -= 1; 1097 continue; 1098 } 1099 /* 1100 * If it has taken us less than a second to process the 1101 * current work, then wait. Otherwise start right over 1102 * again. We can still lose time if any single round 1103 * takes more than two seconds, but it does not really 1104 * matter as we are just trying to generally pace the 1105 * filesystem activity. 1106 */ 1107 if (time_second == starttime) 1108 tsleep(&lbolt, PPAUSE, "syncer", 0); 1109 } 1110 } 1111 1112 /* 1113 * Request the syncer daemon to speed up its work. 1114 * We never push it to speed up more than half of its 1115 * normal turn time, otherwise it could take over the cpu. 1116 */ 1117 int 1118 speedup_syncer() 1119 { 1120 1121 mtx_lock_spin(&sched_lock); 1122 if (updateproc->p_wchan == &lbolt) 1123 setrunnable(updateproc); 1124 mtx_unlock_spin(&sched_lock); 1125 if (rushjob < syncdelay / 2) { 1126 rushjob += 1; 1127 stat_rush_requests += 1; 1128 return (1); 1129 } 1130 return(0); 1131 } 1132 1133 /* 1134 * Associate a p-buffer with a vnode. 1135 * 1136 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1137 * with the buffer. i.e. the bp has not been linked into the vnode or 1138 * ref-counted. 1139 * 1140 * Doesn't block, only vnode seems to need a lock. 1141 */ 1142 void 1143 pbgetvp(vp, bp) 1144 register struct vnode *vp; 1145 register struct buf *bp; 1146 { 1147 1148 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1149 1150 bp->b_vp = vp; 1151 bp->b_flags |= B_PAGING; 1152 bp->b_dev = vn_todev(vp); 1153 } 1154 1155 /* 1156 * Disassociate a p-buffer from a vnode. 1157 */ 1158 void 1159 pbrelvp(bp) 1160 register struct buf *bp; 1161 { 1162 1163 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1164 1165 /* XXX REMOVE ME */ 1166 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1167 panic( 1168 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1169 bp, 1170 (int)bp->b_flags 1171 ); 1172 } 1173 bp->b_vp = (struct vnode *) 0; 1174 bp->b_flags &= ~B_PAGING; 1175 } 1176 1177 /* 1178 * Change the vnode a pager buffer is associated with. 1179 */ 1180 void 1181 pbreassignbuf(bp, newvp) 1182 struct buf *bp; 1183 struct vnode *newvp; 1184 { 1185 1186 KASSERT(bp->b_flags & B_PAGING, 1187 ("pbreassignbuf() on non phys bp %p", bp)); 1188 bp->b_vp = newvp; 1189 } 1190 1191 /* 1192 * Reassign a buffer from one vnode to another. 1193 * Used to assign file specific control information 1194 * (indirect blocks) to the vnode to which they belong. 1195 */ 1196 void 1197 reassignbuf(bp, newvp) 1198 register struct buf *bp; 1199 register struct vnode *newvp; 1200 { 1201 struct buflists *listheadp; 1202 int delay; 1203 int s; 1204 1205 if (newvp == NULL) { 1206 printf("reassignbuf: NULL"); 1207 return; 1208 } 1209 ++reassignbufcalls; 1210 1211 /* 1212 * B_PAGING flagged buffers cannot be reassigned because their vp 1213 * is not fully linked in. 1214 */ 1215 if (bp->b_flags & B_PAGING) 1216 panic("cannot reassign paging buffer"); 1217 1218 s = splbio(); 1219 /* 1220 * Delete from old vnode list, if on one. 1221 */ 1222 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1223 if (bp->b_xflags & BX_VNDIRTY) 1224 listheadp = &bp->b_vp->v_dirtyblkhd; 1225 else 1226 listheadp = &bp->b_vp->v_cleanblkhd; 1227 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1228 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1229 if (bp->b_vp != newvp) { 1230 vdrop(bp->b_vp); 1231 bp->b_vp = NULL; /* for clarification */ 1232 } 1233 } 1234 /* 1235 * If dirty, put on list of dirty buffers; otherwise insert onto list 1236 * of clean buffers. 1237 */ 1238 if (bp->b_flags & B_DELWRI) { 1239 struct buf *tbp; 1240 1241 listheadp = &newvp->v_dirtyblkhd; 1242 if ((newvp->v_flag & VONWORKLST) == 0) { 1243 switch (newvp->v_type) { 1244 case VDIR: 1245 delay = dirdelay; 1246 break; 1247 case VCHR: 1248 if (newvp->v_rdev->si_mountpoint != NULL) { 1249 delay = metadelay; 1250 break; 1251 } 1252 /* fall through */ 1253 default: 1254 delay = filedelay; 1255 } 1256 vn_syncer_add_to_worklist(newvp, delay); 1257 } 1258 bp->b_xflags |= BX_VNDIRTY; 1259 tbp = TAILQ_FIRST(listheadp); 1260 if (tbp == NULL || 1261 bp->b_lblkno == 0 || 1262 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1263 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1264 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1265 ++reassignbufsortgood; 1266 } else if (bp->b_lblkno < 0) { 1267 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1268 ++reassignbufsortgood; 1269 } else if (reassignbufmethod == 1) { 1270 /* 1271 * New sorting algorithm, only handle sequential case, 1272 * otherwise append to end (but before metadata) 1273 */ 1274 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1275 (tbp->b_xflags & BX_VNDIRTY)) { 1276 /* 1277 * Found the best place to insert the buffer 1278 */ 1279 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1280 ++reassignbufsortgood; 1281 } else { 1282 /* 1283 * Missed, append to end, but before meta-data. 1284 * We know that the head buffer in the list is 1285 * not meta-data due to prior conditionals. 1286 * 1287 * Indirect effects: NFS second stage write 1288 * tends to wind up here, giving maximum 1289 * distance between the unstable write and the 1290 * commit rpc. 1291 */ 1292 tbp = TAILQ_LAST(listheadp, buflists); 1293 while (tbp && tbp->b_lblkno < 0) 1294 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1295 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1296 ++reassignbufsortbad; 1297 } 1298 } else { 1299 /* 1300 * Old sorting algorithm, scan queue and insert 1301 */ 1302 struct buf *ttbp; 1303 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1304 (ttbp->b_lblkno < bp->b_lblkno)) { 1305 ++reassignbufloops; 1306 tbp = ttbp; 1307 } 1308 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1309 } 1310 } else { 1311 bp->b_xflags |= BX_VNCLEAN; 1312 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1313 if ((newvp->v_flag & VONWORKLST) && 1314 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1315 newvp->v_flag &= ~VONWORKLST; 1316 LIST_REMOVE(newvp, v_synclist); 1317 } 1318 } 1319 if (bp->b_vp != newvp) { 1320 bp->b_vp = newvp; 1321 vhold(bp->b_vp); 1322 } 1323 splx(s); 1324 } 1325 1326 /* 1327 * Create a vnode for a device. 1328 * Used for mounting the root file system. 1329 */ 1330 int 1331 bdevvp(dev, vpp) 1332 dev_t dev; 1333 struct vnode **vpp; 1334 { 1335 register struct vnode *vp; 1336 struct vnode *nvp; 1337 int error; 1338 1339 if (dev == NODEV) { 1340 *vpp = NULLVP; 1341 return (ENXIO); 1342 } 1343 if (vfinddev(dev, VCHR, vpp)) 1344 return (0); 1345 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1346 if (error) { 1347 *vpp = NULLVP; 1348 return (error); 1349 } 1350 vp = nvp; 1351 vp->v_type = VCHR; 1352 addalias(vp, dev); 1353 *vpp = vp; 1354 return (0); 1355 } 1356 1357 /* 1358 * Add vnode to the alias list hung off the dev_t. 1359 * 1360 * The reason for this gunk is that multiple vnodes can reference 1361 * the same physical device, so checking vp->v_usecount to see 1362 * how many users there are is inadequate; the v_usecount for 1363 * the vnodes need to be accumulated. vcount() does that. 1364 */ 1365 struct vnode * 1366 addaliasu(nvp, nvp_rdev) 1367 struct vnode *nvp; 1368 udev_t nvp_rdev; 1369 { 1370 struct vnode *ovp; 1371 vop_t **ops; 1372 dev_t dev; 1373 1374 if (nvp->v_type == VBLK) 1375 return (nvp); 1376 if (nvp->v_type != VCHR) 1377 panic("addaliasu on non-special vnode"); 1378 dev = udev2dev(nvp_rdev, 0); 1379 /* 1380 * Check to see if we have a bdevvp vnode with no associated 1381 * filesystem. If so, we want to associate the filesystem of 1382 * the new newly instigated vnode with the bdevvp vnode and 1383 * discard the newly created vnode rather than leaving the 1384 * bdevvp vnode lying around with no associated filesystem. 1385 */ 1386 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1387 addalias(nvp, dev); 1388 return (nvp); 1389 } 1390 /* 1391 * Discard unneeded vnode, but save its node specific data. 1392 * Note that if there is a lock, it is carried over in the 1393 * node specific data to the replacement vnode. 1394 */ 1395 vref(ovp); 1396 ovp->v_data = nvp->v_data; 1397 ovp->v_tag = nvp->v_tag; 1398 nvp->v_data = NULL; 1399 lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, 1400 nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); 1401 if (nvp->v_vnlock) 1402 ovp->v_vnlock = &ovp->v_lock; 1403 ops = ovp->v_op; 1404 ovp->v_op = nvp->v_op; 1405 if (VOP_ISLOCKED(nvp, curproc)) { 1406 VOP_UNLOCK(nvp, 0, curproc); 1407 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc); 1408 } 1409 nvp->v_op = ops; 1410 insmntque(ovp, nvp->v_mount); 1411 vrele(nvp); 1412 vgone(nvp); 1413 return (ovp); 1414 } 1415 1416 /* This is a local helper function that do the same as addaliasu, but for a 1417 * dev_t instead of an udev_t. */ 1418 static void 1419 addalias(nvp, dev) 1420 struct vnode *nvp; 1421 dev_t dev; 1422 { 1423 1424 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); 1425 nvp->v_rdev = dev; 1426 mtx_lock(&spechash_mtx); 1427 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1428 mtx_unlock(&spechash_mtx); 1429 } 1430 1431 /* 1432 * Grab a particular vnode from the free list, increment its 1433 * reference count and lock it. The vnode lock bit is set if the 1434 * vnode is being eliminated in vgone. The process is awakened 1435 * when the transition is completed, and an error returned to 1436 * indicate that the vnode is no longer usable (possibly having 1437 * been changed to a new file system type). 1438 */ 1439 int 1440 vget(vp, flags, p) 1441 register struct vnode *vp; 1442 int flags; 1443 struct proc *p; 1444 { 1445 int error; 1446 1447 /* 1448 * If the vnode is in the process of being cleaned out for 1449 * another use, we wait for the cleaning to finish and then 1450 * return failure. Cleaning is determined by checking that 1451 * the VXLOCK flag is set. 1452 */ 1453 if ((flags & LK_INTERLOCK) == 0) 1454 mtx_lock(&vp->v_interlock); 1455 if (vp->v_flag & VXLOCK) { 1456 if (vp->v_vxproc == curproc) { 1457 printf("VXLOCK interlock avoided\n"); 1458 } else { 1459 vp->v_flag |= VXWANT; 1460 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1461 "vget", 0); 1462 return (ENOENT); 1463 } 1464 } 1465 1466 vp->v_usecount++; 1467 1468 if (VSHOULDBUSY(vp)) 1469 vbusy(vp); 1470 if (flags & LK_TYPE_MASK) { 1471 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1472 /* 1473 * must expand vrele here because we do not want 1474 * to call VOP_INACTIVE if the reference count 1475 * drops back to zero since it was never really 1476 * active. We must remove it from the free list 1477 * before sleeping so that multiple processes do 1478 * not try to recycle it. 1479 */ 1480 mtx_lock(&vp->v_interlock); 1481 vp->v_usecount--; 1482 if (VSHOULDFREE(vp)) 1483 vfree(vp); 1484 mtx_unlock(&vp->v_interlock); 1485 } 1486 return (error); 1487 } 1488 mtx_unlock(&vp->v_interlock); 1489 return (0); 1490 } 1491 1492 /* 1493 * Increase the reference count of a vnode. 1494 */ 1495 void 1496 vref(struct vnode *vp) 1497 { 1498 mtx_lock(&vp->v_interlock); 1499 vp->v_usecount++; 1500 mtx_unlock(&vp->v_interlock); 1501 } 1502 1503 /* 1504 * Vnode put/release. 1505 * If count drops to zero, call inactive routine and return to freelist. 1506 */ 1507 void 1508 vrele(vp) 1509 struct vnode *vp; 1510 { 1511 struct proc *p = curproc; /* XXX */ 1512 1513 KASSERT(vp != NULL, ("vrele: null vp")); 1514 1515 mtx_lock(&vp->v_interlock); 1516 1517 /* Skip this v_writecount check if we're going to panic below. */ 1518 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, 1519 ("vrele: missed vn_close")); 1520 1521 if (vp->v_usecount > 1) { 1522 1523 vp->v_usecount--; 1524 mtx_unlock(&vp->v_interlock); 1525 1526 return; 1527 } 1528 1529 if (vp->v_usecount == 1) { 1530 1531 vp->v_usecount--; 1532 if (VSHOULDFREE(vp)) 1533 vfree(vp); 1534 /* 1535 * If we are doing a vput, the node is already locked, and we must 1536 * call VOP_INACTIVE with the node locked. So, in the case of 1537 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1538 */ 1539 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1540 VOP_INACTIVE(vp, p); 1541 } 1542 1543 } else { 1544 #ifdef DIAGNOSTIC 1545 vprint("vrele: negative ref count", vp); 1546 mtx_unlock(&vp->v_interlock); 1547 #endif 1548 panic("vrele: negative ref cnt"); 1549 } 1550 } 1551 1552 /* 1553 * Release an already locked vnode. This give the same effects as 1554 * unlock+vrele(), but takes less time and avoids releasing and 1555 * re-aquiring the lock (as vrele() aquires the lock internally.) 1556 */ 1557 void 1558 vput(vp) 1559 struct vnode *vp; 1560 { 1561 struct proc *p = curproc; /* XXX */ 1562 1563 mtx_assert(&Giant, MA_OWNED); 1564 KASSERT(vp != NULL, ("vput: null vp")); 1565 mtx_lock(&vp->v_interlock); 1566 /* Skip this v_writecount check if we're going to panic below. */ 1567 KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, 1568 ("vput: missed vn_close")); 1569 1570 if (vp->v_usecount > 1) { 1571 1572 vp->v_usecount--; 1573 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1574 return; 1575 1576 } 1577 1578 if (vp->v_usecount == 1) { 1579 1580 vp->v_usecount--; 1581 if (VSHOULDFREE(vp)) 1582 vfree(vp); 1583 /* 1584 * If we are doing a vput, the node is already locked, and we must 1585 * call VOP_INACTIVE with the node locked. So, in the case of 1586 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1587 */ 1588 mtx_unlock(&vp->v_interlock); 1589 VOP_INACTIVE(vp, p); 1590 1591 } else { 1592 #ifdef DIAGNOSTIC 1593 vprint("vput: negative ref count", vp); 1594 #endif 1595 panic("vput: negative ref cnt"); 1596 } 1597 } 1598 1599 /* 1600 * Somebody doesn't want the vnode recycled. 1601 */ 1602 void 1603 vhold(vp) 1604 register struct vnode *vp; 1605 { 1606 int s; 1607 1608 s = splbio(); 1609 vp->v_holdcnt++; 1610 if (VSHOULDBUSY(vp)) 1611 vbusy(vp); 1612 splx(s); 1613 } 1614 1615 /* 1616 * Note that there is one less who cares about this vnode. vdrop() is the 1617 * opposite of vhold(). 1618 */ 1619 void 1620 vdrop(vp) 1621 register struct vnode *vp; 1622 { 1623 int s; 1624 1625 s = splbio(); 1626 if (vp->v_holdcnt <= 0) 1627 panic("vdrop: holdcnt"); 1628 vp->v_holdcnt--; 1629 if (VSHOULDFREE(vp)) 1630 vfree(vp); 1631 splx(s); 1632 } 1633 1634 /* 1635 * Remove any vnodes in the vnode table belonging to mount point mp. 1636 * 1637 * If FORCECLOSE is not specified, there should not be any active ones, 1638 * return error if any are found (nb: this is a user error, not a 1639 * system error). If FORCECLOSE is specified, detach any active vnodes 1640 * that are found. 1641 * 1642 * If WRITECLOSE is set, only flush out regular file vnodes open for 1643 * writing. 1644 * 1645 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. 1646 * 1647 * `rootrefs' specifies the base reference count for the root vnode 1648 * of this filesystem. The root vnode is considered busy if its 1649 * v_usecount exceeds this value. On a successful return, vflush() 1650 * will call vrele() on the root vnode exactly rootrefs times. 1651 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 1652 * be zero. 1653 */ 1654 #ifdef DIAGNOSTIC 1655 static int busyprt = 0; /* print out busy vnodes */ 1656 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1657 #endif 1658 1659 int 1660 vflush(mp, rootrefs, flags) 1661 struct mount *mp; 1662 int rootrefs; 1663 int flags; 1664 { 1665 struct proc *p = curproc; /* XXX */ 1666 struct vnode *vp, *nvp, *rootvp = NULL; 1667 int busy = 0, error; 1668 1669 if (rootrefs > 0) { 1670 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 1671 ("vflush: bad args")); 1672 /* 1673 * Get the filesystem root vnode. We can vput() it 1674 * immediately, since with rootrefs > 0, it won't go away. 1675 */ 1676 if ((error = VFS_ROOT(mp, &rootvp)) != 0) 1677 return (error); 1678 vput(rootvp); 1679 } 1680 mtx_lock(&mntvnode_mtx); 1681 loop: 1682 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1683 /* 1684 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1685 * Start over if it has (it won't be on the list anymore). 1686 */ 1687 if (vp->v_mount != mp) 1688 goto loop; 1689 nvp = LIST_NEXT(vp, v_mntvnodes); 1690 1691 mtx_unlock(&mntvnode_mtx); 1692 mtx_lock(&vp->v_interlock); 1693 /* 1694 * Skip over a vnodes marked VSYSTEM. 1695 */ 1696 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1697 mtx_unlock(&vp->v_interlock); 1698 mtx_lock(&mntvnode_mtx); 1699 continue; 1700 } 1701 /* 1702 * If WRITECLOSE is set, only flush out regular file vnodes 1703 * open for writing. 1704 */ 1705 if ((flags & WRITECLOSE) && 1706 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1707 mtx_unlock(&vp->v_interlock); 1708 mtx_lock(&mntvnode_mtx); 1709 continue; 1710 } 1711 1712 /* 1713 * With v_usecount == 0, all we need to do is clear out the 1714 * vnode data structures and we are done. 1715 */ 1716 if (vp->v_usecount == 0) { 1717 vgonel(vp, p); 1718 mtx_lock(&mntvnode_mtx); 1719 continue; 1720 } 1721 1722 /* 1723 * If FORCECLOSE is set, forcibly close the vnode. For block 1724 * or character devices, revert to an anonymous device. For 1725 * all other files, just kill them. 1726 */ 1727 if (flags & FORCECLOSE) { 1728 if (vp->v_type != VCHR) { 1729 vgonel(vp, p); 1730 } else { 1731 vclean(vp, 0, p); 1732 vp->v_op = spec_vnodeop_p; 1733 insmntque(vp, (struct mount *) 0); 1734 } 1735 mtx_lock(&mntvnode_mtx); 1736 continue; 1737 } 1738 #ifdef DIAGNOSTIC 1739 if (busyprt) 1740 vprint("vflush: busy vnode", vp); 1741 #endif 1742 mtx_unlock(&vp->v_interlock); 1743 mtx_lock(&mntvnode_mtx); 1744 busy++; 1745 } 1746 mtx_unlock(&mntvnode_mtx); 1747 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 1748 /* 1749 * If just the root vnode is busy, and if its refcount 1750 * is equal to `rootrefs', then go ahead and kill it. 1751 */ 1752 mtx_lock(&rootvp->v_interlock); 1753 KASSERT(busy > 0, ("vflush: not busy")); 1754 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); 1755 if (busy == 1 && rootvp->v_usecount == rootrefs) { 1756 vgonel(rootvp, p); 1757 busy = 0; 1758 } else 1759 mtx_unlock(&rootvp->v_interlock); 1760 } 1761 if (busy) 1762 return (EBUSY); 1763 for (; rootrefs > 0; rootrefs--) 1764 vrele(rootvp); 1765 return (0); 1766 } 1767 1768 /* 1769 * Disassociate the underlying file system from a vnode. 1770 */ 1771 static void 1772 vclean(vp, flags, p) 1773 struct vnode *vp; 1774 int flags; 1775 struct proc *p; 1776 { 1777 int active; 1778 1779 /* 1780 * Check to see if the vnode is in use. If so we have to reference it 1781 * before we clean it out so that its count cannot fall to zero and 1782 * generate a race against ourselves to recycle it. 1783 */ 1784 if ((active = vp->v_usecount)) 1785 vp->v_usecount++; 1786 1787 /* 1788 * Prevent the vnode from being recycled or brought into use while we 1789 * clean it out. 1790 */ 1791 if (vp->v_flag & VXLOCK) 1792 panic("vclean: deadlock"); 1793 vp->v_flag |= VXLOCK; 1794 vp->v_vxproc = curproc; 1795 /* 1796 * Even if the count is zero, the VOP_INACTIVE routine may still 1797 * have the object locked while it cleans it out. The VOP_LOCK 1798 * ensures that the VOP_INACTIVE routine is done with its work. 1799 * For active vnodes, it ensures that no other activity can 1800 * occur while the underlying object is being cleaned out. 1801 */ 1802 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1803 1804 /* 1805 * Clean out any buffers associated with the vnode. 1806 * If the flush fails, just toss the buffers. 1807 */ 1808 if (flags & DOCLOSE) { 1809 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1810 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1811 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) 1812 vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1813 } 1814 1815 VOP_DESTROYVOBJECT(vp); 1816 1817 /* 1818 * If purging an active vnode, it must be closed and 1819 * deactivated before being reclaimed. Note that the 1820 * VOP_INACTIVE will unlock the vnode. 1821 */ 1822 if (active) { 1823 if (flags & DOCLOSE) 1824 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1825 VOP_INACTIVE(vp, p); 1826 } else { 1827 /* 1828 * Any other processes trying to obtain this lock must first 1829 * wait for VXLOCK to clear, then call the new lock operation. 1830 */ 1831 VOP_UNLOCK(vp, 0, p); 1832 } 1833 /* 1834 * Reclaim the vnode. 1835 */ 1836 if (VOP_RECLAIM(vp, p)) 1837 panic("vclean: cannot reclaim"); 1838 1839 if (active) { 1840 /* 1841 * Inline copy of vrele() since VOP_INACTIVE 1842 * has already been called. 1843 */ 1844 mtx_lock(&vp->v_interlock); 1845 if (--vp->v_usecount <= 0) { 1846 #ifdef DIAGNOSTIC 1847 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1848 vprint("vclean: bad ref count", vp); 1849 panic("vclean: ref cnt"); 1850 } 1851 #endif 1852 vfree(vp); 1853 } 1854 mtx_unlock(&vp->v_interlock); 1855 } 1856 1857 cache_purge(vp); 1858 vp->v_vnlock = NULL; 1859 lockdestroy(&vp->v_lock); 1860 1861 if (VSHOULDFREE(vp)) 1862 vfree(vp); 1863 1864 /* 1865 * Done with purge, notify sleepers of the grim news. 1866 */ 1867 vp->v_op = dead_vnodeop_p; 1868 vn_pollgone(vp); 1869 vp->v_tag = VT_NON; 1870 vp->v_flag &= ~VXLOCK; 1871 vp->v_vxproc = NULL; 1872 if (vp->v_flag & VXWANT) { 1873 vp->v_flag &= ~VXWANT; 1874 wakeup((caddr_t) vp); 1875 } 1876 } 1877 1878 /* 1879 * Eliminate all activity associated with the requested vnode 1880 * and with all vnodes aliased to the requested vnode. 1881 */ 1882 int 1883 vop_revoke(ap) 1884 struct vop_revoke_args /* { 1885 struct vnode *a_vp; 1886 int a_flags; 1887 } */ *ap; 1888 { 1889 struct vnode *vp, *vq; 1890 dev_t dev; 1891 1892 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1893 1894 vp = ap->a_vp; 1895 /* 1896 * If a vgone (or vclean) is already in progress, 1897 * wait until it is done and return. 1898 */ 1899 if (vp->v_flag & VXLOCK) { 1900 vp->v_flag |= VXWANT; 1901 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1902 "vop_revokeall", 0); 1903 return (0); 1904 } 1905 dev = vp->v_rdev; 1906 for (;;) { 1907 mtx_lock(&spechash_mtx); 1908 vq = SLIST_FIRST(&dev->si_hlist); 1909 mtx_unlock(&spechash_mtx); 1910 if (!vq) 1911 break; 1912 vgone(vq); 1913 } 1914 return (0); 1915 } 1916 1917 /* 1918 * Recycle an unused vnode to the front of the free list. 1919 * Release the passed interlock if the vnode will be recycled. 1920 */ 1921 int 1922 vrecycle(vp, inter_lkp, p) 1923 struct vnode *vp; 1924 struct mtx *inter_lkp; 1925 struct proc *p; 1926 { 1927 1928 mtx_lock(&vp->v_interlock); 1929 if (vp->v_usecount == 0) { 1930 if (inter_lkp) { 1931 mtx_unlock(inter_lkp); 1932 } 1933 vgonel(vp, p); 1934 return (1); 1935 } 1936 mtx_unlock(&vp->v_interlock); 1937 return (0); 1938 } 1939 1940 /* 1941 * Eliminate all activity associated with a vnode 1942 * in preparation for reuse. 1943 */ 1944 void 1945 vgone(vp) 1946 register struct vnode *vp; 1947 { 1948 struct proc *p = curproc; /* XXX */ 1949 1950 mtx_lock(&vp->v_interlock); 1951 vgonel(vp, p); 1952 } 1953 1954 /* 1955 * vgone, with the vp interlock held. 1956 */ 1957 void 1958 vgonel(vp, p) 1959 struct vnode *vp; 1960 struct proc *p; 1961 { 1962 int s; 1963 1964 /* 1965 * If a vgone (or vclean) is already in progress, 1966 * wait until it is done and return. 1967 */ 1968 if (vp->v_flag & VXLOCK) { 1969 vp->v_flag |= VXWANT; 1970 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1971 "vgone", 0); 1972 return; 1973 } 1974 1975 /* 1976 * Clean out the filesystem specific data. 1977 */ 1978 vclean(vp, DOCLOSE, p); 1979 mtx_lock(&vp->v_interlock); 1980 1981 /* 1982 * Delete from old mount point vnode list, if on one. 1983 */ 1984 if (vp->v_mount != NULL) 1985 insmntque(vp, (struct mount *)0); 1986 /* 1987 * If special device, remove it from special device alias list 1988 * if it is on one. 1989 */ 1990 if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { 1991 mtx_lock(&spechash_mtx); 1992 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); 1993 freedev(vp->v_rdev); 1994 mtx_unlock(&spechash_mtx); 1995 vp->v_rdev = NULL; 1996 } 1997 1998 /* 1999 * If it is on the freelist and not already at the head, 2000 * move it to the head of the list. The test of the 2001 * VDOOMED flag and the reference count of zero is because 2002 * it will be removed from the free list by getnewvnode, 2003 * but will not have its reference count incremented until 2004 * after calling vgone. If the reference count were 2005 * incremented first, vgone would (incorrectly) try to 2006 * close the previous instance of the underlying object. 2007 */ 2008 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 2009 s = splbio(); 2010 mtx_lock(&vnode_free_list_mtx); 2011 if (vp->v_flag & VFREE) 2012 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2013 else 2014 freevnodes++; 2015 vp->v_flag |= VFREE; 2016 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2017 mtx_unlock(&vnode_free_list_mtx); 2018 splx(s); 2019 } 2020 2021 vp->v_type = VBAD; 2022 mtx_unlock(&vp->v_interlock); 2023 } 2024 2025 /* 2026 * Lookup a vnode by device number. 2027 */ 2028 int 2029 vfinddev(dev, type, vpp) 2030 dev_t dev; 2031 enum vtype type; 2032 struct vnode **vpp; 2033 { 2034 struct vnode *vp; 2035 2036 mtx_lock(&spechash_mtx); 2037 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2038 if (type == vp->v_type) { 2039 *vpp = vp; 2040 mtx_unlock(&spechash_mtx); 2041 return (1); 2042 } 2043 } 2044 mtx_unlock(&spechash_mtx); 2045 return (0); 2046 } 2047 2048 /* 2049 * Calculate the total number of references to a special device. 2050 */ 2051 int 2052 vcount(vp) 2053 struct vnode *vp; 2054 { 2055 struct vnode *vq; 2056 int count; 2057 2058 count = 0; 2059 mtx_lock(&spechash_mtx); 2060 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) 2061 count += vq->v_usecount; 2062 mtx_unlock(&spechash_mtx); 2063 return (count); 2064 } 2065 2066 /* 2067 * Same as above, but using the dev_t as argument 2068 */ 2069 int 2070 count_dev(dev) 2071 dev_t dev; 2072 { 2073 struct vnode *vp; 2074 2075 vp = SLIST_FIRST(&dev->si_hlist); 2076 if (vp == NULL) 2077 return (0); 2078 return(vcount(vp)); 2079 } 2080 2081 /* 2082 * Print out a description of a vnode. 2083 */ 2084 static char *typename[] = 2085 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2086 2087 void 2088 vprint(label, vp) 2089 char *label; 2090 struct vnode *vp; 2091 { 2092 char buf[96]; 2093 2094 if (label != NULL) 2095 printf("%s: %p: ", label, (void *)vp); 2096 else 2097 printf("%p: ", (void *)vp); 2098 printf("type %s, usecount %d, writecount %d, refcount %d,", 2099 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2100 vp->v_holdcnt); 2101 buf[0] = '\0'; 2102 if (vp->v_flag & VROOT) 2103 strcat(buf, "|VROOT"); 2104 if (vp->v_flag & VTEXT) 2105 strcat(buf, "|VTEXT"); 2106 if (vp->v_flag & VSYSTEM) 2107 strcat(buf, "|VSYSTEM"); 2108 if (vp->v_flag & VXLOCK) 2109 strcat(buf, "|VXLOCK"); 2110 if (vp->v_flag & VXWANT) 2111 strcat(buf, "|VXWANT"); 2112 if (vp->v_flag & VBWAIT) 2113 strcat(buf, "|VBWAIT"); 2114 if (vp->v_flag & VDOOMED) 2115 strcat(buf, "|VDOOMED"); 2116 if (vp->v_flag & VFREE) 2117 strcat(buf, "|VFREE"); 2118 if (vp->v_flag & VOBJBUF) 2119 strcat(buf, "|VOBJBUF"); 2120 if (buf[0] != '\0') 2121 printf(" flags (%s)", &buf[1]); 2122 if (vp->v_data == NULL) { 2123 printf("\n"); 2124 } else { 2125 printf("\n\t"); 2126 VOP_PRINT(vp); 2127 } 2128 } 2129 2130 #ifdef DDB 2131 #include <ddb/ddb.h> 2132 /* 2133 * List all of the locked vnodes in the system. 2134 * Called when debugging the kernel. 2135 */ 2136 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2137 { 2138 struct proc *p = curproc; /* XXX */ 2139 struct mount *mp, *nmp; 2140 struct vnode *vp; 2141 2142 printf("Locked vnodes\n"); 2143 mtx_lock(&mountlist_mtx); 2144 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2145 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2146 nmp = TAILQ_NEXT(mp, mnt_list); 2147 continue; 2148 } 2149 mtx_lock(&mntvnode_mtx); 2150 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2151 if (VOP_ISLOCKED(vp, NULL)) 2152 vprint((char *)0, vp); 2153 } 2154 mtx_unlock(&mntvnode_mtx); 2155 mtx_lock(&mountlist_mtx); 2156 nmp = TAILQ_NEXT(mp, mnt_list); 2157 vfs_unbusy(mp, p); 2158 } 2159 mtx_unlock(&mountlist_mtx); 2160 } 2161 #endif 2162 2163 /* 2164 * Top level filesystem related information gathering. 2165 */ 2166 static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2167 2168 static int 2169 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2170 { 2171 int *name = (int *)arg1 - 1; /* XXX */ 2172 u_int namelen = arg2 + 1; /* XXX */ 2173 struct vfsconf *vfsp; 2174 2175 #if 1 || defined(COMPAT_PRELITE2) 2176 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2177 if (namelen == 1) 2178 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2179 #endif 2180 2181 /* XXX the below code does not compile; vfs_sysctl does not exist. */ 2182 #ifdef notyet 2183 /* all sysctl names at this level are at least name and field */ 2184 if (namelen < 2) 2185 return (ENOTDIR); /* overloaded */ 2186 if (name[0] != VFS_GENERIC) { 2187 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2188 if (vfsp->vfc_typenum == name[0]) 2189 break; 2190 if (vfsp == NULL) 2191 return (EOPNOTSUPP); 2192 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2193 oldp, oldlenp, newp, newlen, p)); 2194 } 2195 #endif 2196 switch (name[1]) { 2197 case VFS_MAXTYPENUM: 2198 if (namelen != 2) 2199 return (ENOTDIR); 2200 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2201 case VFS_CONF: 2202 if (namelen != 3) 2203 return (ENOTDIR); /* overloaded */ 2204 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2205 if (vfsp->vfc_typenum == name[2]) 2206 break; 2207 if (vfsp == NULL) 2208 return (EOPNOTSUPP); 2209 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2210 } 2211 return (EOPNOTSUPP); 2212 } 2213 2214 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2215 "Generic filesystem"); 2216 2217 #if 1 || defined(COMPAT_PRELITE2) 2218 2219 static int 2220 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2221 { 2222 int error; 2223 struct vfsconf *vfsp; 2224 struct ovfsconf ovfs; 2225 2226 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2227 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2228 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2229 ovfs.vfc_index = vfsp->vfc_typenum; 2230 ovfs.vfc_refcount = vfsp->vfc_refcount; 2231 ovfs.vfc_flags = vfsp->vfc_flags; 2232 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2233 if (error) 2234 return error; 2235 } 2236 return 0; 2237 } 2238 2239 #endif /* 1 || COMPAT_PRELITE2 */ 2240 2241 #if COMPILING_LINT 2242 #define KINFO_VNODESLOP 10 2243 /* 2244 * Dump vnode list (via sysctl). 2245 * Copyout address of vnode followed by vnode. 2246 */ 2247 /* ARGSUSED */ 2248 static int 2249 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2250 { 2251 struct proc *p = curproc; /* XXX */ 2252 struct mount *mp, *nmp; 2253 struct vnode *nvp, *vp; 2254 int error; 2255 2256 #define VPTRSZ sizeof (struct vnode *) 2257 #define VNODESZ sizeof (struct vnode) 2258 2259 req->lock = 0; 2260 if (!req->oldptr) /* Make an estimate */ 2261 return (SYSCTL_OUT(req, 0, 2262 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2263 2264 mtx_lock(&mountlist_mtx); 2265 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2266 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2267 nmp = TAILQ_NEXT(mp, mnt_list); 2268 continue; 2269 } 2270 mtx_lock(&mntvnode_mtx); 2271 again: 2272 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2273 vp != NULL; 2274 vp = nvp) { 2275 /* 2276 * Check that the vp is still associated with 2277 * this filesystem. RACE: could have been 2278 * recycled onto the same filesystem. 2279 */ 2280 if (vp->v_mount != mp) 2281 goto again; 2282 nvp = LIST_NEXT(vp, v_mntvnodes); 2283 mtx_unlock(&mntvnode_mtx); 2284 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2285 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2286 return (error); 2287 mtx_lock(&mntvnode_mtx); 2288 } 2289 mtx_unlock(&mntvnode_mtx); 2290 mtx_lock(&mountlist_mtx); 2291 nmp = TAILQ_NEXT(mp, mnt_list); 2292 vfs_unbusy(mp, p); 2293 } 2294 mtx_unlock(&mountlist_mtx); 2295 2296 return (0); 2297 } 2298 2299 /* 2300 * XXX 2301 * Exporting the vnode list on large systems causes them to crash. 2302 * Exporting the vnode list on medium systems causes sysctl to coredump. 2303 */ 2304 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2305 0, 0, sysctl_vnode, "S,vnode", ""); 2306 #endif 2307 2308 /* 2309 * Check to see if a filesystem is mounted on a block device. 2310 */ 2311 int 2312 vfs_mountedon(vp) 2313 struct vnode *vp; 2314 { 2315 2316 if (vp->v_rdev->si_mountpoint != NULL) 2317 return (EBUSY); 2318 return (0); 2319 } 2320 2321 /* 2322 * Unmount all filesystems. The list is traversed in reverse order 2323 * of mounting to avoid dependencies. 2324 */ 2325 void 2326 vfs_unmountall() 2327 { 2328 struct mount *mp; 2329 struct proc *p; 2330 int error; 2331 2332 if (curproc != NULL) 2333 p = curproc; 2334 else 2335 p = initproc; /* XXX XXX should this be proc0? */ 2336 /* 2337 * Since this only runs when rebooting, it is not interlocked. 2338 */ 2339 while(!TAILQ_EMPTY(&mountlist)) { 2340 mp = TAILQ_LAST(&mountlist, mntlist); 2341 error = dounmount(mp, MNT_FORCE, p); 2342 if (error) { 2343 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2344 printf("unmount of %s failed (", 2345 mp->mnt_stat.f_mntonname); 2346 if (error == EBUSY) 2347 printf("BUSY)\n"); 2348 else 2349 printf("%d)\n", error); 2350 } else { 2351 /* The unmount has removed mp from the mountlist */ 2352 } 2353 } 2354 } 2355 2356 /* 2357 * perform msync on all vnodes under a mount point 2358 * the mount point must be locked. 2359 */ 2360 void 2361 vfs_msync(struct mount *mp, int flags) { 2362 struct vnode *vp, *nvp; 2363 struct vm_object *obj; 2364 int anyio, tries; 2365 2366 tries = 5; 2367 loop: 2368 anyio = 0; 2369 mtx_lock(&mntvnode_mtx); 2370 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2371 2372 nvp = LIST_NEXT(vp, v_mntvnodes); 2373 2374 if (vp->v_mount != mp) { 2375 mtx_unlock(&mntvnode_mtx); 2376 goto loop; 2377 } 2378 2379 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2380 continue; 2381 2382 if (flags != MNT_WAIT) { 2383 if (VOP_GETVOBJECT(vp, &obj) != 0 || 2384 (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2385 continue; 2386 if (VOP_ISLOCKED(vp, NULL)) 2387 continue; 2388 } 2389 2390 mtx_unlock(&mntvnode_mtx); 2391 mtx_lock(&vp->v_interlock); 2392 if (VOP_GETVOBJECT(vp, &obj) == 0 && 2393 (obj->flags & OBJ_MIGHTBEDIRTY)) { 2394 if (!vget(vp, 2395 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2396 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2397 mtx_lock(&vm_mtx); 2398 vm_object_page_clean(obj, 0, 0, 2399 flags == MNT_WAIT ? 2400 OBJPC_SYNC : OBJPC_NOSYNC); 2401 mtx_unlock(&vm_mtx); 2402 anyio = 1; 2403 } 2404 vput(vp); 2405 } 2406 } else { 2407 mtx_unlock(&vp->v_interlock); 2408 } 2409 mtx_lock(&mntvnode_mtx); 2410 } 2411 mtx_unlock(&mntvnode_mtx); 2412 if (anyio && (--tries > 0)) 2413 goto loop; 2414 } 2415 2416 /* 2417 * Create the VM object needed for VMIO and mmap support. This 2418 * is done for all VREG files in the system. Some filesystems might 2419 * afford the additional metadata buffering capability of the 2420 * VMIO code by making the device node be VMIO mode also. 2421 * 2422 * vp must be locked when vfs_object_create is called. 2423 */ 2424 int 2425 vfs_object_create(vp, p, cred) 2426 struct vnode *vp; 2427 struct proc *p; 2428 struct ucred *cred; 2429 { 2430 2431 mtx_assert(&vm_mtx, MA_NOTOWNED); 2432 return (VOP_CREATEVOBJECT(vp, cred, p)); 2433 } 2434 2435 /* 2436 * Mark a vnode as free, putting it up for recycling. 2437 */ 2438 void 2439 vfree(vp) 2440 struct vnode *vp; 2441 { 2442 int s; 2443 2444 s = splbio(); 2445 mtx_lock(&vnode_free_list_mtx); 2446 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2447 if (vp->v_flag & VAGE) { 2448 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2449 } else { 2450 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2451 } 2452 freevnodes++; 2453 mtx_unlock(&vnode_free_list_mtx); 2454 vp->v_flag &= ~VAGE; 2455 vp->v_flag |= VFREE; 2456 splx(s); 2457 } 2458 2459 /* 2460 * Opposite of vfree() - mark a vnode as in use. 2461 */ 2462 void 2463 vbusy(vp) 2464 struct vnode *vp; 2465 { 2466 int s; 2467 2468 s = splbio(); 2469 mtx_lock(&vnode_free_list_mtx); 2470 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2471 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2472 freevnodes--; 2473 mtx_unlock(&vnode_free_list_mtx); 2474 vp->v_flag &= ~(VFREE|VAGE); 2475 splx(s); 2476 } 2477 2478 /* 2479 * Record a process's interest in events which might happen to 2480 * a vnode. Because poll uses the historic select-style interface 2481 * internally, this routine serves as both the ``check for any 2482 * pending events'' and the ``record my interest in future events'' 2483 * functions. (These are done together, while the lock is held, 2484 * to avoid race conditions.) 2485 */ 2486 int 2487 vn_pollrecord(vp, p, events) 2488 struct vnode *vp; 2489 struct proc *p; 2490 short events; 2491 { 2492 mtx_lock(&vp->v_pollinfo.vpi_lock); 2493 if (vp->v_pollinfo.vpi_revents & events) { 2494 /* 2495 * This leaves events we are not interested 2496 * in available for the other process which 2497 * which presumably had requested them 2498 * (otherwise they would never have been 2499 * recorded). 2500 */ 2501 events &= vp->v_pollinfo.vpi_revents; 2502 vp->v_pollinfo.vpi_revents &= ~events; 2503 2504 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2505 return events; 2506 } 2507 vp->v_pollinfo.vpi_events |= events; 2508 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2509 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2510 return 0; 2511 } 2512 2513 /* 2514 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2515 * it is possible for us to miss an event due to race conditions, but 2516 * that condition is expected to be rare, so for the moment it is the 2517 * preferred interface. 2518 */ 2519 void 2520 vn_pollevent(vp, events) 2521 struct vnode *vp; 2522 short events; 2523 { 2524 mtx_lock(&vp->v_pollinfo.vpi_lock); 2525 if (vp->v_pollinfo.vpi_events & events) { 2526 /* 2527 * We clear vpi_events so that we don't 2528 * call selwakeup() twice if two events are 2529 * posted before the polling process(es) is 2530 * awakened. This also ensures that we take at 2531 * most one selwakeup() if the polling process 2532 * is no longer interested. However, it does 2533 * mean that only one event can be noticed at 2534 * a time. (Perhaps we should only clear those 2535 * event bits which we note?) XXX 2536 */ 2537 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2538 vp->v_pollinfo.vpi_revents |= events; 2539 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2540 } 2541 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2542 } 2543 2544 #define VN_KNOTE(vp, b) \ 2545 KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) 2546 2547 /* 2548 * Wake up anyone polling on vp because it is being revoked. 2549 * This depends on dead_poll() returning POLLHUP for correct 2550 * behavior. 2551 */ 2552 void 2553 vn_pollgone(vp) 2554 struct vnode *vp; 2555 { 2556 mtx_lock(&vp->v_pollinfo.vpi_lock); 2557 VN_KNOTE(vp, NOTE_REVOKE); 2558 if (vp->v_pollinfo.vpi_events) { 2559 vp->v_pollinfo.vpi_events = 0; 2560 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2561 } 2562 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2563 } 2564 2565 2566 2567 /* 2568 * Routine to create and manage a filesystem syncer vnode. 2569 */ 2570 #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2571 static int sync_fsync __P((struct vop_fsync_args *)); 2572 static int sync_inactive __P((struct vop_inactive_args *)); 2573 static int sync_reclaim __P((struct vop_reclaim_args *)); 2574 #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2575 #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2576 static int sync_print __P((struct vop_print_args *)); 2577 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2578 2579 static vop_t **sync_vnodeop_p; 2580 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2581 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2582 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2583 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2584 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2585 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2586 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2587 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2588 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2589 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2590 { NULL, NULL } 2591 }; 2592 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2593 { &sync_vnodeop_p, sync_vnodeop_entries }; 2594 2595 VNODEOP_SET(sync_vnodeop_opv_desc); 2596 2597 /* 2598 * Create a new filesystem syncer vnode for the specified mount point. 2599 */ 2600 int 2601 vfs_allocate_syncvnode(mp) 2602 struct mount *mp; 2603 { 2604 struct vnode *vp; 2605 static long start, incr, next; 2606 int error; 2607 2608 /* Allocate a new vnode */ 2609 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2610 mp->mnt_syncer = NULL; 2611 return (error); 2612 } 2613 vp->v_type = VNON; 2614 /* 2615 * Place the vnode onto the syncer worklist. We attempt to 2616 * scatter them about on the list so that they will go off 2617 * at evenly distributed times even if all the filesystems 2618 * are mounted at once. 2619 */ 2620 next += incr; 2621 if (next == 0 || next > syncer_maxdelay) { 2622 start /= 2; 2623 incr /= 2; 2624 if (start == 0) { 2625 start = syncer_maxdelay / 2; 2626 incr = syncer_maxdelay; 2627 } 2628 next = start; 2629 } 2630 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2631 mp->mnt_syncer = vp; 2632 return (0); 2633 } 2634 2635 /* 2636 * Do a lazy sync of the filesystem. 2637 */ 2638 static int 2639 sync_fsync(ap) 2640 struct vop_fsync_args /* { 2641 struct vnode *a_vp; 2642 struct ucred *a_cred; 2643 int a_waitfor; 2644 struct proc *a_p; 2645 } */ *ap; 2646 { 2647 struct vnode *syncvp = ap->a_vp; 2648 struct mount *mp = syncvp->v_mount; 2649 struct proc *p = ap->a_p; 2650 int asyncflag; 2651 2652 /* 2653 * We only need to do something if this is a lazy evaluation. 2654 */ 2655 if (ap->a_waitfor != MNT_LAZY) 2656 return (0); 2657 2658 /* 2659 * Move ourselves to the back of the sync list. 2660 */ 2661 vn_syncer_add_to_worklist(syncvp, syncdelay); 2662 2663 /* 2664 * Walk the list of vnodes pushing all that are dirty and 2665 * not already on the sync list. 2666 */ 2667 mtx_lock(&mountlist_mtx); 2668 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { 2669 mtx_unlock(&mountlist_mtx); 2670 return (0); 2671 } 2672 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2673 vfs_unbusy(mp, p); 2674 return (0); 2675 } 2676 asyncflag = mp->mnt_flag & MNT_ASYNC; 2677 mp->mnt_flag &= ~MNT_ASYNC; 2678 vfs_msync(mp, MNT_NOWAIT); 2679 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2680 if (asyncflag) 2681 mp->mnt_flag |= MNT_ASYNC; 2682 vn_finished_write(mp); 2683 vfs_unbusy(mp, p); 2684 return (0); 2685 } 2686 2687 /* 2688 * The syncer vnode is no referenced. 2689 */ 2690 static int 2691 sync_inactive(ap) 2692 struct vop_inactive_args /* { 2693 struct vnode *a_vp; 2694 struct proc *a_p; 2695 } */ *ap; 2696 { 2697 2698 vgone(ap->a_vp); 2699 return (0); 2700 } 2701 2702 /* 2703 * The syncer vnode is no longer needed and is being decommissioned. 2704 * 2705 * Modifications to the worklist must be protected at splbio(). 2706 */ 2707 static int 2708 sync_reclaim(ap) 2709 struct vop_reclaim_args /* { 2710 struct vnode *a_vp; 2711 } */ *ap; 2712 { 2713 struct vnode *vp = ap->a_vp; 2714 int s; 2715 2716 s = splbio(); 2717 vp->v_mount->mnt_syncer = NULL; 2718 if (vp->v_flag & VONWORKLST) { 2719 LIST_REMOVE(vp, v_synclist); 2720 vp->v_flag &= ~VONWORKLST; 2721 } 2722 splx(s); 2723 2724 return (0); 2725 } 2726 2727 /* 2728 * Print out a syncer vnode. 2729 */ 2730 static int 2731 sync_print(ap) 2732 struct vop_print_args /* { 2733 struct vnode *a_vp; 2734 } */ *ap; 2735 { 2736 struct vnode *vp = ap->a_vp; 2737 2738 printf("syncer vnode"); 2739 if (vp->v_vnlock != NULL) 2740 lockmgr_printinfo(vp->v_vnlock); 2741 printf("\n"); 2742 return (0); 2743 } 2744 2745 /* 2746 * extract the dev_t from a VCHR 2747 */ 2748 dev_t 2749 vn_todev(vp) 2750 struct vnode *vp; 2751 { 2752 if (vp->v_type != VCHR) 2753 return (NODEV); 2754 return (vp->v_rdev); 2755 } 2756 2757 /* 2758 * Check if vnode represents a disk device 2759 */ 2760 int 2761 vn_isdisk(vp, errp) 2762 struct vnode *vp; 2763 int *errp; 2764 { 2765 struct cdevsw *cdevsw; 2766 2767 if (vp->v_type != VCHR) { 2768 if (errp != NULL) 2769 *errp = ENOTBLK; 2770 return (0); 2771 } 2772 if (vp->v_rdev == NULL) { 2773 if (errp != NULL) 2774 *errp = ENXIO; 2775 return (0); 2776 } 2777 cdevsw = devsw(vp->v_rdev); 2778 if (cdevsw == NULL) { 2779 if (errp != NULL) 2780 *errp = ENXIO; 2781 return (0); 2782 } 2783 if (!(cdevsw->d_flags & D_DISK)) { 2784 if (errp != NULL) 2785 *errp = ENOTBLK; 2786 return (0); 2787 } 2788 if (errp != NULL) 2789 *errp = 0; 2790 return (1); 2791 } 2792 2793 /* 2794 * Free data allocated by namei(); see namei(9) for details. 2795 */ 2796 void 2797 NDFREE(ndp, flags) 2798 struct nameidata *ndp; 2799 const uint flags; 2800 { 2801 if (!(flags & NDF_NO_FREE_PNBUF) && 2802 (ndp->ni_cnd.cn_flags & HASBUF)) { 2803 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2804 ndp->ni_cnd.cn_flags &= ~HASBUF; 2805 } 2806 if (!(flags & NDF_NO_DVP_UNLOCK) && 2807 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 2808 ndp->ni_dvp != ndp->ni_vp) 2809 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 2810 if (!(flags & NDF_NO_DVP_RELE) && 2811 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 2812 vrele(ndp->ni_dvp); 2813 ndp->ni_dvp = NULL; 2814 } 2815 if (!(flags & NDF_NO_VP_UNLOCK) && 2816 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 2817 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 2818 if (!(flags & NDF_NO_VP_RELE) && 2819 ndp->ni_vp) { 2820 vrele(ndp->ni_vp); 2821 ndp->ni_vp = NULL; 2822 } 2823 if (!(flags & NDF_NO_STARTDIR_RELE) && 2824 (ndp->ni_cnd.cn_flags & SAVESTART)) { 2825 vrele(ndp->ni_startdir); 2826 ndp->ni_startdir = NULL; 2827 } 2828 } 2829 2830 /* 2831 * Common file system object access control check routine. Accepts a 2832 * vnode's type, "mode", uid and gid, requested access mode, credentials, 2833 * and optional call-by-reference privused argument allowing vaccess() 2834 * to indicate to the caller whether privilege was used to satisfy the 2835 * request. Returns 0 on success, or an errno on failure. 2836 */ 2837 int 2838 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 2839 enum vtype type; 2840 mode_t file_mode; 2841 uid_t file_uid; 2842 gid_t file_gid; 2843 mode_t acc_mode; 2844 struct ucred *cred; 2845 int *privused; 2846 { 2847 mode_t dac_granted; 2848 #ifdef CAPABILITIES 2849 mode_t cap_granted; 2850 #endif 2851 2852 /* 2853 * Look for a normal, non-privileged way to access the file/directory 2854 * as requested. If it exists, go with that. 2855 */ 2856 2857 if (privused != NULL) 2858 *privused = 0; 2859 2860 dac_granted = 0; 2861 2862 /* Check the owner. */ 2863 if (cred->cr_uid == file_uid) { 2864 dac_granted |= VADMIN; 2865 if (file_mode & S_IXUSR) 2866 dac_granted |= VEXEC; 2867 if (file_mode & S_IRUSR) 2868 dac_granted |= VREAD; 2869 if (file_mode & S_IWUSR) 2870 dac_granted |= VWRITE; 2871 2872 if ((acc_mode & dac_granted) == acc_mode) 2873 return (0); 2874 2875 goto privcheck; 2876 } 2877 2878 /* Otherwise, check the groups (first match) */ 2879 if (groupmember(file_gid, cred)) { 2880 if (file_mode & S_IXGRP) 2881 dac_granted |= VEXEC; 2882 if (file_mode & S_IRGRP) 2883 dac_granted |= VREAD; 2884 if (file_mode & S_IWGRP) 2885 dac_granted |= VWRITE; 2886 2887 if ((acc_mode & dac_granted) == acc_mode) 2888 return (0); 2889 2890 goto privcheck; 2891 } 2892 2893 /* Otherwise, check everyone else. */ 2894 if (file_mode & S_IXOTH) 2895 dac_granted |= VEXEC; 2896 if (file_mode & S_IROTH) 2897 dac_granted |= VREAD; 2898 if (file_mode & S_IWOTH) 2899 dac_granted |= VWRITE; 2900 if ((acc_mode & dac_granted) == acc_mode) 2901 return (0); 2902 2903 privcheck: 2904 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 2905 /* XXX audit: privilege used */ 2906 if (privused != NULL) 2907 *privused = 1; 2908 return (0); 2909 } 2910 2911 #ifdef CAPABILITIES 2912 /* 2913 * Build a capability mask to determine if the set of capabilities 2914 * satisfies the requirements when combined with the granted mask 2915 * from above. 2916 * For each capability, if the capability is required, bitwise 2917 * or the request type onto the cap_granted mask. 2918 */ 2919 cap_granted = 0; 2920 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 2921 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 2922 cap_granted |= VEXEC; 2923 2924 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 2925 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 2926 cap_granted |= VREAD; 2927 2928 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 2929 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 2930 cap_granted |= VWRITE; 2931 2932 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 2933 !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) 2934 cap_granted |= VADMIN; 2935 2936 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 2937 /* XXX audit: privilege used */ 2938 if (privused != NULL) 2939 *privused = 1; 2940 return (0); 2941 } 2942 #endif 2943 2944 return ((acc_mode & VADMIN) ? EPERM : EACCES); 2945 } 2946 2947