1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/buf.h> 50 #include <sys/conf.h> 51 #include <sys/dirent.h> 52 #include <sys/domain.h> 53 #include <sys/eventhandler.h> 54 #include <sys/fcntl.h> 55 #include <sys/kernel.h> 56 #include <sys/kthread.h> 57 #include <sys/malloc.h> 58 #include <sys/mount.h> 59 #include <sys/namei.h> 60 #include <sys/proc.h> 61 #include <sys/reboot.h> 62 #include <sys/socket.h> 63 #include <sys/stat.h> 64 #include <sys/sysctl.h> 65 #include <sys/vmmeter.h> 66 #include <sys/vnode.h> 67 68 #include <machine/limits.h> 69 70 #include <vm/vm.h> 71 #include <vm/vm_object.h> 72 #include <vm/vm_extern.h> 73 #include <vm/pmap.h> 74 #include <vm/vm_map.h> 75 #include <vm/vm_page.h> 76 #include <vm/vm_pager.h> 77 #include <vm/vnode_pager.h> 78 #include <vm/vm_zone.h> 79 80 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 81 82 static void insmntque __P((struct vnode *vp, struct mount *mp)); 83 static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 84 static void vfree __P((struct vnode *)); 85 static unsigned long numvnodes; 86 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 87 88 enum vtype iftovt_tab[16] = { 89 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 90 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 91 }; 92 int vttoif_tab[9] = { 93 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 94 S_IFSOCK, S_IFIFO, S_IFMT, 95 }; 96 97 static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 98 struct tobefreelist vnode_tobefree_list; /* vnode free list */ 99 100 static u_long wantfreevnodes = 25; 101 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 102 static u_long freevnodes = 0; 103 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 104 105 static int reassignbufcalls; 106 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 107 static int reassignbufloops; 108 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 109 static int reassignbufsortgood; 110 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 111 static int reassignbufsortbad; 112 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 113 static int reassignbufmethod = 1; 114 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 115 116 #ifdef ENABLE_VFS_IOOPT 117 int vfs_ioopt = 0; 118 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 119 #endif 120 121 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */ 122 struct simplelock mountlist_slock; 123 struct simplelock mntvnode_slock; 124 int nfs_mount_type = -1; 125 #ifndef NULL_SIMPLELOCKS 126 static struct simplelock mntid_slock; 127 static struct simplelock vnode_free_list_slock; 128 static struct simplelock spechash_slock; 129 #endif 130 struct nfs_public nfs_pub; /* publicly exported FS */ 131 static vm_zone_t vnode_zone; 132 133 /* 134 * The workitem queue. 135 */ 136 #define SYNCER_MAXDELAY 32 137 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 138 time_t syncdelay = 30; /* max time to delay syncing data */ 139 time_t filedelay = 30; /* time to delay syncing files */ 140 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 141 time_t dirdelay = 29; /* time to delay syncing directories */ 142 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 143 time_t metadelay = 28; /* time to delay syncing metadata */ 144 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 145 static int rushjob; /* number of slots to run ASAP */ 146 static int stat_rush_requests; /* number of times I/O speeded up */ 147 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 148 149 static int syncer_delayno = 0; 150 static long syncer_mask; 151 LIST_HEAD(synclist, vnode); 152 static struct synclist *syncer_workitem_pending; 153 154 int desiredvnodes; 155 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 156 &desiredvnodes, 0, "Maximum number of vnodes"); 157 158 static void vfs_free_addrlist __P((struct netexport *nep)); 159 static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 160 static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 161 struct export_args *argp)); 162 163 /* 164 * Initialize the vnode management data structures. 165 */ 166 void 167 vntblinit() 168 { 169 170 desiredvnodes = maxproc + cnt.v_page_count / 4; 171 simple_lock_init(&mntvnode_slock); 172 simple_lock_init(&mntid_slock); 173 simple_lock_init(&spechash_slock); 174 TAILQ_INIT(&vnode_free_list); 175 TAILQ_INIT(&vnode_tobefree_list); 176 simple_lock_init(&vnode_free_list_slock); 177 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 178 /* 179 * Initialize the filesystem syncer. 180 */ 181 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 182 &syncer_mask); 183 syncer_maxdelay = syncer_mask + 1; 184 } 185 186 /* 187 * Mark a mount point as busy. Used to synchronize access and to delay 188 * unmounting. Interlock is not released on failure. 189 */ 190 int 191 vfs_busy(mp, flags, interlkp, p) 192 struct mount *mp; 193 int flags; 194 struct simplelock *interlkp; 195 struct proc *p; 196 { 197 int lkflags; 198 199 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 200 if (flags & LK_NOWAIT) 201 return (ENOENT); 202 mp->mnt_kern_flag |= MNTK_MWAIT; 203 if (interlkp) { 204 simple_unlock(interlkp); 205 } 206 /* 207 * Since all busy locks are shared except the exclusive 208 * lock granted when unmounting, the only place that a 209 * wakeup needs to be done is at the release of the 210 * exclusive lock at the end of dounmount. 211 */ 212 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 213 if (interlkp) { 214 simple_lock(interlkp); 215 } 216 return (ENOENT); 217 } 218 lkflags = LK_SHARED | LK_NOPAUSE; 219 if (interlkp) 220 lkflags |= LK_INTERLOCK; 221 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 222 panic("vfs_busy: unexpected lock failure"); 223 return (0); 224 } 225 226 /* 227 * Free a busy filesystem. 228 */ 229 void 230 vfs_unbusy(mp, p) 231 struct mount *mp; 232 struct proc *p; 233 { 234 235 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 236 } 237 238 /* 239 * Lookup a filesystem type, and if found allocate and initialize 240 * a mount structure for it. 241 * 242 * Devname is usually updated by mount(8) after booting. 243 */ 244 int 245 vfs_rootmountalloc(fstypename, devname, mpp) 246 char *fstypename; 247 char *devname; 248 struct mount **mpp; 249 { 250 struct proc *p = curproc; /* XXX */ 251 struct vfsconf *vfsp; 252 struct mount *mp; 253 254 if (fstypename == NULL) 255 return (ENODEV); 256 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 257 if (!strcmp(vfsp->vfc_name, fstypename)) 258 break; 259 if (vfsp == NULL) 260 return (ENODEV); 261 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 262 bzero((char *)mp, (u_long)sizeof(struct mount)); 263 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 264 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 265 LIST_INIT(&mp->mnt_vnodelist); 266 mp->mnt_vfc = vfsp; 267 mp->mnt_op = vfsp->vfc_vfsops; 268 mp->mnt_flag = MNT_RDONLY; 269 mp->mnt_vnodecovered = NULLVP; 270 vfsp->vfc_refcount++; 271 mp->mnt_iosize_max = DFLTPHYS; 272 mp->mnt_stat.f_type = vfsp->vfc_typenum; 273 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 274 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 275 mp->mnt_stat.f_mntonname[0] = '/'; 276 mp->mnt_stat.f_mntonname[1] = 0; 277 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 278 *mpp = mp; 279 return (0); 280 } 281 282 /* 283 * Find an appropriate filesystem to use for the root. If a filesystem 284 * has not been preselected, walk through the list of known filesystems 285 * trying those that have mountroot routines, and try them until one 286 * works or we have tried them all. 287 */ 288 #ifdef notdef /* XXX JH */ 289 int 290 lite2_vfs_mountroot() 291 { 292 struct vfsconf *vfsp; 293 extern int (*lite2_mountroot) __P((void)); 294 int error; 295 296 if (lite2_mountroot != NULL) 297 return ((*lite2_mountroot)()); 298 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 299 if (vfsp->vfc_mountroot == NULL) 300 continue; 301 if ((error = (*vfsp->vfc_mountroot)()) == 0) 302 return (0); 303 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 304 } 305 return (ENODEV); 306 } 307 #endif 308 309 /* 310 * Lookup a mount point by filesystem identifier. 311 */ 312 struct mount * 313 vfs_getvfs(fsid) 314 fsid_t *fsid; 315 { 316 register struct mount *mp; 317 318 simple_lock(&mountlist_slock); 319 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 320 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 321 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 322 simple_unlock(&mountlist_slock); 323 return (mp); 324 } 325 } 326 simple_unlock(&mountlist_slock); 327 return ((struct mount *) 0); 328 } 329 330 /* 331 * Get a new unique fsid 332 * 333 * Keep in mind that several mounts may be running in parallel, 334 * so always increment mntid_base even if lower numbers are available. 335 */ 336 337 static u_short mntid_base; 338 339 void 340 vfs_getnewfsid(mp) 341 struct mount *mp; 342 { 343 fsid_t tfsid; 344 int mtype; 345 346 simple_lock(&mntid_slock); 347 348 mtype = mp->mnt_vfc->vfc_typenum; 349 for (;;) { 350 tfsid.val[0] = makeudev(255, mtype + (mntid_base << 16)); 351 tfsid.val[1] = mtype; 352 ++mntid_base; 353 if (vfs_getvfs(&tfsid) == NULL) 354 break; 355 } 356 357 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 358 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 359 360 simple_unlock(&mntid_slock); 361 } 362 363 /* 364 * Knob to control the precision of file timestamps: 365 * 366 * 0 = seconds only; nanoseconds zeroed. 367 * 1 = seconds and nanoseconds, accurate within 1/HZ. 368 * 2 = seconds and nanoseconds, truncated to microseconds. 369 * >=3 = seconds and nanoseconds, maximum precision. 370 */ 371 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 372 373 static int timestamp_precision = TSP_SEC; 374 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 375 ×tamp_precision, 0, ""); 376 377 /* 378 * Get a current timestamp. 379 */ 380 void 381 vfs_timestamp(tsp) 382 struct timespec *tsp; 383 { 384 struct timeval tv; 385 386 switch (timestamp_precision) { 387 case TSP_SEC: 388 tsp->tv_sec = time_second; 389 tsp->tv_nsec = 0; 390 break; 391 case TSP_HZ: 392 getnanotime(tsp); 393 break; 394 case TSP_USEC: 395 microtime(&tv); 396 TIMEVAL_TO_TIMESPEC(&tv, tsp); 397 break; 398 case TSP_NSEC: 399 default: 400 nanotime(tsp); 401 break; 402 } 403 } 404 405 /* 406 * Set vnode attributes to VNOVAL 407 */ 408 void 409 vattr_null(vap) 410 register struct vattr *vap; 411 { 412 413 vap->va_type = VNON; 414 vap->va_size = VNOVAL; 415 vap->va_bytes = VNOVAL; 416 vap->va_mode = VNOVAL; 417 vap->va_nlink = VNOVAL; 418 vap->va_uid = VNOVAL; 419 vap->va_gid = VNOVAL; 420 vap->va_fsid = VNOVAL; 421 vap->va_fileid = VNOVAL; 422 vap->va_blocksize = VNOVAL; 423 vap->va_rdev = VNOVAL; 424 vap->va_atime.tv_sec = VNOVAL; 425 vap->va_atime.tv_nsec = VNOVAL; 426 vap->va_mtime.tv_sec = VNOVAL; 427 vap->va_mtime.tv_nsec = VNOVAL; 428 vap->va_ctime.tv_sec = VNOVAL; 429 vap->va_ctime.tv_nsec = VNOVAL; 430 vap->va_flags = VNOVAL; 431 vap->va_gen = VNOVAL; 432 vap->va_vaflags = 0; 433 } 434 435 /* 436 * Routines having to do with the management of the vnode table. 437 */ 438 extern vop_t **dead_vnodeop_p; 439 440 /* 441 * Return the next vnode from the free list. 442 */ 443 int 444 getnewvnode(tag, mp, vops, vpp) 445 enum vtagtype tag; 446 struct mount *mp; 447 vop_t **vops; 448 struct vnode **vpp; 449 { 450 int s; 451 struct proc *p = curproc; /* XXX */ 452 struct vnode *vp, *tvp, *nvp; 453 vm_object_t object; 454 TAILQ_HEAD(freelst, vnode) vnode_tmp_list; 455 456 /* 457 * We take the least recently used vnode from the freelist 458 * if we can get it and it has no cached pages, and no 459 * namecache entries are relative to it. 460 * Otherwise we allocate a new vnode 461 */ 462 463 s = splbio(); 464 simple_lock(&vnode_free_list_slock); 465 TAILQ_INIT(&vnode_tmp_list); 466 467 for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { 468 nvp = TAILQ_NEXT(vp, v_freelist); 469 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 470 if (vp->v_flag & VAGE) { 471 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 472 } else { 473 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 474 } 475 vp->v_flag &= ~(VTBFREE|VAGE); 476 vp->v_flag |= VFREE; 477 if (vp->v_usecount) 478 panic("tobe free vnode isn't"); 479 freevnodes++; 480 } 481 482 if (wantfreevnodes && freevnodes < wantfreevnodes) { 483 vp = NULL; 484 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 485 /* 486 * XXX: this is only here to be backwards compatible 487 */ 488 vp = NULL; 489 } else { 490 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { 491 nvp = TAILQ_NEXT(vp, v_freelist); 492 if (!simple_lock_try(&vp->v_interlock)) 493 continue; 494 if (vp->v_usecount) 495 panic("free vnode isn't"); 496 497 object = vp->v_object; 498 if (object && (object->resident_page_count || object->ref_count)) { 499 printf("object inconsistant state: RPC: %d, RC: %d\n", 500 object->resident_page_count, object->ref_count); 501 /* Don't recycle if it's caching some pages */ 502 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 503 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); 504 continue; 505 } else if (LIST_FIRST(&vp->v_cache_src)) { 506 /* Don't recycle if active in the namecache */ 507 simple_unlock(&vp->v_interlock); 508 continue; 509 } else { 510 break; 511 } 512 } 513 } 514 515 for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { 516 nvp = TAILQ_NEXT(tvp, v_freelist); 517 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); 518 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); 519 simple_unlock(&tvp->v_interlock); 520 } 521 522 if (vp) { 523 vp->v_flag |= VDOOMED; 524 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 525 freevnodes--; 526 simple_unlock(&vnode_free_list_slock); 527 cache_purge(vp); 528 vp->v_lease = NULL; 529 if (vp->v_type != VBAD) { 530 vgonel(vp, p); 531 } else { 532 simple_unlock(&vp->v_interlock); 533 } 534 535 #ifdef INVARIANTS 536 { 537 int s; 538 539 if (vp->v_data) 540 panic("cleaned vnode isn't"); 541 s = splbio(); 542 if (vp->v_numoutput) 543 panic("Clean vnode has pending I/O's"); 544 splx(s); 545 } 546 #endif 547 vp->v_flag = 0; 548 vp->v_lastw = 0; 549 vp->v_lasta = 0; 550 vp->v_cstart = 0; 551 vp->v_clen = 0; 552 vp->v_socket = 0; 553 vp->v_writecount = 0; /* XXX */ 554 } else { 555 simple_unlock(&vnode_free_list_slock); 556 vp = (struct vnode *) zalloc(vnode_zone); 557 bzero((char *) vp, sizeof *vp); 558 simple_lock_init(&vp->v_interlock); 559 vp->v_dd = vp; 560 cache_purge(vp); 561 LIST_INIT(&vp->v_cache_src); 562 TAILQ_INIT(&vp->v_cache_dst); 563 numvnodes++; 564 } 565 566 TAILQ_INIT(&vp->v_cleanblkhd); 567 TAILQ_INIT(&vp->v_dirtyblkhd); 568 vp->v_type = VNON; 569 vp->v_tag = tag; 570 vp->v_op = vops; 571 insmntque(vp, mp); 572 *vpp = vp; 573 vp->v_usecount = 1; 574 vp->v_data = 0; 575 splx(s); 576 577 vfs_object_create(vp, p, p->p_ucred); 578 return (0); 579 } 580 581 /* 582 * Move a vnode from one mount queue to another. 583 */ 584 static void 585 insmntque(vp, mp) 586 register struct vnode *vp; 587 register struct mount *mp; 588 { 589 590 simple_lock(&mntvnode_slock); 591 /* 592 * Delete from old mount point vnode list, if on one. 593 */ 594 if (vp->v_mount != NULL) 595 LIST_REMOVE(vp, v_mntvnodes); 596 /* 597 * Insert into list of vnodes for the new mount point, if available. 598 */ 599 if ((vp->v_mount = mp) == NULL) { 600 simple_unlock(&mntvnode_slock); 601 return; 602 } 603 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 604 simple_unlock(&mntvnode_slock); 605 } 606 607 /* 608 * Update outstanding I/O count and do wakeup if requested. 609 */ 610 void 611 vwakeup(bp) 612 register struct buf *bp; 613 { 614 register struct vnode *vp; 615 616 bp->b_flags &= ~B_WRITEINPROG; 617 if ((vp = bp->b_vp)) { 618 vp->v_numoutput--; 619 if (vp->v_numoutput < 0) 620 panic("vwakeup: neg numoutput"); 621 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 622 vp->v_flag &= ~VBWAIT; 623 wakeup((caddr_t) &vp->v_numoutput); 624 } 625 } 626 } 627 628 /* 629 * Flush out and invalidate all buffers associated with a vnode. 630 * Called with the underlying object locked. 631 */ 632 int 633 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 634 register struct vnode *vp; 635 int flags; 636 struct ucred *cred; 637 struct proc *p; 638 int slpflag, slptimeo; 639 { 640 register struct buf *bp; 641 struct buf *nbp, *blist; 642 int s, error; 643 vm_object_t object; 644 645 if (flags & V_SAVE) { 646 s = splbio(); 647 while (vp->v_numoutput) { 648 vp->v_flag |= VBWAIT; 649 error = tsleep((caddr_t)&vp->v_numoutput, 650 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 651 if (error) { 652 splx(s); 653 return (error); 654 } 655 } 656 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 657 splx(s); 658 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 659 return (error); 660 s = splbio(); 661 if (vp->v_numoutput > 0 || 662 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 663 panic("vinvalbuf: dirty bufs"); 664 } 665 splx(s); 666 } 667 s = splbio(); 668 for (;;) { 669 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 670 if (!blist) 671 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 672 if (!blist) 673 break; 674 675 for (bp = blist; bp; bp = nbp) { 676 nbp = TAILQ_NEXT(bp, b_vnbufs); 677 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 678 error = BUF_TIMELOCK(bp, 679 LK_EXCLUSIVE | LK_SLEEPFAIL, 680 "vinvalbuf", slpflag, slptimeo); 681 if (error == ENOLCK) 682 break; 683 splx(s); 684 return (error); 685 } 686 /* 687 * XXX Since there are no node locks for NFS, I 688 * believe there is a slight chance that a delayed 689 * write will occur while sleeping just above, so 690 * check for it. Note that vfs_bio_awrite expects 691 * buffers to reside on a queue, while VOP_BWRITE and 692 * brelse do not. 693 */ 694 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 695 (flags & V_SAVE)) { 696 697 if (bp->b_vp == vp) { 698 if (bp->b_flags & B_CLUSTEROK) { 699 BUF_UNLOCK(bp); 700 vfs_bio_awrite(bp); 701 } else { 702 bremfree(bp); 703 bp->b_flags |= B_ASYNC; 704 VOP_BWRITE(bp->b_vp, bp); 705 } 706 } else { 707 bremfree(bp); 708 (void) VOP_BWRITE(bp->b_vp, bp); 709 } 710 break; 711 } 712 bremfree(bp); 713 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 714 bp->b_flags &= ~B_ASYNC; 715 brelse(bp); 716 } 717 } 718 719 while (vp->v_numoutput > 0) { 720 vp->v_flag |= VBWAIT; 721 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 722 } 723 724 splx(s); 725 726 /* 727 * Destroy the copy in the VM cache, too. 728 */ 729 simple_lock(&vp->v_interlock); 730 object = vp->v_object; 731 if (object != NULL) { 732 vm_object_page_remove(object, 0, 0, 733 (flags & V_SAVE) ? TRUE : FALSE); 734 } 735 simple_unlock(&vp->v_interlock); 736 737 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 738 panic("vinvalbuf: flush failed"); 739 return (0); 740 } 741 742 /* 743 * Truncate a file's buffer and pages to a specified length. This 744 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 745 * sync activity. 746 */ 747 int 748 vtruncbuf(vp, cred, p, length, blksize) 749 register struct vnode *vp; 750 struct ucred *cred; 751 struct proc *p; 752 off_t length; 753 int blksize; 754 { 755 register struct buf *bp; 756 struct buf *nbp; 757 int s, anyfreed; 758 int trunclbn; 759 760 /* 761 * Round up to the *next* lbn. 762 */ 763 trunclbn = (length + blksize - 1) / blksize; 764 765 s = splbio(); 766 restart: 767 anyfreed = 1; 768 for (;anyfreed;) { 769 anyfreed = 0; 770 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 771 nbp = TAILQ_NEXT(bp, b_vnbufs); 772 if (bp->b_lblkno >= trunclbn) { 773 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 774 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 775 goto restart; 776 } else { 777 bremfree(bp); 778 bp->b_flags |= (B_INVAL | B_RELBUF); 779 bp->b_flags &= ~B_ASYNC; 780 brelse(bp); 781 anyfreed = 1; 782 } 783 if (nbp && 784 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 785 (nbp->b_vp != vp) || 786 (nbp->b_flags & B_DELWRI))) { 787 goto restart; 788 } 789 } 790 } 791 792 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 793 nbp = TAILQ_NEXT(bp, b_vnbufs); 794 if (bp->b_lblkno >= trunclbn) { 795 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 796 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 797 goto restart; 798 } else { 799 bremfree(bp); 800 bp->b_flags |= (B_INVAL | B_RELBUF); 801 bp->b_flags &= ~B_ASYNC; 802 brelse(bp); 803 anyfreed = 1; 804 } 805 if (nbp && 806 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 807 (nbp->b_vp != vp) || 808 (nbp->b_flags & B_DELWRI) == 0)) { 809 goto restart; 810 } 811 } 812 } 813 } 814 815 if (length > 0) { 816 restartsync: 817 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 818 nbp = TAILQ_NEXT(bp, b_vnbufs); 819 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 820 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 821 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 822 goto restart; 823 } else { 824 bremfree(bp); 825 if (bp->b_vp == vp) { 826 bp->b_flags |= B_ASYNC; 827 } else { 828 bp->b_flags &= ~B_ASYNC; 829 } 830 VOP_BWRITE(bp->b_vp, bp); 831 } 832 goto restartsync; 833 } 834 835 } 836 } 837 838 while (vp->v_numoutput > 0) { 839 vp->v_flag |= VBWAIT; 840 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 841 } 842 843 splx(s); 844 845 vnode_pager_setsize(vp, length); 846 847 return (0); 848 } 849 850 /* 851 * Associate a buffer with a vnode. 852 */ 853 void 854 bgetvp(vp, bp) 855 register struct vnode *vp; 856 register struct buf *bp; 857 { 858 int s; 859 860 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 861 862 vhold(vp); 863 bp->b_vp = vp; 864 bp->b_dev = vn_todev(vp); 865 /* 866 * Insert onto list for new vnode. 867 */ 868 s = splbio(); 869 bp->b_xflags |= BX_VNCLEAN; 870 bp->b_xflags &= ~BX_VNDIRTY; 871 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 872 splx(s); 873 } 874 875 /* 876 * Disassociate a buffer from a vnode. 877 */ 878 void 879 brelvp(bp) 880 register struct buf *bp; 881 { 882 struct vnode *vp; 883 struct buflists *listheadp; 884 int s; 885 886 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 887 888 /* 889 * Delete from old vnode list, if on one. 890 */ 891 vp = bp->b_vp; 892 s = splbio(); 893 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 894 if (bp->b_xflags & BX_VNDIRTY) 895 listheadp = &vp->v_dirtyblkhd; 896 else 897 listheadp = &vp->v_cleanblkhd; 898 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 899 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 900 } 901 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 902 vp->v_flag &= ~VONWORKLST; 903 LIST_REMOVE(vp, v_synclist); 904 } 905 splx(s); 906 bp->b_vp = (struct vnode *) 0; 907 vdrop(vp); 908 } 909 910 /* 911 * The workitem queue. 912 * 913 * It is useful to delay writes of file data and filesystem metadata 914 * for tens of seconds so that quickly created and deleted files need 915 * not waste disk bandwidth being created and removed. To realize this, 916 * we append vnodes to a "workitem" queue. When running with a soft 917 * updates implementation, most pending metadata dependencies should 918 * not wait for more than a few seconds. Thus, mounted on block devices 919 * are delayed only about a half the time that file data is delayed. 920 * Similarly, directory updates are more critical, so are only delayed 921 * about a third the time that file data is delayed. Thus, there are 922 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 923 * one each second (driven off the filesystem syncer process). The 924 * syncer_delayno variable indicates the next queue that is to be processed. 925 * Items that need to be processed soon are placed in this queue: 926 * 927 * syncer_workitem_pending[syncer_delayno] 928 * 929 * A delay of fifteen seconds is done by placing the request fifteen 930 * entries later in the queue: 931 * 932 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 933 * 934 */ 935 936 /* 937 * Add an item to the syncer work queue. 938 */ 939 static void 940 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 941 { 942 int s, slot; 943 944 s = splbio(); 945 946 if (vp->v_flag & VONWORKLST) { 947 LIST_REMOVE(vp, v_synclist); 948 } 949 950 if (delay > syncer_maxdelay - 2) 951 delay = syncer_maxdelay - 2; 952 slot = (syncer_delayno + delay) & syncer_mask; 953 954 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 955 vp->v_flag |= VONWORKLST; 956 splx(s); 957 } 958 959 struct proc *updateproc; 960 static void sched_sync __P((void)); 961 static struct kproc_desc up_kp = { 962 "syncer", 963 sched_sync, 964 &updateproc 965 }; 966 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 967 968 /* 969 * System filesystem synchronizer daemon. 970 */ 971 void 972 sched_sync(void) 973 { 974 struct synclist *slp; 975 struct vnode *vp; 976 long starttime; 977 int s; 978 struct proc *p = updateproc; 979 980 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, 981 SHUTDOWN_PRI_LAST); 982 983 for (;;) { 984 kproc_suspend_loop(p); 985 986 starttime = time_second; 987 988 /* 989 * Push files whose dirty time has expired. Be careful 990 * of interrupt race on slp queue. 991 */ 992 s = splbio(); 993 slp = &syncer_workitem_pending[syncer_delayno]; 994 syncer_delayno += 1; 995 if (syncer_delayno == syncer_maxdelay) 996 syncer_delayno = 0; 997 splx(s); 998 999 while ((vp = LIST_FIRST(slp)) != NULL) { 1000 if (VOP_ISLOCKED(vp, NULL) == 0) { 1001 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1002 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 1003 VOP_UNLOCK(vp, 0, p); 1004 } 1005 s = splbio(); 1006 if (LIST_FIRST(slp) == vp) { 1007 /* 1008 * Note: v_tag VT_VFS vps can remain on the 1009 * worklist too with no dirty blocks, but 1010 * since sync_fsync() moves it to a different 1011 * slot we are safe. 1012 */ 1013 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1014 !vn_isdisk(vp, NULL)) 1015 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1016 /* 1017 * Put us back on the worklist. The worklist 1018 * routine will remove us from our current 1019 * position and then add us back in at a later 1020 * position. 1021 */ 1022 vn_syncer_add_to_worklist(vp, syncdelay); 1023 } 1024 splx(s); 1025 } 1026 1027 /* 1028 * Do soft update processing. 1029 */ 1030 if (bioops.io_sync) 1031 (*bioops.io_sync)(NULL); 1032 1033 /* 1034 * The variable rushjob allows the kernel to speed up the 1035 * processing of the filesystem syncer process. A rushjob 1036 * value of N tells the filesystem syncer to process the next 1037 * N seconds worth of work on its queue ASAP. Currently rushjob 1038 * is used by the soft update code to speed up the filesystem 1039 * syncer process when the incore state is getting so far 1040 * ahead of the disk that the kernel memory pool is being 1041 * threatened with exhaustion. 1042 */ 1043 if (rushjob > 0) { 1044 rushjob -= 1; 1045 continue; 1046 } 1047 /* 1048 * If it has taken us less than a second to process the 1049 * current work, then wait. Otherwise start right over 1050 * again. We can still lose time if any single round 1051 * takes more than two seconds, but it does not really 1052 * matter as we are just trying to generally pace the 1053 * filesystem activity. 1054 */ 1055 if (time_second == starttime) 1056 tsleep(&lbolt, PPAUSE, "syncer", 0); 1057 } 1058 } 1059 1060 /* 1061 * Request the syncer daemon to speed up its work. 1062 * We never push it to speed up more than half of its 1063 * normal turn time, otherwise it could take over the cpu. 1064 */ 1065 int 1066 speedup_syncer() 1067 { 1068 int s; 1069 1070 s = splhigh(); 1071 if (updateproc->p_wchan == &lbolt) 1072 setrunnable(updateproc); 1073 splx(s); 1074 if (rushjob < syncdelay / 2) { 1075 rushjob += 1; 1076 stat_rush_requests += 1; 1077 return (1); 1078 } 1079 return(0); 1080 } 1081 1082 /* 1083 * Associate a p-buffer with a vnode. 1084 * 1085 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1086 * with the buffer. i.e. the bp has not been linked into the vnode or 1087 * ref-counted. 1088 */ 1089 void 1090 pbgetvp(vp, bp) 1091 register struct vnode *vp; 1092 register struct buf *bp; 1093 { 1094 1095 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1096 1097 bp->b_vp = vp; 1098 bp->b_flags |= B_PAGING; 1099 bp->b_dev = vn_todev(vp); 1100 } 1101 1102 /* 1103 * Disassociate a p-buffer from a vnode. 1104 */ 1105 void 1106 pbrelvp(bp) 1107 register struct buf *bp; 1108 { 1109 1110 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1111 1112 #if !defined(MAX_PERF) 1113 /* XXX REMOVE ME */ 1114 if (bp->b_vnbufs.tqe_next != NULL) { 1115 panic( 1116 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1117 bp, 1118 (int)bp->b_flags 1119 ); 1120 } 1121 #endif 1122 bp->b_vp = (struct vnode *) 0; 1123 bp->b_flags &= ~B_PAGING; 1124 } 1125 1126 void 1127 pbreassignbuf(bp, newvp) 1128 struct buf *bp; 1129 struct vnode *newvp; 1130 { 1131 #if !defined(MAX_PERF) 1132 if ((bp->b_flags & B_PAGING) == 0) { 1133 panic( 1134 "pbreassignbuf() on non phys bp %p", 1135 bp 1136 ); 1137 } 1138 #endif 1139 bp->b_vp = newvp; 1140 } 1141 1142 /* 1143 * Reassign a buffer from one vnode to another. 1144 * Used to assign file specific control information 1145 * (indirect blocks) to the vnode to which they belong. 1146 */ 1147 void 1148 reassignbuf(bp, newvp) 1149 register struct buf *bp; 1150 register struct vnode *newvp; 1151 { 1152 struct buflists *listheadp; 1153 int delay; 1154 int s; 1155 1156 if (newvp == NULL) { 1157 printf("reassignbuf: NULL"); 1158 return; 1159 } 1160 ++reassignbufcalls; 1161 1162 #if !defined(MAX_PERF) 1163 /* 1164 * B_PAGING flagged buffers cannot be reassigned because their vp 1165 * is not fully linked in. 1166 */ 1167 if (bp->b_flags & B_PAGING) 1168 panic("cannot reassign paging buffer"); 1169 #endif 1170 1171 s = splbio(); 1172 /* 1173 * Delete from old vnode list, if on one. 1174 */ 1175 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1176 if (bp->b_xflags & BX_VNDIRTY) 1177 listheadp = &bp->b_vp->v_dirtyblkhd; 1178 else 1179 listheadp = &bp->b_vp->v_cleanblkhd; 1180 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1181 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1182 if (bp->b_vp != newvp) { 1183 vdrop(bp->b_vp); 1184 bp->b_vp = NULL; /* for clarification */ 1185 } 1186 } 1187 /* 1188 * If dirty, put on list of dirty buffers; otherwise insert onto list 1189 * of clean buffers. 1190 */ 1191 if (bp->b_flags & B_DELWRI) { 1192 struct buf *tbp; 1193 1194 listheadp = &newvp->v_dirtyblkhd; 1195 if ((newvp->v_flag & VONWORKLST) == 0) { 1196 switch (newvp->v_type) { 1197 case VDIR: 1198 delay = dirdelay; 1199 break; 1200 case VCHR: 1201 case VBLK: 1202 if (newvp->v_specmountpoint != NULL) { 1203 delay = metadelay; 1204 break; 1205 } 1206 /* fall through */ 1207 default: 1208 delay = filedelay; 1209 } 1210 vn_syncer_add_to_worklist(newvp, delay); 1211 } 1212 bp->b_xflags |= BX_VNDIRTY; 1213 tbp = TAILQ_FIRST(listheadp); 1214 if (tbp == NULL || 1215 bp->b_lblkno == 0 || 1216 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1217 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1218 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1219 ++reassignbufsortgood; 1220 } else if (bp->b_lblkno < 0) { 1221 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1222 ++reassignbufsortgood; 1223 } else if (reassignbufmethod == 1) { 1224 /* 1225 * New sorting algorithm, only handle sequential case, 1226 * otherwise append to end (but before metadata) 1227 */ 1228 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1229 (tbp->b_xflags & BX_VNDIRTY)) { 1230 /* 1231 * Found the best place to insert the buffer 1232 */ 1233 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1234 ++reassignbufsortgood; 1235 } else { 1236 /* 1237 * Missed, append to end, but before meta-data. 1238 * We know that the head buffer in the list is 1239 * not meta-data due to prior conditionals. 1240 * 1241 * Indirect effects: NFS second stage write 1242 * tends to wind up here, giving maximum 1243 * distance between the unstable write and the 1244 * commit rpc. 1245 */ 1246 tbp = TAILQ_LAST(listheadp, buflists); 1247 while (tbp && tbp->b_lblkno < 0) 1248 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1249 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1250 ++reassignbufsortbad; 1251 } 1252 } else { 1253 /* 1254 * Old sorting algorithm, scan queue and insert 1255 */ 1256 struct buf *ttbp; 1257 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1258 (ttbp->b_lblkno < bp->b_lblkno)) { 1259 ++reassignbufloops; 1260 tbp = ttbp; 1261 } 1262 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1263 } 1264 } else { 1265 bp->b_xflags |= BX_VNCLEAN; 1266 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1267 if ((newvp->v_flag & VONWORKLST) && 1268 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1269 newvp->v_flag &= ~VONWORKLST; 1270 LIST_REMOVE(newvp, v_synclist); 1271 } 1272 } 1273 if (bp->b_vp != newvp) { 1274 bp->b_vp = newvp; 1275 vhold(bp->b_vp); 1276 } 1277 splx(s); 1278 } 1279 1280 /* 1281 * Create a vnode for a block device. 1282 * Used for mounting the root file system. 1283 */ 1284 int 1285 bdevvp(dev, vpp) 1286 dev_t dev; 1287 struct vnode **vpp; 1288 { 1289 register struct vnode *vp; 1290 struct vnode *nvp; 1291 int error; 1292 1293 if (dev == NODEV) { 1294 *vpp = NULLVP; 1295 return (ENXIO); 1296 } 1297 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1298 if (error) { 1299 *vpp = NULLVP; 1300 return (error); 1301 } 1302 vp = nvp; 1303 vp->v_type = VBLK; 1304 addalias(vp, dev); 1305 *vpp = vp; 1306 return (0); 1307 } 1308 1309 /* 1310 * Add vnode to the alias list hung off the dev_t. 1311 * 1312 * The reason for this gunk is that multiple vnodes can reference 1313 * the same physical device, so checking vp->v_usecount to see 1314 * how many users there are is inadequate; the v_usecount for 1315 * the vnodes need to be accumulated. vcount() does that. 1316 */ 1317 void 1318 addaliasu(nvp, nvp_rdev) 1319 struct vnode *nvp; 1320 udev_t nvp_rdev; 1321 { 1322 1323 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1324 panic("addaliasu on non-special vnode"); 1325 addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0)); 1326 } 1327 1328 void 1329 addalias(nvp, dev) 1330 struct vnode *nvp; 1331 dev_t dev; 1332 { 1333 1334 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1335 panic("addalias on non-special vnode"); 1336 1337 nvp->v_rdev = dev; 1338 simple_lock(&spechash_slock); 1339 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1340 simple_unlock(&spechash_slock); 1341 } 1342 1343 /* 1344 * Grab a particular vnode from the free list, increment its 1345 * reference count and lock it. The vnode lock bit is set if the 1346 * vnode is being eliminated in vgone. The process is awakened 1347 * when the transition is completed, and an error returned to 1348 * indicate that the vnode is no longer usable (possibly having 1349 * been changed to a new file system type). 1350 */ 1351 int 1352 vget(vp, flags, p) 1353 register struct vnode *vp; 1354 int flags; 1355 struct proc *p; 1356 { 1357 int error; 1358 1359 /* 1360 * If the vnode is in the process of being cleaned out for 1361 * another use, we wait for the cleaning to finish and then 1362 * return failure. Cleaning is determined by checking that 1363 * the VXLOCK flag is set. 1364 */ 1365 if ((flags & LK_INTERLOCK) == 0) { 1366 simple_lock(&vp->v_interlock); 1367 } 1368 if (vp->v_flag & VXLOCK) { 1369 vp->v_flag |= VXWANT; 1370 simple_unlock(&vp->v_interlock); 1371 tsleep((caddr_t)vp, PINOD, "vget", 0); 1372 return (ENOENT); 1373 } 1374 1375 vp->v_usecount++; 1376 1377 if (VSHOULDBUSY(vp)) 1378 vbusy(vp); 1379 if (flags & LK_TYPE_MASK) { 1380 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1381 /* 1382 * must expand vrele here because we do not want 1383 * to call VOP_INACTIVE if the reference count 1384 * drops back to zero since it was never really 1385 * active. We must remove it from the free list 1386 * before sleeping so that multiple processes do 1387 * not try to recycle it. 1388 */ 1389 simple_lock(&vp->v_interlock); 1390 vp->v_usecount--; 1391 if (VSHOULDFREE(vp)) 1392 vfree(vp); 1393 simple_unlock(&vp->v_interlock); 1394 } 1395 return (error); 1396 } 1397 simple_unlock(&vp->v_interlock); 1398 return (0); 1399 } 1400 1401 void 1402 vref(struct vnode *vp) 1403 { 1404 simple_lock(&vp->v_interlock); 1405 vp->v_usecount++; 1406 simple_unlock(&vp->v_interlock); 1407 } 1408 1409 /* 1410 * Vnode put/release. 1411 * If count drops to zero, call inactive routine and return to freelist. 1412 */ 1413 void 1414 vrele(vp) 1415 struct vnode *vp; 1416 { 1417 struct proc *p = curproc; /* XXX */ 1418 1419 KASSERT(vp != NULL, ("vrele: null vp")); 1420 1421 simple_lock(&vp->v_interlock); 1422 1423 if (vp->v_usecount > 1) { 1424 1425 vp->v_usecount--; 1426 simple_unlock(&vp->v_interlock); 1427 1428 return; 1429 } 1430 1431 if (vp->v_usecount == 1) { 1432 1433 vp->v_usecount--; 1434 if (VSHOULDFREE(vp)) 1435 vfree(vp); 1436 /* 1437 * If we are doing a vput, the node is already locked, and we must 1438 * call VOP_INACTIVE with the node locked. So, in the case of 1439 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1440 */ 1441 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1442 VOP_INACTIVE(vp, p); 1443 } 1444 1445 } else { 1446 #ifdef DIAGNOSTIC 1447 vprint("vrele: negative ref count", vp); 1448 simple_unlock(&vp->v_interlock); 1449 #endif 1450 panic("vrele: negative ref cnt"); 1451 } 1452 } 1453 1454 void 1455 vput(vp) 1456 struct vnode *vp; 1457 { 1458 struct proc *p = curproc; /* XXX */ 1459 1460 KASSERT(vp != NULL, ("vput: null vp")); 1461 1462 simple_lock(&vp->v_interlock); 1463 1464 if (vp->v_usecount > 1) { 1465 1466 vp->v_usecount--; 1467 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1468 return; 1469 1470 } 1471 1472 if (vp->v_usecount == 1) { 1473 1474 vp->v_usecount--; 1475 if (VSHOULDFREE(vp)) 1476 vfree(vp); 1477 /* 1478 * If we are doing a vput, the node is already locked, and we must 1479 * call VOP_INACTIVE with the node locked. So, in the case of 1480 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1481 */ 1482 simple_unlock(&vp->v_interlock); 1483 VOP_INACTIVE(vp, p); 1484 1485 } else { 1486 #ifdef DIAGNOSTIC 1487 vprint("vput: negative ref count", vp); 1488 #endif 1489 panic("vput: negative ref cnt"); 1490 } 1491 } 1492 1493 /* 1494 * Somebody doesn't want the vnode recycled. 1495 */ 1496 void 1497 vhold(vp) 1498 register struct vnode *vp; 1499 { 1500 int s; 1501 1502 s = splbio(); 1503 vp->v_holdcnt++; 1504 if (VSHOULDBUSY(vp)) 1505 vbusy(vp); 1506 splx(s); 1507 } 1508 1509 /* 1510 * One less who cares about this vnode. 1511 */ 1512 void 1513 vdrop(vp) 1514 register struct vnode *vp; 1515 { 1516 int s; 1517 1518 s = splbio(); 1519 if (vp->v_holdcnt <= 0) 1520 panic("vdrop: holdcnt"); 1521 vp->v_holdcnt--; 1522 if (VSHOULDFREE(vp)) 1523 vfree(vp); 1524 splx(s); 1525 } 1526 1527 /* 1528 * Remove any vnodes in the vnode table belonging to mount point mp. 1529 * 1530 * If MNT_NOFORCE is specified, there should not be any active ones, 1531 * return error if any are found (nb: this is a user error, not a 1532 * system error). If MNT_FORCE is specified, detach any active vnodes 1533 * that are found. 1534 */ 1535 #ifdef DIAGNOSTIC 1536 static int busyprt = 0; /* print out busy vnodes */ 1537 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1538 #endif 1539 1540 int 1541 vflush(mp, skipvp, flags) 1542 struct mount *mp; 1543 struct vnode *skipvp; 1544 int flags; 1545 { 1546 struct proc *p = curproc; /* XXX */ 1547 struct vnode *vp, *nvp; 1548 int busy = 0; 1549 1550 simple_lock(&mntvnode_slock); 1551 loop: 1552 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1553 /* 1554 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1555 * Start over if it has (it won't be on the list anymore). 1556 */ 1557 if (vp->v_mount != mp) 1558 goto loop; 1559 nvp = LIST_NEXT(vp, v_mntvnodes); 1560 /* 1561 * Skip over a selected vnode. 1562 */ 1563 if (vp == skipvp) 1564 continue; 1565 1566 simple_lock(&vp->v_interlock); 1567 /* 1568 * Skip over a vnodes marked VSYSTEM. 1569 */ 1570 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1571 simple_unlock(&vp->v_interlock); 1572 continue; 1573 } 1574 /* 1575 * If WRITECLOSE is set, only flush out regular file vnodes 1576 * open for writing. 1577 */ 1578 if ((flags & WRITECLOSE) && 1579 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1580 simple_unlock(&vp->v_interlock); 1581 continue; 1582 } 1583 1584 /* 1585 * With v_usecount == 0, all we need to do is clear out the 1586 * vnode data structures and we are done. 1587 */ 1588 if (vp->v_usecount == 0) { 1589 simple_unlock(&mntvnode_slock); 1590 vgonel(vp, p); 1591 simple_lock(&mntvnode_slock); 1592 continue; 1593 } 1594 1595 /* 1596 * If FORCECLOSE is set, forcibly close the vnode. For block 1597 * or character devices, revert to an anonymous device. For 1598 * all other files, just kill them. 1599 */ 1600 if (flags & FORCECLOSE) { 1601 simple_unlock(&mntvnode_slock); 1602 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1603 vgonel(vp, p); 1604 } else { 1605 vclean(vp, 0, p); 1606 vp->v_op = spec_vnodeop_p; 1607 insmntque(vp, (struct mount *) 0); 1608 } 1609 simple_lock(&mntvnode_slock); 1610 continue; 1611 } 1612 #ifdef DIAGNOSTIC 1613 if (busyprt) 1614 vprint("vflush: busy vnode", vp); 1615 #endif 1616 simple_unlock(&vp->v_interlock); 1617 busy++; 1618 } 1619 simple_unlock(&mntvnode_slock); 1620 if (busy) 1621 return (EBUSY); 1622 return (0); 1623 } 1624 1625 /* 1626 * Disassociate the underlying file system from a vnode. 1627 */ 1628 static void 1629 vclean(vp, flags, p) 1630 struct vnode *vp; 1631 int flags; 1632 struct proc *p; 1633 { 1634 int active; 1635 vm_object_t obj; 1636 1637 /* 1638 * Check to see if the vnode is in use. If so we have to reference it 1639 * before we clean it out so that its count cannot fall to zero and 1640 * generate a race against ourselves to recycle it. 1641 */ 1642 if ((active = vp->v_usecount)) 1643 vp->v_usecount++; 1644 1645 /* 1646 * Prevent the vnode from being recycled or brought into use while we 1647 * clean it out. 1648 */ 1649 if (vp->v_flag & VXLOCK) 1650 panic("vclean: deadlock"); 1651 vp->v_flag |= VXLOCK; 1652 /* 1653 * Even if the count is zero, the VOP_INACTIVE routine may still 1654 * have the object locked while it cleans it out. The VOP_LOCK 1655 * ensures that the VOP_INACTIVE routine is done with its work. 1656 * For active vnodes, it ensures that no other activity can 1657 * occur while the underlying object is being cleaned out. 1658 */ 1659 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1660 1661 /* 1662 * Clean out any buffers associated with the vnode. 1663 */ 1664 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1665 if ((obj = vp->v_object) != NULL) { 1666 if (obj->ref_count == 0) { 1667 /* 1668 * vclean() may be called twice. The first time removes the 1669 * primary reference to the object, the second time goes 1670 * one further and is a special-case to terminate the object. 1671 */ 1672 vm_object_terminate(obj); 1673 } else { 1674 /* 1675 * Woe to the process that tries to page now :-). 1676 */ 1677 vm_pager_deallocate(obj); 1678 } 1679 } 1680 1681 /* 1682 * If purging an active vnode, it must be closed and 1683 * deactivated before being reclaimed. Note that the 1684 * VOP_INACTIVE will unlock the vnode. 1685 */ 1686 if (active) { 1687 if (flags & DOCLOSE) 1688 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1689 VOP_INACTIVE(vp, p); 1690 } else { 1691 /* 1692 * Any other processes trying to obtain this lock must first 1693 * wait for VXLOCK to clear, then call the new lock operation. 1694 */ 1695 VOP_UNLOCK(vp, 0, p); 1696 } 1697 /* 1698 * Reclaim the vnode. 1699 */ 1700 if (VOP_RECLAIM(vp, p)) 1701 panic("vclean: cannot reclaim"); 1702 1703 if (active) { 1704 /* 1705 * Inline copy of vrele() since VOP_INACTIVE 1706 * has already been called. 1707 */ 1708 simple_lock(&vp->v_interlock); 1709 if (--vp->v_usecount <= 0) { 1710 #ifdef DIAGNOSTIC 1711 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1712 vprint("vclean: bad ref count", vp); 1713 panic("vclean: ref cnt"); 1714 } 1715 #endif 1716 vfree(vp); 1717 } 1718 simple_unlock(&vp->v_interlock); 1719 } 1720 1721 cache_purge(vp); 1722 if (vp->v_vnlock) { 1723 FREE(vp->v_vnlock, M_VNODE); 1724 vp->v_vnlock = NULL; 1725 } 1726 1727 if (VSHOULDFREE(vp)) 1728 vfree(vp); 1729 1730 /* 1731 * Done with purge, notify sleepers of the grim news. 1732 */ 1733 vp->v_op = dead_vnodeop_p; 1734 vn_pollgone(vp); 1735 vp->v_tag = VT_NON; 1736 vp->v_flag &= ~VXLOCK; 1737 if (vp->v_flag & VXWANT) { 1738 vp->v_flag &= ~VXWANT; 1739 wakeup((caddr_t) vp); 1740 } 1741 } 1742 1743 /* 1744 * Eliminate all activity associated with the requested vnode 1745 * and with all vnodes aliased to the requested vnode. 1746 */ 1747 int 1748 vop_revoke(ap) 1749 struct vop_revoke_args /* { 1750 struct vnode *a_vp; 1751 int a_flags; 1752 } */ *ap; 1753 { 1754 struct vnode *vp, *vq; 1755 dev_t dev; 1756 1757 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1758 1759 vp = ap->a_vp; 1760 /* 1761 * If a vgone (or vclean) is already in progress, 1762 * wait until it is done and return. 1763 */ 1764 if (vp->v_flag & VXLOCK) { 1765 vp->v_flag |= VXWANT; 1766 simple_unlock(&vp->v_interlock); 1767 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1768 return (0); 1769 } 1770 dev = vp->v_rdev; 1771 for (;;) { 1772 simple_lock(&spechash_slock); 1773 vq = SLIST_FIRST(&dev->si_hlist); 1774 simple_unlock(&spechash_slock); 1775 if (!vq) 1776 break; 1777 vgone(vq); 1778 } 1779 return (0); 1780 } 1781 1782 /* 1783 * Recycle an unused vnode to the front of the free list. 1784 * Release the passed interlock if the vnode will be recycled. 1785 */ 1786 int 1787 vrecycle(vp, inter_lkp, p) 1788 struct vnode *vp; 1789 struct simplelock *inter_lkp; 1790 struct proc *p; 1791 { 1792 1793 simple_lock(&vp->v_interlock); 1794 if (vp->v_usecount == 0) { 1795 if (inter_lkp) { 1796 simple_unlock(inter_lkp); 1797 } 1798 vgonel(vp, p); 1799 return (1); 1800 } 1801 simple_unlock(&vp->v_interlock); 1802 return (0); 1803 } 1804 1805 /* 1806 * Eliminate all activity associated with a vnode 1807 * in preparation for reuse. 1808 */ 1809 void 1810 vgone(vp) 1811 register struct vnode *vp; 1812 { 1813 struct proc *p = curproc; /* XXX */ 1814 1815 simple_lock(&vp->v_interlock); 1816 vgonel(vp, p); 1817 } 1818 1819 /* 1820 * vgone, with the vp interlock held. 1821 */ 1822 void 1823 vgonel(vp, p) 1824 struct vnode *vp; 1825 struct proc *p; 1826 { 1827 int s; 1828 1829 /* 1830 * If a vgone (or vclean) is already in progress, 1831 * wait until it is done and return. 1832 */ 1833 if (vp->v_flag & VXLOCK) { 1834 vp->v_flag |= VXWANT; 1835 simple_unlock(&vp->v_interlock); 1836 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1837 return; 1838 } 1839 1840 /* 1841 * Clean out the filesystem specific data. 1842 */ 1843 vclean(vp, DOCLOSE, p); 1844 simple_lock(&vp->v_interlock); 1845 1846 /* 1847 * Delete from old mount point vnode list, if on one. 1848 */ 1849 if (vp->v_mount != NULL) 1850 insmntque(vp, (struct mount *)0); 1851 /* 1852 * If special device, remove it from special device alias list 1853 * if it is on one. 1854 */ 1855 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 1856 simple_lock(&spechash_slock); 1857 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext); 1858 freedev(vp->v_rdev); 1859 simple_unlock(&spechash_slock); 1860 vp->v_rdev = NULL; 1861 } 1862 1863 /* 1864 * If it is on the freelist and not already at the head, 1865 * move it to the head of the list. The test of the back 1866 * pointer and the reference count of zero is because 1867 * it will be removed from the free list by getnewvnode, 1868 * but will not have its reference count incremented until 1869 * after calling vgone. If the reference count were 1870 * incremented first, vgone would (incorrectly) try to 1871 * close the previous instance of the underlying object. 1872 */ 1873 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1874 s = splbio(); 1875 simple_lock(&vnode_free_list_slock); 1876 if (vp->v_flag & VFREE) { 1877 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1878 } else if (vp->v_flag & VTBFREE) { 1879 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 1880 vp->v_flag &= ~VTBFREE; 1881 freevnodes++; 1882 } else 1883 freevnodes++; 1884 vp->v_flag |= VFREE; 1885 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1886 simple_unlock(&vnode_free_list_slock); 1887 splx(s); 1888 } 1889 1890 vp->v_type = VBAD; 1891 simple_unlock(&vp->v_interlock); 1892 } 1893 1894 /* 1895 * Lookup a vnode by device number. 1896 */ 1897 int 1898 vfinddev(dev, type, vpp) 1899 dev_t dev; 1900 enum vtype type; 1901 struct vnode **vpp; 1902 { 1903 struct vnode *vp; 1904 1905 simple_lock(&spechash_slock); 1906 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1907 if (type == vp->v_type) { 1908 *vpp = vp; 1909 simple_unlock(&spechash_slock); 1910 return (1); 1911 } 1912 } 1913 simple_unlock(&spechash_slock); 1914 return (0); 1915 } 1916 1917 /* 1918 * Calculate the total number of references to a special device. 1919 */ 1920 int 1921 vcount(vp) 1922 struct vnode *vp; 1923 { 1924 struct vnode *vq; 1925 int count; 1926 1927 count = 0; 1928 simple_lock(&spechash_slock); 1929 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext) 1930 count += vq->v_usecount; 1931 simple_unlock(&spechash_slock); 1932 return (count); 1933 } 1934 1935 /* 1936 * Same as above, but using the dev_t as argument 1937 */ 1938 1939 int 1940 count_dev(dev) 1941 dev_t dev; 1942 { 1943 struct vnode *vp; 1944 1945 vp = SLIST_FIRST(&dev->si_hlist); 1946 if (vp == NULL) 1947 return (0); 1948 return(vcount(vp)); 1949 } 1950 1951 /* 1952 * Print out a description of a vnode. 1953 */ 1954 static char *typename[] = 1955 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1956 1957 void 1958 vprint(label, vp) 1959 char *label; 1960 struct vnode *vp; 1961 { 1962 char buf[96]; 1963 1964 if (label != NULL) 1965 printf("%s: %p: ", label, (void *)vp); 1966 else 1967 printf("%p: ", (void *)vp); 1968 printf("type %s, usecount %d, writecount %d, refcount %d,", 1969 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1970 vp->v_holdcnt); 1971 buf[0] = '\0'; 1972 if (vp->v_flag & VROOT) 1973 strcat(buf, "|VROOT"); 1974 if (vp->v_flag & VTEXT) 1975 strcat(buf, "|VTEXT"); 1976 if (vp->v_flag & VSYSTEM) 1977 strcat(buf, "|VSYSTEM"); 1978 if (vp->v_flag & VXLOCK) 1979 strcat(buf, "|VXLOCK"); 1980 if (vp->v_flag & VXWANT) 1981 strcat(buf, "|VXWANT"); 1982 if (vp->v_flag & VBWAIT) 1983 strcat(buf, "|VBWAIT"); 1984 if (vp->v_flag & VDOOMED) 1985 strcat(buf, "|VDOOMED"); 1986 if (vp->v_flag & VFREE) 1987 strcat(buf, "|VFREE"); 1988 if (vp->v_flag & VOBJBUF) 1989 strcat(buf, "|VOBJBUF"); 1990 if (buf[0] != '\0') 1991 printf(" flags (%s)", &buf[1]); 1992 if (vp->v_data == NULL) { 1993 printf("\n"); 1994 } else { 1995 printf("\n\t"); 1996 VOP_PRINT(vp); 1997 } 1998 } 1999 2000 #ifdef DDB 2001 #include <ddb/ddb.h> 2002 /* 2003 * List all of the locked vnodes in the system. 2004 * Called when debugging the kernel. 2005 */ 2006 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2007 { 2008 struct proc *p = curproc; /* XXX */ 2009 struct mount *mp, *nmp; 2010 struct vnode *vp; 2011 2012 printf("Locked vnodes\n"); 2013 simple_lock(&mountlist_slock); 2014 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2015 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2016 nmp = TAILQ_NEXT(mp, mnt_list); 2017 continue; 2018 } 2019 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2020 if (VOP_ISLOCKED(vp, NULL)) 2021 vprint((char *)0, vp); 2022 } 2023 simple_lock(&mountlist_slock); 2024 nmp = TAILQ_NEXT(mp, mnt_list); 2025 vfs_unbusy(mp, p); 2026 } 2027 simple_unlock(&mountlist_slock); 2028 } 2029 #endif 2030 2031 /* 2032 * Top level filesystem related information gathering. 2033 */ 2034 static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); 2035 2036 static int 2037 vfs_sysctl SYSCTL_HANDLER_ARGS 2038 { 2039 int *name = (int *)arg1 - 1; /* XXX */ 2040 u_int namelen = arg2 + 1; /* XXX */ 2041 struct vfsconf *vfsp; 2042 2043 #if 1 || defined(COMPAT_PRELITE2) 2044 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2045 if (namelen == 1) 2046 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2047 #endif 2048 2049 #ifdef notyet 2050 /* all sysctl names at this level are at least name and field */ 2051 if (namelen < 2) 2052 return (ENOTDIR); /* overloaded */ 2053 if (name[0] != VFS_GENERIC) { 2054 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2055 if (vfsp->vfc_typenum == name[0]) 2056 break; 2057 if (vfsp == NULL) 2058 return (EOPNOTSUPP); 2059 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2060 oldp, oldlenp, newp, newlen, p)); 2061 } 2062 #endif 2063 switch (name[1]) { 2064 case VFS_MAXTYPENUM: 2065 if (namelen != 2) 2066 return (ENOTDIR); 2067 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2068 case VFS_CONF: 2069 if (namelen != 3) 2070 return (ENOTDIR); /* overloaded */ 2071 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2072 if (vfsp->vfc_typenum == name[2]) 2073 break; 2074 if (vfsp == NULL) 2075 return (EOPNOTSUPP); 2076 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2077 } 2078 return (EOPNOTSUPP); 2079 } 2080 2081 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2082 "Generic filesystem"); 2083 2084 #if 1 || defined(COMPAT_PRELITE2) 2085 2086 static int 2087 sysctl_ovfs_conf SYSCTL_HANDLER_ARGS 2088 { 2089 int error; 2090 struct vfsconf *vfsp; 2091 struct ovfsconf ovfs; 2092 2093 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2094 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2095 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2096 ovfs.vfc_index = vfsp->vfc_typenum; 2097 ovfs.vfc_refcount = vfsp->vfc_refcount; 2098 ovfs.vfc_flags = vfsp->vfc_flags; 2099 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2100 if (error) 2101 return error; 2102 } 2103 return 0; 2104 } 2105 2106 #endif /* 1 || COMPAT_PRELITE2 */ 2107 2108 #if 0 2109 #define KINFO_VNODESLOP 10 2110 /* 2111 * Dump vnode list (via sysctl). 2112 * Copyout address of vnode followed by vnode. 2113 */ 2114 /* ARGSUSED */ 2115 static int 2116 sysctl_vnode SYSCTL_HANDLER_ARGS 2117 { 2118 struct proc *p = curproc; /* XXX */ 2119 struct mount *mp, *nmp; 2120 struct vnode *nvp, *vp; 2121 int error; 2122 2123 #define VPTRSZ sizeof (struct vnode *) 2124 #define VNODESZ sizeof (struct vnode) 2125 2126 req->lock = 0; 2127 if (!req->oldptr) /* Make an estimate */ 2128 return (SYSCTL_OUT(req, 0, 2129 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2130 2131 simple_lock(&mountlist_slock); 2132 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2133 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2134 nmp = TAILQ_NEXT(mp, mnt_list); 2135 continue; 2136 } 2137 again: 2138 simple_lock(&mntvnode_slock); 2139 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2140 vp != NULL; 2141 vp = nvp) { 2142 /* 2143 * Check that the vp is still associated with 2144 * this filesystem. RACE: could have been 2145 * recycled onto the same filesystem. 2146 */ 2147 if (vp->v_mount != mp) { 2148 simple_unlock(&mntvnode_slock); 2149 goto again; 2150 } 2151 nvp = LIST_NEXT(vp, v_mntvnodes); 2152 simple_unlock(&mntvnode_slock); 2153 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2154 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2155 return (error); 2156 simple_lock(&mntvnode_slock); 2157 } 2158 simple_unlock(&mntvnode_slock); 2159 simple_lock(&mountlist_slock); 2160 nmp = TAILQ_NEXT(mp, mnt_list); 2161 vfs_unbusy(mp, p); 2162 } 2163 simple_unlock(&mountlist_slock); 2164 2165 return (0); 2166 } 2167 #endif 2168 2169 /* 2170 * XXX 2171 * Exporting the vnode list on large systems causes them to crash. 2172 * Exporting the vnode list on medium systems causes sysctl to coredump. 2173 */ 2174 #if 0 2175 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2176 0, 0, sysctl_vnode, "S,vnode", ""); 2177 #endif 2178 2179 /* 2180 * Check to see if a filesystem is mounted on a block device. 2181 */ 2182 int 2183 vfs_mountedon(vp) 2184 struct vnode *vp; 2185 { 2186 2187 if (vp->v_specmountpoint != NULL) 2188 return (EBUSY); 2189 return (0); 2190 } 2191 2192 /* 2193 * Unmount all filesystems. The list is traversed in reverse order 2194 * of mounting to avoid dependencies. 2195 */ 2196 void 2197 vfs_unmountall() 2198 { 2199 struct mount *mp; 2200 struct proc *p; 2201 int error; 2202 2203 if (curproc != NULL) 2204 p = curproc; 2205 else 2206 p = initproc; /* XXX XXX should this be proc0? */ 2207 /* 2208 * Since this only runs when rebooting, it is not interlocked. 2209 */ 2210 while(!TAILQ_EMPTY(&mountlist)) { 2211 mp = TAILQ_LAST(&mountlist, mntlist); 2212 error = dounmount(mp, MNT_FORCE, p); 2213 if (error) { 2214 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2215 printf("unmount of %s failed (", 2216 mp->mnt_stat.f_mntonname); 2217 if (error == EBUSY) 2218 printf("BUSY)\n"); 2219 else 2220 printf("%d)\n", error); 2221 } else { 2222 /* The unmount has removed mp from the mountlist */ 2223 } 2224 } 2225 } 2226 2227 /* 2228 * Build hash lists of net addresses and hang them off the mount point. 2229 * Called by ufs_mount() to set up the lists of export addresses. 2230 */ 2231 static int 2232 vfs_hang_addrlist(mp, nep, argp) 2233 struct mount *mp; 2234 struct netexport *nep; 2235 struct export_args *argp; 2236 { 2237 register struct netcred *np; 2238 register struct radix_node_head *rnh; 2239 register int i; 2240 struct radix_node *rn; 2241 struct sockaddr *saddr, *smask = 0; 2242 struct domain *dom; 2243 int error; 2244 2245 if (argp->ex_addrlen == 0) { 2246 if (mp->mnt_flag & MNT_DEFEXPORTED) 2247 return (EPERM); 2248 np = &nep->ne_defexported; 2249 np->netc_exflags = argp->ex_flags; 2250 np->netc_anon = argp->ex_anon; 2251 np->netc_anon.cr_ref = 1; 2252 mp->mnt_flag |= MNT_DEFEXPORTED; 2253 return (0); 2254 } 2255 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2256 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2257 bzero((caddr_t) np, i); 2258 saddr = (struct sockaddr *) (np + 1); 2259 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2260 goto out; 2261 if (saddr->sa_len > argp->ex_addrlen) 2262 saddr->sa_len = argp->ex_addrlen; 2263 if (argp->ex_masklen) { 2264 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2265 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2266 if (error) 2267 goto out; 2268 if (smask->sa_len > argp->ex_masklen) 2269 smask->sa_len = argp->ex_masklen; 2270 } 2271 i = saddr->sa_family; 2272 if ((rnh = nep->ne_rtable[i]) == 0) { 2273 /* 2274 * Seems silly to initialize every AF when most are not used, 2275 * do so on demand here 2276 */ 2277 for (dom = domains; dom; dom = dom->dom_next) 2278 if (dom->dom_family == i && dom->dom_rtattach) { 2279 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2280 dom->dom_rtoffset); 2281 break; 2282 } 2283 if ((rnh = nep->ne_rtable[i]) == 0) { 2284 error = ENOBUFS; 2285 goto out; 2286 } 2287 } 2288 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2289 np->netc_rnodes); 2290 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2291 error = EPERM; 2292 goto out; 2293 } 2294 np->netc_exflags = argp->ex_flags; 2295 np->netc_anon = argp->ex_anon; 2296 np->netc_anon.cr_ref = 1; 2297 return (0); 2298 out: 2299 free(np, M_NETADDR); 2300 return (error); 2301 } 2302 2303 /* ARGSUSED */ 2304 static int 2305 vfs_free_netcred(rn, w) 2306 struct radix_node *rn; 2307 void *w; 2308 { 2309 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2310 2311 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2312 free((caddr_t) rn, M_NETADDR); 2313 return (0); 2314 } 2315 2316 /* 2317 * Free the net address hash lists that are hanging off the mount points. 2318 */ 2319 static void 2320 vfs_free_addrlist(nep) 2321 struct netexport *nep; 2322 { 2323 register int i; 2324 register struct radix_node_head *rnh; 2325 2326 for (i = 0; i <= AF_MAX; i++) 2327 if ((rnh = nep->ne_rtable[i])) { 2328 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2329 (caddr_t) rnh); 2330 free((caddr_t) rnh, M_RTABLE); 2331 nep->ne_rtable[i] = 0; 2332 } 2333 } 2334 2335 int 2336 vfs_export(mp, nep, argp) 2337 struct mount *mp; 2338 struct netexport *nep; 2339 struct export_args *argp; 2340 { 2341 int error; 2342 2343 if (argp->ex_flags & MNT_DELEXPORT) { 2344 if (mp->mnt_flag & MNT_EXPUBLIC) { 2345 vfs_setpublicfs(NULL, NULL, NULL); 2346 mp->mnt_flag &= ~MNT_EXPUBLIC; 2347 } 2348 vfs_free_addrlist(nep); 2349 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2350 } 2351 if (argp->ex_flags & MNT_EXPORTED) { 2352 if (argp->ex_flags & MNT_EXPUBLIC) { 2353 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2354 return (error); 2355 mp->mnt_flag |= MNT_EXPUBLIC; 2356 } 2357 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2358 return (error); 2359 mp->mnt_flag |= MNT_EXPORTED; 2360 } 2361 return (0); 2362 } 2363 2364 2365 /* 2366 * Set the publicly exported filesystem (WebNFS). Currently, only 2367 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2368 */ 2369 int 2370 vfs_setpublicfs(mp, nep, argp) 2371 struct mount *mp; 2372 struct netexport *nep; 2373 struct export_args *argp; 2374 { 2375 int error; 2376 struct vnode *rvp; 2377 char *cp; 2378 2379 /* 2380 * mp == NULL -> invalidate the current info, the FS is 2381 * no longer exported. May be called from either vfs_export 2382 * or unmount, so check if it hasn't already been done. 2383 */ 2384 if (mp == NULL) { 2385 if (nfs_pub.np_valid) { 2386 nfs_pub.np_valid = 0; 2387 if (nfs_pub.np_index != NULL) { 2388 FREE(nfs_pub.np_index, M_TEMP); 2389 nfs_pub.np_index = NULL; 2390 } 2391 } 2392 return (0); 2393 } 2394 2395 /* 2396 * Only one allowed at a time. 2397 */ 2398 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2399 return (EBUSY); 2400 2401 /* 2402 * Get real filehandle for root of exported FS. 2403 */ 2404 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2405 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2406 2407 if ((error = VFS_ROOT(mp, &rvp))) 2408 return (error); 2409 2410 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2411 return (error); 2412 2413 vput(rvp); 2414 2415 /* 2416 * If an indexfile was specified, pull it in. 2417 */ 2418 if (argp->ex_indexfile != NULL) { 2419 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2420 M_WAITOK); 2421 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2422 MAXNAMLEN, (size_t *)0); 2423 if (!error) { 2424 /* 2425 * Check for illegal filenames. 2426 */ 2427 for (cp = nfs_pub.np_index; *cp; cp++) { 2428 if (*cp == '/') { 2429 error = EINVAL; 2430 break; 2431 } 2432 } 2433 } 2434 if (error) { 2435 FREE(nfs_pub.np_index, M_TEMP); 2436 return (error); 2437 } 2438 } 2439 2440 nfs_pub.np_mount = mp; 2441 nfs_pub.np_valid = 1; 2442 return (0); 2443 } 2444 2445 struct netcred * 2446 vfs_export_lookup(mp, nep, nam) 2447 register struct mount *mp; 2448 struct netexport *nep; 2449 struct sockaddr *nam; 2450 { 2451 register struct netcred *np; 2452 register struct radix_node_head *rnh; 2453 struct sockaddr *saddr; 2454 2455 np = NULL; 2456 if (mp->mnt_flag & MNT_EXPORTED) { 2457 /* 2458 * Lookup in the export list first. 2459 */ 2460 if (nam != NULL) { 2461 saddr = nam; 2462 rnh = nep->ne_rtable[saddr->sa_family]; 2463 if (rnh != NULL) { 2464 np = (struct netcred *) 2465 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2466 rnh); 2467 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2468 np = NULL; 2469 } 2470 } 2471 /* 2472 * If no address match, use the default if it exists. 2473 */ 2474 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2475 np = &nep->ne_defexported; 2476 } 2477 return (np); 2478 } 2479 2480 /* 2481 * perform msync on all vnodes under a mount point 2482 * the mount point must be locked. 2483 */ 2484 void 2485 vfs_msync(struct mount *mp, int flags) { 2486 struct vnode *vp, *nvp; 2487 struct vm_object *obj; 2488 int anyio, tries; 2489 2490 tries = 5; 2491 loop: 2492 anyio = 0; 2493 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2494 2495 nvp = LIST_NEXT(vp, v_mntvnodes); 2496 2497 if (vp->v_mount != mp) { 2498 goto loop; 2499 } 2500 2501 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2502 continue; 2503 2504 if (flags != MNT_WAIT) { 2505 obj = vp->v_object; 2506 if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2507 continue; 2508 if (VOP_ISLOCKED(vp, NULL)) 2509 continue; 2510 } 2511 2512 simple_lock(&vp->v_interlock); 2513 if (vp->v_object && 2514 (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { 2515 if (!vget(vp, 2516 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2517 if (vp->v_object) { 2518 vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2519 anyio = 1; 2520 } 2521 vput(vp); 2522 } 2523 } else { 2524 simple_unlock(&vp->v_interlock); 2525 } 2526 } 2527 if (anyio && (--tries > 0)) 2528 goto loop; 2529 } 2530 2531 /* 2532 * Create the VM object needed for VMIO and mmap support. This 2533 * is done for all VREG files in the system. Some filesystems might 2534 * afford the additional metadata buffering capability of the 2535 * VMIO code by making the device node be VMIO mode also. 2536 * 2537 * vp must be locked when vfs_object_create is called. 2538 */ 2539 int 2540 vfs_object_create(vp, p, cred) 2541 struct vnode *vp; 2542 struct proc *p; 2543 struct ucred *cred; 2544 { 2545 struct vattr vat; 2546 vm_object_t object; 2547 int error = 0; 2548 2549 if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) 2550 return 0; 2551 2552 retry: 2553 if ((object = vp->v_object) == NULL) { 2554 if (vp->v_type == VREG || vp->v_type == VDIR) { 2555 if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) 2556 goto retn; 2557 object = vnode_pager_alloc(vp, vat.va_size, 0, 0); 2558 } else if (devsw(vp->v_rdev) != NULL) { 2559 /* 2560 * This simply allocates the biggest object possible 2561 * for a disk vnode. This should be fixed, but doesn't 2562 * cause any problems (yet). 2563 */ 2564 object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); 2565 } else { 2566 goto retn; 2567 } 2568 /* 2569 * Dereference the reference we just created. This assumes 2570 * that the object is associated with the vp. 2571 */ 2572 object->ref_count--; 2573 vp->v_usecount--; 2574 } else { 2575 if (object->flags & OBJ_DEAD) { 2576 VOP_UNLOCK(vp, 0, p); 2577 tsleep(object, PVM, "vodead", 0); 2578 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 2579 goto retry; 2580 } 2581 } 2582 2583 KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); 2584 vp->v_flag |= VOBJBUF; 2585 2586 retn: 2587 return error; 2588 } 2589 2590 static void 2591 vfree(vp) 2592 struct vnode *vp; 2593 { 2594 int s; 2595 2596 s = splbio(); 2597 simple_lock(&vnode_free_list_slock); 2598 if (vp->v_flag & VTBFREE) { 2599 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2600 vp->v_flag &= ~VTBFREE; 2601 } 2602 if (vp->v_flag & VAGE) { 2603 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2604 } else { 2605 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2606 } 2607 freevnodes++; 2608 simple_unlock(&vnode_free_list_slock); 2609 vp->v_flag &= ~VAGE; 2610 vp->v_flag |= VFREE; 2611 splx(s); 2612 } 2613 2614 void 2615 vbusy(vp) 2616 struct vnode *vp; 2617 { 2618 int s; 2619 2620 s = splbio(); 2621 simple_lock(&vnode_free_list_slock); 2622 if (vp->v_flag & VTBFREE) { 2623 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2624 vp->v_flag &= ~VTBFREE; 2625 } else { 2626 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2627 freevnodes--; 2628 } 2629 simple_unlock(&vnode_free_list_slock); 2630 vp->v_flag &= ~(VFREE|VAGE); 2631 splx(s); 2632 } 2633 2634 /* 2635 * Record a process's interest in events which might happen to 2636 * a vnode. Because poll uses the historic select-style interface 2637 * internally, this routine serves as both the ``check for any 2638 * pending events'' and the ``record my interest in future events'' 2639 * functions. (These are done together, while the lock is held, 2640 * to avoid race conditions.) 2641 */ 2642 int 2643 vn_pollrecord(vp, p, events) 2644 struct vnode *vp; 2645 struct proc *p; 2646 short events; 2647 { 2648 simple_lock(&vp->v_pollinfo.vpi_lock); 2649 if (vp->v_pollinfo.vpi_revents & events) { 2650 /* 2651 * This leaves events we are not interested 2652 * in available for the other process which 2653 * which presumably had requested them 2654 * (otherwise they would never have been 2655 * recorded). 2656 */ 2657 events &= vp->v_pollinfo.vpi_revents; 2658 vp->v_pollinfo.vpi_revents &= ~events; 2659 2660 simple_unlock(&vp->v_pollinfo.vpi_lock); 2661 return events; 2662 } 2663 vp->v_pollinfo.vpi_events |= events; 2664 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2665 simple_unlock(&vp->v_pollinfo.vpi_lock); 2666 return 0; 2667 } 2668 2669 /* 2670 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2671 * it is possible for us to miss an event due to race conditions, but 2672 * that condition is expected to be rare, so for the moment it is the 2673 * preferred interface. 2674 */ 2675 void 2676 vn_pollevent(vp, events) 2677 struct vnode *vp; 2678 short events; 2679 { 2680 simple_lock(&vp->v_pollinfo.vpi_lock); 2681 if (vp->v_pollinfo.vpi_events & events) { 2682 /* 2683 * We clear vpi_events so that we don't 2684 * call selwakeup() twice if two events are 2685 * posted before the polling process(es) is 2686 * awakened. This also ensures that we take at 2687 * most one selwakeup() if the polling process 2688 * is no longer interested. However, it does 2689 * mean that only one event can be noticed at 2690 * a time. (Perhaps we should only clear those 2691 * event bits which we note?) XXX 2692 */ 2693 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2694 vp->v_pollinfo.vpi_revents |= events; 2695 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2696 } 2697 simple_unlock(&vp->v_pollinfo.vpi_lock); 2698 } 2699 2700 /* 2701 * Wake up anyone polling on vp because it is being revoked. 2702 * This depends on dead_poll() returning POLLHUP for correct 2703 * behavior. 2704 */ 2705 void 2706 vn_pollgone(vp) 2707 struct vnode *vp; 2708 { 2709 simple_lock(&vp->v_pollinfo.vpi_lock); 2710 if (vp->v_pollinfo.vpi_events) { 2711 vp->v_pollinfo.vpi_events = 0; 2712 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2713 } 2714 simple_unlock(&vp->v_pollinfo.vpi_lock); 2715 } 2716 2717 2718 2719 /* 2720 * Routine to create and manage a filesystem syncer vnode. 2721 */ 2722 #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2723 static int sync_fsync __P((struct vop_fsync_args *)); 2724 static int sync_inactive __P((struct vop_inactive_args *)); 2725 static int sync_reclaim __P((struct vop_reclaim_args *)); 2726 #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2727 #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2728 static int sync_print __P((struct vop_print_args *)); 2729 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2730 2731 static vop_t **sync_vnodeop_p; 2732 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2733 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2734 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2735 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2736 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2737 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2738 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2739 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2740 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2741 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2742 { NULL, NULL } 2743 }; 2744 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2745 { &sync_vnodeop_p, sync_vnodeop_entries }; 2746 2747 VNODEOP_SET(sync_vnodeop_opv_desc); 2748 2749 /* 2750 * Create a new filesystem syncer vnode for the specified mount point. 2751 */ 2752 int 2753 vfs_allocate_syncvnode(mp) 2754 struct mount *mp; 2755 { 2756 struct vnode *vp; 2757 static long start, incr, next; 2758 int error; 2759 2760 /* Allocate a new vnode */ 2761 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2762 mp->mnt_syncer = NULL; 2763 return (error); 2764 } 2765 vp->v_type = VNON; 2766 /* 2767 * Place the vnode onto the syncer worklist. We attempt to 2768 * scatter them about on the list so that they will go off 2769 * at evenly distributed times even if all the filesystems 2770 * are mounted at once. 2771 */ 2772 next += incr; 2773 if (next == 0 || next > syncer_maxdelay) { 2774 start /= 2; 2775 incr /= 2; 2776 if (start == 0) { 2777 start = syncer_maxdelay / 2; 2778 incr = syncer_maxdelay; 2779 } 2780 next = start; 2781 } 2782 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2783 mp->mnt_syncer = vp; 2784 return (0); 2785 } 2786 2787 /* 2788 * Do a lazy sync of the filesystem. 2789 */ 2790 static int 2791 sync_fsync(ap) 2792 struct vop_fsync_args /* { 2793 struct vnode *a_vp; 2794 struct ucred *a_cred; 2795 int a_waitfor; 2796 struct proc *a_p; 2797 } */ *ap; 2798 { 2799 struct vnode *syncvp = ap->a_vp; 2800 struct mount *mp = syncvp->v_mount; 2801 struct proc *p = ap->a_p; 2802 int asyncflag; 2803 2804 /* 2805 * We only need to do something if this is a lazy evaluation. 2806 */ 2807 if (ap->a_waitfor != MNT_LAZY) 2808 return (0); 2809 2810 /* 2811 * Move ourselves to the back of the sync list. 2812 */ 2813 vn_syncer_add_to_worklist(syncvp, syncdelay); 2814 2815 /* 2816 * Walk the list of vnodes pushing all that are dirty and 2817 * not already on the sync list. 2818 */ 2819 simple_lock(&mountlist_slock); 2820 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { 2821 simple_unlock(&mountlist_slock); 2822 return (0); 2823 } 2824 asyncflag = mp->mnt_flag & MNT_ASYNC; 2825 mp->mnt_flag &= ~MNT_ASYNC; 2826 vfs_msync(mp, MNT_NOWAIT); 2827 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2828 if (asyncflag) 2829 mp->mnt_flag |= MNT_ASYNC; 2830 vfs_unbusy(mp, p); 2831 return (0); 2832 } 2833 2834 /* 2835 * The syncer vnode is no referenced. 2836 */ 2837 static int 2838 sync_inactive(ap) 2839 struct vop_inactive_args /* { 2840 struct vnode *a_vp; 2841 struct proc *a_p; 2842 } */ *ap; 2843 { 2844 2845 vgone(ap->a_vp); 2846 return (0); 2847 } 2848 2849 /* 2850 * The syncer vnode is no longer needed and is being decommissioned. 2851 * 2852 * Modifications to the worklist must be protected at splbio(). 2853 */ 2854 static int 2855 sync_reclaim(ap) 2856 struct vop_reclaim_args /* { 2857 struct vnode *a_vp; 2858 } */ *ap; 2859 { 2860 struct vnode *vp = ap->a_vp; 2861 int s; 2862 2863 s = splbio(); 2864 vp->v_mount->mnt_syncer = NULL; 2865 if (vp->v_flag & VONWORKLST) { 2866 LIST_REMOVE(vp, v_synclist); 2867 vp->v_flag &= ~VONWORKLST; 2868 } 2869 splx(s); 2870 2871 return (0); 2872 } 2873 2874 /* 2875 * Print out a syncer vnode. 2876 */ 2877 static int 2878 sync_print(ap) 2879 struct vop_print_args /* { 2880 struct vnode *a_vp; 2881 } */ *ap; 2882 { 2883 struct vnode *vp = ap->a_vp; 2884 2885 printf("syncer vnode"); 2886 if (vp->v_vnlock != NULL) 2887 lockmgr_printinfo(vp->v_vnlock); 2888 printf("\n"); 2889 return (0); 2890 } 2891 2892 /* 2893 * extract the dev_t from a VBLK or VCHR 2894 */ 2895 dev_t 2896 vn_todev(vp) 2897 struct vnode *vp; 2898 { 2899 if (vp->v_type != VBLK && vp->v_type != VCHR) 2900 return (NODEV); 2901 return (vp->v_rdev); 2902 } 2903 2904 /* 2905 * Check if vnode represents a disk device 2906 */ 2907 int 2908 vn_isdisk(vp, errp) 2909 struct vnode *vp; 2910 int *errp; 2911 { 2912 if (vp->v_type != VBLK && vp->v_type != VCHR) { 2913 if (errp != NULL) 2914 *errp = ENOTBLK; 2915 return (0); 2916 } 2917 if (!devsw(vp->v_rdev)) { 2918 if (errp != NULL) 2919 *errp = ENXIO; 2920 return (0); 2921 } 2922 if (!(devsw(vp->v_rdev)->d_flags & D_DISK)) { 2923 if (errp != NULL) 2924 *errp = ENOTBLK; 2925 return (0); 2926 } 2927 if (errp != NULL) 2928 *errp = 0; 2929 return (1); 2930 } 2931 2932 void 2933 NDFREE(ndp, flags) 2934 struct nameidata *ndp; 2935 const uint flags; 2936 { 2937 if (!(flags & NDF_NO_FREE_PNBUF) && 2938 (ndp->ni_cnd.cn_flags & HASBUF)) { 2939 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2940 ndp->ni_cnd.cn_flags &= ~HASBUF; 2941 } 2942 if (!(flags & NDF_NO_DVP_UNLOCK) && 2943 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 2944 ndp->ni_dvp != ndp->ni_vp) 2945 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 2946 if (!(flags & NDF_NO_DVP_RELE) && 2947 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 2948 vrele(ndp->ni_dvp); 2949 ndp->ni_dvp = NULL; 2950 } 2951 if (!(flags & NDF_NO_VP_UNLOCK) && 2952 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 2953 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 2954 if (!(flags & NDF_NO_VP_RELE) && 2955 ndp->ni_vp) { 2956 vrele(ndp->ni_vp); 2957 ndp->ni_vp = NULL; 2958 } 2959 if (!(flags & NDF_NO_STARTDIR_RELE) && 2960 (ndp->ni_cnd.cn_flags & SAVESTART)) { 2961 vrele(ndp->ni_startdir); 2962 ndp->ni_startdir = NULL; 2963 } 2964 } 2965