1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 #include "opt_ffs.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/dirent.h> 54 #include <sys/domain.h> 55 #include <sys/eventhandler.h> 56 #include <sys/fcntl.h> 57 #include <sys/kernel.h> 58 #include <sys/kthread.h> 59 #include <sys/ktr.h> 60 #include <sys/malloc.h> 61 #include <sys/mount.h> 62 #include <sys/mutex.h> 63 #include <sys/namei.h> 64 #include <sys/proc.h> 65 #include <sys/reboot.h> 66 #include <sys/socket.h> 67 #include <sys/stat.h> 68 #include <sys/sysctl.h> 69 #include <sys/vmmeter.h> 70 #include <sys/vnode.h> 71 72 #include <machine/limits.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_object.h> 76 #include <vm/vm_extern.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pager.h> 81 #include <vm/vnode_pager.h> 82 #include <vm/vm_zone.h> 83 84 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 85 86 static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); 87 static void insmntque __P((struct vnode *vp, struct mount *mp)); 88 static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 89 90 /* 91 * Number of vnodes in existence. Increased whenever getnewvnode() 92 * allocates a new vnode, never decreased. 93 */ 94 static unsigned long numvnodes; 95 SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 96 97 /* 98 * Conversion tables for conversion from vnode types to inode formats 99 * and back. 100 */ 101 enum vtype iftovt_tab[16] = { 102 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 103 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 104 }; 105 int vttoif_tab[9] = { 106 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 107 S_IFSOCK, S_IFIFO, S_IFMT, 108 }; 109 110 /* 111 * List of vnodes that are ready for recycling. 112 */ 113 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 114 115 /* 116 * Minimum number of free vnodes. If there are fewer than this free vnodes, 117 * getnewvnode() will return a newly allocated vnode. 118 */ 119 static u_long wantfreevnodes = 25; 120 SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 121 /* Number of vnodes in the free list. */ 122 static u_long freevnodes = 0; 123 SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 124 125 /* 126 * Various variables used for debugging the new implementation of 127 * reassignbuf(). 128 * XXX these are probably of (very) limited utility now. 129 */ 130 static int reassignbufcalls; 131 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 132 static int reassignbufloops; 133 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 134 static int reassignbufsortgood; 135 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 136 static int reassignbufsortbad; 137 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 138 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ 139 static int reassignbufmethod = 1; 140 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 141 142 #ifdef ENABLE_VFS_IOOPT 143 /* See NOTES for a description of this setting. */ 144 int vfs_ioopt = 0; 145 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 146 #endif 147 148 /* List of mounted filesystems. */ 149 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 150 151 /* For any iteration/modification of mountlist */ 152 struct mtx mountlist_mtx; 153 154 /* For any iteration/modification of mnt_vnodelist */ 155 struct simplelock mntvnode_slock; 156 /* 157 * Cache for the mount type id assigned to NFS. This is used for 158 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 159 */ 160 int nfs_mount_type = -1; 161 162 #ifndef NULL_SIMPLELOCKS 163 /* To keep more than one thread at a time from running vfs_getnewfsid */ 164 static struct simplelock mntid_slock; 165 166 /* For any iteration/modification of vnode_free_list */ 167 static struct simplelock vnode_free_list_slock; 168 169 /* 170 * For any iteration/modification of dev->si_hlist (linked through 171 * v_specnext) 172 */ 173 static struct simplelock spechash_slock; 174 #endif 175 176 /* Publicly exported FS */ 177 struct nfs_public nfs_pub; 178 179 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 180 static vm_zone_t vnode_zone; 181 182 /* Set to 1 to print out reclaim of active vnodes */ 183 int prtactive = 0; 184 185 /* 186 * The workitem queue. 187 * 188 * It is useful to delay writes of file data and filesystem metadata 189 * for tens of seconds so that quickly created and deleted files need 190 * not waste disk bandwidth being created and removed. To realize this, 191 * we append vnodes to a "workitem" queue. When running with a soft 192 * updates implementation, most pending metadata dependencies should 193 * not wait for more than a few seconds. Thus, mounted on block devices 194 * are delayed only about a half the time that file data is delayed. 195 * Similarly, directory updates are more critical, so are only delayed 196 * about a third the time that file data is delayed. Thus, there are 197 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 198 * one each second (driven off the filesystem syncer process). The 199 * syncer_delayno variable indicates the next queue that is to be processed. 200 * Items that need to be processed soon are placed in this queue: 201 * 202 * syncer_workitem_pending[syncer_delayno] 203 * 204 * A delay of fifteen seconds is done by placing the request fifteen 205 * entries later in the queue: 206 * 207 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 208 * 209 */ 210 static int syncer_delayno = 0; 211 static long syncer_mask; 212 LIST_HEAD(synclist, vnode); 213 static struct synclist *syncer_workitem_pending; 214 215 #define SYNCER_MAXDELAY 32 216 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 217 time_t syncdelay = 30; /* max time to delay syncing data */ 218 time_t filedelay = 30; /* time to delay syncing files */ 219 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 220 time_t dirdelay = 29; /* time to delay syncing directories */ 221 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 222 time_t metadelay = 28; /* time to delay syncing metadata */ 223 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 224 static int rushjob; /* number of slots to run ASAP */ 225 static int stat_rush_requests; /* number of times I/O speeded up */ 226 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 227 228 /* 229 * Number of vnodes we want to exist at any one time. This is mostly used 230 * to size hash tables in vnode-related code. It is normally not used in 231 * getnewvnode(), as wantfreevnodes is normally nonzero.) 232 * 233 * XXX desiredvnodes is historical cruft and should not exist. 234 */ 235 int desiredvnodes; 236 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 237 &desiredvnodes, 0, "Maximum number of vnodes"); 238 239 static void vfs_free_addrlist __P((struct netexport *nep)); 240 static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 241 static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 242 struct export_args *argp)); 243 244 /* 245 * Initialize the vnode management data structures. 246 */ 247 static void 248 vntblinit(void *dummy __unused) 249 { 250 251 desiredvnodes = maxproc + cnt.v_page_count / 4; 252 mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); 253 simple_lock_init(&mntvnode_slock); 254 simple_lock_init(&mntid_slock); 255 simple_lock_init(&spechash_slock); 256 TAILQ_INIT(&vnode_free_list); 257 simple_lock_init(&vnode_free_list_slock); 258 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 259 /* 260 * Initialize the filesystem syncer. 261 */ 262 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 263 &syncer_mask); 264 syncer_maxdelay = syncer_mask + 1; 265 } 266 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 267 268 269 /* 270 * Mark a mount point as busy. Used to synchronize access and to delay 271 * unmounting. Interlock is not released on failure. 272 */ 273 int 274 vfs_busy(mp, flags, interlkp, p) 275 struct mount *mp; 276 int flags; 277 struct mtx *interlkp; 278 struct proc *p; 279 { 280 int lkflags; 281 282 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 283 if (flags & LK_NOWAIT) 284 return (ENOENT); 285 mp->mnt_kern_flag |= MNTK_MWAIT; 286 /* 287 * Since all busy locks are shared except the exclusive 288 * lock granted when unmounting, the only place that a 289 * wakeup needs to be done is at the release of the 290 * exclusive lock at the end of dounmount. 291 */ 292 msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); 293 return (ENOENT); 294 } 295 lkflags = LK_SHARED | LK_NOPAUSE; 296 if (interlkp) 297 lkflags |= LK_INTERLOCK; 298 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 299 panic("vfs_busy: unexpected lock failure"); 300 return (0); 301 } 302 303 /* 304 * Free a busy filesystem. 305 */ 306 void 307 vfs_unbusy(mp, p) 308 struct mount *mp; 309 struct proc *p; 310 { 311 312 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 313 } 314 315 /* 316 * Lookup a filesystem type, and if found allocate and initialize 317 * a mount structure for it. 318 * 319 * Devname is usually updated by mount(8) after booting. 320 */ 321 int 322 vfs_rootmountalloc(fstypename, devname, mpp) 323 char *fstypename; 324 char *devname; 325 struct mount **mpp; 326 { 327 struct proc *p = curproc; /* XXX */ 328 struct vfsconf *vfsp; 329 struct mount *mp; 330 331 if (fstypename == NULL) 332 return (ENODEV); 333 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 334 if (!strcmp(vfsp->vfc_name, fstypename)) 335 break; 336 if (vfsp == NULL) 337 return (ENODEV); 338 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); 339 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 340 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 341 LIST_INIT(&mp->mnt_vnodelist); 342 mp->mnt_vfc = vfsp; 343 mp->mnt_op = vfsp->vfc_vfsops; 344 mp->mnt_flag = MNT_RDONLY; 345 mp->mnt_vnodecovered = NULLVP; 346 vfsp->vfc_refcount++; 347 mp->mnt_iosize_max = DFLTPHYS; 348 mp->mnt_stat.f_type = vfsp->vfc_typenum; 349 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 350 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 351 mp->mnt_stat.f_mntonname[0] = '/'; 352 mp->mnt_stat.f_mntonname[1] = 0; 353 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 354 *mpp = mp; 355 return (0); 356 } 357 358 /* 359 * Find an appropriate filesystem to use for the root. If a filesystem 360 * has not been preselected, walk through the list of known filesystems 361 * trying those that have mountroot routines, and try them until one 362 * works or we have tried them all. 363 */ 364 #ifdef notdef /* XXX JH */ 365 int 366 lite2_vfs_mountroot() 367 { 368 struct vfsconf *vfsp; 369 extern int (*lite2_mountroot) __P((void)); 370 int error; 371 372 if (lite2_mountroot != NULL) 373 return ((*lite2_mountroot)()); 374 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 375 if (vfsp->vfc_mountroot == NULL) 376 continue; 377 if ((error = (*vfsp->vfc_mountroot)()) == 0) 378 return (0); 379 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 380 } 381 return (ENODEV); 382 } 383 #endif 384 385 /* 386 * Lookup a mount point by filesystem identifier. 387 */ 388 struct mount * 389 vfs_getvfs(fsid) 390 fsid_t *fsid; 391 { 392 register struct mount *mp; 393 394 mtx_enter(&mountlist_mtx, MTX_DEF); 395 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 396 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 397 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 398 mtx_exit(&mountlist_mtx, MTX_DEF); 399 return (mp); 400 } 401 } 402 mtx_exit(&mountlist_mtx, MTX_DEF); 403 return ((struct mount *) 0); 404 } 405 406 /* 407 * Get a new unique fsid. Try to make its val[0] unique, since this value 408 * will be used to create fake device numbers for stat(). Also try (but 409 * not so hard) make its val[0] unique mod 2^16, since some emulators only 410 * support 16-bit device numbers. We end up with unique val[0]'s for the 411 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 412 * 413 * Keep in mind that several mounts may be running in parallel. Starting 414 * the search one past where the previous search terminated is both a 415 * micro-optimization and a defense against returning the same fsid to 416 * different mounts. 417 */ 418 void 419 vfs_getnewfsid(mp) 420 struct mount *mp; 421 { 422 static u_int16_t mntid_base; 423 fsid_t tfsid; 424 int mtype; 425 426 simple_lock(&mntid_slock); 427 mtype = mp->mnt_vfc->vfc_typenum; 428 tfsid.val[1] = mtype; 429 mtype = (mtype & 0xFF) << 24; 430 for (;;) { 431 tfsid.val[0] = makeudev(255, 432 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 433 mntid_base++; 434 if (vfs_getvfs(&tfsid) == NULL) 435 break; 436 } 437 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 438 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 439 simple_unlock(&mntid_slock); 440 } 441 442 /* 443 * Knob to control the precision of file timestamps: 444 * 445 * 0 = seconds only; nanoseconds zeroed. 446 * 1 = seconds and nanoseconds, accurate within 1/HZ. 447 * 2 = seconds and nanoseconds, truncated to microseconds. 448 * >=3 = seconds and nanoseconds, maximum precision. 449 */ 450 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 451 452 static int timestamp_precision = TSP_SEC; 453 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 454 ×tamp_precision, 0, ""); 455 456 /* 457 * Get a current timestamp. 458 */ 459 void 460 vfs_timestamp(tsp) 461 struct timespec *tsp; 462 { 463 struct timeval tv; 464 465 switch (timestamp_precision) { 466 case TSP_SEC: 467 tsp->tv_sec = time_second; 468 tsp->tv_nsec = 0; 469 break; 470 case TSP_HZ: 471 getnanotime(tsp); 472 break; 473 case TSP_USEC: 474 microtime(&tv); 475 TIMEVAL_TO_TIMESPEC(&tv, tsp); 476 break; 477 case TSP_NSEC: 478 default: 479 nanotime(tsp); 480 break; 481 } 482 } 483 484 /* 485 * Set vnode attributes to VNOVAL 486 */ 487 void 488 vattr_null(vap) 489 register struct vattr *vap; 490 { 491 492 vap->va_type = VNON; 493 vap->va_size = VNOVAL; 494 vap->va_bytes = VNOVAL; 495 vap->va_mode = VNOVAL; 496 vap->va_nlink = VNOVAL; 497 vap->va_uid = VNOVAL; 498 vap->va_gid = VNOVAL; 499 vap->va_fsid = VNOVAL; 500 vap->va_fileid = VNOVAL; 501 vap->va_blocksize = VNOVAL; 502 vap->va_rdev = VNOVAL; 503 vap->va_atime.tv_sec = VNOVAL; 504 vap->va_atime.tv_nsec = VNOVAL; 505 vap->va_mtime.tv_sec = VNOVAL; 506 vap->va_mtime.tv_nsec = VNOVAL; 507 vap->va_ctime.tv_sec = VNOVAL; 508 vap->va_ctime.tv_nsec = VNOVAL; 509 vap->va_flags = VNOVAL; 510 vap->va_gen = VNOVAL; 511 vap->va_vaflags = 0; 512 } 513 514 /* 515 * Routines having to do with the management of the vnode table. 516 */ 517 518 /* 519 * Return the next vnode from the free list. 520 */ 521 int 522 getnewvnode(tag, mp, vops, vpp) 523 enum vtagtype tag; 524 struct mount *mp; 525 vop_t **vops; 526 struct vnode **vpp; 527 { 528 int s, count; 529 struct proc *p = curproc; /* XXX */ 530 struct vnode *vp = NULL; 531 struct mount *vnmp; 532 vm_object_t object; 533 534 /* 535 * We take the least recently used vnode from the freelist 536 * if we can get it and it has no cached pages, and no 537 * namecache entries are relative to it. 538 * Otherwise we allocate a new vnode 539 */ 540 541 s = splbio(); 542 simple_lock(&vnode_free_list_slock); 543 544 if (wantfreevnodes && freevnodes < wantfreevnodes) { 545 vp = NULL; 546 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 547 /* 548 * XXX: this is only here to be backwards compatible 549 */ 550 vp = NULL; 551 } else for (count = 0; count < freevnodes; count++) { 552 vp = TAILQ_FIRST(&vnode_free_list); 553 if (vp == NULL || vp->v_usecount) 554 panic("getnewvnode: free vnode isn't"); 555 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 556 /* 557 * Don't recycle if active in the namecache or 558 * if it still has cached pages or we cannot get 559 * its interlock. 560 */ 561 if (LIST_FIRST(&vp->v_cache_src) != NULL || 562 (VOP_GETVOBJECT(vp, &object) == 0 && 563 (object->resident_page_count || object->ref_count)) || 564 !mtx_try_enter(&vp->v_interlock, MTX_DEF)) { 565 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 566 vp = NULL; 567 continue; 568 } 569 /* 570 * Skip over it if its filesystem is being suspended. 571 */ 572 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 573 break; 574 mtx_exit(&vp->v_interlock, MTX_DEF); 575 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 576 vp = NULL; 577 } 578 if (vp) { 579 vp->v_flag |= VDOOMED; 580 vp->v_flag &= ~VFREE; 581 freevnodes--; 582 simple_unlock(&vnode_free_list_slock); 583 cache_purge(vp); 584 vp->v_lease = NULL; 585 if (vp->v_type != VBAD) { 586 vgonel(vp, p); 587 } else { 588 mtx_exit(&vp->v_interlock, MTX_DEF); 589 } 590 vn_finished_write(vnmp); 591 592 #ifdef INVARIANTS 593 { 594 int s; 595 596 if (vp->v_data) 597 panic("cleaned vnode isn't"); 598 s = splbio(); 599 if (vp->v_numoutput) 600 panic("Clean vnode has pending I/O's"); 601 splx(s); 602 if (vp->v_writecount != 0) 603 panic("Non-zero write count"); 604 } 605 #endif 606 vp->v_flag = 0; 607 vp->v_lastw = 0; 608 vp->v_lasta = 0; 609 vp->v_cstart = 0; 610 vp->v_clen = 0; 611 vp->v_socket = 0; 612 } else { 613 simple_unlock(&vnode_free_list_slock); 614 vp = (struct vnode *) zalloc(vnode_zone); 615 bzero((char *) vp, sizeof *vp); 616 mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); 617 vp->v_dd = vp; 618 cache_purge(vp); 619 LIST_INIT(&vp->v_cache_src); 620 TAILQ_INIT(&vp->v_cache_dst); 621 numvnodes++; 622 } 623 624 TAILQ_INIT(&vp->v_cleanblkhd); 625 TAILQ_INIT(&vp->v_dirtyblkhd); 626 vp->v_type = VNON; 627 vp->v_tag = tag; 628 vp->v_op = vops; 629 lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); 630 insmntque(vp, mp); 631 *vpp = vp; 632 vp->v_usecount = 1; 633 vp->v_data = 0; 634 splx(s); 635 636 vfs_object_create(vp, p, p->p_ucred); 637 return (0); 638 } 639 640 /* 641 * Move a vnode from one mount queue to another. 642 */ 643 static void 644 insmntque(vp, mp) 645 register struct vnode *vp; 646 register struct mount *mp; 647 { 648 649 simple_lock(&mntvnode_slock); 650 /* 651 * Delete from old mount point vnode list, if on one. 652 */ 653 if (vp->v_mount != NULL) 654 LIST_REMOVE(vp, v_mntvnodes); 655 /* 656 * Insert into list of vnodes for the new mount point, if available. 657 */ 658 if ((vp->v_mount = mp) == NULL) { 659 simple_unlock(&mntvnode_slock); 660 return; 661 } 662 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 663 simple_unlock(&mntvnode_slock); 664 } 665 666 /* 667 * Update outstanding I/O count and do wakeup if requested. 668 */ 669 void 670 vwakeup(bp) 671 register struct buf *bp; 672 { 673 register struct vnode *vp; 674 675 bp->b_flags &= ~B_WRITEINPROG; 676 if ((vp = bp->b_vp)) { 677 vp->v_numoutput--; 678 if (vp->v_numoutput < 0) 679 panic("vwakeup: neg numoutput"); 680 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 681 vp->v_flag &= ~VBWAIT; 682 wakeup((caddr_t) &vp->v_numoutput); 683 } 684 } 685 } 686 687 /* 688 * Flush out and invalidate all buffers associated with a vnode. 689 * Called with the underlying object locked. 690 */ 691 int 692 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 693 register struct vnode *vp; 694 int flags; 695 struct ucred *cred; 696 struct proc *p; 697 int slpflag, slptimeo; 698 { 699 register struct buf *bp; 700 struct buf *nbp, *blist; 701 int s, error; 702 vm_object_t object; 703 704 if (flags & V_SAVE) { 705 s = splbio(); 706 while (vp->v_numoutput) { 707 vp->v_flag |= VBWAIT; 708 error = tsleep((caddr_t)&vp->v_numoutput, 709 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 710 if (error) { 711 splx(s); 712 return (error); 713 } 714 } 715 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 716 splx(s); 717 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 718 return (error); 719 s = splbio(); 720 if (vp->v_numoutput > 0 || 721 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 722 panic("vinvalbuf: dirty bufs"); 723 } 724 splx(s); 725 } 726 s = splbio(); 727 for (;;) { 728 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 729 if (!blist) 730 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 731 if (!blist) 732 break; 733 734 for (bp = blist; bp; bp = nbp) { 735 nbp = TAILQ_NEXT(bp, b_vnbufs); 736 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 737 error = BUF_TIMELOCK(bp, 738 LK_EXCLUSIVE | LK_SLEEPFAIL, 739 "vinvalbuf", slpflag, slptimeo); 740 if (error == ENOLCK) 741 break; 742 splx(s); 743 return (error); 744 } 745 /* 746 * XXX Since there are no node locks for NFS, I 747 * believe there is a slight chance that a delayed 748 * write will occur while sleeping just above, so 749 * check for it. Note that vfs_bio_awrite expects 750 * buffers to reside on a queue, while VOP_BWRITE and 751 * brelse do not. 752 */ 753 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 754 (flags & V_SAVE)) { 755 756 if (bp->b_vp == vp) { 757 if (bp->b_flags & B_CLUSTEROK) { 758 BUF_UNLOCK(bp); 759 vfs_bio_awrite(bp); 760 } else { 761 bremfree(bp); 762 bp->b_flags |= B_ASYNC; 763 BUF_WRITE(bp); 764 } 765 } else { 766 bremfree(bp); 767 (void) BUF_WRITE(bp); 768 } 769 break; 770 } 771 bremfree(bp); 772 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 773 bp->b_flags &= ~B_ASYNC; 774 brelse(bp); 775 } 776 } 777 778 while (vp->v_numoutput > 0) { 779 vp->v_flag |= VBWAIT; 780 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 781 } 782 783 splx(s); 784 785 /* 786 * Destroy the copy in the VM cache, too. 787 */ 788 mtx_enter(&vp->v_interlock, MTX_DEF); 789 if (VOP_GETVOBJECT(vp, &object) == 0) { 790 vm_object_page_remove(object, 0, 0, 791 (flags & V_SAVE) ? TRUE : FALSE); 792 } 793 mtx_exit(&vp->v_interlock, MTX_DEF); 794 795 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 796 panic("vinvalbuf: flush failed"); 797 return (0); 798 } 799 800 /* 801 * Truncate a file's buffer and pages to a specified length. This 802 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 803 * sync activity. 804 */ 805 int 806 vtruncbuf(vp, cred, p, length, blksize) 807 register struct vnode *vp; 808 struct ucred *cred; 809 struct proc *p; 810 off_t length; 811 int blksize; 812 { 813 register struct buf *bp; 814 struct buf *nbp; 815 int s, anyfreed; 816 int trunclbn; 817 818 /* 819 * Round up to the *next* lbn. 820 */ 821 trunclbn = (length + blksize - 1) / blksize; 822 823 s = splbio(); 824 restart: 825 anyfreed = 1; 826 for (;anyfreed;) { 827 anyfreed = 0; 828 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 829 nbp = TAILQ_NEXT(bp, b_vnbufs); 830 if (bp->b_lblkno >= trunclbn) { 831 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 832 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 833 goto restart; 834 } else { 835 bremfree(bp); 836 bp->b_flags |= (B_INVAL | B_RELBUF); 837 bp->b_flags &= ~B_ASYNC; 838 brelse(bp); 839 anyfreed = 1; 840 } 841 if (nbp && 842 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 843 (nbp->b_vp != vp) || 844 (nbp->b_flags & B_DELWRI))) { 845 goto restart; 846 } 847 } 848 } 849 850 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 851 nbp = TAILQ_NEXT(bp, b_vnbufs); 852 if (bp->b_lblkno >= trunclbn) { 853 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 854 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 855 goto restart; 856 } else { 857 bremfree(bp); 858 bp->b_flags |= (B_INVAL | B_RELBUF); 859 bp->b_flags &= ~B_ASYNC; 860 brelse(bp); 861 anyfreed = 1; 862 } 863 if (nbp && 864 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 865 (nbp->b_vp != vp) || 866 (nbp->b_flags & B_DELWRI) == 0)) { 867 goto restart; 868 } 869 } 870 } 871 } 872 873 if (length > 0) { 874 restartsync: 875 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 876 nbp = TAILQ_NEXT(bp, b_vnbufs); 877 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 878 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 879 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 880 goto restart; 881 } else { 882 bremfree(bp); 883 if (bp->b_vp == vp) { 884 bp->b_flags |= B_ASYNC; 885 } else { 886 bp->b_flags &= ~B_ASYNC; 887 } 888 BUF_WRITE(bp); 889 } 890 goto restartsync; 891 } 892 893 } 894 } 895 896 while (vp->v_numoutput > 0) { 897 vp->v_flag |= VBWAIT; 898 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 899 } 900 901 splx(s); 902 903 vnode_pager_setsize(vp, length); 904 905 return (0); 906 } 907 908 /* 909 * Associate a buffer with a vnode. 910 */ 911 void 912 bgetvp(vp, bp) 913 register struct vnode *vp; 914 register struct buf *bp; 915 { 916 int s; 917 918 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 919 920 vhold(vp); 921 bp->b_vp = vp; 922 bp->b_dev = vn_todev(vp); 923 /* 924 * Insert onto list for new vnode. 925 */ 926 s = splbio(); 927 bp->b_xflags |= BX_VNCLEAN; 928 bp->b_xflags &= ~BX_VNDIRTY; 929 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 930 splx(s); 931 } 932 933 /* 934 * Disassociate a buffer from a vnode. 935 */ 936 void 937 brelvp(bp) 938 register struct buf *bp; 939 { 940 struct vnode *vp; 941 struct buflists *listheadp; 942 int s; 943 944 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 945 946 /* 947 * Delete from old vnode list, if on one. 948 */ 949 vp = bp->b_vp; 950 s = splbio(); 951 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 952 if (bp->b_xflags & BX_VNDIRTY) 953 listheadp = &vp->v_dirtyblkhd; 954 else 955 listheadp = &vp->v_cleanblkhd; 956 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 957 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 958 } 959 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 960 vp->v_flag &= ~VONWORKLST; 961 LIST_REMOVE(vp, v_synclist); 962 } 963 splx(s); 964 bp->b_vp = (struct vnode *) 0; 965 vdrop(vp); 966 } 967 968 /* 969 * Add an item to the syncer work queue. 970 */ 971 static void 972 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 973 { 974 int s, slot; 975 976 s = splbio(); 977 978 if (vp->v_flag & VONWORKLST) { 979 LIST_REMOVE(vp, v_synclist); 980 } 981 982 if (delay > syncer_maxdelay - 2) 983 delay = syncer_maxdelay - 2; 984 slot = (syncer_delayno + delay) & syncer_mask; 985 986 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 987 vp->v_flag |= VONWORKLST; 988 splx(s); 989 } 990 991 struct proc *updateproc; 992 static void sched_sync __P((void)); 993 static struct kproc_desc up_kp = { 994 "syncer", 995 sched_sync, 996 &updateproc 997 }; 998 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 999 1000 /* 1001 * System filesystem synchronizer daemon. 1002 */ 1003 void 1004 sched_sync(void) 1005 { 1006 struct synclist *slp; 1007 struct vnode *vp; 1008 struct mount *mp; 1009 long starttime; 1010 int s; 1011 struct proc *p = updateproc; 1012 1013 mtx_enter(&Giant, MTX_DEF); 1014 1015 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 1016 SHUTDOWN_PRI_LAST); 1017 1018 for (;;) { 1019 kthread_suspend_check(p); 1020 1021 starttime = time_second; 1022 1023 /* 1024 * Push files whose dirty time has expired. Be careful 1025 * of interrupt race on slp queue. 1026 */ 1027 s = splbio(); 1028 slp = &syncer_workitem_pending[syncer_delayno]; 1029 syncer_delayno += 1; 1030 if (syncer_delayno == syncer_maxdelay) 1031 syncer_delayno = 0; 1032 splx(s); 1033 1034 while ((vp = LIST_FIRST(slp)) != NULL) { 1035 if (VOP_ISLOCKED(vp, NULL) == 0 && 1036 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 1037 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1038 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 1039 VOP_UNLOCK(vp, 0, p); 1040 vn_finished_write(mp); 1041 } 1042 s = splbio(); 1043 if (LIST_FIRST(slp) == vp) { 1044 /* 1045 * Note: v_tag VT_VFS vps can remain on the 1046 * worklist too with no dirty blocks, but 1047 * since sync_fsync() moves it to a different 1048 * slot we are safe. 1049 */ 1050 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1051 !vn_isdisk(vp, NULL)) 1052 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1053 /* 1054 * Put us back on the worklist. The worklist 1055 * routine will remove us from our current 1056 * position and then add us back in at a later 1057 * position. 1058 */ 1059 vn_syncer_add_to_worklist(vp, syncdelay); 1060 } 1061 splx(s); 1062 } 1063 1064 /* 1065 * Do soft update processing. 1066 */ 1067 #ifdef SOFTUPDATES 1068 softdep_process_worklist(NULL); 1069 #endif 1070 1071 /* 1072 * The variable rushjob allows the kernel to speed up the 1073 * processing of the filesystem syncer process. A rushjob 1074 * value of N tells the filesystem syncer to process the next 1075 * N seconds worth of work on its queue ASAP. Currently rushjob 1076 * is used by the soft update code to speed up the filesystem 1077 * syncer process when the incore state is getting so far 1078 * ahead of the disk that the kernel memory pool is being 1079 * threatened with exhaustion. 1080 */ 1081 if (rushjob > 0) { 1082 rushjob -= 1; 1083 continue; 1084 } 1085 /* 1086 * If it has taken us less than a second to process the 1087 * current work, then wait. Otherwise start right over 1088 * again. We can still lose time if any single round 1089 * takes more than two seconds, but it does not really 1090 * matter as we are just trying to generally pace the 1091 * filesystem activity. 1092 */ 1093 if (time_second == starttime) 1094 tsleep(&lbolt, PPAUSE, "syncer", 0); 1095 } 1096 } 1097 1098 /* 1099 * Request the syncer daemon to speed up its work. 1100 * We never push it to speed up more than half of its 1101 * normal turn time, otherwise it could take over the cpu. 1102 */ 1103 int 1104 speedup_syncer() 1105 { 1106 1107 mtx_enter(&sched_lock, MTX_SPIN); 1108 if (updateproc->p_wchan == &lbolt) 1109 setrunnable(updateproc); 1110 mtx_exit(&sched_lock, MTX_SPIN); 1111 if (rushjob < syncdelay / 2) { 1112 rushjob += 1; 1113 stat_rush_requests += 1; 1114 return (1); 1115 } 1116 return(0); 1117 } 1118 1119 /* 1120 * Associate a p-buffer with a vnode. 1121 * 1122 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1123 * with the buffer. i.e. the bp has not been linked into the vnode or 1124 * ref-counted. 1125 */ 1126 void 1127 pbgetvp(vp, bp) 1128 register struct vnode *vp; 1129 register struct buf *bp; 1130 { 1131 1132 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1133 1134 bp->b_vp = vp; 1135 bp->b_flags |= B_PAGING; 1136 bp->b_dev = vn_todev(vp); 1137 } 1138 1139 /* 1140 * Disassociate a p-buffer from a vnode. 1141 */ 1142 void 1143 pbrelvp(bp) 1144 register struct buf *bp; 1145 { 1146 1147 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1148 1149 /* XXX REMOVE ME */ 1150 if (bp->b_vnbufs.tqe_next != NULL) { 1151 panic( 1152 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1153 bp, 1154 (int)bp->b_flags 1155 ); 1156 } 1157 bp->b_vp = (struct vnode *) 0; 1158 bp->b_flags &= ~B_PAGING; 1159 } 1160 1161 /* 1162 * Change the vnode a pager buffer is associated with. 1163 */ 1164 void 1165 pbreassignbuf(bp, newvp) 1166 struct buf *bp; 1167 struct vnode *newvp; 1168 { 1169 1170 KASSERT(bp->b_flags & B_PAGING, 1171 ("pbreassignbuf() on non phys bp %p", bp)); 1172 bp->b_vp = newvp; 1173 } 1174 1175 /* 1176 * Reassign a buffer from one vnode to another. 1177 * Used to assign file specific control information 1178 * (indirect blocks) to the vnode to which they belong. 1179 */ 1180 void 1181 reassignbuf(bp, newvp) 1182 register struct buf *bp; 1183 register struct vnode *newvp; 1184 { 1185 struct buflists *listheadp; 1186 int delay; 1187 int s; 1188 1189 if (newvp == NULL) { 1190 printf("reassignbuf: NULL"); 1191 return; 1192 } 1193 ++reassignbufcalls; 1194 1195 /* 1196 * B_PAGING flagged buffers cannot be reassigned because their vp 1197 * is not fully linked in. 1198 */ 1199 if (bp->b_flags & B_PAGING) 1200 panic("cannot reassign paging buffer"); 1201 1202 s = splbio(); 1203 /* 1204 * Delete from old vnode list, if on one. 1205 */ 1206 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1207 if (bp->b_xflags & BX_VNDIRTY) 1208 listheadp = &bp->b_vp->v_dirtyblkhd; 1209 else 1210 listheadp = &bp->b_vp->v_cleanblkhd; 1211 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1212 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1213 if (bp->b_vp != newvp) { 1214 vdrop(bp->b_vp); 1215 bp->b_vp = NULL; /* for clarification */ 1216 } 1217 } 1218 /* 1219 * If dirty, put on list of dirty buffers; otherwise insert onto list 1220 * of clean buffers. 1221 */ 1222 if (bp->b_flags & B_DELWRI) { 1223 struct buf *tbp; 1224 1225 listheadp = &newvp->v_dirtyblkhd; 1226 if ((newvp->v_flag & VONWORKLST) == 0) { 1227 switch (newvp->v_type) { 1228 case VDIR: 1229 delay = dirdelay; 1230 break; 1231 case VCHR: 1232 if (newvp->v_rdev->si_mountpoint != NULL) { 1233 delay = metadelay; 1234 break; 1235 } 1236 /* fall through */ 1237 default: 1238 delay = filedelay; 1239 } 1240 vn_syncer_add_to_worklist(newvp, delay); 1241 } 1242 bp->b_xflags |= BX_VNDIRTY; 1243 tbp = TAILQ_FIRST(listheadp); 1244 if (tbp == NULL || 1245 bp->b_lblkno == 0 || 1246 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1247 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1248 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1249 ++reassignbufsortgood; 1250 } else if (bp->b_lblkno < 0) { 1251 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1252 ++reassignbufsortgood; 1253 } else if (reassignbufmethod == 1) { 1254 /* 1255 * New sorting algorithm, only handle sequential case, 1256 * otherwise append to end (but before metadata) 1257 */ 1258 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1259 (tbp->b_xflags & BX_VNDIRTY)) { 1260 /* 1261 * Found the best place to insert the buffer 1262 */ 1263 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1264 ++reassignbufsortgood; 1265 } else { 1266 /* 1267 * Missed, append to end, but before meta-data. 1268 * We know that the head buffer in the list is 1269 * not meta-data due to prior conditionals. 1270 * 1271 * Indirect effects: NFS second stage write 1272 * tends to wind up here, giving maximum 1273 * distance between the unstable write and the 1274 * commit rpc. 1275 */ 1276 tbp = TAILQ_LAST(listheadp, buflists); 1277 while (tbp && tbp->b_lblkno < 0) 1278 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1279 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1280 ++reassignbufsortbad; 1281 } 1282 } else { 1283 /* 1284 * Old sorting algorithm, scan queue and insert 1285 */ 1286 struct buf *ttbp; 1287 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1288 (ttbp->b_lblkno < bp->b_lblkno)) { 1289 ++reassignbufloops; 1290 tbp = ttbp; 1291 } 1292 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1293 } 1294 } else { 1295 bp->b_xflags |= BX_VNCLEAN; 1296 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1297 if ((newvp->v_flag & VONWORKLST) && 1298 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1299 newvp->v_flag &= ~VONWORKLST; 1300 LIST_REMOVE(newvp, v_synclist); 1301 } 1302 } 1303 if (bp->b_vp != newvp) { 1304 bp->b_vp = newvp; 1305 vhold(bp->b_vp); 1306 } 1307 splx(s); 1308 } 1309 1310 /* 1311 * Create a vnode for a device. 1312 * Used for mounting the root file system. 1313 */ 1314 int 1315 bdevvp(dev, vpp) 1316 dev_t dev; 1317 struct vnode **vpp; 1318 { 1319 register struct vnode *vp; 1320 struct vnode *nvp; 1321 int error; 1322 1323 if (dev == NODEV) { 1324 *vpp = NULLVP; 1325 return (ENXIO); 1326 } 1327 if (vfinddev(dev, VCHR, vpp)) 1328 return (0); 1329 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1330 if (error) { 1331 *vpp = NULLVP; 1332 return (error); 1333 } 1334 vp = nvp; 1335 vp->v_type = VCHR; 1336 addalias(vp, dev); 1337 *vpp = vp; 1338 return (0); 1339 } 1340 1341 /* 1342 * Add vnode to the alias list hung off the dev_t. 1343 * 1344 * The reason for this gunk is that multiple vnodes can reference 1345 * the same physical device, so checking vp->v_usecount to see 1346 * how many users there are is inadequate; the v_usecount for 1347 * the vnodes need to be accumulated. vcount() does that. 1348 */ 1349 struct vnode * 1350 addaliasu(nvp, nvp_rdev) 1351 struct vnode *nvp; 1352 udev_t nvp_rdev; 1353 { 1354 struct vnode *ovp; 1355 vop_t **ops; 1356 dev_t dev; 1357 1358 if (nvp->v_type == VBLK) 1359 return (nvp); 1360 if (nvp->v_type != VCHR) 1361 panic("addaliasu on non-special vnode"); 1362 dev = udev2dev(nvp_rdev, 0); 1363 /* 1364 * Check to see if we have a bdevvp vnode with no associated 1365 * filesystem. If so, we want to associate the filesystem of 1366 * the new newly instigated vnode with the bdevvp vnode and 1367 * discard the newly created vnode rather than leaving the 1368 * bdevvp vnode lying around with no associated filesystem. 1369 */ 1370 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1371 addalias(nvp, dev); 1372 return (nvp); 1373 } 1374 /* 1375 * Discard unneeded vnode, but save its node specific data. 1376 * Note that if there is a lock, it is carried over in the 1377 * node specific data to the replacement vnode. 1378 */ 1379 vref(ovp); 1380 ovp->v_data = nvp->v_data; 1381 ovp->v_tag = nvp->v_tag; 1382 nvp->v_data = NULL; 1383 ops = nvp->v_op; 1384 nvp->v_op = ovp->v_op; 1385 ovp->v_op = ops; 1386 lockinit(&ovp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); 1387 if (nvp->v_vnlock) 1388 ovp->v_vnlock = &ovp->v_lock; 1389 insmntque(ovp, nvp->v_mount); 1390 vrele(nvp); 1391 vgone(nvp); 1392 return (ovp); 1393 } 1394 1395 /* This is a local helper function that do the same as addaliasu, but for a 1396 * dev_t instead of an udev_t. */ 1397 static void 1398 addalias(nvp, dev) 1399 struct vnode *nvp; 1400 dev_t dev; 1401 { 1402 1403 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); 1404 nvp->v_rdev = dev; 1405 simple_lock(&spechash_slock); 1406 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1407 simple_unlock(&spechash_slock); 1408 } 1409 1410 /* 1411 * Grab a particular vnode from the free list, increment its 1412 * reference count and lock it. The vnode lock bit is set if the 1413 * vnode is being eliminated in vgone. The process is awakened 1414 * when the transition is completed, and an error returned to 1415 * indicate that the vnode is no longer usable (possibly having 1416 * been changed to a new file system type). 1417 */ 1418 int 1419 vget(vp, flags, p) 1420 register struct vnode *vp; 1421 int flags; 1422 struct proc *p; 1423 { 1424 int error; 1425 1426 /* 1427 * If the vnode is in the process of being cleaned out for 1428 * another use, we wait for the cleaning to finish and then 1429 * return failure. Cleaning is determined by checking that 1430 * the VXLOCK flag is set. 1431 */ 1432 if ((flags & LK_INTERLOCK) == 0) 1433 mtx_enter(&vp->v_interlock, MTX_DEF); 1434 if (vp->v_flag & VXLOCK) { 1435 if (vp->v_vxproc == curproc) { 1436 printf("VXLOCK interlock avoided\n"); 1437 } else { 1438 vp->v_flag |= VXWANT; 1439 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1440 "vget", 0); 1441 return (ENOENT); 1442 } 1443 } 1444 1445 vp->v_usecount++; 1446 1447 if (VSHOULDBUSY(vp)) 1448 vbusy(vp); 1449 if (flags & LK_TYPE_MASK) { 1450 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1451 /* 1452 * must expand vrele here because we do not want 1453 * to call VOP_INACTIVE if the reference count 1454 * drops back to zero since it was never really 1455 * active. We must remove it from the free list 1456 * before sleeping so that multiple processes do 1457 * not try to recycle it. 1458 */ 1459 mtx_enter(&vp->v_interlock, MTX_DEF); 1460 vp->v_usecount--; 1461 if (VSHOULDFREE(vp)) 1462 vfree(vp); 1463 mtx_exit(&vp->v_interlock, MTX_DEF); 1464 } 1465 return (error); 1466 } 1467 mtx_exit(&vp->v_interlock, MTX_DEF); 1468 return (0); 1469 } 1470 1471 /* 1472 * Increase the reference count of a vnode. 1473 */ 1474 void 1475 vref(struct vnode *vp) 1476 { 1477 mtx_enter(&vp->v_interlock, MTX_DEF); 1478 vp->v_usecount++; 1479 mtx_exit(&vp->v_interlock, MTX_DEF); 1480 } 1481 1482 /* 1483 * Vnode put/release. 1484 * If count drops to zero, call inactive routine and return to freelist. 1485 */ 1486 void 1487 vrele(vp) 1488 struct vnode *vp; 1489 { 1490 struct proc *p = curproc; /* XXX */ 1491 1492 KASSERT(vp != NULL, ("vrele: null vp")); 1493 1494 mtx_enter(&vp->v_interlock, MTX_DEF); 1495 1496 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); 1497 1498 if (vp->v_usecount > 1) { 1499 1500 vp->v_usecount--; 1501 mtx_exit(&vp->v_interlock, MTX_DEF); 1502 1503 return; 1504 } 1505 1506 if (vp->v_usecount == 1) { 1507 1508 vp->v_usecount--; 1509 if (VSHOULDFREE(vp)) 1510 vfree(vp); 1511 /* 1512 * If we are doing a vput, the node is already locked, and we must 1513 * call VOP_INACTIVE with the node locked. So, in the case of 1514 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1515 */ 1516 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1517 VOP_INACTIVE(vp, p); 1518 } 1519 1520 } else { 1521 #ifdef DIAGNOSTIC 1522 vprint("vrele: negative ref count", vp); 1523 mtx_exit(&vp->v_interlock, MTX_DEF); 1524 #endif 1525 panic("vrele: negative ref cnt"); 1526 } 1527 } 1528 1529 /* 1530 * Release an already locked vnode. This give the same effects as 1531 * unlock+vrele(), but takes less time and avoids releasing and 1532 * re-aquiring the lock (as vrele() aquires the lock internally.) 1533 */ 1534 void 1535 vput(vp) 1536 struct vnode *vp; 1537 { 1538 struct proc *p = curproc; /* XXX */ 1539 1540 KASSERT(vp != NULL, ("vput: null vp")); 1541 mtx_enter(&vp->v_interlock, MTX_DEF); 1542 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); 1543 1544 if (vp->v_usecount > 1) { 1545 1546 vp->v_usecount--; 1547 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1548 return; 1549 1550 } 1551 1552 if (vp->v_usecount == 1) { 1553 1554 vp->v_usecount--; 1555 if (VSHOULDFREE(vp)) 1556 vfree(vp); 1557 /* 1558 * If we are doing a vput, the node is already locked, and we must 1559 * call VOP_INACTIVE with the node locked. So, in the case of 1560 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1561 */ 1562 mtx_exit(&vp->v_interlock, MTX_DEF); 1563 VOP_INACTIVE(vp, p); 1564 1565 } else { 1566 #ifdef DIAGNOSTIC 1567 vprint("vput: negative ref count", vp); 1568 #endif 1569 panic("vput: negative ref cnt"); 1570 } 1571 } 1572 1573 /* 1574 * Somebody doesn't want the vnode recycled. 1575 */ 1576 void 1577 vhold(vp) 1578 register struct vnode *vp; 1579 { 1580 int s; 1581 1582 s = splbio(); 1583 vp->v_holdcnt++; 1584 if (VSHOULDBUSY(vp)) 1585 vbusy(vp); 1586 splx(s); 1587 } 1588 1589 /* 1590 * Note that there is one less who cares about this vnode. vdrop() is the 1591 * opposite of vhold(). 1592 */ 1593 void 1594 vdrop(vp) 1595 register struct vnode *vp; 1596 { 1597 int s; 1598 1599 s = splbio(); 1600 if (vp->v_holdcnt <= 0) 1601 panic("vdrop: holdcnt"); 1602 vp->v_holdcnt--; 1603 if (VSHOULDFREE(vp)) 1604 vfree(vp); 1605 splx(s); 1606 } 1607 1608 /* 1609 * Remove any vnodes in the vnode table belonging to mount point mp. 1610 * 1611 * If MNT_NOFORCE is specified, there should not be any active ones, 1612 * return error if any are found (nb: this is a user error, not a 1613 * system error). If MNT_FORCE is specified, detach any active vnodes 1614 * that are found. 1615 */ 1616 #ifdef DIAGNOSTIC 1617 static int busyprt = 0; /* print out busy vnodes */ 1618 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1619 #endif 1620 1621 int 1622 vflush(mp, skipvp, flags) 1623 struct mount *mp; 1624 struct vnode *skipvp; 1625 int flags; 1626 { 1627 struct proc *p = curproc; /* XXX */ 1628 struct vnode *vp, *nvp; 1629 int busy = 0; 1630 1631 simple_lock(&mntvnode_slock); 1632 loop: 1633 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1634 /* 1635 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1636 * Start over if it has (it won't be on the list anymore). 1637 */ 1638 if (vp->v_mount != mp) 1639 goto loop; 1640 nvp = LIST_NEXT(vp, v_mntvnodes); 1641 /* 1642 * Skip over a selected vnode. 1643 */ 1644 if (vp == skipvp) 1645 continue; 1646 1647 mtx_enter(&vp->v_interlock, MTX_DEF); 1648 /* 1649 * Skip over a vnodes marked VSYSTEM. 1650 */ 1651 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1652 mtx_exit(&vp->v_interlock, MTX_DEF); 1653 continue; 1654 } 1655 /* 1656 * If WRITECLOSE is set, only flush out regular file vnodes 1657 * open for writing. 1658 */ 1659 if ((flags & WRITECLOSE) && 1660 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1661 mtx_exit(&vp->v_interlock, MTX_DEF); 1662 continue; 1663 } 1664 1665 /* 1666 * With v_usecount == 0, all we need to do is clear out the 1667 * vnode data structures and we are done. 1668 */ 1669 if (vp->v_usecount == 0) { 1670 simple_unlock(&mntvnode_slock); 1671 vgonel(vp, p); 1672 simple_lock(&mntvnode_slock); 1673 continue; 1674 } 1675 1676 /* 1677 * If FORCECLOSE is set, forcibly close the vnode. For block 1678 * or character devices, revert to an anonymous device. For 1679 * all other files, just kill them. 1680 */ 1681 if (flags & FORCECLOSE) { 1682 simple_unlock(&mntvnode_slock); 1683 if (vp->v_type != VCHR) { 1684 vgonel(vp, p); 1685 } else { 1686 vclean(vp, 0, p); 1687 vp->v_op = spec_vnodeop_p; 1688 insmntque(vp, (struct mount *) 0); 1689 } 1690 simple_lock(&mntvnode_slock); 1691 continue; 1692 } 1693 #ifdef DIAGNOSTIC 1694 if (busyprt) 1695 vprint("vflush: busy vnode", vp); 1696 #endif 1697 mtx_exit(&vp->v_interlock, MTX_DEF); 1698 busy++; 1699 } 1700 simple_unlock(&mntvnode_slock); 1701 if (busy) 1702 return (EBUSY); 1703 return (0); 1704 } 1705 1706 /* 1707 * Disassociate the underlying file system from a vnode. 1708 */ 1709 static void 1710 vclean(vp, flags, p) 1711 struct vnode *vp; 1712 int flags; 1713 struct proc *p; 1714 { 1715 int active; 1716 1717 /* 1718 * Check to see if the vnode is in use. If so we have to reference it 1719 * before we clean it out so that its count cannot fall to zero and 1720 * generate a race against ourselves to recycle it. 1721 */ 1722 if ((active = vp->v_usecount)) 1723 vp->v_usecount++; 1724 1725 /* 1726 * Prevent the vnode from being recycled or brought into use while we 1727 * clean it out. 1728 */ 1729 if (vp->v_flag & VXLOCK) 1730 panic("vclean: deadlock"); 1731 vp->v_flag |= VXLOCK; 1732 vp->v_vxproc = curproc; 1733 /* 1734 * Even if the count is zero, the VOP_INACTIVE routine may still 1735 * have the object locked while it cleans it out. The VOP_LOCK 1736 * ensures that the VOP_INACTIVE routine is done with its work. 1737 * For active vnodes, it ensures that no other activity can 1738 * occur while the underlying object is being cleaned out. 1739 */ 1740 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1741 1742 /* 1743 * Clean out any buffers associated with the vnode. 1744 * If the flush fails, just toss the buffers. 1745 */ 1746 if (flags & DOCLOSE) { 1747 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1748 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1749 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) 1750 vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1751 } 1752 1753 VOP_DESTROYVOBJECT(vp); 1754 1755 /* 1756 * If purging an active vnode, it must be closed and 1757 * deactivated before being reclaimed. Note that the 1758 * VOP_INACTIVE will unlock the vnode. 1759 */ 1760 if (active) { 1761 if (flags & DOCLOSE) 1762 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1763 VOP_INACTIVE(vp, p); 1764 } else { 1765 /* 1766 * Any other processes trying to obtain this lock must first 1767 * wait for VXLOCK to clear, then call the new lock operation. 1768 */ 1769 VOP_UNLOCK(vp, 0, p); 1770 } 1771 /* 1772 * Reclaim the vnode. 1773 */ 1774 if (VOP_RECLAIM(vp, p)) 1775 panic("vclean: cannot reclaim"); 1776 1777 if (active) { 1778 /* 1779 * Inline copy of vrele() since VOP_INACTIVE 1780 * has already been called. 1781 */ 1782 mtx_enter(&vp->v_interlock, MTX_DEF); 1783 if (--vp->v_usecount <= 0) { 1784 #ifdef DIAGNOSTIC 1785 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1786 vprint("vclean: bad ref count", vp); 1787 panic("vclean: ref cnt"); 1788 } 1789 #endif 1790 vfree(vp); 1791 } 1792 mtx_exit(&vp->v_interlock, MTX_DEF); 1793 } 1794 1795 cache_purge(vp); 1796 vp->v_vnlock = NULL; 1797 lockdestroy(&vp->v_lock); 1798 1799 if (VSHOULDFREE(vp)) 1800 vfree(vp); 1801 1802 /* 1803 * Done with purge, notify sleepers of the grim news. 1804 */ 1805 vp->v_op = dead_vnodeop_p; 1806 vn_pollgone(vp); 1807 vp->v_tag = VT_NON; 1808 vp->v_flag &= ~VXLOCK; 1809 vp->v_vxproc = NULL; 1810 if (vp->v_flag & VXWANT) { 1811 vp->v_flag &= ~VXWANT; 1812 wakeup((caddr_t) vp); 1813 } 1814 } 1815 1816 /* 1817 * Eliminate all activity associated with the requested vnode 1818 * and with all vnodes aliased to the requested vnode. 1819 */ 1820 int 1821 vop_revoke(ap) 1822 struct vop_revoke_args /* { 1823 struct vnode *a_vp; 1824 int a_flags; 1825 } */ *ap; 1826 { 1827 struct vnode *vp, *vq; 1828 dev_t dev; 1829 1830 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1831 1832 vp = ap->a_vp; 1833 /* 1834 * If a vgone (or vclean) is already in progress, 1835 * wait until it is done and return. 1836 */ 1837 if (vp->v_flag & VXLOCK) { 1838 vp->v_flag |= VXWANT; 1839 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1840 "vop_revokeall", 0); 1841 return (0); 1842 } 1843 dev = vp->v_rdev; 1844 for (;;) { 1845 simple_lock(&spechash_slock); 1846 vq = SLIST_FIRST(&dev->si_hlist); 1847 simple_unlock(&spechash_slock); 1848 if (!vq) 1849 break; 1850 vgone(vq); 1851 } 1852 return (0); 1853 } 1854 1855 /* 1856 * Recycle an unused vnode to the front of the free list. 1857 * Release the passed interlock if the vnode will be recycled. 1858 */ 1859 int 1860 vrecycle(vp, inter_lkp, p) 1861 struct vnode *vp; 1862 struct simplelock *inter_lkp; 1863 struct proc *p; 1864 { 1865 1866 mtx_enter(&vp->v_interlock, MTX_DEF); 1867 if (vp->v_usecount == 0) { 1868 if (inter_lkp) { 1869 simple_unlock(inter_lkp); 1870 } 1871 vgonel(vp, p); 1872 return (1); 1873 } 1874 mtx_exit(&vp->v_interlock, MTX_DEF); 1875 return (0); 1876 } 1877 1878 /* 1879 * Eliminate all activity associated with a vnode 1880 * in preparation for reuse. 1881 */ 1882 void 1883 vgone(vp) 1884 register struct vnode *vp; 1885 { 1886 struct proc *p = curproc; /* XXX */ 1887 1888 mtx_enter(&vp->v_interlock, MTX_DEF); 1889 vgonel(vp, p); 1890 } 1891 1892 /* 1893 * vgone, with the vp interlock held. 1894 */ 1895 void 1896 vgonel(vp, p) 1897 struct vnode *vp; 1898 struct proc *p; 1899 { 1900 int s; 1901 1902 /* 1903 * If a vgone (or vclean) is already in progress, 1904 * wait until it is done and return. 1905 */ 1906 if (vp->v_flag & VXLOCK) { 1907 vp->v_flag |= VXWANT; 1908 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1909 "vgone", 0); 1910 return; 1911 } 1912 1913 /* 1914 * Clean out the filesystem specific data. 1915 */ 1916 vclean(vp, DOCLOSE, p); 1917 mtx_enter(&vp->v_interlock, MTX_DEF); 1918 1919 /* 1920 * Delete from old mount point vnode list, if on one. 1921 */ 1922 if (vp->v_mount != NULL) 1923 insmntque(vp, (struct mount *)0); 1924 /* 1925 * If special device, remove it from special device alias list 1926 * if it is on one. 1927 */ 1928 if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { 1929 simple_lock(&spechash_slock); 1930 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); 1931 freedev(vp->v_rdev); 1932 simple_unlock(&spechash_slock); 1933 vp->v_rdev = NULL; 1934 } 1935 1936 /* 1937 * If it is on the freelist and not already at the head, 1938 * move it to the head of the list. The test of the 1939 * VDOOMED flag and the reference count of zero is because 1940 * it will be removed from the free list by getnewvnode, 1941 * but will not have its reference count incremented until 1942 * after calling vgone. If the reference count were 1943 * incremented first, vgone would (incorrectly) try to 1944 * close the previous instance of the underlying object. 1945 */ 1946 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1947 s = splbio(); 1948 simple_lock(&vnode_free_list_slock); 1949 if (vp->v_flag & VFREE) 1950 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1951 else 1952 freevnodes++; 1953 vp->v_flag |= VFREE; 1954 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1955 simple_unlock(&vnode_free_list_slock); 1956 splx(s); 1957 } 1958 1959 vp->v_type = VBAD; 1960 mtx_exit(&vp->v_interlock, MTX_DEF); 1961 } 1962 1963 /* 1964 * Lookup a vnode by device number. 1965 */ 1966 int 1967 vfinddev(dev, type, vpp) 1968 dev_t dev; 1969 enum vtype type; 1970 struct vnode **vpp; 1971 { 1972 struct vnode *vp; 1973 1974 simple_lock(&spechash_slock); 1975 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1976 if (type == vp->v_type) { 1977 *vpp = vp; 1978 simple_unlock(&spechash_slock); 1979 return (1); 1980 } 1981 } 1982 simple_unlock(&spechash_slock); 1983 return (0); 1984 } 1985 1986 /* 1987 * Calculate the total number of references to a special device. 1988 */ 1989 int 1990 vcount(vp) 1991 struct vnode *vp; 1992 { 1993 struct vnode *vq; 1994 int count; 1995 1996 count = 0; 1997 simple_lock(&spechash_slock); 1998 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) 1999 count += vq->v_usecount; 2000 simple_unlock(&spechash_slock); 2001 return (count); 2002 } 2003 2004 /* 2005 * Same as above, but using the dev_t as argument 2006 */ 2007 int 2008 count_dev(dev) 2009 dev_t dev; 2010 { 2011 struct vnode *vp; 2012 2013 vp = SLIST_FIRST(&dev->si_hlist); 2014 if (vp == NULL) 2015 return (0); 2016 return(vcount(vp)); 2017 } 2018 2019 /* 2020 * Print out a description of a vnode. 2021 */ 2022 static char *typename[] = 2023 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2024 2025 void 2026 vprint(label, vp) 2027 char *label; 2028 struct vnode *vp; 2029 { 2030 char buf[96]; 2031 2032 if (label != NULL) 2033 printf("%s: %p: ", label, (void *)vp); 2034 else 2035 printf("%p: ", (void *)vp); 2036 printf("type %s, usecount %d, writecount %d, refcount %d,", 2037 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2038 vp->v_holdcnt); 2039 buf[0] = '\0'; 2040 if (vp->v_flag & VROOT) 2041 strcat(buf, "|VROOT"); 2042 if (vp->v_flag & VTEXT) 2043 strcat(buf, "|VTEXT"); 2044 if (vp->v_flag & VSYSTEM) 2045 strcat(buf, "|VSYSTEM"); 2046 if (vp->v_flag & VXLOCK) 2047 strcat(buf, "|VXLOCK"); 2048 if (vp->v_flag & VXWANT) 2049 strcat(buf, "|VXWANT"); 2050 if (vp->v_flag & VBWAIT) 2051 strcat(buf, "|VBWAIT"); 2052 if (vp->v_flag & VDOOMED) 2053 strcat(buf, "|VDOOMED"); 2054 if (vp->v_flag & VFREE) 2055 strcat(buf, "|VFREE"); 2056 if (vp->v_flag & VOBJBUF) 2057 strcat(buf, "|VOBJBUF"); 2058 if (buf[0] != '\0') 2059 printf(" flags (%s)", &buf[1]); 2060 if (vp->v_data == NULL) { 2061 printf("\n"); 2062 } else { 2063 printf("\n\t"); 2064 VOP_PRINT(vp); 2065 } 2066 } 2067 2068 #ifdef DDB 2069 #include <ddb/ddb.h> 2070 /* 2071 * List all of the locked vnodes in the system. 2072 * Called when debugging the kernel. 2073 */ 2074 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2075 { 2076 struct proc *p = curproc; /* XXX */ 2077 struct mount *mp, *nmp; 2078 struct vnode *vp; 2079 2080 printf("Locked vnodes\n"); 2081 mtx_enter(&mountlist_mtx, MTX_DEF); 2082 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2083 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2084 nmp = TAILQ_NEXT(mp, mnt_list); 2085 continue; 2086 } 2087 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2088 if (VOP_ISLOCKED(vp, NULL)) 2089 vprint((char *)0, vp); 2090 } 2091 mtx_enter(&mountlist_mtx, MTX_DEF); 2092 nmp = TAILQ_NEXT(mp, mnt_list); 2093 vfs_unbusy(mp, p); 2094 } 2095 mtx_exit(&mountlist_mtx, MTX_DEF); 2096 } 2097 #endif 2098 2099 /* 2100 * Top level filesystem related information gathering. 2101 */ 2102 static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2103 2104 static int 2105 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2106 { 2107 int *name = (int *)arg1 - 1; /* XXX */ 2108 u_int namelen = arg2 + 1; /* XXX */ 2109 struct vfsconf *vfsp; 2110 2111 #if 1 || defined(COMPAT_PRELITE2) 2112 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2113 if (namelen == 1) 2114 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2115 #endif 2116 2117 /* XXX the below code does not compile; vfs_sysctl does not exist. */ 2118 #ifdef notyet 2119 /* all sysctl names at this level are at least name and field */ 2120 if (namelen < 2) 2121 return (ENOTDIR); /* overloaded */ 2122 if (name[0] != VFS_GENERIC) { 2123 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2124 if (vfsp->vfc_typenum == name[0]) 2125 break; 2126 if (vfsp == NULL) 2127 return (EOPNOTSUPP); 2128 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2129 oldp, oldlenp, newp, newlen, p)); 2130 } 2131 #endif 2132 switch (name[1]) { 2133 case VFS_MAXTYPENUM: 2134 if (namelen != 2) 2135 return (ENOTDIR); 2136 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2137 case VFS_CONF: 2138 if (namelen != 3) 2139 return (ENOTDIR); /* overloaded */ 2140 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2141 if (vfsp->vfc_typenum == name[2]) 2142 break; 2143 if (vfsp == NULL) 2144 return (EOPNOTSUPP); 2145 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2146 } 2147 return (EOPNOTSUPP); 2148 } 2149 2150 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2151 "Generic filesystem"); 2152 2153 #if 1 || defined(COMPAT_PRELITE2) 2154 2155 static int 2156 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2157 { 2158 int error; 2159 struct vfsconf *vfsp; 2160 struct ovfsconf ovfs; 2161 2162 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2163 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2164 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2165 ovfs.vfc_index = vfsp->vfc_typenum; 2166 ovfs.vfc_refcount = vfsp->vfc_refcount; 2167 ovfs.vfc_flags = vfsp->vfc_flags; 2168 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2169 if (error) 2170 return error; 2171 } 2172 return 0; 2173 } 2174 2175 #endif /* 1 || COMPAT_PRELITE2 */ 2176 2177 #if COMPILING_LINT 2178 #define KINFO_VNODESLOP 10 2179 /* 2180 * Dump vnode list (via sysctl). 2181 * Copyout address of vnode followed by vnode. 2182 */ 2183 /* ARGSUSED */ 2184 static int 2185 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2186 { 2187 struct proc *p = curproc; /* XXX */ 2188 struct mount *mp, *nmp; 2189 struct vnode *nvp, *vp; 2190 int error; 2191 2192 #define VPTRSZ sizeof (struct vnode *) 2193 #define VNODESZ sizeof (struct vnode) 2194 2195 req->lock = 0; 2196 if (!req->oldptr) /* Make an estimate */ 2197 return (SYSCTL_OUT(req, 0, 2198 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2199 2200 mtx_enter(&mountlist_mtx, MTX_DEF); 2201 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2202 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2203 nmp = TAILQ_NEXT(mp, mnt_list); 2204 continue; 2205 } 2206 again: 2207 simple_lock(&mntvnode_slock); 2208 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2209 vp != NULL; 2210 vp = nvp) { 2211 /* 2212 * Check that the vp is still associated with 2213 * this filesystem. RACE: could have been 2214 * recycled onto the same filesystem. 2215 */ 2216 if (vp->v_mount != mp) { 2217 simple_unlock(&mntvnode_slock); 2218 goto again; 2219 } 2220 nvp = LIST_NEXT(vp, v_mntvnodes); 2221 simple_unlock(&mntvnode_slock); 2222 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2223 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2224 return (error); 2225 simple_lock(&mntvnode_slock); 2226 } 2227 simple_unlock(&mntvnode_slock); 2228 mtx_enter(&mountlist_mtx, MTX_DEF); 2229 nmp = TAILQ_NEXT(mp, mnt_list); 2230 vfs_unbusy(mp, p); 2231 } 2232 mtx_exit(&mountlist_mtx, MTX_DEF); 2233 2234 return (0); 2235 } 2236 2237 /* 2238 * XXX 2239 * Exporting the vnode list on large systems causes them to crash. 2240 * Exporting the vnode list on medium systems causes sysctl to coredump. 2241 */ 2242 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2243 0, 0, sysctl_vnode, "S,vnode", ""); 2244 #endif 2245 2246 /* 2247 * Check to see if a filesystem is mounted on a block device. 2248 */ 2249 int 2250 vfs_mountedon(vp) 2251 struct vnode *vp; 2252 { 2253 2254 if (vp->v_rdev->si_mountpoint != NULL) 2255 return (EBUSY); 2256 return (0); 2257 } 2258 2259 /* 2260 * Unmount all filesystems. The list is traversed in reverse order 2261 * of mounting to avoid dependencies. 2262 */ 2263 void 2264 vfs_unmountall() 2265 { 2266 struct mount *mp; 2267 struct proc *p; 2268 int error; 2269 2270 if (curproc != NULL) 2271 p = curproc; 2272 else 2273 p = initproc; /* XXX XXX should this be proc0? */ 2274 /* 2275 * Since this only runs when rebooting, it is not interlocked. 2276 */ 2277 while(!TAILQ_EMPTY(&mountlist)) { 2278 mp = TAILQ_LAST(&mountlist, mntlist); 2279 error = dounmount(mp, MNT_FORCE, p); 2280 if (error) { 2281 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2282 printf("unmount of %s failed (", 2283 mp->mnt_stat.f_mntonname); 2284 if (error == EBUSY) 2285 printf("BUSY)\n"); 2286 else 2287 printf("%d)\n", error); 2288 } else { 2289 /* The unmount has removed mp from the mountlist */ 2290 } 2291 } 2292 } 2293 2294 /* 2295 * Build hash lists of net addresses and hang them off the mount point. 2296 * Called by ufs_mount() to set up the lists of export addresses. 2297 */ 2298 static int 2299 vfs_hang_addrlist(mp, nep, argp) 2300 struct mount *mp; 2301 struct netexport *nep; 2302 struct export_args *argp; 2303 { 2304 register struct netcred *np; 2305 register struct radix_node_head *rnh; 2306 register int i; 2307 struct radix_node *rn; 2308 struct sockaddr *saddr, *smask = 0; 2309 struct domain *dom; 2310 int error; 2311 2312 if (argp->ex_addrlen == 0) { 2313 if (mp->mnt_flag & MNT_DEFEXPORTED) 2314 return (EPERM); 2315 np = &nep->ne_defexported; 2316 np->netc_exflags = argp->ex_flags; 2317 np->netc_anon = argp->ex_anon; 2318 np->netc_anon.cr_ref = 1; 2319 mp->mnt_flag |= MNT_DEFEXPORTED; 2320 return (0); 2321 } 2322 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2323 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); 2324 saddr = (struct sockaddr *) (np + 1); 2325 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2326 goto out; 2327 if (saddr->sa_len > argp->ex_addrlen) 2328 saddr->sa_len = argp->ex_addrlen; 2329 if (argp->ex_masklen) { 2330 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2331 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2332 if (error) 2333 goto out; 2334 if (smask->sa_len > argp->ex_masklen) 2335 smask->sa_len = argp->ex_masklen; 2336 } 2337 i = saddr->sa_family; 2338 if ((rnh = nep->ne_rtable[i]) == 0) { 2339 /* 2340 * Seems silly to initialize every AF when most are not used, 2341 * do so on demand here 2342 */ 2343 for (dom = domains; dom; dom = dom->dom_next) 2344 if (dom->dom_family == i && dom->dom_rtattach) { 2345 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2346 dom->dom_rtoffset); 2347 break; 2348 } 2349 if ((rnh = nep->ne_rtable[i]) == 0) { 2350 error = ENOBUFS; 2351 goto out; 2352 } 2353 } 2354 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2355 np->netc_rnodes); 2356 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2357 error = EPERM; 2358 goto out; 2359 } 2360 np->netc_exflags = argp->ex_flags; 2361 np->netc_anon = argp->ex_anon; 2362 np->netc_anon.cr_ref = 1; 2363 return (0); 2364 out: 2365 free(np, M_NETADDR); 2366 return (error); 2367 } 2368 2369 /* Helper for vfs_free_addrlist. */ 2370 /* ARGSUSED */ 2371 static int 2372 vfs_free_netcred(rn, w) 2373 struct radix_node *rn; 2374 void *w; 2375 { 2376 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2377 2378 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2379 free((caddr_t) rn, M_NETADDR); 2380 return (0); 2381 } 2382 2383 /* 2384 * Free the net address hash lists that are hanging off the mount points. 2385 */ 2386 static void 2387 vfs_free_addrlist(nep) 2388 struct netexport *nep; 2389 { 2390 register int i; 2391 register struct radix_node_head *rnh; 2392 2393 for (i = 0; i <= AF_MAX; i++) 2394 if ((rnh = nep->ne_rtable[i])) { 2395 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2396 (caddr_t) rnh); 2397 free((caddr_t) rnh, M_RTABLE); 2398 nep->ne_rtable[i] = 0; 2399 } 2400 } 2401 2402 /* 2403 * High level function to manipulate export options on a mount point 2404 * and the passed in netexport. 2405 * Struct export_args *argp is the variable used to twiddle options, 2406 * the structure is described in sys/mount.h 2407 */ 2408 int 2409 vfs_export(mp, nep, argp) 2410 struct mount *mp; 2411 struct netexport *nep; 2412 struct export_args *argp; 2413 { 2414 int error; 2415 2416 if (argp->ex_flags & MNT_DELEXPORT) { 2417 if (mp->mnt_flag & MNT_EXPUBLIC) { 2418 vfs_setpublicfs(NULL, NULL, NULL); 2419 mp->mnt_flag &= ~MNT_EXPUBLIC; 2420 } 2421 vfs_free_addrlist(nep); 2422 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2423 } 2424 if (argp->ex_flags & MNT_EXPORTED) { 2425 if (argp->ex_flags & MNT_EXPUBLIC) { 2426 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2427 return (error); 2428 mp->mnt_flag |= MNT_EXPUBLIC; 2429 } 2430 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2431 return (error); 2432 mp->mnt_flag |= MNT_EXPORTED; 2433 } 2434 return (0); 2435 } 2436 2437 /* 2438 * Set the publicly exported filesystem (WebNFS). Currently, only 2439 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2440 */ 2441 int 2442 vfs_setpublicfs(mp, nep, argp) 2443 struct mount *mp; 2444 struct netexport *nep; 2445 struct export_args *argp; 2446 { 2447 int error; 2448 struct vnode *rvp; 2449 char *cp; 2450 2451 /* 2452 * mp == NULL -> invalidate the current info, the FS is 2453 * no longer exported. May be called from either vfs_export 2454 * or unmount, so check if it hasn't already been done. 2455 */ 2456 if (mp == NULL) { 2457 if (nfs_pub.np_valid) { 2458 nfs_pub.np_valid = 0; 2459 if (nfs_pub.np_index != NULL) { 2460 FREE(nfs_pub.np_index, M_TEMP); 2461 nfs_pub.np_index = NULL; 2462 } 2463 } 2464 return (0); 2465 } 2466 2467 /* 2468 * Only one allowed at a time. 2469 */ 2470 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2471 return (EBUSY); 2472 2473 /* 2474 * Get real filehandle for root of exported FS. 2475 */ 2476 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2477 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2478 2479 if ((error = VFS_ROOT(mp, &rvp))) 2480 return (error); 2481 2482 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2483 return (error); 2484 2485 vput(rvp); 2486 2487 /* 2488 * If an indexfile was specified, pull it in. 2489 */ 2490 if (argp->ex_indexfile != NULL) { 2491 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2492 M_WAITOK); 2493 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2494 MAXNAMLEN, (size_t *)0); 2495 if (!error) { 2496 /* 2497 * Check for illegal filenames. 2498 */ 2499 for (cp = nfs_pub.np_index; *cp; cp++) { 2500 if (*cp == '/') { 2501 error = EINVAL; 2502 break; 2503 } 2504 } 2505 } 2506 if (error) { 2507 FREE(nfs_pub.np_index, M_TEMP); 2508 return (error); 2509 } 2510 } 2511 2512 nfs_pub.np_mount = mp; 2513 nfs_pub.np_valid = 1; 2514 return (0); 2515 } 2516 2517 /* 2518 * Used by the filesystems to determine if a given network address 2519 * (passed in 'nam') is present in thier exports list, returns a pointer 2520 * to struct netcred so that the filesystem can examine it for 2521 * access rights (read/write/etc). 2522 */ 2523 struct netcred * 2524 vfs_export_lookup(mp, nep, nam) 2525 register struct mount *mp; 2526 struct netexport *nep; 2527 struct sockaddr *nam; 2528 { 2529 register struct netcred *np; 2530 register struct radix_node_head *rnh; 2531 struct sockaddr *saddr; 2532 2533 np = NULL; 2534 if (mp->mnt_flag & MNT_EXPORTED) { 2535 /* 2536 * Lookup in the export list first. 2537 */ 2538 if (nam != NULL) { 2539 saddr = nam; 2540 rnh = nep->ne_rtable[saddr->sa_family]; 2541 if (rnh != NULL) { 2542 np = (struct netcred *) 2543 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2544 rnh); 2545 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2546 np = NULL; 2547 } 2548 } 2549 /* 2550 * If no address match, use the default if it exists. 2551 */ 2552 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2553 np = &nep->ne_defexported; 2554 } 2555 return (np); 2556 } 2557 2558 /* 2559 * perform msync on all vnodes under a mount point 2560 * the mount point must be locked. 2561 */ 2562 void 2563 vfs_msync(struct mount *mp, int flags) { 2564 struct vnode *vp, *nvp; 2565 struct vm_object *obj; 2566 int anyio, tries; 2567 2568 tries = 5; 2569 loop: 2570 anyio = 0; 2571 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2572 2573 nvp = LIST_NEXT(vp, v_mntvnodes); 2574 2575 if (vp->v_mount != mp) { 2576 goto loop; 2577 } 2578 2579 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2580 continue; 2581 2582 if (flags != MNT_WAIT) { 2583 if (VOP_GETVOBJECT(vp, &obj) != 0 || 2584 (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2585 continue; 2586 if (VOP_ISLOCKED(vp, NULL)) 2587 continue; 2588 } 2589 2590 mtx_enter(&vp->v_interlock, MTX_DEF); 2591 if (VOP_GETVOBJECT(vp, &obj) == 0 && 2592 (obj->flags & OBJ_MIGHTBEDIRTY)) { 2593 if (!vget(vp, 2594 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2595 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2596 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2597 anyio = 1; 2598 } 2599 vput(vp); 2600 } 2601 } else { 2602 mtx_exit(&vp->v_interlock, MTX_DEF); 2603 } 2604 } 2605 if (anyio && (--tries > 0)) 2606 goto loop; 2607 } 2608 2609 /* 2610 * Create the VM object needed for VMIO and mmap support. This 2611 * is done for all VREG files in the system. Some filesystems might 2612 * afford the additional metadata buffering capability of the 2613 * VMIO code by making the device node be VMIO mode also. 2614 * 2615 * vp must be locked when vfs_object_create is called. 2616 */ 2617 int 2618 vfs_object_create(vp, p, cred) 2619 struct vnode *vp; 2620 struct proc *p; 2621 struct ucred *cred; 2622 { 2623 return (VOP_CREATEVOBJECT(vp, cred, p)); 2624 } 2625 2626 /* 2627 * Mark a vnode as free, putting it up for recycling. 2628 */ 2629 void 2630 vfree(vp) 2631 struct vnode *vp; 2632 { 2633 int s; 2634 2635 s = splbio(); 2636 simple_lock(&vnode_free_list_slock); 2637 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2638 if (vp->v_flag & VAGE) { 2639 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2640 } else { 2641 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2642 } 2643 freevnodes++; 2644 simple_unlock(&vnode_free_list_slock); 2645 vp->v_flag &= ~VAGE; 2646 vp->v_flag |= VFREE; 2647 splx(s); 2648 } 2649 2650 /* 2651 * Opposite of vfree() - mark a vnode as in use. 2652 */ 2653 void 2654 vbusy(vp) 2655 struct vnode *vp; 2656 { 2657 int s; 2658 2659 s = splbio(); 2660 simple_lock(&vnode_free_list_slock); 2661 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2662 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2663 freevnodes--; 2664 simple_unlock(&vnode_free_list_slock); 2665 vp->v_flag &= ~(VFREE|VAGE); 2666 splx(s); 2667 } 2668 2669 /* 2670 * Record a process's interest in events which might happen to 2671 * a vnode. Because poll uses the historic select-style interface 2672 * internally, this routine serves as both the ``check for any 2673 * pending events'' and the ``record my interest in future events'' 2674 * functions. (These are done together, while the lock is held, 2675 * to avoid race conditions.) 2676 */ 2677 int 2678 vn_pollrecord(vp, p, events) 2679 struct vnode *vp; 2680 struct proc *p; 2681 short events; 2682 { 2683 simple_lock(&vp->v_pollinfo.vpi_lock); 2684 if (vp->v_pollinfo.vpi_revents & events) { 2685 /* 2686 * This leaves events we are not interested 2687 * in available for the other process which 2688 * which presumably had requested them 2689 * (otherwise they would never have been 2690 * recorded). 2691 */ 2692 events &= vp->v_pollinfo.vpi_revents; 2693 vp->v_pollinfo.vpi_revents &= ~events; 2694 2695 simple_unlock(&vp->v_pollinfo.vpi_lock); 2696 return events; 2697 } 2698 vp->v_pollinfo.vpi_events |= events; 2699 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2700 simple_unlock(&vp->v_pollinfo.vpi_lock); 2701 return 0; 2702 } 2703 2704 /* 2705 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2706 * it is possible for us to miss an event due to race conditions, but 2707 * that condition is expected to be rare, so for the moment it is the 2708 * preferred interface. 2709 */ 2710 void 2711 vn_pollevent(vp, events) 2712 struct vnode *vp; 2713 short events; 2714 { 2715 simple_lock(&vp->v_pollinfo.vpi_lock); 2716 if (vp->v_pollinfo.vpi_events & events) { 2717 /* 2718 * We clear vpi_events so that we don't 2719 * call selwakeup() twice if two events are 2720 * posted before the polling process(es) is 2721 * awakened. This also ensures that we take at 2722 * most one selwakeup() if the polling process 2723 * is no longer interested. However, it does 2724 * mean that only one event can be noticed at 2725 * a time. (Perhaps we should only clear those 2726 * event bits which we note?) XXX 2727 */ 2728 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2729 vp->v_pollinfo.vpi_revents |= events; 2730 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2731 } 2732 simple_unlock(&vp->v_pollinfo.vpi_lock); 2733 } 2734 2735 /* 2736 * Wake up anyone polling on vp because it is being revoked. 2737 * This depends on dead_poll() returning POLLHUP for correct 2738 * behavior. 2739 */ 2740 void 2741 vn_pollgone(vp) 2742 struct vnode *vp; 2743 { 2744 simple_lock(&vp->v_pollinfo.vpi_lock); 2745 if (vp->v_pollinfo.vpi_events) { 2746 vp->v_pollinfo.vpi_events = 0; 2747 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2748 } 2749 simple_unlock(&vp->v_pollinfo.vpi_lock); 2750 } 2751 2752 2753 2754 /* 2755 * Routine to create and manage a filesystem syncer vnode. 2756 */ 2757 #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2758 static int sync_fsync __P((struct vop_fsync_args *)); 2759 static int sync_inactive __P((struct vop_inactive_args *)); 2760 static int sync_reclaim __P((struct vop_reclaim_args *)); 2761 #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2762 #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2763 static int sync_print __P((struct vop_print_args *)); 2764 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2765 2766 static vop_t **sync_vnodeop_p; 2767 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2768 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2769 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2770 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2771 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2772 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2773 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2774 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2775 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2776 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2777 { NULL, NULL } 2778 }; 2779 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2780 { &sync_vnodeop_p, sync_vnodeop_entries }; 2781 2782 VNODEOP_SET(sync_vnodeop_opv_desc); 2783 2784 /* 2785 * Create a new filesystem syncer vnode for the specified mount point. 2786 */ 2787 int 2788 vfs_allocate_syncvnode(mp) 2789 struct mount *mp; 2790 { 2791 struct vnode *vp; 2792 static long start, incr, next; 2793 int error; 2794 2795 /* Allocate a new vnode */ 2796 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2797 mp->mnt_syncer = NULL; 2798 return (error); 2799 } 2800 vp->v_type = VNON; 2801 /* 2802 * Place the vnode onto the syncer worklist. We attempt to 2803 * scatter them about on the list so that they will go off 2804 * at evenly distributed times even if all the filesystems 2805 * are mounted at once. 2806 */ 2807 next += incr; 2808 if (next == 0 || next > syncer_maxdelay) { 2809 start /= 2; 2810 incr /= 2; 2811 if (start == 0) { 2812 start = syncer_maxdelay / 2; 2813 incr = syncer_maxdelay; 2814 } 2815 next = start; 2816 } 2817 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2818 mp->mnt_syncer = vp; 2819 return (0); 2820 } 2821 2822 /* 2823 * Do a lazy sync of the filesystem. 2824 */ 2825 static int 2826 sync_fsync(ap) 2827 struct vop_fsync_args /* { 2828 struct vnode *a_vp; 2829 struct ucred *a_cred; 2830 int a_waitfor; 2831 struct proc *a_p; 2832 } */ *ap; 2833 { 2834 struct vnode *syncvp = ap->a_vp; 2835 struct mount *mp = syncvp->v_mount; 2836 struct proc *p = ap->a_p; 2837 int asyncflag; 2838 2839 /* 2840 * We only need to do something if this is a lazy evaluation. 2841 */ 2842 if (ap->a_waitfor != MNT_LAZY) 2843 return (0); 2844 2845 /* 2846 * Move ourselves to the back of the sync list. 2847 */ 2848 vn_syncer_add_to_worklist(syncvp, syncdelay); 2849 2850 /* 2851 * Walk the list of vnodes pushing all that are dirty and 2852 * not already on the sync list. 2853 */ 2854 mtx_enter(&mountlist_mtx, MTX_DEF); 2855 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { 2856 mtx_exit(&mountlist_mtx, MTX_DEF); 2857 return (0); 2858 } 2859 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2860 vfs_unbusy(mp, p); 2861 return (0); 2862 } 2863 asyncflag = mp->mnt_flag & MNT_ASYNC; 2864 mp->mnt_flag &= ~MNT_ASYNC; 2865 vfs_msync(mp, MNT_NOWAIT); 2866 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2867 if (asyncflag) 2868 mp->mnt_flag |= MNT_ASYNC; 2869 vn_finished_write(mp); 2870 vfs_unbusy(mp, p); 2871 return (0); 2872 } 2873 2874 /* 2875 * The syncer vnode is no referenced. 2876 */ 2877 static int 2878 sync_inactive(ap) 2879 struct vop_inactive_args /* { 2880 struct vnode *a_vp; 2881 struct proc *a_p; 2882 } */ *ap; 2883 { 2884 2885 vgone(ap->a_vp); 2886 return (0); 2887 } 2888 2889 /* 2890 * The syncer vnode is no longer needed and is being decommissioned. 2891 * 2892 * Modifications to the worklist must be protected at splbio(). 2893 */ 2894 static int 2895 sync_reclaim(ap) 2896 struct vop_reclaim_args /* { 2897 struct vnode *a_vp; 2898 } */ *ap; 2899 { 2900 struct vnode *vp = ap->a_vp; 2901 int s; 2902 2903 s = splbio(); 2904 vp->v_mount->mnt_syncer = NULL; 2905 if (vp->v_flag & VONWORKLST) { 2906 LIST_REMOVE(vp, v_synclist); 2907 vp->v_flag &= ~VONWORKLST; 2908 } 2909 splx(s); 2910 2911 return (0); 2912 } 2913 2914 /* 2915 * Print out a syncer vnode. 2916 */ 2917 static int 2918 sync_print(ap) 2919 struct vop_print_args /* { 2920 struct vnode *a_vp; 2921 } */ *ap; 2922 { 2923 struct vnode *vp = ap->a_vp; 2924 2925 printf("syncer vnode"); 2926 if (vp->v_vnlock != NULL) 2927 lockmgr_printinfo(vp->v_vnlock); 2928 printf("\n"); 2929 return (0); 2930 } 2931 2932 /* 2933 * extract the dev_t from a VCHR 2934 */ 2935 dev_t 2936 vn_todev(vp) 2937 struct vnode *vp; 2938 { 2939 if (vp->v_type != VCHR) 2940 return (NODEV); 2941 return (vp->v_rdev); 2942 } 2943 2944 /* 2945 * Check if vnode represents a disk device 2946 */ 2947 int 2948 vn_isdisk(vp, errp) 2949 struct vnode *vp; 2950 int *errp; 2951 { 2952 struct cdevsw *cdevsw; 2953 2954 if (vp->v_type != VCHR) { 2955 if (errp != NULL) 2956 *errp = ENOTBLK; 2957 return (0); 2958 } 2959 if (vp->v_rdev == NULL) { 2960 if (errp != NULL) 2961 *errp = ENXIO; 2962 return (0); 2963 } 2964 cdevsw = devsw(vp->v_rdev); 2965 if (cdevsw == NULL) { 2966 if (errp != NULL) 2967 *errp = ENXIO; 2968 return (0); 2969 } 2970 if (!(cdevsw->d_flags & D_DISK)) { 2971 if (errp != NULL) 2972 *errp = ENOTBLK; 2973 return (0); 2974 } 2975 if (errp != NULL) 2976 *errp = 0; 2977 return (1); 2978 } 2979 2980 /* 2981 * Free data allocated by namei(); see namei(9) for details. 2982 */ 2983 void 2984 NDFREE(ndp, flags) 2985 struct nameidata *ndp; 2986 const uint flags; 2987 { 2988 if (!(flags & NDF_NO_FREE_PNBUF) && 2989 (ndp->ni_cnd.cn_flags & HASBUF)) { 2990 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2991 ndp->ni_cnd.cn_flags &= ~HASBUF; 2992 } 2993 if (!(flags & NDF_NO_DVP_UNLOCK) && 2994 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 2995 ndp->ni_dvp != ndp->ni_vp) 2996 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 2997 if (!(flags & NDF_NO_DVP_RELE) && 2998 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 2999 vrele(ndp->ni_dvp); 3000 ndp->ni_dvp = NULL; 3001 } 3002 if (!(flags & NDF_NO_VP_UNLOCK) && 3003 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3004 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 3005 if (!(flags & NDF_NO_VP_RELE) && 3006 ndp->ni_vp) { 3007 vrele(ndp->ni_vp); 3008 ndp->ni_vp = NULL; 3009 } 3010 if (!(flags & NDF_NO_STARTDIR_RELE) && 3011 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3012 vrele(ndp->ni_startdir); 3013 ndp->ni_startdir = NULL; 3014 } 3015 } 3016 3017 /* 3018 * Common file system object access control check routine. Accepts a 3019 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3020 * and optional call-by-reference privused argument allowing vaccess() 3021 * to indicate to the caller whether privilege was used to satisfy the 3022 * request. Returns 0 on success, or an errno on failure. 3023 */ 3024 int 3025 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3026 enum vtype type; 3027 mode_t file_mode; 3028 uid_t file_uid; 3029 gid_t file_gid; 3030 mode_t acc_mode; 3031 struct ucred *cred; 3032 int *privused; 3033 { 3034 mode_t dac_granted; 3035 #ifdef CAPABILITIES 3036 mode_t cap_granted; 3037 #endif 3038 3039 /* 3040 * Look for a normal, non-privileged way to access the file/directory 3041 * as requested. If it exists, go with that. 3042 */ 3043 3044 if (privused != NULL) 3045 *privused = 0; 3046 3047 dac_granted = 0; 3048 3049 /* Check the owner. */ 3050 if (cred->cr_uid == file_uid) { 3051 dac_granted |= VADMIN; 3052 if (file_mode & S_IXUSR) 3053 dac_granted |= VEXEC; 3054 if (file_mode & S_IRUSR) 3055 dac_granted |= VREAD; 3056 if (file_mode & S_IWUSR) 3057 dac_granted |= VWRITE; 3058 3059 if ((acc_mode & dac_granted) == acc_mode) 3060 return (0); 3061 3062 goto privcheck; 3063 } 3064 3065 /* Otherwise, check the groups (first match) */ 3066 if (groupmember(file_gid, cred)) { 3067 if (file_mode & S_IXGRP) 3068 dac_granted |= VEXEC; 3069 if (file_mode & S_IRGRP) 3070 dac_granted |= VREAD; 3071 if (file_mode & S_IWGRP) 3072 dac_granted |= VWRITE; 3073 3074 if ((acc_mode & dac_granted) == acc_mode) 3075 return (0); 3076 3077 goto privcheck; 3078 } 3079 3080 /* Otherwise, check everyone else. */ 3081 if (file_mode & S_IXOTH) 3082 dac_granted |= VEXEC; 3083 if (file_mode & S_IROTH) 3084 dac_granted |= VREAD; 3085 if (file_mode & S_IWOTH) 3086 dac_granted |= VWRITE; 3087 if ((acc_mode & dac_granted) == acc_mode) 3088 return (0); 3089 3090 privcheck: 3091 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 3092 /* XXX audit: privilege used */ 3093 if (privused != NULL) 3094 *privused = 1; 3095 return (0); 3096 } 3097 3098 #ifdef CAPABILITIES 3099 /* 3100 * Build a capability mask to determine if the set of capabilities 3101 * satisfies the requirements when combined with the granted mask 3102 * from above. 3103 * For each capability, if the capability is required, bitwise 3104 * or the request type onto the cap_granted mask. 3105 */ 3106 cap_granted = 0; 3107 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3108 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3109 cap_granted |= VEXEC; 3110 3111 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3112 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3113 cap_granted |= VREAD; 3114 3115 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3116 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3117 cap_granted |= VWRITE; 3118 3119 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3120 !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) 3121 cap_granted |= VADMIN; 3122 3123 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3124 /* XXX audit: privilege used */ 3125 if (privused != NULL) 3126 *privused = 1; 3127 return (0); 3128 } 3129 #endif 3130 3131 return (EACCES); 3132 } 3133