1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 #include "opt_ffs.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/dirent.h> 54 #include <sys/domain.h> 55 #include <sys/eventhandler.h> 56 #include <sys/event.h> 57 #include <sys/fcntl.h> 58 #include <sys/kernel.h> 59 #include <sys/kthread.h> 60 #include <sys/ktr.h> 61 #include <sys/malloc.h> 62 #include <sys/mount.h> 63 #include <sys/mutex.h> 64 #include <sys/namei.h> 65 #include <sys/proc.h> 66 #include <sys/reboot.h> 67 #include <sys/socket.h> 68 #include <sys/stat.h> 69 #include <sys/sysctl.h> 70 #include <sys/vmmeter.h> 71 #include <sys/vnode.h> 72 73 #include <machine/limits.h> 74 75 #include <vm/vm.h> 76 #include <vm/vm_object.h> 77 #include <vm/vm_extern.h> 78 #include <vm/pmap.h> 79 #include <vm/vm_map.h> 80 #include <vm/vm_page.h> 81 #include <vm/vm_pager.h> 82 #include <vm/vnode_pager.h> 83 #include <vm/vm_zone.h> 84 85 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 86 87 static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); 88 static void insmntque __P((struct vnode *vp, struct mount *mp)); 89 static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 90 91 /* 92 * Number of vnodes in existence. Increased whenever getnewvnode() 93 * allocates a new vnode, never decreased. 94 */ 95 static unsigned long numvnodes; 96 SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 97 98 /* 99 * Conversion tables for conversion from vnode types to inode formats 100 * and back. 101 */ 102 enum vtype iftovt_tab[16] = { 103 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 104 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 105 }; 106 int vttoif_tab[9] = { 107 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 108 S_IFSOCK, S_IFIFO, S_IFMT, 109 }; 110 111 /* 112 * List of vnodes that are ready for recycling. 113 */ 114 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 115 116 /* 117 * Minimum number of free vnodes. If there are fewer than this free vnodes, 118 * getnewvnode() will return a newly allocated vnode. 119 */ 120 static u_long wantfreevnodes = 25; 121 SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 122 /* Number of vnodes in the free list. */ 123 static u_long freevnodes = 0; 124 SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 125 126 /* 127 * Various variables used for debugging the new implementation of 128 * reassignbuf(). 129 * XXX these are probably of (very) limited utility now. 130 */ 131 static int reassignbufcalls; 132 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 133 static int reassignbufloops; 134 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 135 static int reassignbufsortgood; 136 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 137 static int reassignbufsortbad; 138 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 139 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ 140 static int reassignbufmethod = 1; 141 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 142 143 #ifdef ENABLE_VFS_IOOPT 144 /* See NOTES for a description of this setting. */ 145 int vfs_ioopt = 0; 146 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 147 #endif 148 149 /* List of mounted filesystems. */ 150 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 151 152 /* For any iteration/modification of mountlist */ 153 struct mtx mountlist_mtx; 154 155 /* For any iteration/modification of mnt_vnodelist */ 156 struct mtx mntvnode_mtx; 157 158 /* 159 * Cache for the mount type id assigned to NFS. This is used for 160 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 161 */ 162 int nfs_mount_type = -1; 163 164 /* To keep more than one thread at a time from running vfs_getnewfsid */ 165 static struct mtx mntid_mtx; 166 167 /* For any iteration/modification of vnode_free_list */ 168 static struct mtx vnode_free_list_mtx; 169 170 /* 171 * For any iteration/modification of dev->si_hlist (linked through 172 * v_specnext) 173 */ 174 static struct mtx spechash_mtx; 175 176 /* Publicly exported FS */ 177 struct nfs_public nfs_pub; 178 179 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 180 static vm_zone_t vnode_zone; 181 182 /* Set to 1 to print out reclaim of active vnodes */ 183 int prtactive = 0; 184 185 /* 186 * The workitem queue. 187 * 188 * It is useful to delay writes of file data and filesystem metadata 189 * for tens of seconds so that quickly created and deleted files need 190 * not waste disk bandwidth being created and removed. To realize this, 191 * we append vnodes to a "workitem" queue. When running with a soft 192 * updates implementation, most pending metadata dependencies should 193 * not wait for more than a few seconds. Thus, mounted on block devices 194 * are delayed only about a half the time that file data is delayed. 195 * Similarly, directory updates are more critical, so are only delayed 196 * about a third the time that file data is delayed. Thus, there are 197 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 198 * one each second (driven off the filesystem syncer process). The 199 * syncer_delayno variable indicates the next queue that is to be processed. 200 * Items that need to be processed soon are placed in this queue: 201 * 202 * syncer_workitem_pending[syncer_delayno] 203 * 204 * A delay of fifteen seconds is done by placing the request fifteen 205 * entries later in the queue: 206 * 207 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 208 * 209 */ 210 static int syncer_delayno = 0; 211 static long syncer_mask; 212 LIST_HEAD(synclist, vnode); 213 static struct synclist *syncer_workitem_pending; 214 215 #define SYNCER_MAXDELAY 32 216 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 217 time_t syncdelay = 30; /* max time to delay syncing data */ 218 time_t filedelay = 30; /* time to delay syncing files */ 219 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 220 time_t dirdelay = 29; /* time to delay syncing directories */ 221 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 222 time_t metadelay = 28; /* time to delay syncing metadata */ 223 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 224 static int rushjob; /* number of slots to run ASAP */ 225 static int stat_rush_requests; /* number of times I/O speeded up */ 226 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 227 228 /* 229 * Number of vnodes we want to exist at any one time. This is mostly used 230 * to size hash tables in vnode-related code. It is normally not used in 231 * getnewvnode(), as wantfreevnodes is normally nonzero.) 232 * 233 * XXX desiredvnodes is historical cruft and should not exist. 234 */ 235 int desiredvnodes; 236 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 237 &desiredvnodes, 0, "Maximum number of vnodes"); 238 239 static void vfs_free_addrlist __P((struct netexport *nep)); 240 static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 241 static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 242 struct export_args *argp)); 243 244 /* 245 * Initialize the vnode management data structures. 246 */ 247 static void 248 vntblinit(void *dummy __unused) 249 { 250 251 desiredvnodes = maxproc + cnt.v_page_count / 4; 252 mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); 253 mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); 254 mtx_init(&mntid_mtx, "mntid", MTX_DEF); 255 mtx_init(&spechash_mtx, "spechash", MTX_DEF); 256 TAILQ_INIT(&vnode_free_list); 257 mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); 258 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 259 /* 260 * Initialize the filesystem syncer. 261 */ 262 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 263 &syncer_mask); 264 syncer_maxdelay = syncer_mask + 1; 265 } 266 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 267 268 269 /* 270 * Mark a mount point as busy. Used to synchronize access and to delay 271 * unmounting. Interlock is not released on failure. 272 */ 273 int 274 vfs_busy(mp, flags, interlkp, p) 275 struct mount *mp; 276 int flags; 277 struct mtx *interlkp; 278 struct proc *p; 279 { 280 int lkflags; 281 282 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 283 if (flags & LK_NOWAIT) 284 return (ENOENT); 285 mp->mnt_kern_flag |= MNTK_MWAIT; 286 /* 287 * Since all busy locks are shared except the exclusive 288 * lock granted when unmounting, the only place that a 289 * wakeup needs to be done is at the release of the 290 * exclusive lock at the end of dounmount. 291 */ 292 msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); 293 return (ENOENT); 294 } 295 lkflags = LK_SHARED | LK_NOPAUSE; 296 if (interlkp) 297 lkflags |= LK_INTERLOCK; 298 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 299 panic("vfs_busy: unexpected lock failure"); 300 return (0); 301 } 302 303 /* 304 * Free a busy filesystem. 305 */ 306 void 307 vfs_unbusy(mp, p) 308 struct mount *mp; 309 struct proc *p; 310 { 311 312 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 313 } 314 315 /* 316 * Lookup a filesystem type, and if found allocate and initialize 317 * a mount structure for it. 318 * 319 * Devname is usually updated by mount(8) after booting. 320 */ 321 int 322 vfs_rootmountalloc(fstypename, devname, mpp) 323 char *fstypename; 324 char *devname; 325 struct mount **mpp; 326 { 327 struct proc *p = curproc; /* XXX */ 328 struct vfsconf *vfsp; 329 struct mount *mp; 330 331 if (fstypename == NULL) 332 return (ENODEV); 333 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 334 if (!strcmp(vfsp->vfc_name, fstypename)) 335 break; 336 if (vfsp == NULL) 337 return (ENODEV); 338 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); 339 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 340 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 341 LIST_INIT(&mp->mnt_vnodelist); 342 mp->mnt_vfc = vfsp; 343 mp->mnt_op = vfsp->vfc_vfsops; 344 mp->mnt_flag = MNT_RDONLY; 345 mp->mnt_vnodecovered = NULLVP; 346 vfsp->vfc_refcount++; 347 mp->mnt_iosize_max = DFLTPHYS; 348 mp->mnt_stat.f_type = vfsp->vfc_typenum; 349 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 350 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 351 mp->mnt_stat.f_mntonname[0] = '/'; 352 mp->mnt_stat.f_mntonname[1] = 0; 353 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 354 *mpp = mp; 355 return (0); 356 } 357 358 /* 359 * Find an appropriate filesystem to use for the root. If a filesystem 360 * has not been preselected, walk through the list of known filesystems 361 * trying those that have mountroot routines, and try them until one 362 * works or we have tried them all. 363 */ 364 #ifdef notdef /* XXX JH */ 365 int 366 lite2_vfs_mountroot() 367 { 368 struct vfsconf *vfsp; 369 extern int (*lite2_mountroot) __P((void)); 370 int error; 371 372 if (lite2_mountroot != NULL) 373 return ((*lite2_mountroot)()); 374 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 375 if (vfsp->vfc_mountroot == NULL) 376 continue; 377 if ((error = (*vfsp->vfc_mountroot)()) == 0) 378 return (0); 379 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 380 } 381 return (ENODEV); 382 } 383 #endif 384 385 /* 386 * Lookup a mount point by filesystem identifier. 387 */ 388 struct mount * 389 vfs_getvfs(fsid) 390 fsid_t *fsid; 391 { 392 register struct mount *mp; 393 394 mtx_lock(&mountlist_mtx); 395 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 396 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 397 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 398 mtx_unlock(&mountlist_mtx); 399 return (mp); 400 } 401 } 402 mtx_unlock(&mountlist_mtx); 403 return ((struct mount *) 0); 404 } 405 406 /* 407 * Get a new unique fsid. Try to make its val[0] unique, since this value 408 * will be used to create fake device numbers for stat(). Also try (but 409 * not so hard) make its val[0] unique mod 2^16, since some emulators only 410 * support 16-bit device numbers. We end up with unique val[0]'s for the 411 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 412 * 413 * Keep in mind that several mounts may be running in parallel. Starting 414 * the search one past where the previous search terminated is both a 415 * micro-optimization and a defense against returning the same fsid to 416 * different mounts. 417 */ 418 void 419 vfs_getnewfsid(mp) 420 struct mount *mp; 421 { 422 static u_int16_t mntid_base; 423 fsid_t tfsid; 424 int mtype; 425 426 mtx_lock(&mntid_mtx); 427 mtype = mp->mnt_vfc->vfc_typenum; 428 tfsid.val[1] = mtype; 429 mtype = (mtype & 0xFF) << 24; 430 for (;;) { 431 tfsid.val[0] = makeudev(255, 432 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 433 mntid_base++; 434 if (vfs_getvfs(&tfsid) == NULL) 435 break; 436 } 437 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 438 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 439 mtx_unlock(&mntid_mtx); 440 } 441 442 /* 443 * Knob to control the precision of file timestamps: 444 * 445 * 0 = seconds only; nanoseconds zeroed. 446 * 1 = seconds and nanoseconds, accurate within 1/HZ. 447 * 2 = seconds and nanoseconds, truncated to microseconds. 448 * >=3 = seconds and nanoseconds, maximum precision. 449 */ 450 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 451 452 static int timestamp_precision = TSP_SEC; 453 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 454 ×tamp_precision, 0, ""); 455 456 /* 457 * Get a current timestamp. 458 */ 459 void 460 vfs_timestamp(tsp) 461 struct timespec *tsp; 462 { 463 struct timeval tv; 464 465 switch (timestamp_precision) { 466 case TSP_SEC: 467 tsp->tv_sec = time_second; 468 tsp->tv_nsec = 0; 469 break; 470 case TSP_HZ: 471 getnanotime(tsp); 472 break; 473 case TSP_USEC: 474 microtime(&tv); 475 TIMEVAL_TO_TIMESPEC(&tv, tsp); 476 break; 477 case TSP_NSEC: 478 default: 479 nanotime(tsp); 480 break; 481 } 482 } 483 484 /* 485 * Set vnode attributes to VNOVAL 486 */ 487 void 488 vattr_null(vap) 489 register struct vattr *vap; 490 { 491 492 vap->va_type = VNON; 493 vap->va_size = VNOVAL; 494 vap->va_bytes = VNOVAL; 495 vap->va_mode = VNOVAL; 496 vap->va_nlink = VNOVAL; 497 vap->va_uid = VNOVAL; 498 vap->va_gid = VNOVAL; 499 vap->va_fsid = VNOVAL; 500 vap->va_fileid = VNOVAL; 501 vap->va_blocksize = VNOVAL; 502 vap->va_rdev = VNOVAL; 503 vap->va_atime.tv_sec = VNOVAL; 504 vap->va_atime.tv_nsec = VNOVAL; 505 vap->va_mtime.tv_sec = VNOVAL; 506 vap->va_mtime.tv_nsec = VNOVAL; 507 vap->va_ctime.tv_sec = VNOVAL; 508 vap->va_ctime.tv_nsec = VNOVAL; 509 vap->va_flags = VNOVAL; 510 vap->va_gen = VNOVAL; 511 vap->va_vaflags = 0; 512 } 513 514 /* 515 * Routines having to do with the management of the vnode table. 516 */ 517 518 /* 519 * Return the next vnode from the free list. 520 */ 521 int 522 getnewvnode(tag, mp, vops, vpp) 523 enum vtagtype tag; 524 struct mount *mp; 525 vop_t **vops; 526 struct vnode **vpp; 527 { 528 int s, count; 529 struct proc *p = curproc; /* XXX */ 530 struct vnode *vp = NULL; 531 struct mount *vnmp; 532 vm_object_t object; 533 534 /* 535 * We take the least recently used vnode from the freelist 536 * if we can get it and it has no cached pages, and no 537 * namecache entries are relative to it. 538 * Otherwise we allocate a new vnode 539 */ 540 541 s = splbio(); 542 mtx_lock(&vnode_free_list_mtx); 543 544 if (wantfreevnodes && freevnodes < wantfreevnodes) { 545 vp = NULL; 546 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 547 /* 548 * XXX: this is only here to be backwards compatible 549 */ 550 vp = NULL; 551 } else for (count = 0; count < freevnodes; count++) { 552 vp = TAILQ_FIRST(&vnode_free_list); 553 if (vp == NULL || vp->v_usecount) 554 panic("getnewvnode: free vnode isn't"); 555 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 556 /* 557 * Don't recycle if active in the namecache or 558 * if it still has cached pages or we cannot get 559 * its interlock. 560 */ 561 if (LIST_FIRST(&vp->v_cache_src) != NULL || 562 (VOP_GETVOBJECT(vp, &object) == 0 && 563 (object->resident_page_count || object->ref_count)) || 564 !mtx_trylock(&vp->v_interlock)) { 565 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 566 vp = NULL; 567 continue; 568 } 569 /* 570 * Skip over it if its filesystem is being suspended. 571 */ 572 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 573 break; 574 mtx_unlock(&vp->v_interlock); 575 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 576 vp = NULL; 577 } 578 if (vp) { 579 vp->v_flag |= VDOOMED; 580 vp->v_flag &= ~VFREE; 581 freevnodes--; 582 mtx_unlock(&vnode_free_list_mtx); 583 cache_purge(vp); 584 vp->v_lease = NULL; 585 if (vp->v_type != VBAD) { 586 vgonel(vp, p); 587 } else { 588 mtx_unlock(&vp->v_interlock); 589 } 590 vn_finished_write(vnmp); 591 592 #ifdef INVARIANTS 593 { 594 int s; 595 596 if (vp->v_data) 597 panic("cleaned vnode isn't"); 598 s = splbio(); 599 if (vp->v_numoutput) 600 panic("Clean vnode has pending I/O's"); 601 splx(s); 602 if (vp->v_writecount != 0) 603 panic("Non-zero write count"); 604 } 605 #endif 606 vp->v_flag = 0; 607 vp->v_lastw = 0; 608 vp->v_lasta = 0; 609 vp->v_cstart = 0; 610 vp->v_clen = 0; 611 vp->v_socket = 0; 612 } else { 613 mtx_unlock(&vnode_free_list_mtx); 614 vp = (struct vnode *) zalloc(vnode_zone); 615 bzero((char *) vp, sizeof *vp); 616 mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); 617 vp->v_dd = vp; 618 mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF); 619 cache_purge(vp); 620 LIST_INIT(&vp->v_cache_src); 621 TAILQ_INIT(&vp->v_cache_dst); 622 numvnodes++; 623 } 624 625 TAILQ_INIT(&vp->v_cleanblkhd); 626 TAILQ_INIT(&vp->v_dirtyblkhd); 627 vp->v_type = VNON; 628 vp->v_tag = tag; 629 vp->v_op = vops; 630 lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); 631 insmntque(vp, mp); 632 *vpp = vp; 633 vp->v_usecount = 1; 634 vp->v_data = 0; 635 splx(s); 636 637 vfs_object_create(vp, p, p->p_ucred); 638 return (0); 639 } 640 641 /* 642 * Move a vnode from one mount queue to another. 643 */ 644 static void 645 insmntque(vp, mp) 646 register struct vnode *vp; 647 register struct mount *mp; 648 { 649 650 mtx_lock(&mntvnode_mtx); 651 /* 652 * Delete from old mount point vnode list, if on one. 653 */ 654 if (vp->v_mount != NULL) 655 LIST_REMOVE(vp, v_mntvnodes); 656 /* 657 * Insert into list of vnodes for the new mount point, if available. 658 */ 659 if ((vp->v_mount = mp) == NULL) { 660 mtx_unlock(&mntvnode_mtx); 661 return; 662 } 663 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 664 mtx_unlock(&mntvnode_mtx); 665 } 666 667 /* 668 * Update outstanding I/O count and do wakeup if requested. 669 */ 670 void 671 vwakeup(bp) 672 register struct buf *bp; 673 { 674 register struct vnode *vp; 675 676 bp->b_flags &= ~B_WRITEINPROG; 677 if ((vp = bp->b_vp)) { 678 vp->v_numoutput--; 679 if (vp->v_numoutput < 0) 680 panic("vwakeup: neg numoutput"); 681 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 682 vp->v_flag &= ~VBWAIT; 683 wakeup((caddr_t) &vp->v_numoutput); 684 } 685 } 686 } 687 688 /* 689 * Flush out and invalidate all buffers associated with a vnode. 690 * Called with the underlying object locked. 691 */ 692 int 693 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 694 register struct vnode *vp; 695 int flags; 696 struct ucred *cred; 697 struct proc *p; 698 int slpflag, slptimeo; 699 { 700 register struct buf *bp; 701 struct buf *nbp, *blist; 702 int s, error; 703 vm_object_t object; 704 705 if (flags & V_SAVE) { 706 s = splbio(); 707 while (vp->v_numoutput) { 708 vp->v_flag |= VBWAIT; 709 error = tsleep((caddr_t)&vp->v_numoutput, 710 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 711 if (error) { 712 splx(s); 713 return (error); 714 } 715 } 716 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 717 splx(s); 718 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 719 return (error); 720 s = splbio(); 721 if (vp->v_numoutput > 0 || 722 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 723 panic("vinvalbuf: dirty bufs"); 724 } 725 splx(s); 726 } 727 s = splbio(); 728 for (;;) { 729 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 730 if (!blist) 731 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 732 if (!blist) 733 break; 734 735 for (bp = blist; bp; bp = nbp) { 736 nbp = TAILQ_NEXT(bp, b_vnbufs); 737 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 738 error = BUF_TIMELOCK(bp, 739 LK_EXCLUSIVE | LK_SLEEPFAIL, 740 "vinvalbuf", slpflag, slptimeo); 741 if (error == ENOLCK) 742 break; 743 splx(s); 744 return (error); 745 } 746 /* 747 * XXX Since there are no node locks for NFS, I 748 * believe there is a slight chance that a delayed 749 * write will occur while sleeping just above, so 750 * check for it. Note that vfs_bio_awrite expects 751 * buffers to reside on a queue, while BUF_WRITE and 752 * brelse do not. 753 */ 754 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 755 (flags & V_SAVE)) { 756 757 if (bp->b_vp == vp) { 758 if (bp->b_flags & B_CLUSTEROK) { 759 BUF_UNLOCK(bp); 760 vfs_bio_awrite(bp); 761 } else { 762 bremfree(bp); 763 bp->b_flags |= B_ASYNC; 764 BUF_WRITE(bp); 765 } 766 } else { 767 bremfree(bp); 768 (void) BUF_WRITE(bp); 769 } 770 break; 771 } 772 bremfree(bp); 773 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 774 bp->b_flags &= ~B_ASYNC; 775 brelse(bp); 776 } 777 } 778 779 while (vp->v_numoutput > 0) { 780 vp->v_flag |= VBWAIT; 781 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 782 } 783 784 splx(s); 785 786 /* 787 * Destroy the copy in the VM cache, too. 788 */ 789 mtx_lock(&vp->v_interlock); 790 if (VOP_GETVOBJECT(vp, &object) == 0) { 791 vm_object_page_remove(object, 0, 0, 792 (flags & V_SAVE) ? TRUE : FALSE); 793 } 794 mtx_unlock(&vp->v_interlock); 795 796 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 797 panic("vinvalbuf: flush failed"); 798 return (0); 799 } 800 801 /* 802 * Truncate a file's buffer and pages to a specified length. This 803 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 804 * sync activity. 805 */ 806 int 807 vtruncbuf(vp, cred, p, length, blksize) 808 register struct vnode *vp; 809 struct ucred *cred; 810 struct proc *p; 811 off_t length; 812 int blksize; 813 { 814 register struct buf *bp; 815 struct buf *nbp; 816 int s, anyfreed; 817 int trunclbn; 818 819 /* 820 * Round up to the *next* lbn. 821 */ 822 trunclbn = (length + blksize - 1) / blksize; 823 824 s = splbio(); 825 restart: 826 anyfreed = 1; 827 for (;anyfreed;) { 828 anyfreed = 0; 829 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 830 nbp = TAILQ_NEXT(bp, b_vnbufs); 831 if (bp->b_lblkno >= trunclbn) { 832 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 833 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 834 goto restart; 835 } else { 836 bremfree(bp); 837 bp->b_flags |= (B_INVAL | B_RELBUF); 838 bp->b_flags &= ~B_ASYNC; 839 brelse(bp); 840 anyfreed = 1; 841 } 842 if (nbp && 843 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 844 (nbp->b_vp != vp) || 845 (nbp->b_flags & B_DELWRI))) { 846 goto restart; 847 } 848 } 849 } 850 851 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 852 nbp = TAILQ_NEXT(bp, b_vnbufs); 853 if (bp->b_lblkno >= trunclbn) { 854 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 855 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 856 goto restart; 857 } else { 858 bremfree(bp); 859 bp->b_flags |= (B_INVAL | B_RELBUF); 860 bp->b_flags &= ~B_ASYNC; 861 brelse(bp); 862 anyfreed = 1; 863 } 864 if (nbp && 865 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 866 (nbp->b_vp != vp) || 867 (nbp->b_flags & B_DELWRI) == 0)) { 868 goto restart; 869 } 870 } 871 } 872 } 873 874 if (length > 0) { 875 restartsync: 876 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 877 nbp = TAILQ_NEXT(bp, b_vnbufs); 878 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 879 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 880 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 881 goto restart; 882 } else { 883 bremfree(bp); 884 if (bp->b_vp == vp) { 885 bp->b_flags |= B_ASYNC; 886 } else { 887 bp->b_flags &= ~B_ASYNC; 888 } 889 BUF_WRITE(bp); 890 } 891 goto restartsync; 892 } 893 894 } 895 } 896 897 while (vp->v_numoutput > 0) { 898 vp->v_flag |= VBWAIT; 899 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 900 } 901 902 splx(s); 903 904 vnode_pager_setsize(vp, length); 905 906 return (0); 907 } 908 909 /* 910 * Associate a buffer with a vnode. 911 */ 912 void 913 bgetvp(vp, bp) 914 register struct vnode *vp; 915 register struct buf *bp; 916 { 917 int s; 918 919 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 920 921 vhold(vp); 922 bp->b_vp = vp; 923 bp->b_dev = vn_todev(vp); 924 /* 925 * Insert onto list for new vnode. 926 */ 927 s = splbio(); 928 bp->b_xflags |= BX_VNCLEAN; 929 bp->b_xflags &= ~BX_VNDIRTY; 930 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 931 splx(s); 932 } 933 934 /* 935 * Disassociate a buffer from a vnode. 936 */ 937 void 938 brelvp(bp) 939 register struct buf *bp; 940 { 941 struct vnode *vp; 942 struct buflists *listheadp; 943 int s; 944 945 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 946 947 /* 948 * Delete from old vnode list, if on one. 949 */ 950 vp = bp->b_vp; 951 s = splbio(); 952 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 953 if (bp->b_xflags & BX_VNDIRTY) 954 listheadp = &vp->v_dirtyblkhd; 955 else 956 listheadp = &vp->v_cleanblkhd; 957 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 958 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 959 } 960 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 961 vp->v_flag &= ~VONWORKLST; 962 LIST_REMOVE(vp, v_synclist); 963 } 964 splx(s); 965 bp->b_vp = (struct vnode *) 0; 966 vdrop(vp); 967 } 968 969 /* 970 * Add an item to the syncer work queue. 971 */ 972 static void 973 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 974 { 975 int s, slot; 976 977 s = splbio(); 978 979 if (vp->v_flag & VONWORKLST) { 980 LIST_REMOVE(vp, v_synclist); 981 } 982 983 if (delay > syncer_maxdelay - 2) 984 delay = syncer_maxdelay - 2; 985 slot = (syncer_delayno + delay) & syncer_mask; 986 987 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 988 vp->v_flag |= VONWORKLST; 989 splx(s); 990 } 991 992 struct proc *updateproc; 993 static void sched_sync __P((void)); 994 static struct kproc_desc up_kp = { 995 "syncer", 996 sched_sync, 997 &updateproc 998 }; 999 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1000 1001 /* 1002 * System filesystem synchronizer daemon. 1003 */ 1004 void 1005 sched_sync(void) 1006 { 1007 struct synclist *slp; 1008 struct vnode *vp; 1009 struct mount *mp; 1010 long starttime; 1011 int s; 1012 struct proc *p = updateproc; 1013 1014 mtx_lock(&Giant); 1015 1016 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 1017 SHUTDOWN_PRI_LAST); 1018 1019 for (;;) { 1020 kthread_suspend_check(p); 1021 1022 starttime = time_second; 1023 1024 /* 1025 * Push files whose dirty time has expired. Be careful 1026 * of interrupt race on slp queue. 1027 */ 1028 s = splbio(); 1029 slp = &syncer_workitem_pending[syncer_delayno]; 1030 syncer_delayno += 1; 1031 if (syncer_delayno == syncer_maxdelay) 1032 syncer_delayno = 0; 1033 splx(s); 1034 1035 while ((vp = LIST_FIRST(slp)) != NULL) { 1036 if (VOP_ISLOCKED(vp, NULL) == 0 && 1037 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 1038 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1039 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 1040 VOP_UNLOCK(vp, 0, p); 1041 vn_finished_write(mp); 1042 } 1043 s = splbio(); 1044 if (LIST_FIRST(slp) == vp) { 1045 /* 1046 * Note: v_tag VT_VFS vps can remain on the 1047 * worklist too with no dirty blocks, but 1048 * since sync_fsync() moves it to a different 1049 * slot we are safe. 1050 */ 1051 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1052 !vn_isdisk(vp, NULL)) 1053 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1054 /* 1055 * Put us back on the worklist. The worklist 1056 * routine will remove us from our current 1057 * position and then add us back in at a later 1058 * position. 1059 */ 1060 vn_syncer_add_to_worklist(vp, syncdelay); 1061 } 1062 splx(s); 1063 } 1064 1065 /* 1066 * Do soft update processing. 1067 */ 1068 #ifdef SOFTUPDATES 1069 softdep_process_worklist(NULL); 1070 #endif 1071 1072 /* 1073 * The variable rushjob allows the kernel to speed up the 1074 * processing of the filesystem syncer process. A rushjob 1075 * value of N tells the filesystem syncer to process the next 1076 * N seconds worth of work on its queue ASAP. Currently rushjob 1077 * is used by the soft update code to speed up the filesystem 1078 * syncer process when the incore state is getting so far 1079 * ahead of the disk that the kernel memory pool is being 1080 * threatened with exhaustion. 1081 */ 1082 if (rushjob > 0) { 1083 rushjob -= 1; 1084 continue; 1085 } 1086 /* 1087 * If it has taken us less than a second to process the 1088 * current work, then wait. Otherwise start right over 1089 * again. We can still lose time if any single round 1090 * takes more than two seconds, but it does not really 1091 * matter as we are just trying to generally pace the 1092 * filesystem activity. 1093 */ 1094 if (time_second == starttime) 1095 tsleep(&lbolt, PPAUSE, "syncer", 0); 1096 } 1097 } 1098 1099 /* 1100 * Request the syncer daemon to speed up its work. 1101 * We never push it to speed up more than half of its 1102 * normal turn time, otherwise it could take over the cpu. 1103 */ 1104 int 1105 speedup_syncer() 1106 { 1107 1108 mtx_lock_spin(&sched_lock); 1109 if (updateproc->p_wchan == &lbolt) 1110 setrunnable(updateproc); 1111 mtx_unlock_spin(&sched_lock); 1112 if (rushjob < syncdelay / 2) { 1113 rushjob += 1; 1114 stat_rush_requests += 1; 1115 return (1); 1116 } 1117 return(0); 1118 } 1119 1120 /* 1121 * Associate a p-buffer with a vnode. 1122 * 1123 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1124 * with the buffer. i.e. the bp has not been linked into the vnode or 1125 * ref-counted. 1126 */ 1127 void 1128 pbgetvp(vp, bp) 1129 register struct vnode *vp; 1130 register struct buf *bp; 1131 { 1132 1133 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1134 1135 bp->b_vp = vp; 1136 bp->b_flags |= B_PAGING; 1137 bp->b_dev = vn_todev(vp); 1138 } 1139 1140 /* 1141 * Disassociate a p-buffer from a vnode. 1142 */ 1143 void 1144 pbrelvp(bp) 1145 register struct buf *bp; 1146 { 1147 1148 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1149 1150 /* XXX REMOVE ME */ 1151 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1152 panic( 1153 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1154 bp, 1155 (int)bp->b_flags 1156 ); 1157 } 1158 bp->b_vp = (struct vnode *) 0; 1159 bp->b_flags &= ~B_PAGING; 1160 } 1161 1162 /* 1163 * Change the vnode a pager buffer is associated with. 1164 */ 1165 void 1166 pbreassignbuf(bp, newvp) 1167 struct buf *bp; 1168 struct vnode *newvp; 1169 { 1170 1171 KASSERT(bp->b_flags & B_PAGING, 1172 ("pbreassignbuf() on non phys bp %p", bp)); 1173 bp->b_vp = newvp; 1174 } 1175 1176 /* 1177 * Reassign a buffer from one vnode to another. 1178 * Used to assign file specific control information 1179 * (indirect blocks) to the vnode to which they belong. 1180 */ 1181 void 1182 reassignbuf(bp, newvp) 1183 register struct buf *bp; 1184 register struct vnode *newvp; 1185 { 1186 struct buflists *listheadp; 1187 int delay; 1188 int s; 1189 1190 if (newvp == NULL) { 1191 printf("reassignbuf: NULL"); 1192 return; 1193 } 1194 ++reassignbufcalls; 1195 1196 /* 1197 * B_PAGING flagged buffers cannot be reassigned because their vp 1198 * is not fully linked in. 1199 */ 1200 if (bp->b_flags & B_PAGING) 1201 panic("cannot reassign paging buffer"); 1202 1203 s = splbio(); 1204 /* 1205 * Delete from old vnode list, if on one. 1206 */ 1207 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1208 if (bp->b_xflags & BX_VNDIRTY) 1209 listheadp = &bp->b_vp->v_dirtyblkhd; 1210 else 1211 listheadp = &bp->b_vp->v_cleanblkhd; 1212 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1213 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1214 if (bp->b_vp != newvp) { 1215 vdrop(bp->b_vp); 1216 bp->b_vp = NULL; /* for clarification */ 1217 } 1218 } 1219 /* 1220 * If dirty, put on list of dirty buffers; otherwise insert onto list 1221 * of clean buffers. 1222 */ 1223 if (bp->b_flags & B_DELWRI) { 1224 struct buf *tbp; 1225 1226 listheadp = &newvp->v_dirtyblkhd; 1227 if ((newvp->v_flag & VONWORKLST) == 0) { 1228 switch (newvp->v_type) { 1229 case VDIR: 1230 delay = dirdelay; 1231 break; 1232 case VCHR: 1233 if (newvp->v_rdev->si_mountpoint != NULL) { 1234 delay = metadelay; 1235 break; 1236 } 1237 /* fall through */ 1238 default: 1239 delay = filedelay; 1240 } 1241 vn_syncer_add_to_worklist(newvp, delay); 1242 } 1243 bp->b_xflags |= BX_VNDIRTY; 1244 tbp = TAILQ_FIRST(listheadp); 1245 if (tbp == NULL || 1246 bp->b_lblkno == 0 || 1247 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1248 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1249 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1250 ++reassignbufsortgood; 1251 } else if (bp->b_lblkno < 0) { 1252 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1253 ++reassignbufsortgood; 1254 } else if (reassignbufmethod == 1) { 1255 /* 1256 * New sorting algorithm, only handle sequential case, 1257 * otherwise append to end (but before metadata) 1258 */ 1259 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1260 (tbp->b_xflags & BX_VNDIRTY)) { 1261 /* 1262 * Found the best place to insert the buffer 1263 */ 1264 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1265 ++reassignbufsortgood; 1266 } else { 1267 /* 1268 * Missed, append to end, but before meta-data. 1269 * We know that the head buffer in the list is 1270 * not meta-data due to prior conditionals. 1271 * 1272 * Indirect effects: NFS second stage write 1273 * tends to wind up here, giving maximum 1274 * distance between the unstable write and the 1275 * commit rpc. 1276 */ 1277 tbp = TAILQ_LAST(listheadp, buflists); 1278 while (tbp && tbp->b_lblkno < 0) 1279 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1280 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1281 ++reassignbufsortbad; 1282 } 1283 } else { 1284 /* 1285 * Old sorting algorithm, scan queue and insert 1286 */ 1287 struct buf *ttbp; 1288 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1289 (ttbp->b_lblkno < bp->b_lblkno)) { 1290 ++reassignbufloops; 1291 tbp = ttbp; 1292 } 1293 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1294 } 1295 } else { 1296 bp->b_xflags |= BX_VNCLEAN; 1297 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1298 if ((newvp->v_flag & VONWORKLST) && 1299 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1300 newvp->v_flag &= ~VONWORKLST; 1301 LIST_REMOVE(newvp, v_synclist); 1302 } 1303 } 1304 if (bp->b_vp != newvp) { 1305 bp->b_vp = newvp; 1306 vhold(bp->b_vp); 1307 } 1308 splx(s); 1309 } 1310 1311 /* 1312 * Create a vnode for a device. 1313 * Used for mounting the root file system. 1314 */ 1315 int 1316 bdevvp(dev, vpp) 1317 dev_t dev; 1318 struct vnode **vpp; 1319 { 1320 register struct vnode *vp; 1321 struct vnode *nvp; 1322 int error; 1323 1324 if (dev == NODEV) { 1325 *vpp = NULLVP; 1326 return (ENXIO); 1327 } 1328 if (vfinddev(dev, VCHR, vpp)) 1329 return (0); 1330 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1331 if (error) { 1332 *vpp = NULLVP; 1333 return (error); 1334 } 1335 vp = nvp; 1336 vp->v_type = VCHR; 1337 addalias(vp, dev); 1338 *vpp = vp; 1339 return (0); 1340 } 1341 1342 /* 1343 * Add vnode to the alias list hung off the dev_t. 1344 * 1345 * The reason for this gunk is that multiple vnodes can reference 1346 * the same physical device, so checking vp->v_usecount to see 1347 * how many users there are is inadequate; the v_usecount for 1348 * the vnodes need to be accumulated. vcount() does that. 1349 */ 1350 struct vnode * 1351 addaliasu(nvp, nvp_rdev) 1352 struct vnode *nvp; 1353 udev_t nvp_rdev; 1354 { 1355 struct vnode *ovp; 1356 vop_t **ops; 1357 dev_t dev; 1358 1359 if (nvp->v_type == VBLK) 1360 return (nvp); 1361 if (nvp->v_type != VCHR) 1362 panic("addaliasu on non-special vnode"); 1363 dev = udev2dev(nvp_rdev, 0); 1364 /* 1365 * Check to see if we have a bdevvp vnode with no associated 1366 * filesystem. If so, we want to associate the filesystem of 1367 * the new newly instigated vnode with the bdevvp vnode and 1368 * discard the newly created vnode rather than leaving the 1369 * bdevvp vnode lying around with no associated filesystem. 1370 */ 1371 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1372 addalias(nvp, dev); 1373 return (nvp); 1374 } 1375 /* 1376 * Discard unneeded vnode, but save its node specific data. 1377 * Note that if there is a lock, it is carried over in the 1378 * node specific data to the replacement vnode. 1379 */ 1380 vref(ovp); 1381 ovp->v_data = nvp->v_data; 1382 ovp->v_tag = nvp->v_tag; 1383 nvp->v_data = NULL; 1384 lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, 1385 nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); 1386 if (nvp->v_vnlock) 1387 ovp->v_vnlock = &ovp->v_lock; 1388 ops = ovp->v_op; 1389 ovp->v_op = nvp->v_op; 1390 if (VOP_ISLOCKED(nvp, curproc)) { 1391 VOP_UNLOCK(nvp, 0, curproc); 1392 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc); 1393 } 1394 nvp->v_op = ops; 1395 insmntque(ovp, nvp->v_mount); 1396 vrele(nvp); 1397 vgone(nvp); 1398 return (ovp); 1399 } 1400 1401 /* This is a local helper function that do the same as addaliasu, but for a 1402 * dev_t instead of an udev_t. */ 1403 static void 1404 addalias(nvp, dev) 1405 struct vnode *nvp; 1406 dev_t dev; 1407 { 1408 1409 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); 1410 nvp->v_rdev = dev; 1411 mtx_lock(&spechash_mtx); 1412 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1413 mtx_unlock(&spechash_mtx); 1414 } 1415 1416 /* 1417 * Grab a particular vnode from the free list, increment its 1418 * reference count and lock it. The vnode lock bit is set if the 1419 * vnode is being eliminated in vgone. The process is awakened 1420 * when the transition is completed, and an error returned to 1421 * indicate that the vnode is no longer usable (possibly having 1422 * been changed to a new file system type). 1423 */ 1424 int 1425 vget(vp, flags, p) 1426 register struct vnode *vp; 1427 int flags; 1428 struct proc *p; 1429 { 1430 int error; 1431 1432 /* 1433 * If the vnode is in the process of being cleaned out for 1434 * another use, we wait for the cleaning to finish and then 1435 * return failure. Cleaning is determined by checking that 1436 * the VXLOCK flag is set. 1437 */ 1438 if ((flags & LK_INTERLOCK) == 0) 1439 mtx_lock(&vp->v_interlock); 1440 if (vp->v_flag & VXLOCK) { 1441 if (vp->v_vxproc == curproc) { 1442 printf("VXLOCK interlock avoided\n"); 1443 } else { 1444 vp->v_flag |= VXWANT; 1445 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1446 "vget", 0); 1447 return (ENOENT); 1448 } 1449 } 1450 1451 vp->v_usecount++; 1452 1453 if (VSHOULDBUSY(vp)) 1454 vbusy(vp); 1455 if (flags & LK_TYPE_MASK) { 1456 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1457 /* 1458 * must expand vrele here because we do not want 1459 * to call VOP_INACTIVE if the reference count 1460 * drops back to zero since it was never really 1461 * active. We must remove it from the free list 1462 * before sleeping so that multiple processes do 1463 * not try to recycle it. 1464 */ 1465 mtx_lock(&vp->v_interlock); 1466 vp->v_usecount--; 1467 if (VSHOULDFREE(vp)) 1468 vfree(vp); 1469 mtx_unlock(&vp->v_interlock); 1470 } 1471 return (error); 1472 } 1473 mtx_unlock(&vp->v_interlock); 1474 return (0); 1475 } 1476 1477 /* 1478 * Increase the reference count of a vnode. 1479 */ 1480 void 1481 vref(struct vnode *vp) 1482 { 1483 mtx_lock(&vp->v_interlock); 1484 vp->v_usecount++; 1485 mtx_unlock(&vp->v_interlock); 1486 } 1487 1488 /* 1489 * Vnode put/release. 1490 * If count drops to zero, call inactive routine and return to freelist. 1491 */ 1492 void 1493 vrele(vp) 1494 struct vnode *vp; 1495 { 1496 struct proc *p = curproc; /* XXX */ 1497 1498 KASSERT(vp != NULL, ("vrele: null vp")); 1499 1500 mtx_lock(&vp->v_interlock); 1501 1502 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); 1503 1504 if (vp->v_usecount > 1) { 1505 1506 vp->v_usecount--; 1507 mtx_unlock(&vp->v_interlock); 1508 1509 return; 1510 } 1511 1512 if (vp->v_usecount == 1) { 1513 1514 vp->v_usecount--; 1515 if (VSHOULDFREE(vp)) 1516 vfree(vp); 1517 /* 1518 * If we are doing a vput, the node is already locked, and we must 1519 * call VOP_INACTIVE with the node locked. So, in the case of 1520 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1521 */ 1522 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1523 VOP_INACTIVE(vp, p); 1524 } 1525 1526 } else { 1527 #ifdef DIAGNOSTIC 1528 vprint("vrele: negative ref count", vp); 1529 mtx_unlock(&vp->v_interlock); 1530 #endif 1531 panic("vrele: negative ref cnt"); 1532 } 1533 } 1534 1535 /* 1536 * Release an already locked vnode. This give the same effects as 1537 * unlock+vrele(), but takes less time and avoids releasing and 1538 * re-aquiring the lock (as vrele() aquires the lock internally.) 1539 */ 1540 void 1541 vput(vp) 1542 struct vnode *vp; 1543 { 1544 struct proc *p = curproc; /* XXX */ 1545 1546 KASSERT(vp != NULL, ("vput: null vp")); 1547 mtx_lock(&vp->v_interlock); 1548 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); 1549 1550 if (vp->v_usecount > 1) { 1551 1552 vp->v_usecount--; 1553 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1554 return; 1555 1556 } 1557 1558 if (vp->v_usecount == 1) { 1559 1560 vp->v_usecount--; 1561 if (VSHOULDFREE(vp)) 1562 vfree(vp); 1563 /* 1564 * If we are doing a vput, the node is already locked, and we must 1565 * call VOP_INACTIVE with the node locked. So, in the case of 1566 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1567 */ 1568 mtx_unlock(&vp->v_interlock); 1569 VOP_INACTIVE(vp, p); 1570 1571 } else { 1572 #ifdef DIAGNOSTIC 1573 vprint("vput: negative ref count", vp); 1574 #endif 1575 panic("vput: negative ref cnt"); 1576 } 1577 } 1578 1579 /* 1580 * Somebody doesn't want the vnode recycled. 1581 */ 1582 void 1583 vhold(vp) 1584 register struct vnode *vp; 1585 { 1586 int s; 1587 1588 s = splbio(); 1589 vp->v_holdcnt++; 1590 if (VSHOULDBUSY(vp)) 1591 vbusy(vp); 1592 splx(s); 1593 } 1594 1595 /* 1596 * Note that there is one less who cares about this vnode. vdrop() is the 1597 * opposite of vhold(). 1598 */ 1599 void 1600 vdrop(vp) 1601 register struct vnode *vp; 1602 { 1603 int s; 1604 1605 s = splbio(); 1606 if (vp->v_holdcnt <= 0) 1607 panic("vdrop: holdcnt"); 1608 vp->v_holdcnt--; 1609 if (VSHOULDFREE(vp)) 1610 vfree(vp); 1611 splx(s); 1612 } 1613 1614 /* 1615 * Remove any vnodes in the vnode table belonging to mount point mp. 1616 * 1617 * If MNT_NOFORCE is specified, there should not be any active ones, 1618 * return error if any are found (nb: this is a user error, not a 1619 * system error). If MNT_FORCE is specified, detach any active vnodes 1620 * that are found. 1621 */ 1622 #ifdef DIAGNOSTIC 1623 static int busyprt = 0; /* print out busy vnodes */ 1624 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1625 #endif 1626 1627 int 1628 vflush(mp, skipvp, flags) 1629 struct mount *mp; 1630 struct vnode *skipvp; 1631 int flags; 1632 { 1633 struct proc *p = curproc; /* XXX */ 1634 struct vnode *vp, *nvp; 1635 int busy = 0; 1636 1637 mtx_lock(&mntvnode_mtx); 1638 loop: 1639 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1640 /* 1641 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1642 * Start over if it has (it won't be on the list anymore). 1643 */ 1644 if (vp->v_mount != mp) 1645 goto loop; 1646 nvp = LIST_NEXT(vp, v_mntvnodes); 1647 /* 1648 * Skip over a selected vnode. 1649 */ 1650 if (vp == skipvp) 1651 continue; 1652 1653 mtx_lock(&vp->v_interlock); 1654 /* 1655 * Skip over a vnodes marked VSYSTEM. 1656 */ 1657 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1658 mtx_unlock(&vp->v_interlock); 1659 continue; 1660 } 1661 /* 1662 * If WRITECLOSE is set, only flush out regular file vnodes 1663 * open for writing. 1664 */ 1665 if ((flags & WRITECLOSE) && 1666 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1667 mtx_unlock(&vp->v_interlock); 1668 continue; 1669 } 1670 1671 /* 1672 * With v_usecount == 0, all we need to do is clear out the 1673 * vnode data structures and we are done. 1674 */ 1675 if (vp->v_usecount == 0) { 1676 mtx_unlock(&mntvnode_mtx); 1677 vgonel(vp, p); 1678 mtx_lock(&mntvnode_mtx); 1679 continue; 1680 } 1681 1682 /* 1683 * If FORCECLOSE is set, forcibly close the vnode. For block 1684 * or character devices, revert to an anonymous device. For 1685 * all other files, just kill them. 1686 */ 1687 if (flags & FORCECLOSE) { 1688 mtx_unlock(&mntvnode_mtx); 1689 if (vp->v_type != VCHR) { 1690 vgonel(vp, p); 1691 } else { 1692 vclean(vp, 0, p); 1693 vp->v_op = spec_vnodeop_p; 1694 insmntque(vp, (struct mount *) 0); 1695 } 1696 mtx_lock(&mntvnode_mtx); 1697 continue; 1698 } 1699 #ifdef DIAGNOSTIC 1700 if (busyprt) 1701 vprint("vflush: busy vnode", vp); 1702 #endif 1703 mtx_unlock(&vp->v_interlock); 1704 busy++; 1705 } 1706 mtx_unlock(&mntvnode_mtx); 1707 if (busy) 1708 return (EBUSY); 1709 return (0); 1710 } 1711 1712 /* 1713 * Disassociate the underlying file system from a vnode. 1714 */ 1715 static void 1716 vclean(vp, flags, p) 1717 struct vnode *vp; 1718 int flags; 1719 struct proc *p; 1720 { 1721 int active; 1722 1723 /* 1724 * Check to see if the vnode is in use. If so we have to reference it 1725 * before we clean it out so that its count cannot fall to zero and 1726 * generate a race against ourselves to recycle it. 1727 */ 1728 if ((active = vp->v_usecount)) 1729 vp->v_usecount++; 1730 1731 /* 1732 * Prevent the vnode from being recycled or brought into use while we 1733 * clean it out. 1734 */ 1735 if (vp->v_flag & VXLOCK) 1736 panic("vclean: deadlock"); 1737 vp->v_flag |= VXLOCK; 1738 vp->v_vxproc = curproc; 1739 /* 1740 * Even if the count is zero, the VOP_INACTIVE routine may still 1741 * have the object locked while it cleans it out. The VOP_LOCK 1742 * ensures that the VOP_INACTIVE routine is done with its work. 1743 * For active vnodes, it ensures that no other activity can 1744 * occur while the underlying object is being cleaned out. 1745 */ 1746 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1747 1748 /* 1749 * Clean out any buffers associated with the vnode. 1750 * If the flush fails, just toss the buffers. 1751 */ 1752 if (flags & DOCLOSE) { 1753 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1754 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1755 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) 1756 vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1757 } 1758 1759 VOP_DESTROYVOBJECT(vp); 1760 1761 /* 1762 * If purging an active vnode, it must be closed and 1763 * deactivated before being reclaimed. Note that the 1764 * VOP_INACTIVE will unlock the vnode. 1765 */ 1766 if (active) { 1767 if (flags & DOCLOSE) 1768 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1769 VOP_INACTIVE(vp, p); 1770 } else { 1771 /* 1772 * Any other processes trying to obtain this lock must first 1773 * wait for VXLOCK to clear, then call the new lock operation. 1774 */ 1775 VOP_UNLOCK(vp, 0, p); 1776 } 1777 /* 1778 * Reclaim the vnode. 1779 */ 1780 if (VOP_RECLAIM(vp, p)) 1781 panic("vclean: cannot reclaim"); 1782 1783 if (active) { 1784 /* 1785 * Inline copy of vrele() since VOP_INACTIVE 1786 * has already been called. 1787 */ 1788 mtx_lock(&vp->v_interlock); 1789 if (--vp->v_usecount <= 0) { 1790 #ifdef DIAGNOSTIC 1791 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1792 vprint("vclean: bad ref count", vp); 1793 panic("vclean: ref cnt"); 1794 } 1795 #endif 1796 vfree(vp); 1797 } 1798 mtx_unlock(&vp->v_interlock); 1799 } 1800 1801 cache_purge(vp); 1802 vp->v_vnlock = NULL; 1803 lockdestroy(&vp->v_lock); 1804 1805 if (VSHOULDFREE(vp)) 1806 vfree(vp); 1807 1808 /* 1809 * Done with purge, notify sleepers of the grim news. 1810 */ 1811 vp->v_op = dead_vnodeop_p; 1812 vn_pollgone(vp); 1813 vp->v_tag = VT_NON; 1814 vp->v_flag &= ~VXLOCK; 1815 vp->v_vxproc = NULL; 1816 if (vp->v_flag & VXWANT) { 1817 vp->v_flag &= ~VXWANT; 1818 wakeup((caddr_t) vp); 1819 } 1820 } 1821 1822 /* 1823 * Eliminate all activity associated with the requested vnode 1824 * and with all vnodes aliased to the requested vnode. 1825 */ 1826 int 1827 vop_revoke(ap) 1828 struct vop_revoke_args /* { 1829 struct vnode *a_vp; 1830 int a_flags; 1831 } */ *ap; 1832 { 1833 struct vnode *vp, *vq; 1834 dev_t dev; 1835 1836 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1837 1838 vp = ap->a_vp; 1839 /* 1840 * If a vgone (or vclean) is already in progress, 1841 * wait until it is done and return. 1842 */ 1843 if (vp->v_flag & VXLOCK) { 1844 vp->v_flag |= VXWANT; 1845 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1846 "vop_revokeall", 0); 1847 return (0); 1848 } 1849 dev = vp->v_rdev; 1850 for (;;) { 1851 mtx_lock(&spechash_mtx); 1852 vq = SLIST_FIRST(&dev->si_hlist); 1853 mtx_unlock(&spechash_mtx); 1854 if (!vq) 1855 break; 1856 vgone(vq); 1857 } 1858 return (0); 1859 } 1860 1861 /* 1862 * Recycle an unused vnode to the front of the free list. 1863 * Release the passed interlock if the vnode will be recycled. 1864 */ 1865 int 1866 vrecycle(vp, inter_lkp, p) 1867 struct vnode *vp; 1868 struct mtx *inter_lkp; 1869 struct proc *p; 1870 { 1871 1872 mtx_lock(&vp->v_interlock); 1873 if (vp->v_usecount == 0) { 1874 if (inter_lkp) { 1875 mtx_unlock(inter_lkp); 1876 } 1877 vgonel(vp, p); 1878 return (1); 1879 } 1880 mtx_unlock(&vp->v_interlock); 1881 return (0); 1882 } 1883 1884 /* 1885 * Eliminate all activity associated with a vnode 1886 * in preparation for reuse. 1887 */ 1888 void 1889 vgone(vp) 1890 register struct vnode *vp; 1891 { 1892 struct proc *p = curproc; /* XXX */ 1893 1894 mtx_lock(&vp->v_interlock); 1895 vgonel(vp, p); 1896 } 1897 1898 /* 1899 * vgone, with the vp interlock held. 1900 */ 1901 void 1902 vgonel(vp, p) 1903 struct vnode *vp; 1904 struct proc *p; 1905 { 1906 int s; 1907 1908 /* 1909 * If a vgone (or vclean) is already in progress, 1910 * wait until it is done and return. 1911 */ 1912 if (vp->v_flag & VXLOCK) { 1913 vp->v_flag |= VXWANT; 1914 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1915 "vgone", 0); 1916 return; 1917 } 1918 1919 /* 1920 * Clean out the filesystem specific data. 1921 */ 1922 vclean(vp, DOCLOSE, p); 1923 mtx_lock(&vp->v_interlock); 1924 1925 /* 1926 * Delete from old mount point vnode list, if on one. 1927 */ 1928 if (vp->v_mount != NULL) 1929 insmntque(vp, (struct mount *)0); 1930 /* 1931 * If special device, remove it from special device alias list 1932 * if it is on one. 1933 */ 1934 if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { 1935 mtx_lock(&spechash_mtx); 1936 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); 1937 freedev(vp->v_rdev); 1938 mtx_unlock(&spechash_mtx); 1939 vp->v_rdev = NULL; 1940 } 1941 1942 /* 1943 * If it is on the freelist and not already at the head, 1944 * move it to the head of the list. The test of the 1945 * VDOOMED flag and the reference count of zero is because 1946 * it will be removed from the free list by getnewvnode, 1947 * but will not have its reference count incremented until 1948 * after calling vgone. If the reference count were 1949 * incremented first, vgone would (incorrectly) try to 1950 * close the previous instance of the underlying object. 1951 */ 1952 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1953 s = splbio(); 1954 mtx_lock(&vnode_free_list_mtx); 1955 if (vp->v_flag & VFREE) 1956 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1957 else 1958 freevnodes++; 1959 vp->v_flag |= VFREE; 1960 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1961 mtx_unlock(&vnode_free_list_mtx); 1962 splx(s); 1963 } 1964 1965 vp->v_type = VBAD; 1966 mtx_unlock(&vp->v_interlock); 1967 } 1968 1969 /* 1970 * Lookup a vnode by device number. 1971 */ 1972 int 1973 vfinddev(dev, type, vpp) 1974 dev_t dev; 1975 enum vtype type; 1976 struct vnode **vpp; 1977 { 1978 struct vnode *vp; 1979 1980 mtx_lock(&spechash_mtx); 1981 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1982 if (type == vp->v_type) { 1983 *vpp = vp; 1984 mtx_unlock(&spechash_mtx); 1985 return (1); 1986 } 1987 } 1988 mtx_unlock(&spechash_mtx); 1989 return (0); 1990 } 1991 1992 /* 1993 * Calculate the total number of references to a special device. 1994 */ 1995 int 1996 vcount(vp) 1997 struct vnode *vp; 1998 { 1999 struct vnode *vq; 2000 int count; 2001 2002 count = 0; 2003 mtx_lock(&spechash_mtx); 2004 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) 2005 count += vq->v_usecount; 2006 mtx_unlock(&spechash_mtx); 2007 return (count); 2008 } 2009 2010 /* 2011 * Same as above, but using the dev_t as argument 2012 */ 2013 int 2014 count_dev(dev) 2015 dev_t dev; 2016 { 2017 struct vnode *vp; 2018 2019 vp = SLIST_FIRST(&dev->si_hlist); 2020 if (vp == NULL) 2021 return (0); 2022 return(vcount(vp)); 2023 } 2024 2025 /* 2026 * Print out a description of a vnode. 2027 */ 2028 static char *typename[] = 2029 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2030 2031 void 2032 vprint(label, vp) 2033 char *label; 2034 struct vnode *vp; 2035 { 2036 char buf[96]; 2037 2038 if (label != NULL) 2039 printf("%s: %p: ", label, (void *)vp); 2040 else 2041 printf("%p: ", (void *)vp); 2042 printf("type %s, usecount %d, writecount %d, refcount %d,", 2043 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2044 vp->v_holdcnt); 2045 buf[0] = '\0'; 2046 if (vp->v_flag & VROOT) 2047 strcat(buf, "|VROOT"); 2048 if (vp->v_flag & VTEXT) 2049 strcat(buf, "|VTEXT"); 2050 if (vp->v_flag & VSYSTEM) 2051 strcat(buf, "|VSYSTEM"); 2052 if (vp->v_flag & VXLOCK) 2053 strcat(buf, "|VXLOCK"); 2054 if (vp->v_flag & VXWANT) 2055 strcat(buf, "|VXWANT"); 2056 if (vp->v_flag & VBWAIT) 2057 strcat(buf, "|VBWAIT"); 2058 if (vp->v_flag & VDOOMED) 2059 strcat(buf, "|VDOOMED"); 2060 if (vp->v_flag & VFREE) 2061 strcat(buf, "|VFREE"); 2062 if (vp->v_flag & VOBJBUF) 2063 strcat(buf, "|VOBJBUF"); 2064 if (buf[0] != '\0') 2065 printf(" flags (%s)", &buf[1]); 2066 if (vp->v_data == NULL) { 2067 printf("\n"); 2068 } else { 2069 printf("\n\t"); 2070 VOP_PRINT(vp); 2071 } 2072 } 2073 2074 #ifdef DDB 2075 #include <ddb/ddb.h> 2076 /* 2077 * List all of the locked vnodes in the system. 2078 * Called when debugging the kernel. 2079 */ 2080 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2081 { 2082 struct proc *p = curproc; /* XXX */ 2083 struct mount *mp, *nmp; 2084 struct vnode *vp; 2085 2086 printf("Locked vnodes\n"); 2087 mtx_lock(&mountlist_mtx); 2088 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2089 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2090 nmp = TAILQ_NEXT(mp, mnt_list); 2091 continue; 2092 } 2093 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2094 if (VOP_ISLOCKED(vp, NULL)) 2095 vprint((char *)0, vp); 2096 } 2097 mtx_lock(&mountlist_mtx); 2098 nmp = TAILQ_NEXT(mp, mnt_list); 2099 vfs_unbusy(mp, p); 2100 } 2101 mtx_unlock(&mountlist_mtx); 2102 } 2103 #endif 2104 2105 /* 2106 * Top level filesystem related information gathering. 2107 */ 2108 static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2109 2110 static int 2111 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2112 { 2113 int *name = (int *)arg1 - 1; /* XXX */ 2114 u_int namelen = arg2 + 1; /* XXX */ 2115 struct vfsconf *vfsp; 2116 2117 #if 1 || defined(COMPAT_PRELITE2) 2118 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2119 if (namelen == 1) 2120 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2121 #endif 2122 2123 /* XXX the below code does not compile; vfs_sysctl does not exist. */ 2124 #ifdef notyet 2125 /* all sysctl names at this level are at least name and field */ 2126 if (namelen < 2) 2127 return (ENOTDIR); /* overloaded */ 2128 if (name[0] != VFS_GENERIC) { 2129 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2130 if (vfsp->vfc_typenum == name[0]) 2131 break; 2132 if (vfsp == NULL) 2133 return (EOPNOTSUPP); 2134 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2135 oldp, oldlenp, newp, newlen, p)); 2136 } 2137 #endif 2138 switch (name[1]) { 2139 case VFS_MAXTYPENUM: 2140 if (namelen != 2) 2141 return (ENOTDIR); 2142 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2143 case VFS_CONF: 2144 if (namelen != 3) 2145 return (ENOTDIR); /* overloaded */ 2146 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2147 if (vfsp->vfc_typenum == name[2]) 2148 break; 2149 if (vfsp == NULL) 2150 return (EOPNOTSUPP); 2151 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2152 } 2153 return (EOPNOTSUPP); 2154 } 2155 2156 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2157 "Generic filesystem"); 2158 2159 #if 1 || defined(COMPAT_PRELITE2) 2160 2161 static int 2162 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2163 { 2164 int error; 2165 struct vfsconf *vfsp; 2166 struct ovfsconf ovfs; 2167 2168 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2169 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2170 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2171 ovfs.vfc_index = vfsp->vfc_typenum; 2172 ovfs.vfc_refcount = vfsp->vfc_refcount; 2173 ovfs.vfc_flags = vfsp->vfc_flags; 2174 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2175 if (error) 2176 return error; 2177 } 2178 return 0; 2179 } 2180 2181 #endif /* 1 || COMPAT_PRELITE2 */ 2182 2183 #if COMPILING_LINT 2184 #define KINFO_VNODESLOP 10 2185 /* 2186 * Dump vnode list (via sysctl). 2187 * Copyout address of vnode followed by vnode. 2188 */ 2189 /* ARGSUSED */ 2190 static int 2191 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2192 { 2193 struct proc *p = curproc; /* XXX */ 2194 struct mount *mp, *nmp; 2195 struct vnode *nvp, *vp; 2196 int error; 2197 2198 #define VPTRSZ sizeof (struct vnode *) 2199 #define VNODESZ sizeof (struct vnode) 2200 2201 req->lock = 0; 2202 if (!req->oldptr) /* Make an estimate */ 2203 return (SYSCTL_OUT(req, 0, 2204 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2205 2206 mtx_lock(&mountlist_mtx); 2207 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2208 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2209 nmp = TAILQ_NEXT(mp, mnt_list); 2210 continue; 2211 } 2212 again: 2213 mtx_lock(&mntvnode_mtx); 2214 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2215 vp != NULL; 2216 vp = nvp) { 2217 /* 2218 * Check that the vp is still associated with 2219 * this filesystem. RACE: could have been 2220 * recycled onto the same filesystem. 2221 */ 2222 if (vp->v_mount != mp) { 2223 mtx_unlock(&mntvnode_mtx); 2224 goto again; 2225 } 2226 nvp = LIST_NEXT(vp, v_mntvnodes); 2227 mtx_unlock(&mntvnode_mtx); 2228 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2229 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2230 return (error); 2231 mtx_lock(&mntvnode_mtx); 2232 } 2233 mtx_unlock(&mntvnode_mtx); 2234 mtx_lock(&mountlist_mtx); 2235 nmp = TAILQ_NEXT(mp, mnt_list); 2236 vfs_unbusy(mp, p); 2237 } 2238 mtx_unlock(&mountlist_mtx); 2239 2240 return (0); 2241 } 2242 2243 /* 2244 * XXX 2245 * Exporting the vnode list on large systems causes them to crash. 2246 * Exporting the vnode list on medium systems causes sysctl to coredump. 2247 */ 2248 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2249 0, 0, sysctl_vnode, "S,vnode", ""); 2250 #endif 2251 2252 /* 2253 * Check to see if a filesystem is mounted on a block device. 2254 */ 2255 int 2256 vfs_mountedon(vp) 2257 struct vnode *vp; 2258 { 2259 2260 if (vp->v_rdev->si_mountpoint != NULL) 2261 return (EBUSY); 2262 return (0); 2263 } 2264 2265 /* 2266 * Unmount all filesystems. The list is traversed in reverse order 2267 * of mounting to avoid dependencies. 2268 */ 2269 void 2270 vfs_unmountall() 2271 { 2272 struct mount *mp; 2273 struct proc *p; 2274 int error; 2275 2276 if (curproc != NULL) 2277 p = curproc; 2278 else 2279 p = initproc; /* XXX XXX should this be proc0? */ 2280 /* 2281 * Since this only runs when rebooting, it is not interlocked. 2282 */ 2283 while(!TAILQ_EMPTY(&mountlist)) { 2284 mp = TAILQ_LAST(&mountlist, mntlist); 2285 error = dounmount(mp, MNT_FORCE, p); 2286 if (error) { 2287 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2288 printf("unmount of %s failed (", 2289 mp->mnt_stat.f_mntonname); 2290 if (error == EBUSY) 2291 printf("BUSY)\n"); 2292 else 2293 printf("%d)\n", error); 2294 } else { 2295 /* The unmount has removed mp from the mountlist */ 2296 } 2297 } 2298 } 2299 2300 /* 2301 * Build hash lists of net addresses and hang them off the mount point. 2302 * Called by ufs_mount() to set up the lists of export addresses. 2303 */ 2304 static int 2305 vfs_hang_addrlist(mp, nep, argp) 2306 struct mount *mp; 2307 struct netexport *nep; 2308 struct export_args *argp; 2309 { 2310 register struct netcred *np; 2311 register struct radix_node_head *rnh; 2312 register int i; 2313 struct radix_node *rn; 2314 struct sockaddr *saddr, *smask = 0; 2315 struct domain *dom; 2316 int error; 2317 2318 if (argp->ex_addrlen == 0) { 2319 if (mp->mnt_flag & MNT_DEFEXPORTED) 2320 return (EPERM); 2321 np = &nep->ne_defexported; 2322 np->netc_exflags = argp->ex_flags; 2323 bzero(&np->netc_anon, sizeof(np->netc_anon)); 2324 np->netc_anon.cr_uid = argp->ex_anon.cr_uid; 2325 np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; 2326 bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, 2327 sizeof(np->netc_anon.cr_groups)); 2328 np->netc_anon.cr_ref = 1; 2329 mp->mnt_flag |= MNT_DEFEXPORTED; 2330 return (0); 2331 } 2332 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2333 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); 2334 saddr = (struct sockaddr *) (np + 1); 2335 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2336 goto out; 2337 if (saddr->sa_len > argp->ex_addrlen) 2338 saddr->sa_len = argp->ex_addrlen; 2339 if (argp->ex_masklen) { 2340 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2341 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2342 if (error) 2343 goto out; 2344 if (smask->sa_len > argp->ex_masklen) 2345 smask->sa_len = argp->ex_masklen; 2346 } 2347 i = saddr->sa_family; 2348 if ((rnh = nep->ne_rtable[i]) == 0) { 2349 /* 2350 * Seems silly to initialize every AF when most are not used, 2351 * do so on demand here 2352 */ 2353 for (dom = domains; dom; dom = dom->dom_next) 2354 if (dom->dom_family == i && dom->dom_rtattach) { 2355 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2356 dom->dom_rtoffset); 2357 break; 2358 } 2359 if ((rnh = nep->ne_rtable[i]) == 0) { 2360 error = ENOBUFS; 2361 goto out; 2362 } 2363 } 2364 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2365 np->netc_rnodes); 2366 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2367 error = EPERM; 2368 goto out; 2369 } 2370 np->netc_exflags = argp->ex_flags; 2371 bzero(&np->netc_anon, sizeof(np->netc_anon)); 2372 np->netc_anon.cr_uid = argp->ex_anon.cr_uid; 2373 np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; 2374 bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, 2375 sizeof(np->netc_anon.cr_groups)); 2376 np->netc_anon.cr_ref = 1; 2377 return (0); 2378 out: 2379 free(np, M_NETADDR); 2380 return (error); 2381 } 2382 2383 /* Helper for vfs_free_addrlist. */ 2384 /* ARGSUSED */ 2385 static int 2386 vfs_free_netcred(rn, w) 2387 struct radix_node *rn; 2388 void *w; 2389 { 2390 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2391 2392 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2393 free((caddr_t) rn, M_NETADDR); 2394 return (0); 2395 } 2396 2397 /* 2398 * Free the net address hash lists that are hanging off the mount points. 2399 */ 2400 static void 2401 vfs_free_addrlist(nep) 2402 struct netexport *nep; 2403 { 2404 register int i; 2405 register struct radix_node_head *rnh; 2406 2407 for (i = 0; i <= AF_MAX; i++) 2408 if ((rnh = nep->ne_rtable[i])) { 2409 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2410 (caddr_t) rnh); 2411 free((caddr_t) rnh, M_RTABLE); 2412 nep->ne_rtable[i] = 0; 2413 } 2414 } 2415 2416 /* 2417 * High level function to manipulate export options on a mount point 2418 * and the passed in netexport. 2419 * Struct export_args *argp is the variable used to twiddle options, 2420 * the structure is described in sys/mount.h 2421 */ 2422 int 2423 vfs_export(mp, nep, argp) 2424 struct mount *mp; 2425 struct netexport *nep; 2426 struct export_args *argp; 2427 { 2428 int error; 2429 2430 if (argp->ex_flags & MNT_DELEXPORT) { 2431 if (mp->mnt_flag & MNT_EXPUBLIC) { 2432 vfs_setpublicfs(NULL, NULL, NULL); 2433 mp->mnt_flag &= ~MNT_EXPUBLIC; 2434 } 2435 vfs_free_addrlist(nep); 2436 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2437 } 2438 if (argp->ex_flags & MNT_EXPORTED) { 2439 if (argp->ex_flags & MNT_EXPUBLIC) { 2440 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2441 return (error); 2442 mp->mnt_flag |= MNT_EXPUBLIC; 2443 } 2444 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2445 return (error); 2446 mp->mnt_flag |= MNT_EXPORTED; 2447 } 2448 return (0); 2449 } 2450 2451 /* 2452 * Set the publicly exported filesystem (WebNFS). Currently, only 2453 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2454 */ 2455 int 2456 vfs_setpublicfs(mp, nep, argp) 2457 struct mount *mp; 2458 struct netexport *nep; 2459 struct export_args *argp; 2460 { 2461 int error; 2462 struct vnode *rvp; 2463 char *cp; 2464 2465 /* 2466 * mp == NULL -> invalidate the current info, the FS is 2467 * no longer exported. May be called from either vfs_export 2468 * or unmount, so check if it hasn't already been done. 2469 */ 2470 if (mp == NULL) { 2471 if (nfs_pub.np_valid) { 2472 nfs_pub.np_valid = 0; 2473 if (nfs_pub.np_index != NULL) { 2474 FREE(nfs_pub.np_index, M_TEMP); 2475 nfs_pub.np_index = NULL; 2476 } 2477 } 2478 return (0); 2479 } 2480 2481 /* 2482 * Only one allowed at a time. 2483 */ 2484 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2485 return (EBUSY); 2486 2487 /* 2488 * Get real filehandle for root of exported FS. 2489 */ 2490 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2491 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2492 2493 if ((error = VFS_ROOT(mp, &rvp))) 2494 return (error); 2495 2496 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2497 return (error); 2498 2499 vput(rvp); 2500 2501 /* 2502 * If an indexfile was specified, pull it in. 2503 */ 2504 if (argp->ex_indexfile != NULL) { 2505 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2506 M_WAITOK); 2507 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2508 MAXNAMLEN, (size_t *)0); 2509 if (!error) { 2510 /* 2511 * Check for illegal filenames. 2512 */ 2513 for (cp = nfs_pub.np_index; *cp; cp++) { 2514 if (*cp == '/') { 2515 error = EINVAL; 2516 break; 2517 } 2518 } 2519 } 2520 if (error) { 2521 FREE(nfs_pub.np_index, M_TEMP); 2522 return (error); 2523 } 2524 } 2525 2526 nfs_pub.np_mount = mp; 2527 nfs_pub.np_valid = 1; 2528 return (0); 2529 } 2530 2531 /* 2532 * Used by the filesystems to determine if a given network address 2533 * (passed in 'nam') is present in thier exports list, returns a pointer 2534 * to struct netcred so that the filesystem can examine it for 2535 * access rights (read/write/etc). 2536 */ 2537 struct netcred * 2538 vfs_export_lookup(mp, nep, nam) 2539 register struct mount *mp; 2540 struct netexport *nep; 2541 struct sockaddr *nam; 2542 { 2543 register struct netcred *np; 2544 register struct radix_node_head *rnh; 2545 struct sockaddr *saddr; 2546 2547 np = NULL; 2548 if (mp->mnt_flag & MNT_EXPORTED) { 2549 /* 2550 * Lookup in the export list first. 2551 */ 2552 if (nam != NULL) { 2553 saddr = nam; 2554 rnh = nep->ne_rtable[saddr->sa_family]; 2555 if (rnh != NULL) { 2556 np = (struct netcred *) 2557 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2558 rnh); 2559 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2560 np = NULL; 2561 } 2562 } 2563 /* 2564 * If no address match, use the default if it exists. 2565 */ 2566 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2567 np = &nep->ne_defexported; 2568 } 2569 return (np); 2570 } 2571 2572 /* 2573 * perform msync on all vnodes under a mount point 2574 * the mount point must be locked. 2575 */ 2576 void 2577 vfs_msync(struct mount *mp, int flags) { 2578 struct vnode *vp, *nvp; 2579 struct vm_object *obj; 2580 int anyio, tries; 2581 2582 tries = 5; 2583 loop: 2584 anyio = 0; 2585 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2586 2587 nvp = LIST_NEXT(vp, v_mntvnodes); 2588 2589 if (vp->v_mount != mp) { 2590 goto loop; 2591 } 2592 2593 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2594 continue; 2595 2596 if (flags != MNT_WAIT) { 2597 if (VOP_GETVOBJECT(vp, &obj) != 0 || 2598 (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2599 continue; 2600 if (VOP_ISLOCKED(vp, NULL)) 2601 continue; 2602 } 2603 2604 mtx_lock(&vp->v_interlock); 2605 if (VOP_GETVOBJECT(vp, &obj) == 0 && 2606 (obj->flags & OBJ_MIGHTBEDIRTY)) { 2607 if (!vget(vp, 2608 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2609 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2610 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2611 anyio = 1; 2612 } 2613 vput(vp); 2614 } 2615 } else { 2616 mtx_unlock(&vp->v_interlock); 2617 } 2618 } 2619 if (anyio && (--tries > 0)) 2620 goto loop; 2621 } 2622 2623 /* 2624 * Create the VM object needed for VMIO and mmap support. This 2625 * is done for all VREG files in the system. Some filesystems might 2626 * afford the additional metadata buffering capability of the 2627 * VMIO code by making the device node be VMIO mode also. 2628 * 2629 * vp must be locked when vfs_object_create is called. 2630 */ 2631 int 2632 vfs_object_create(vp, p, cred) 2633 struct vnode *vp; 2634 struct proc *p; 2635 struct ucred *cred; 2636 { 2637 return (VOP_CREATEVOBJECT(vp, cred, p)); 2638 } 2639 2640 /* 2641 * Mark a vnode as free, putting it up for recycling. 2642 */ 2643 void 2644 vfree(vp) 2645 struct vnode *vp; 2646 { 2647 int s; 2648 2649 s = splbio(); 2650 mtx_lock(&vnode_free_list_mtx); 2651 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2652 if (vp->v_flag & VAGE) { 2653 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2654 } else { 2655 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2656 } 2657 freevnodes++; 2658 mtx_unlock(&vnode_free_list_mtx); 2659 vp->v_flag &= ~VAGE; 2660 vp->v_flag |= VFREE; 2661 splx(s); 2662 } 2663 2664 /* 2665 * Opposite of vfree() - mark a vnode as in use. 2666 */ 2667 void 2668 vbusy(vp) 2669 struct vnode *vp; 2670 { 2671 int s; 2672 2673 s = splbio(); 2674 mtx_lock(&vnode_free_list_mtx); 2675 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2676 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2677 freevnodes--; 2678 mtx_unlock(&vnode_free_list_mtx); 2679 vp->v_flag &= ~(VFREE|VAGE); 2680 splx(s); 2681 } 2682 2683 /* 2684 * Record a process's interest in events which might happen to 2685 * a vnode. Because poll uses the historic select-style interface 2686 * internally, this routine serves as both the ``check for any 2687 * pending events'' and the ``record my interest in future events'' 2688 * functions. (These are done together, while the lock is held, 2689 * to avoid race conditions.) 2690 */ 2691 int 2692 vn_pollrecord(vp, p, events) 2693 struct vnode *vp; 2694 struct proc *p; 2695 short events; 2696 { 2697 mtx_lock(&vp->v_pollinfo.vpi_lock); 2698 if (vp->v_pollinfo.vpi_revents & events) { 2699 /* 2700 * This leaves events we are not interested 2701 * in available for the other process which 2702 * which presumably had requested them 2703 * (otherwise they would never have been 2704 * recorded). 2705 */ 2706 events &= vp->v_pollinfo.vpi_revents; 2707 vp->v_pollinfo.vpi_revents &= ~events; 2708 2709 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2710 return events; 2711 } 2712 vp->v_pollinfo.vpi_events |= events; 2713 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2714 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2715 return 0; 2716 } 2717 2718 /* 2719 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2720 * it is possible for us to miss an event due to race conditions, but 2721 * that condition is expected to be rare, so for the moment it is the 2722 * preferred interface. 2723 */ 2724 void 2725 vn_pollevent(vp, events) 2726 struct vnode *vp; 2727 short events; 2728 { 2729 mtx_lock(&vp->v_pollinfo.vpi_lock); 2730 if (vp->v_pollinfo.vpi_events & events) { 2731 /* 2732 * We clear vpi_events so that we don't 2733 * call selwakeup() twice if two events are 2734 * posted before the polling process(es) is 2735 * awakened. This also ensures that we take at 2736 * most one selwakeup() if the polling process 2737 * is no longer interested. However, it does 2738 * mean that only one event can be noticed at 2739 * a time. (Perhaps we should only clear those 2740 * event bits which we note?) XXX 2741 */ 2742 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2743 vp->v_pollinfo.vpi_revents |= events; 2744 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2745 } 2746 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2747 } 2748 2749 #define VN_KNOTE(vp, b) \ 2750 KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) 2751 2752 /* 2753 * Wake up anyone polling on vp because it is being revoked. 2754 * This depends on dead_poll() returning POLLHUP for correct 2755 * behavior. 2756 */ 2757 void 2758 vn_pollgone(vp) 2759 struct vnode *vp; 2760 { 2761 mtx_lock(&vp->v_pollinfo.vpi_lock); 2762 VN_KNOTE(vp, NOTE_REVOKE); 2763 if (vp->v_pollinfo.vpi_events) { 2764 vp->v_pollinfo.vpi_events = 0; 2765 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2766 } 2767 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2768 } 2769 2770 2771 2772 /* 2773 * Routine to create and manage a filesystem syncer vnode. 2774 */ 2775 #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2776 static int sync_fsync __P((struct vop_fsync_args *)); 2777 static int sync_inactive __P((struct vop_inactive_args *)); 2778 static int sync_reclaim __P((struct vop_reclaim_args *)); 2779 #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2780 #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2781 static int sync_print __P((struct vop_print_args *)); 2782 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2783 2784 static vop_t **sync_vnodeop_p; 2785 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2786 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2787 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2788 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2789 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2790 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2791 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2792 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2793 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2794 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2795 { NULL, NULL } 2796 }; 2797 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2798 { &sync_vnodeop_p, sync_vnodeop_entries }; 2799 2800 VNODEOP_SET(sync_vnodeop_opv_desc); 2801 2802 /* 2803 * Create a new filesystem syncer vnode for the specified mount point. 2804 */ 2805 int 2806 vfs_allocate_syncvnode(mp) 2807 struct mount *mp; 2808 { 2809 struct vnode *vp; 2810 static long start, incr, next; 2811 int error; 2812 2813 /* Allocate a new vnode */ 2814 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2815 mp->mnt_syncer = NULL; 2816 return (error); 2817 } 2818 vp->v_type = VNON; 2819 /* 2820 * Place the vnode onto the syncer worklist. We attempt to 2821 * scatter them about on the list so that they will go off 2822 * at evenly distributed times even if all the filesystems 2823 * are mounted at once. 2824 */ 2825 next += incr; 2826 if (next == 0 || next > syncer_maxdelay) { 2827 start /= 2; 2828 incr /= 2; 2829 if (start == 0) { 2830 start = syncer_maxdelay / 2; 2831 incr = syncer_maxdelay; 2832 } 2833 next = start; 2834 } 2835 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2836 mp->mnt_syncer = vp; 2837 return (0); 2838 } 2839 2840 /* 2841 * Do a lazy sync of the filesystem. 2842 */ 2843 static int 2844 sync_fsync(ap) 2845 struct vop_fsync_args /* { 2846 struct vnode *a_vp; 2847 struct ucred *a_cred; 2848 int a_waitfor; 2849 struct proc *a_p; 2850 } */ *ap; 2851 { 2852 struct vnode *syncvp = ap->a_vp; 2853 struct mount *mp = syncvp->v_mount; 2854 struct proc *p = ap->a_p; 2855 int asyncflag; 2856 2857 /* 2858 * We only need to do something if this is a lazy evaluation. 2859 */ 2860 if (ap->a_waitfor != MNT_LAZY) 2861 return (0); 2862 2863 /* 2864 * Move ourselves to the back of the sync list. 2865 */ 2866 vn_syncer_add_to_worklist(syncvp, syncdelay); 2867 2868 /* 2869 * Walk the list of vnodes pushing all that are dirty and 2870 * not already on the sync list. 2871 */ 2872 mtx_lock(&mountlist_mtx); 2873 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { 2874 mtx_unlock(&mountlist_mtx); 2875 return (0); 2876 } 2877 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2878 vfs_unbusy(mp, p); 2879 return (0); 2880 } 2881 asyncflag = mp->mnt_flag & MNT_ASYNC; 2882 mp->mnt_flag &= ~MNT_ASYNC; 2883 vfs_msync(mp, MNT_NOWAIT); 2884 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2885 if (asyncflag) 2886 mp->mnt_flag |= MNT_ASYNC; 2887 vn_finished_write(mp); 2888 vfs_unbusy(mp, p); 2889 return (0); 2890 } 2891 2892 /* 2893 * The syncer vnode is no referenced. 2894 */ 2895 static int 2896 sync_inactive(ap) 2897 struct vop_inactive_args /* { 2898 struct vnode *a_vp; 2899 struct proc *a_p; 2900 } */ *ap; 2901 { 2902 2903 vgone(ap->a_vp); 2904 return (0); 2905 } 2906 2907 /* 2908 * The syncer vnode is no longer needed and is being decommissioned. 2909 * 2910 * Modifications to the worklist must be protected at splbio(). 2911 */ 2912 static int 2913 sync_reclaim(ap) 2914 struct vop_reclaim_args /* { 2915 struct vnode *a_vp; 2916 } */ *ap; 2917 { 2918 struct vnode *vp = ap->a_vp; 2919 int s; 2920 2921 s = splbio(); 2922 vp->v_mount->mnt_syncer = NULL; 2923 if (vp->v_flag & VONWORKLST) { 2924 LIST_REMOVE(vp, v_synclist); 2925 vp->v_flag &= ~VONWORKLST; 2926 } 2927 splx(s); 2928 2929 return (0); 2930 } 2931 2932 /* 2933 * Print out a syncer vnode. 2934 */ 2935 static int 2936 sync_print(ap) 2937 struct vop_print_args /* { 2938 struct vnode *a_vp; 2939 } */ *ap; 2940 { 2941 struct vnode *vp = ap->a_vp; 2942 2943 printf("syncer vnode"); 2944 if (vp->v_vnlock != NULL) 2945 lockmgr_printinfo(vp->v_vnlock); 2946 printf("\n"); 2947 return (0); 2948 } 2949 2950 /* 2951 * extract the dev_t from a VCHR 2952 */ 2953 dev_t 2954 vn_todev(vp) 2955 struct vnode *vp; 2956 { 2957 if (vp->v_type != VCHR) 2958 return (NODEV); 2959 return (vp->v_rdev); 2960 } 2961 2962 /* 2963 * Check if vnode represents a disk device 2964 */ 2965 int 2966 vn_isdisk(vp, errp) 2967 struct vnode *vp; 2968 int *errp; 2969 { 2970 struct cdevsw *cdevsw; 2971 2972 if (vp->v_type != VCHR) { 2973 if (errp != NULL) 2974 *errp = ENOTBLK; 2975 return (0); 2976 } 2977 if (vp->v_rdev == NULL) { 2978 if (errp != NULL) 2979 *errp = ENXIO; 2980 return (0); 2981 } 2982 cdevsw = devsw(vp->v_rdev); 2983 if (cdevsw == NULL) { 2984 if (errp != NULL) 2985 *errp = ENXIO; 2986 return (0); 2987 } 2988 if (!(cdevsw->d_flags & D_DISK)) { 2989 if (errp != NULL) 2990 *errp = ENOTBLK; 2991 return (0); 2992 } 2993 if (errp != NULL) 2994 *errp = 0; 2995 return (1); 2996 } 2997 2998 /* 2999 * Free data allocated by namei(); see namei(9) for details. 3000 */ 3001 void 3002 NDFREE(ndp, flags) 3003 struct nameidata *ndp; 3004 const uint flags; 3005 { 3006 if (!(flags & NDF_NO_FREE_PNBUF) && 3007 (ndp->ni_cnd.cn_flags & HASBUF)) { 3008 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3009 ndp->ni_cnd.cn_flags &= ~HASBUF; 3010 } 3011 if (!(flags & NDF_NO_DVP_UNLOCK) && 3012 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3013 ndp->ni_dvp != ndp->ni_vp) 3014 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 3015 if (!(flags & NDF_NO_DVP_RELE) && 3016 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3017 vrele(ndp->ni_dvp); 3018 ndp->ni_dvp = NULL; 3019 } 3020 if (!(flags & NDF_NO_VP_UNLOCK) && 3021 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3022 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 3023 if (!(flags & NDF_NO_VP_RELE) && 3024 ndp->ni_vp) { 3025 vrele(ndp->ni_vp); 3026 ndp->ni_vp = NULL; 3027 } 3028 if (!(flags & NDF_NO_STARTDIR_RELE) && 3029 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3030 vrele(ndp->ni_startdir); 3031 ndp->ni_startdir = NULL; 3032 } 3033 } 3034 3035 /* 3036 * Common file system object access control check routine. Accepts a 3037 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3038 * and optional call-by-reference privused argument allowing vaccess() 3039 * to indicate to the caller whether privilege was used to satisfy the 3040 * request. Returns 0 on success, or an errno on failure. 3041 */ 3042 int 3043 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3044 enum vtype type; 3045 mode_t file_mode; 3046 uid_t file_uid; 3047 gid_t file_gid; 3048 mode_t acc_mode; 3049 struct ucred *cred; 3050 int *privused; 3051 { 3052 mode_t dac_granted; 3053 #ifdef CAPABILITIES 3054 mode_t cap_granted; 3055 #endif 3056 3057 /* 3058 * Look for a normal, non-privileged way to access the file/directory 3059 * as requested. If it exists, go with that. 3060 */ 3061 3062 if (privused != NULL) 3063 *privused = 0; 3064 3065 dac_granted = 0; 3066 3067 /* Check the owner. */ 3068 if (cred->cr_uid == file_uid) { 3069 dac_granted |= VADMIN; 3070 if (file_mode & S_IXUSR) 3071 dac_granted |= VEXEC; 3072 if (file_mode & S_IRUSR) 3073 dac_granted |= VREAD; 3074 if (file_mode & S_IWUSR) 3075 dac_granted |= VWRITE; 3076 3077 if ((acc_mode & dac_granted) == acc_mode) 3078 return (0); 3079 3080 goto privcheck; 3081 } 3082 3083 /* Otherwise, check the groups (first match) */ 3084 if (groupmember(file_gid, cred)) { 3085 if (file_mode & S_IXGRP) 3086 dac_granted |= VEXEC; 3087 if (file_mode & S_IRGRP) 3088 dac_granted |= VREAD; 3089 if (file_mode & S_IWGRP) 3090 dac_granted |= VWRITE; 3091 3092 if ((acc_mode & dac_granted) == acc_mode) 3093 return (0); 3094 3095 goto privcheck; 3096 } 3097 3098 /* Otherwise, check everyone else. */ 3099 if (file_mode & S_IXOTH) 3100 dac_granted |= VEXEC; 3101 if (file_mode & S_IROTH) 3102 dac_granted |= VREAD; 3103 if (file_mode & S_IWOTH) 3104 dac_granted |= VWRITE; 3105 if ((acc_mode & dac_granted) == acc_mode) 3106 return (0); 3107 3108 privcheck: 3109 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 3110 /* XXX audit: privilege used */ 3111 if (privused != NULL) 3112 *privused = 1; 3113 return (0); 3114 } 3115 3116 #ifdef CAPABILITIES 3117 /* 3118 * Build a capability mask to determine if the set of capabilities 3119 * satisfies the requirements when combined with the granted mask 3120 * from above. 3121 * For each capability, if the capability is required, bitwise 3122 * or the request type onto the cap_granted mask. 3123 */ 3124 cap_granted = 0; 3125 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3126 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3127 cap_granted |= VEXEC; 3128 3129 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3130 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3131 cap_granted |= VREAD; 3132 3133 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3134 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3135 cap_granted |= VWRITE; 3136 3137 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3138 !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) 3139 cap_granted |= VADMIN; 3140 3141 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3142 /* XXX audit: privilege used */ 3143 if (privused != NULL) 3144 *privused = 1; 3145 return (0); 3146 } 3147 #endif 3148 3149 return ((acc_mode & VADMIN) ? EPERM : EACCES); 3150 } 3151