1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 #include "opt_ffs.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/dirent.h> 54 #include <sys/domain.h> 55 #include <sys/eventhandler.h> 56 #include <sys/event.h> 57 #include <sys/fcntl.h> 58 #include <sys/kernel.h> 59 #include <sys/kthread.h> 60 #include <sys/ktr.h> 61 #include <sys/malloc.h> 62 #include <net/radix.h> 63 #include <sys/socket.h> 64 #include <sys/mount.h> 65 #include <sys/mutex.h> 66 #include <sys/namei.h> 67 #include <sys/proc.h> 68 #include <sys/reboot.h> 69 #include <sys/socket.h> 70 #include <sys/stat.h> 71 #include <sys/sysctl.h> 72 #include <sys/vmmeter.h> 73 #include <sys/vnode.h> 74 75 #include <machine/limits.h> 76 77 #include <vm/vm.h> 78 #include <vm/vm_object.h> 79 #include <vm/vm_extern.h> 80 #include <vm/pmap.h> 81 #include <vm/vm_map.h> 82 #include <vm/vm_page.h> 83 #include <vm/vm_pager.h> 84 #include <vm/vnode_pager.h> 85 #include <vm/vm_zone.h> 86 87 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 88 89 static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); 90 static void insmntque __P((struct vnode *vp, struct mount *mp)); 91 static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 92 93 /* 94 * Number of vnodes in existence. Increased whenever getnewvnode() 95 * allocates a new vnode, never decreased. 96 */ 97 static unsigned long numvnodes; 98 SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 99 100 /* 101 * Conversion tables for conversion from vnode types to inode formats 102 * and back. 103 */ 104 enum vtype iftovt_tab[16] = { 105 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 106 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 107 }; 108 int vttoif_tab[9] = { 109 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 110 S_IFSOCK, S_IFIFO, S_IFMT, 111 }; 112 113 /* 114 * List of vnodes that are ready for recycling. 115 */ 116 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 117 118 /* 119 * Minimum number of free vnodes. If there are fewer than this free vnodes, 120 * getnewvnode() will return a newly allocated vnode. 121 */ 122 static u_long wantfreevnodes = 25; 123 SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 124 /* Number of vnodes in the free list. */ 125 static u_long freevnodes = 0; 126 SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 127 /* Number of vnode allocation. */ 128 static u_long vnodeallocs = 0; 129 SYSCTL_LONG(_debug, OID_AUTO, vnodeallocs, CTLFLAG_RD, &vnodeallocs, 0, ""); 130 /* Period of vnode recycle from namecache in vnode allocation times. */ 131 static u_long vnoderecycleperiod = 1000; 132 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleperiod, CTLFLAG_RW, &vnoderecycleperiod, 0, ""); 133 /* Minimum number of total vnodes required to invoke vnode recycle from namecache. */ 134 static u_long vnoderecyclemintotalvn = 2000; 135 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclemintotalvn, CTLFLAG_RW, &vnoderecyclemintotalvn, 0, ""); 136 /* Minimum number of free vnodes required to invoke vnode recycle from namecache. */ 137 static u_long vnoderecycleminfreevn = 2000; 138 SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleminfreevn, CTLFLAG_RW, &vnoderecycleminfreevn, 0, ""); 139 /* Number of vnodes attempted to recycle at a time. */ 140 static u_long vnoderecyclenumber = 3000; 141 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclenumber, CTLFLAG_RW, &vnoderecyclenumber, 0, ""); 142 143 /* 144 * Various variables used for debugging the new implementation of 145 * reassignbuf(). 146 * XXX these are probably of (very) limited utility now. 147 */ 148 static int reassignbufcalls; 149 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 150 static int reassignbufloops; 151 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 152 static int reassignbufsortgood; 153 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 154 static int reassignbufsortbad; 155 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 156 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ 157 static int reassignbufmethod = 1; 158 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 159 160 #ifdef ENABLE_VFS_IOOPT 161 /* See NOTES for a description of this setting. */ 162 int vfs_ioopt = 0; 163 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 164 #endif 165 166 /* List of mounted filesystems. */ 167 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 168 169 /* For any iteration/modification of mountlist */ 170 struct mtx mountlist_mtx; 171 172 /* For any iteration/modification of mnt_vnodelist */ 173 struct mtx mntvnode_mtx; 174 175 /* 176 * Cache for the mount type id assigned to NFS. This is used for 177 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 178 */ 179 int nfs_mount_type = -1; 180 181 /* To keep more than one thread at a time from running vfs_getnewfsid */ 182 static struct mtx mntid_mtx; 183 184 /* For any iteration/modification of vnode_free_list */ 185 static struct mtx vnode_free_list_mtx; 186 187 /* 188 * For any iteration/modification of dev->si_hlist (linked through 189 * v_specnext) 190 */ 191 static struct mtx spechash_mtx; 192 193 /* Publicly exported FS */ 194 struct nfs_public nfs_pub; 195 196 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 197 static vm_zone_t vnode_zone; 198 199 /* Set to 1 to print out reclaim of active vnodes */ 200 int prtactive = 0; 201 202 /* 203 * The workitem queue. 204 * 205 * It is useful to delay writes of file data and filesystem metadata 206 * for tens of seconds so that quickly created and deleted files need 207 * not waste disk bandwidth being created and removed. To realize this, 208 * we append vnodes to a "workitem" queue. When running with a soft 209 * updates implementation, most pending metadata dependencies should 210 * not wait for more than a few seconds. Thus, mounted on block devices 211 * are delayed only about a half the time that file data is delayed. 212 * Similarly, directory updates are more critical, so are only delayed 213 * about a third the time that file data is delayed. Thus, there are 214 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 215 * one each second (driven off the filesystem syncer process). The 216 * syncer_delayno variable indicates the next queue that is to be processed. 217 * Items that need to be processed soon are placed in this queue: 218 * 219 * syncer_workitem_pending[syncer_delayno] 220 * 221 * A delay of fifteen seconds is done by placing the request fifteen 222 * entries later in the queue: 223 * 224 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 225 * 226 */ 227 static int syncer_delayno = 0; 228 static long syncer_mask; 229 LIST_HEAD(synclist, vnode); 230 static struct synclist *syncer_workitem_pending; 231 232 #define SYNCER_MAXDELAY 32 233 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 234 time_t syncdelay = 30; /* max time to delay syncing data */ 235 time_t filedelay = 30; /* time to delay syncing files */ 236 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 237 time_t dirdelay = 29; /* time to delay syncing directories */ 238 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 239 time_t metadelay = 28; /* time to delay syncing metadata */ 240 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 241 static int rushjob; /* number of slots to run ASAP */ 242 static int stat_rush_requests; /* number of times I/O speeded up */ 243 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 244 245 /* 246 * Number of vnodes we want to exist at any one time. This is mostly used 247 * to size hash tables in vnode-related code. It is normally not used in 248 * getnewvnode(), as wantfreevnodes is normally nonzero.) 249 * 250 * XXX desiredvnodes is historical cruft and should not exist. 251 */ 252 int desiredvnodes; 253 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 254 &desiredvnodes, 0, "Maximum number of vnodes"); 255 256 static void vfs_free_addrlist __P((struct netexport *nep)); 257 static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 258 static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 259 struct export_args *argp)); 260 261 /* 262 * Initialize the vnode management data structures. 263 */ 264 static void 265 vntblinit(void *dummy __unused) 266 { 267 268 desiredvnodes = maxproc + cnt.v_page_count / 4; 269 mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); 270 mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF); 271 mtx_init(&mntid_mtx, "mntid", MTX_DEF); 272 mtx_init(&spechash_mtx, "spechash", MTX_DEF); 273 TAILQ_INIT(&vnode_free_list); 274 mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF); 275 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 276 /* 277 * Initialize the filesystem syncer. 278 */ 279 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 280 &syncer_mask); 281 syncer_maxdelay = syncer_mask + 1; 282 } 283 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 284 285 286 /* 287 * Mark a mount point as busy. Used to synchronize access and to delay 288 * unmounting. Interlock is not released on failure. 289 */ 290 int 291 vfs_busy(mp, flags, interlkp, p) 292 struct mount *mp; 293 int flags; 294 struct mtx *interlkp; 295 struct proc *p; 296 { 297 int lkflags; 298 299 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 300 if (flags & LK_NOWAIT) 301 return (ENOENT); 302 mp->mnt_kern_flag |= MNTK_MWAIT; 303 /* 304 * Since all busy locks are shared except the exclusive 305 * lock granted when unmounting, the only place that a 306 * wakeup needs to be done is at the release of the 307 * exclusive lock at the end of dounmount. 308 */ 309 msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0); 310 return (ENOENT); 311 } 312 lkflags = LK_SHARED | LK_NOPAUSE; 313 if (interlkp) 314 lkflags |= LK_INTERLOCK; 315 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 316 panic("vfs_busy: unexpected lock failure"); 317 return (0); 318 } 319 320 /* 321 * Free a busy filesystem. 322 */ 323 void 324 vfs_unbusy(mp, p) 325 struct mount *mp; 326 struct proc *p; 327 { 328 329 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 330 } 331 332 /* 333 * Lookup a filesystem type, and if found allocate and initialize 334 * a mount structure for it. 335 * 336 * Devname is usually updated by mount(8) after booting. 337 */ 338 int 339 vfs_rootmountalloc(fstypename, devname, mpp) 340 char *fstypename; 341 char *devname; 342 struct mount **mpp; 343 { 344 struct proc *p = curproc; /* XXX */ 345 struct vfsconf *vfsp; 346 struct mount *mp; 347 348 if (fstypename == NULL) 349 return (ENODEV); 350 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 351 if (!strcmp(vfsp->vfc_name, fstypename)) 352 break; 353 if (vfsp == NULL) 354 return (ENODEV); 355 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO); 356 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 357 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 358 LIST_INIT(&mp->mnt_vnodelist); 359 mp->mnt_vfc = vfsp; 360 mp->mnt_op = vfsp->vfc_vfsops; 361 mp->mnt_flag = MNT_RDONLY; 362 mp->mnt_vnodecovered = NULLVP; 363 vfsp->vfc_refcount++; 364 mp->mnt_iosize_max = DFLTPHYS; 365 mp->mnt_stat.f_type = vfsp->vfc_typenum; 366 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 367 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 368 mp->mnt_stat.f_mntonname[0] = '/'; 369 mp->mnt_stat.f_mntonname[1] = 0; 370 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 371 *mpp = mp; 372 return (0); 373 } 374 375 /* 376 * Find an appropriate filesystem to use for the root. If a filesystem 377 * has not been preselected, walk through the list of known filesystems 378 * trying those that have mountroot routines, and try them until one 379 * works or we have tried them all. 380 */ 381 #ifdef notdef /* XXX JH */ 382 int 383 lite2_vfs_mountroot() 384 { 385 struct vfsconf *vfsp; 386 extern int (*lite2_mountroot) __P((void)); 387 int error; 388 389 if (lite2_mountroot != NULL) 390 return ((*lite2_mountroot)()); 391 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 392 if (vfsp->vfc_mountroot == NULL) 393 continue; 394 if ((error = (*vfsp->vfc_mountroot)()) == 0) 395 return (0); 396 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 397 } 398 return (ENODEV); 399 } 400 #endif 401 402 /* 403 * Lookup a mount point by filesystem identifier. 404 */ 405 struct mount * 406 vfs_getvfs(fsid) 407 fsid_t *fsid; 408 { 409 register struct mount *mp; 410 411 mtx_lock(&mountlist_mtx); 412 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 413 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 414 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 415 mtx_unlock(&mountlist_mtx); 416 return (mp); 417 } 418 } 419 mtx_unlock(&mountlist_mtx); 420 return ((struct mount *) 0); 421 } 422 423 /* 424 * Get a new unique fsid. Try to make its val[0] unique, since this value 425 * will be used to create fake device numbers for stat(). Also try (but 426 * not so hard) make its val[0] unique mod 2^16, since some emulators only 427 * support 16-bit device numbers. We end up with unique val[0]'s for the 428 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 429 * 430 * Keep in mind that several mounts may be running in parallel. Starting 431 * the search one past where the previous search terminated is both a 432 * micro-optimization and a defense against returning the same fsid to 433 * different mounts. 434 */ 435 void 436 vfs_getnewfsid(mp) 437 struct mount *mp; 438 { 439 static u_int16_t mntid_base; 440 fsid_t tfsid; 441 int mtype; 442 443 mtx_lock(&mntid_mtx); 444 mtype = mp->mnt_vfc->vfc_typenum; 445 tfsid.val[1] = mtype; 446 mtype = (mtype & 0xFF) << 24; 447 for (;;) { 448 tfsid.val[0] = makeudev(255, 449 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 450 mntid_base++; 451 if (vfs_getvfs(&tfsid) == NULL) 452 break; 453 } 454 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 455 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 456 mtx_unlock(&mntid_mtx); 457 } 458 459 /* 460 * Knob to control the precision of file timestamps: 461 * 462 * 0 = seconds only; nanoseconds zeroed. 463 * 1 = seconds and nanoseconds, accurate within 1/HZ. 464 * 2 = seconds and nanoseconds, truncated to microseconds. 465 * >=3 = seconds and nanoseconds, maximum precision. 466 */ 467 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 468 469 static int timestamp_precision = TSP_SEC; 470 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 471 ×tamp_precision, 0, ""); 472 473 /* 474 * Get a current timestamp. 475 */ 476 void 477 vfs_timestamp(tsp) 478 struct timespec *tsp; 479 { 480 struct timeval tv; 481 482 switch (timestamp_precision) { 483 case TSP_SEC: 484 tsp->tv_sec = time_second; 485 tsp->tv_nsec = 0; 486 break; 487 case TSP_HZ: 488 getnanotime(tsp); 489 break; 490 case TSP_USEC: 491 microtime(&tv); 492 TIMEVAL_TO_TIMESPEC(&tv, tsp); 493 break; 494 case TSP_NSEC: 495 default: 496 nanotime(tsp); 497 break; 498 } 499 } 500 501 /* 502 * Set vnode attributes to VNOVAL 503 */ 504 void 505 vattr_null(vap) 506 register struct vattr *vap; 507 { 508 509 vap->va_type = VNON; 510 vap->va_size = VNOVAL; 511 vap->va_bytes = VNOVAL; 512 vap->va_mode = VNOVAL; 513 vap->va_nlink = VNOVAL; 514 vap->va_uid = VNOVAL; 515 vap->va_gid = VNOVAL; 516 vap->va_fsid = VNOVAL; 517 vap->va_fileid = VNOVAL; 518 vap->va_blocksize = VNOVAL; 519 vap->va_rdev = VNOVAL; 520 vap->va_atime.tv_sec = VNOVAL; 521 vap->va_atime.tv_nsec = VNOVAL; 522 vap->va_mtime.tv_sec = VNOVAL; 523 vap->va_mtime.tv_nsec = VNOVAL; 524 vap->va_ctime.tv_sec = VNOVAL; 525 vap->va_ctime.tv_nsec = VNOVAL; 526 vap->va_flags = VNOVAL; 527 vap->va_gen = VNOVAL; 528 vap->va_vaflags = 0; 529 } 530 531 /* 532 * Routines having to do with the management of the vnode table. 533 */ 534 535 /* 536 * Return the next vnode from the free list. 537 */ 538 int 539 getnewvnode(tag, mp, vops, vpp) 540 enum vtagtype tag; 541 struct mount *mp; 542 vop_t **vops; 543 struct vnode **vpp; 544 { 545 int s, count; 546 struct proc *p = curproc; /* XXX */ 547 struct vnode *vp = NULL; 548 struct mount *vnmp; 549 vm_object_t object; 550 551 /* 552 * We take the least recently used vnode from the freelist 553 * if we can get it and it has no cached pages, and no 554 * namecache entries are relative to it. 555 * Otherwise we allocate a new vnode 556 */ 557 558 s = splbio(); 559 mtx_lock(&vnode_free_list_mtx); 560 561 if (wantfreevnodes && freevnodes < wantfreevnodes) { 562 vp = NULL; 563 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 564 /* 565 * XXX: this is only here to be backwards compatible 566 */ 567 vp = NULL; 568 } else for (count = 0; count < freevnodes; count++) { 569 vp = TAILQ_FIRST(&vnode_free_list); 570 if (vp == NULL || vp->v_usecount) 571 panic("getnewvnode: free vnode isn't"); 572 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 573 574 /* 575 * Don't recycle if active in the namecache or 576 * if it still has cached pages or we cannot get 577 * its interlock. 578 */ 579 if (LIST_FIRST(&vp->v_cache_src) != NULL || 580 (VOP_GETVOBJECT(vp, &object) == 0 && 581 (object->resident_page_count || object->ref_count)) || 582 !mtx_trylock(&vp->v_interlock)) { 583 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 584 vp = NULL; 585 continue; 586 } 587 /* 588 * Skip over it if its filesystem is being suspended. 589 */ 590 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 591 break; 592 mtx_unlock(&vp->v_interlock); 593 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 594 vp = NULL; 595 } 596 if (vp) { 597 vp->v_flag |= VDOOMED; 598 vp->v_flag &= ~VFREE; 599 freevnodes--; 600 mtx_unlock(&vnode_free_list_mtx); 601 cache_purge(vp); 602 vp->v_lease = NULL; 603 if (vp->v_type != VBAD) { 604 vgonel(vp, p); 605 } else { 606 mtx_unlock(&vp->v_interlock); 607 } 608 vn_finished_write(vnmp); 609 610 #ifdef INVARIANTS 611 { 612 int s; 613 614 if (vp->v_data) 615 panic("cleaned vnode isn't"); 616 s = splbio(); 617 if (vp->v_numoutput) 618 panic("Clean vnode has pending I/O's"); 619 splx(s); 620 if (vp->v_writecount != 0) 621 panic("Non-zero write count"); 622 } 623 #endif 624 vp->v_flag = 0; 625 vp->v_lastw = 0; 626 vp->v_lasta = 0; 627 vp->v_cstart = 0; 628 vp->v_clen = 0; 629 vp->v_socket = 0; 630 } else { 631 mtx_unlock(&vnode_free_list_mtx); 632 vp = (struct vnode *) zalloc(vnode_zone); 633 bzero((char *) vp, sizeof *vp); 634 mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); 635 vp->v_dd = vp; 636 mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF); 637 cache_purge(vp); 638 LIST_INIT(&vp->v_cache_src); 639 TAILQ_INIT(&vp->v_cache_dst); 640 numvnodes++; 641 } 642 643 TAILQ_INIT(&vp->v_cleanblkhd); 644 TAILQ_INIT(&vp->v_dirtyblkhd); 645 vp->v_type = VNON; 646 vp->v_tag = tag; 647 vp->v_op = vops; 648 lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); 649 insmntque(vp, mp); 650 *vpp = vp; 651 vp->v_usecount = 1; 652 vp->v_data = 0; 653 654 splx(s); 655 656 vfs_object_create(vp, p, p->p_ucred); 657 658 vnodeallocs++; 659 if (vnodeallocs % vnoderecycleperiod == 0 && 660 freevnodes < vnoderecycleminfreevn && 661 vnoderecyclemintotalvn < numvnodes) { 662 /* Recycle vnodes. */ 663 cache_purgeleafdirs(vnoderecyclenumber); 664 } 665 666 return (0); 667 } 668 669 /* 670 * Move a vnode from one mount queue to another. 671 */ 672 static void 673 insmntque(vp, mp) 674 register struct vnode *vp; 675 register struct mount *mp; 676 { 677 678 mtx_lock(&mntvnode_mtx); 679 /* 680 * Delete from old mount point vnode list, if on one. 681 */ 682 if (vp->v_mount != NULL) 683 LIST_REMOVE(vp, v_mntvnodes); 684 /* 685 * Insert into list of vnodes for the new mount point, if available. 686 */ 687 if ((vp->v_mount = mp) == NULL) { 688 mtx_unlock(&mntvnode_mtx); 689 return; 690 } 691 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 692 mtx_unlock(&mntvnode_mtx); 693 } 694 695 /* 696 * Update outstanding I/O count and do wakeup if requested. 697 */ 698 void 699 vwakeup(bp) 700 register struct buf *bp; 701 { 702 register struct vnode *vp; 703 704 bp->b_flags &= ~B_WRITEINPROG; 705 if ((vp = bp->b_vp)) { 706 vp->v_numoutput--; 707 if (vp->v_numoutput < 0) 708 panic("vwakeup: neg numoutput"); 709 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 710 vp->v_flag &= ~VBWAIT; 711 wakeup((caddr_t) &vp->v_numoutput); 712 } 713 } 714 } 715 716 /* 717 * Flush out and invalidate all buffers associated with a vnode. 718 * Called with the underlying object locked. 719 */ 720 int 721 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 722 register struct vnode *vp; 723 int flags; 724 struct ucred *cred; 725 struct proc *p; 726 int slpflag, slptimeo; 727 { 728 register struct buf *bp; 729 struct buf *nbp, *blist; 730 int s, error; 731 vm_object_t object; 732 733 if (flags & V_SAVE) { 734 s = splbio(); 735 while (vp->v_numoutput) { 736 vp->v_flag |= VBWAIT; 737 error = tsleep((caddr_t)&vp->v_numoutput, 738 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 739 if (error) { 740 splx(s); 741 return (error); 742 } 743 } 744 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 745 splx(s); 746 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 747 return (error); 748 s = splbio(); 749 if (vp->v_numoutput > 0 || 750 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 751 panic("vinvalbuf: dirty bufs"); 752 } 753 splx(s); 754 } 755 s = splbio(); 756 for (;;) { 757 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 758 if (!blist) 759 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 760 if (!blist) 761 break; 762 763 for (bp = blist; bp; bp = nbp) { 764 nbp = TAILQ_NEXT(bp, b_vnbufs); 765 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 766 error = BUF_TIMELOCK(bp, 767 LK_EXCLUSIVE | LK_SLEEPFAIL, 768 "vinvalbuf", slpflag, slptimeo); 769 if (error == ENOLCK) 770 break; 771 splx(s); 772 return (error); 773 } 774 /* 775 * XXX Since there are no node locks for NFS, I 776 * believe there is a slight chance that a delayed 777 * write will occur while sleeping just above, so 778 * check for it. Note that vfs_bio_awrite expects 779 * buffers to reside on a queue, while BUF_WRITE and 780 * brelse do not. 781 */ 782 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 783 (flags & V_SAVE)) { 784 785 if (bp->b_vp == vp) { 786 if (bp->b_flags & B_CLUSTEROK) { 787 BUF_UNLOCK(bp); 788 vfs_bio_awrite(bp); 789 } else { 790 bremfree(bp); 791 bp->b_flags |= B_ASYNC; 792 BUF_WRITE(bp); 793 } 794 } else { 795 bremfree(bp); 796 (void) BUF_WRITE(bp); 797 } 798 break; 799 } 800 bremfree(bp); 801 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 802 bp->b_flags &= ~B_ASYNC; 803 brelse(bp); 804 } 805 } 806 807 while (vp->v_numoutput > 0) { 808 vp->v_flag |= VBWAIT; 809 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 810 } 811 812 splx(s); 813 814 /* 815 * Destroy the copy in the VM cache, too. 816 */ 817 mtx_lock(&vp->v_interlock); 818 if (VOP_GETVOBJECT(vp, &object) == 0) { 819 vm_object_page_remove(object, 0, 0, 820 (flags & V_SAVE) ? TRUE : FALSE); 821 } 822 mtx_unlock(&vp->v_interlock); 823 824 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 825 panic("vinvalbuf: flush failed"); 826 return (0); 827 } 828 829 /* 830 * Truncate a file's buffer and pages to a specified length. This 831 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 832 * sync activity. 833 */ 834 int 835 vtruncbuf(vp, cred, p, length, blksize) 836 register struct vnode *vp; 837 struct ucred *cred; 838 struct proc *p; 839 off_t length; 840 int blksize; 841 { 842 register struct buf *bp; 843 struct buf *nbp; 844 int s, anyfreed; 845 int trunclbn; 846 847 /* 848 * Round up to the *next* lbn. 849 */ 850 trunclbn = (length + blksize - 1) / blksize; 851 852 s = splbio(); 853 restart: 854 anyfreed = 1; 855 for (;anyfreed;) { 856 anyfreed = 0; 857 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 858 nbp = TAILQ_NEXT(bp, b_vnbufs); 859 if (bp->b_lblkno >= trunclbn) { 860 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 861 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 862 goto restart; 863 } else { 864 bremfree(bp); 865 bp->b_flags |= (B_INVAL | B_RELBUF); 866 bp->b_flags &= ~B_ASYNC; 867 brelse(bp); 868 anyfreed = 1; 869 } 870 if (nbp && 871 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 872 (nbp->b_vp != vp) || 873 (nbp->b_flags & B_DELWRI))) { 874 goto restart; 875 } 876 } 877 } 878 879 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 880 nbp = TAILQ_NEXT(bp, b_vnbufs); 881 if (bp->b_lblkno >= trunclbn) { 882 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 883 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 884 goto restart; 885 } else { 886 bremfree(bp); 887 bp->b_flags |= (B_INVAL | B_RELBUF); 888 bp->b_flags &= ~B_ASYNC; 889 brelse(bp); 890 anyfreed = 1; 891 } 892 if (nbp && 893 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 894 (nbp->b_vp != vp) || 895 (nbp->b_flags & B_DELWRI) == 0)) { 896 goto restart; 897 } 898 } 899 } 900 } 901 902 if (length > 0) { 903 restartsync: 904 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 905 nbp = TAILQ_NEXT(bp, b_vnbufs); 906 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 907 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 908 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 909 goto restart; 910 } else { 911 bremfree(bp); 912 if (bp->b_vp == vp) { 913 bp->b_flags |= B_ASYNC; 914 } else { 915 bp->b_flags &= ~B_ASYNC; 916 } 917 BUF_WRITE(bp); 918 } 919 goto restartsync; 920 } 921 922 } 923 } 924 925 while (vp->v_numoutput > 0) { 926 vp->v_flag |= VBWAIT; 927 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 928 } 929 930 splx(s); 931 932 vnode_pager_setsize(vp, length); 933 934 return (0); 935 } 936 937 /* 938 * Associate a buffer with a vnode. 939 */ 940 void 941 bgetvp(vp, bp) 942 register struct vnode *vp; 943 register struct buf *bp; 944 { 945 int s; 946 947 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 948 949 vhold(vp); 950 bp->b_vp = vp; 951 bp->b_dev = vn_todev(vp); 952 /* 953 * Insert onto list for new vnode. 954 */ 955 s = splbio(); 956 bp->b_xflags |= BX_VNCLEAN; 957 bp->b_xflags &= ~BX_VNDIRTY; 958 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 959 splx(s); 960 } 961 962 /* 963 * Disassociate a buffer from a vnode. 964 */ 965 void 966 brelvp(bp) 967 register struct buf *bp; 968 { 969 struct vnode *vp; 970 struct buflists *listheadp; 971 int s; 972 973 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 974 975 /* 976 * Delete from old vnode list, if on one. 977 */ 978 vp = bp->b_vp; 979 s = splbio(); 980 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 981 if (bp->b_xflags & BX_VNDIRTY) 982 listheadp = &vp->v_dirtyblkhd; 983 else 984 listheadp = &vp->v_cleanblkhd; 985 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 986 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 987 } 988 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 989 vp->v_flag &= ~VONWORKLST; 990 LIST_REMOVE(vp, v_synclist); 991 } 992 splx(s); 993 bp->b_vp = (struct vnode *) 0; 994 vdrop(vp); 995 } 996 997 /* 998 * Add an item to the syncer work queue. 999 */ 1000 static void 1001 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 1002 { 1003 int s, slot; 1004 1005 s = splbio(); 1006 1007 if (vp->v_flag & VONWORKLST) { 1008 LIST_REMOVE(vp, v_synclist); 1009 } 1010 1011 if (delay > syncer_maxdelay - 2) 1012 delay = syncer_maxdelay - 2; 1013 slot = (syncer_delayno + delay) & syncer_mask; 1014 1015 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 1016 vp->v_flag |= VONWORKLST; 1017 splx(s); 1018 } 1019 1020 struct proc *updateproc; 1021 static void sched_sync __P((void)); 1022 static struct kproc_desc up_kp = { 1023 "syncer", 1024 sched_sync, 1025 &updateproc 1026 }; 1027 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1028 1029 /* 1030 * System filesystem synchronizer daemon. 1031 */ 1032 void 1033 sched_sync(void) 1034 { 1035 struct synclist *slp; 1036 struct vnode *vp; 1037 struct mount *mp; 1038 long starttime; 1039 int s; 1040 struct proc *p = updateproc; 1041 1042 mtx_lock(&Giant); 1043 1044 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 1045 SHUTDOWN_PRI_LAST); 1046 1047 for (;;) { 1048 kthread_suspend_check(p); 1049 1050 starttime = time_second; 1051 1052 /* 1053 * Push files whose dirty time has expired. Be careful 1054 * of interrupt race on slp queue. 1055 */ 1056 s = splbio(); 1057 slp = &syncer_workitem_pending[syncer_delayno]; 1058 syncer_delayno += 1; 1059 if (syncer_delayno == syncer_maxdelay) 1060 syncer_delayno = 0; 1061 splx(s); 1062 1063 while ((vp = LIST_FIRST(slp)) != NULL) { 1064 if (VOP_ISLOCKED(vp, NULL) == 0 && 1065 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 1066 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1067 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 1068 VOP_UNLOCK(vp, 0, p); 1069 vn_finished_write(mp); 1070 } 1071 s = splbio(); 1072 if (LIST_FIRST(slp) == vp) { 1073 /* 1074 * Note: v_tag VT_VFS vps can remain on the 1075 * worklist too with no dirty blocks, but 1076 * since sync_fsync() moves it to a different 1077 * slot we are safe. 1078 */ 1079 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1080 !vn_isdisk(vp, NULL)) 1081 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1082 /* 1083 * Put us back on the worklist. The worklist 1084 * routine will remove us from our current 1085 * position and then add us back in at a later 1086 * position. 1087 */ 1088 vn_syncer_add_to_worklist(vp, syncdelay); 1089 } 1090 splx(s); 1091 } 1092 1093 /* 1094 * Do soft update processing. 1095 */ 1096 #ifdef SOFTUPDATES 1097 softdep_process_worklist(NULL); 1098 #endif 1099 1100 /* 1101 * The variable rushjob allows the kernel to speed up the 1102 * processing of the filesystem syncer process. A rushjob 1103 * value of N tells the filesystem syncer to process the next 1104 * N seconds worth of work on its queue ASAP. Currently rushjob 1105 * is used by the soft update code to speed up the filesystem 1106 * syncer process when the incore state is getting so far 1107 * ahead of the disk that the kernel memory pool is being 1108 * threatened with exhaustion. 1109 */ 1110 if (rushjob > 0) { 1111 rushjob -= 1; 1112 continue; 1113 } 1114 /* 1115 * If it has taken us less than a second to process the 1116 * current work, then wait. Otherwise start right over 1117 * again. We can still lose time if any single round 1118 * takes more than two seconds, but it does not really 1119 * matter as we are just trying to generally pace the 1120 * filesystem activity. 1121 */ 1122 if (time_second == starttime) 1123 tsleep(&lbolt, PPAUSE, "syncer", 0); 1124 } 1125 } 1126 1127 /* 1128 * Request the syncer daemon to speed up its work. 1129 * We never push it to speed up more than half of its 1130 * normal turn time, otherwise it could take over the cpu. 1131 */ 1132 int 1133 speedup_syncer() 1134 { 1135 1136 mtx_lock_spin(&sched_lock); 1137 if (updateproc->p_wchan == &lbolt) 1138 setrunnable(updateproc); 1139 mtx_unlock_spin(&sched_lock); 1140 if (rushjob < syncdelay / 2) { 1141 rushjob += 1; 1142 stat_rush_requests += 1; 1143 return (1); 1144 } 1145 return(0); 1146 } 1147 1148 /* 1149 * Associate a p-buffer with a vnode. 1150 * 1151 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1152 * with the buffer. i.e. the bp has not been linked into the vnode or 1153 * ref-counted. 1154 */ 1155 void 1156 pbgetvp(vp, bp) 1157 register struct vnode *vp; 1158 register struct buf *bp; 1159 { 1160 1161 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1162 1163 bp->b_vp = vp; 1164 bp->b_flags |= B_PAGING; 1165 bp->b_dev = vn_todev(vp); 1166 } 1167 1168 /* 1169 * Disassociate a p-buffer from a vnode. 1170 */ 1171 void 1172 pbrelvp(bp) 1173 register struct buf *bp; 1174 { 1175 1176 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1177 1178 /* XXX REMOVE ME */ 1179 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1180 panic( 1181 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1182 bp, 1183 (int)bp->b_flags 1184 ); 1185 } 1186 bp->b_vp = (struct vnode *) 0; 1187 bp->b_flags &= ~B_PAGING; 1188 } 1189 1190 /* 1191 * Change the vnode a pager buffer is associated with. 1192 */ 1193 void 1194 pbreassignbuf(bp, newvp) 1195 struct buf *bp; 1196 struct vnode *newvp; 1197 { 1198 1199 KASSERT(bp->b_flags & B_PAGING, 1200 ("pbreassignbuf() on non phys bp %p", bp)); 1201 bp->b_vp = newvp; 1202 } 1203 1204 /* 1205 * Reassign a buffer from one vnode to another. 1206 * Used to assign file specific control information 1207 * (indirect blocks) to the vnode to which they belong. 1208 */ 1209 void 1210 reassignbuf(bp, newvp) 1211 register struct buf *bp; 1212 register struct vnode *newvp; 1213 { 1214 struct buflists *listheadp; 1215 int delay; 1216 int s; 1217 1218 if (newvp == NULL) { 1219 printf("reassignbuf: NULL"); 1220 return; 1221 } 1222 ++reassignbufcalls; 1223 1224 /* 1225 * B_PAGING flagged buffers cannot be reassigned because their vp 1226 * is not fully linked in. 1227 */ 1228 if (bp->b_flags & B_PAGING) 1229 panic("cannot reassign paging buffer"); 1230 1231 s = splbio(); 1232 /* 1233 * Delete from old vnode list, if on one. 1234 */ 1235 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1236 if (bp->b_xflags & BX_VNDIRTY) 1237 listheadp = &bp->b_vp->v_dirtyblkhd; 1238 else 1239 listheadp = &bp->b_vp->v_cleanblkhd; 1240 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1241 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1242 if (bp->b_vp != newvp) { 1243 vdrop(bp->b_vp); 1244 bp->b_vp = NULL; /* for clarification */ 1245 } 1246 } 1247 /* 1248 * If dirty, put on list of dirty buffers; otherwise insert onto list 1249 * of clean buffers. 1250 */ 1251 if (bp->b_flags & B_DELWRI) { 1252 struct buf *tbp; 1253 1254 listheadp = &newvp->v_dirtyblkhd; 1255 if ((newvp->v_flag & VONWORKLST) == 0) { 1256 switch (newvp->v_type) { 1257 case VDIR: 1258 delay = dirdelay; 1259 break; 1260 case VCHR: 1261 if (newvp->v_rdev->si_mountpoint != NULL) { 1262 delay = metadelay; 1263 break; 1264 } 1265 /* fall through */ 1266 default: 1267 delay = filedelay; 1268 } 1269 vn_syncer_add_to_worklist(newvp, delay); 1270 } 1271 bp->b_xflags |= BX_VNDIRTY; 1272 tbp = TAILQ_FIRST(listheadp); 1273 if (tbp == NULL || 1274 bp->b_lblkno == 0 || 1275 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1276 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1277 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1278 ++reassignbufsortgood; 1279 } else if (bp->b_lblkno < 0) { 1280 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1281 ++reassignbufsortgood; 1282 } else if (reassignbufmethod == 1) { 1283 /* 1284 * New sorting algorithm, only handle sequential case, 1285 * otherwise append to end (but before metadata) 1286 */ 1287 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1288 (tbp->b_xflags & BX_VNDIRTY)) { 1289 /* 1290 * Found the best place to insert the buffer 1291 */ 1292 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1293 ++reassignbufsortgood; 1294 } else { 1295 /* 1296 * Missed, append to end, but before meta-data. 1297 * We know that the head buffer in the list is 1298 * not meta-data due to prior conditionals. 1299 * 1300 * Indirect effects: NFS second stage write 1301 * tends to wind up here, giving maximum 1302 * distance between the unstable write and the 1303 * commit rpc. 1304 */ 1305 tbp = TAILQ_LAST(listheadp, buflists); 1306 while (tbp && tbp->b_lblkno < 0) 1307 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1308 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1309 ++reassignbufsortbad; 1310 } 1311 } else { 1312 /* 1313 * Old sorting algorithm, scan queue and insert 1314 */ 1315 struct buf *ttbp; 1316 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1317 (ttbp->b_lblkno < bp->b_lblkno)) { 1318 ++reassignbufloops; 1319 tbp = ttbp; 1320 } 1321 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1322 } 1323 } else { 1324 bp->b_xflags |= BX_VNCLEAN; 1325 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1326 if ((newvp->v_flag & VONWORKLST) && 1327 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1328 newvp->v_flag &= ~VONWORKLST; 1329 LIST_REMOVE(newvp, v_synclist); 1330 } 1331 } 1332 if (bp->b_vp != newvp) { 1333 bp->b_vp = newvp; 1334 vhold(bp->b_vp); 1335 } 1336 splx(s); 1337 } 1338 1339 /* 1340 * Create a vnode for a device. 1341 * Used for mounting the root file system. 1342 */ 1343 int 1344 bdevvp(dev, vpp) 1345 dev_t dev; 1346 struct vnode **vpp; 1347 { 1348 register struct vnode *vp; 1349 struct vnode *nvp; 1350 int error; 1351 1352 if (dev == NODEV) { 1353 *vpp = NULLVP; 1354 return (ENXIO); 1355 } 1356 if (vfinddev(dev, VCHR, vpp)) 1357 return (0); 1358 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1359 if (error) { 1360 *vpp = NULLVP; 1361 return (error); 1362 } 1363 vp = nvp; 1364 vp->v_type = VCHR; 1365 addalias(vp, dev); 1366 *vpp = vp; 1367 return (0); 1368 } 1369 1370 /* 1371 * Add vnode to the alias list hung off the dev_t. 1372 * 1373 * The reason for this gunk is that multiple vnodes can reference 1374 * the same physical device, so checking vp->v_usecount to see 1375 * how many users there are is inadequate; the v_usecount for 1376 * the vnodes need to be accumulated. vcount() does that. 1377 */ 1378 struct vnode * 1379 addaliasu(nvp, nvp_rdev) 1380 struct vnode *nvp; 1381 udev_t nvp_rdev; 1382 { 1383 struct vnode *ovp; 1384 vop_t **ops; 1385 dev_t dev; 1386 1387 if (nvp->v_type == VBLK) 1388 return (nvp); 1389 if (nvp->v_type != VCHR) 1390 panic("addaliasu on non-special vnode"); 1391 dev = udev2dev(nvp_rdev, 0); 1392 /* 1393 * Check to see if we have a bdevvp vnode with no associated 1394 * filesystem. If so, we want to associate the filesystem of 1395 * the new newly instigated vnode with the bdevvp vnode and 1396 * discard the newly created vnode rather than leaving the 1397 * bdevvp vnode lying around with no associated filesystem. 1398 */ 1399 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1400 addalias(nvp, dev); 1401 return (nvp); 1402 } 1403 /* 1404 * Discard unneeded vnode, but save its node specific data. 1405 * Note that if there is a lock, it is carried over in the 1406 * node specific data to the replacement vnode. 1407 */ 1408 vref(ovp); 1409 ovp->v_data = nvp->v_data; 1410 ovp->v_tag = nvp->v_tag; 1411 nvp->v_data = NULL; 1412 lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg, 1413 nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK); 1414 if (nvp->v_vnlock) 1415 ovp->v_vnlock = &ovp->v_lock; 1416 ops = ovp->v_op; 1417 ovp->v_op = nvp->v_op; 1418 if (VOP_ISLOCKED(nvp, curproc)) { 1419 VOP_UNLOCK(nvp, 0, curproc); 1420 vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc); 1421 } 1422 nvp->v_op = ops; 1423 insmntque(ovp, nvp->v_mount); 1424 vrele(nvp); 1425 vgone(nvp); 1426 return (ovp); 1427 } 1428 1429 /* This is a local helper function that do the same as addaliasu, but for a 1430 * dev_t instead of an udev_t. */ 1431 static void 1432 addalias(nvp, dev) 1433 struct vnode *nvp; 1434 dev_t dev; 1435 { 1436 1437 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); 1438 nvp->v_rdev = dev; 1439 mtx_lock(&spechash_mtx); 1440 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1441 mtx_unlock(&spechash_mtx); 1442 } 1443 1444 /* 1445 * Grab a particular vnode from the free list, increment its 1446 * reference count and lock it. The vnode lock bit is set if the 1447 * vnode is being eliminated in vgone. The process is awakened 1448 * when the transition is completed, and an error returned to 1449 * indicate that the vnode is no longer usable (possibly having 1450 * been changed to a new file system type). 1451 */ 1452 int 1453 vget(vp, flags, p) 1454 register struct vnode *vp; 1455 int flags; 1456 struct proc *p; 1457 { 1458 int error; 1459 1460 /* 1461 * If the vnode is in the process of being cleaned out for 1462 * another use, we wait for the cleaning to finish and then 1463 * return failure. Cleaning is determined by checking that 1464 * the VXLOCK flag is set. 1465 */ 1466 if ((flags & LK_INTERLOCK) == 0) 1467 mtx_lock(&vp->v_interlock); 1468 if (vp->v_flag & VXLOCK) { 1469 if (vp->v_vxproc == curproc) { 1470 printf("VXLOCK interlock avoided\n"); 1471 } else { 1472 vp->v_flag |= VXWANT; 1473 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1474 "vget", 0); 1475 return (ENOENT); 1476 } 1477 } 1478 1479 vp->v_usecount++; 1480 1481 if (VSHOULDBUSY(vp)) 1482 vbusy(vp); 1483 if (flags & LK_TYPE_MASK) { 1484 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1485 /* 1486 * must expand vrele here because we do not want 1487 * to call VOP_INACTIVE if the reference count 1488 * drops back to zero since it was never really 1489 * active. We must remove it from the free list 1490 * before sleeping so that multiple processes do 1491 * not try to recycle it. 1492 */ 1493 mtx_lock(&vp->v_interlock); 1494 vp->v_usecount--; 1495 if (VSHOULDFREE(vp)) 1496 vfree(vp); 1497 mtx_unlock(&vp->v_interlock); 1498 } 1499 return (error); 1500 } 1501 mtx_unlock(&vp->v_interlock); 1502 return (0); 1503 } 1504 1505 /* 1506 * Increase the reference count of a vnode. 1507 */ 1508 void 1509 vref(struct vnode *vp) 1510 { 1511 mtx_lock(&vp->v_interlock); 1512 vp->v_usecount++; 1513 mtx_unlock(&vp->v_interlock); 1514 } 1515 1516 /* 1517 * Vnode put/release. 1518 * If count drops to zero, call inactive routine and return to freelist. 1519 */ 1520 void 1521 vrele(vp) 1522 struct vnode *vp; 1523 { 1524 struct proc *p = curproc; /* XXX */ 1525 1526 KASSERT(vp != NULL, ("vrele: null vp")); 1527 1528 mtx_lock(&vp->v_interlock); 1529 1530 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); 1531 1532 if (vp->v_usecount > 1) { 1533 1534 vp->v_usecount--; 1535 mtx_unlock(&vp->v_interlock); 1536 1537 return; 1538 } 1539 1540 if (vp->v_usecount == 1) { 1541 1542 vp->v_usecount--; 1543 if (VSHOULDFREE(vp)) 1544 vfree(vp); 1545 /* 1546 * If we are doing a vput, the node is already locked, and we must 1547 * call VOP_INACTIVE with the node locked. So, in the case of 1548 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1549 */ 1550 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1551 VOP_INACTIVE(vp, p); 1552 } 1553 1554 } else { 1555 #ifdef DIAGNOSTIC 1556 vprint("vrele: negative ref count", vp); 1557 mtx_unlock(&vp->v_interlock); 1558 #endif 1559 panic("vrele: negative ref cnt"); 1560 } 1561 } 1562 1563 /* 1564 * Release an already locked vnode. This give the same effects as 1565 * unlock+vrele(), but takes less time and avoids releasing and 1566 * re-aquiring the lock (as vrele() aquires the lock internally.) 1567 */ 1568 void 1569 vput(vp) 1570 struct vnode *vp; 1571 { 1572 struct proc *p = curproc; /* XXX */ 1573 1574 KASSERT(vp != NULL, ("vput: null vp")); 1575 mtx_lock(&vp->v_interlock); 1576 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); 1577 1578 if (vp->v_usecount > 1) { 1579 1580 vp->v_usecount--; 1581 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1582 return; 1583 1584 } 1585 1586 if (vp->v_usecount == 1) { 1587 1588 vp->v_usecount--; 1589 if (VSHOULDFREE(vp)) 1590 vfree(vp); 1591 /* 1592 * If we are doing a vput, the node is already locked, and we must 1593 * call VOP_INACTIVE with the node locked. So, in the case of 1594 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1595 */ 1596 mtx_unlock(&vp->v_interlock); 1597 VOP_INACTIVE(vp, p); 1598 1599 } else { 1600 #ifdef DIAGNOSTIC 1601 vprint("vput: negative ref count", vp); 1602 #endif 1603 panic("vput: negative ref cnt"); 1604 } 1605 } 1606 1607 /* 1608 * Somebody doesn't want the vnode recycled. 1609 */ 1610 void 1611 vhold(vp) 1612 register struct vnode *vp; 1613 { 1614 int s; 1615 1616 s = splbio(); 1617 vp->v_holdcnt++; 1618 if (VSHOULDBUSY(vp)) 1619 vbusy(vp); 1620 splx(s); 1621 } 1622 1623 /* 1624 * Note that there is one less who cares about this vnode. vdrop() is the 1625 * opposite of vhold(). 1626 */ 1627 void 1628 vdrop(vp) 1629 register struct vnode *vp; 1630 { 1631 int s; 1632 1633 s = splbio(); 1634 if (vp->v_holdcnt <= 0) 1635 panic("vdrop: holdcnt"); 1636 vp->v_holdcnt--; 1637 if (VSHOULDFREE(vp)) 1638 vfree(vp); 1639 splx(s); 1640 } 1641 1642 /* 1643 * Remove any vnodes in the vnode table belonging to mount point mp. 1644 * 1645 * If MNT_NOFORCE is specified, there should not be any active ones, 1646 * return error if any are found (nb: this is a user error, not a 1647 * system error). If MNT_FORCE is specified, detach any active vnodes 1648 * that are found. 1649 */ 1650 #ifdef DIAGNOSTIC 1651 static int busyprt = 0; /* print out busy vnodes */ 1652 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1653 #endif 1654 1655 int 1656 vflush(mp, skipvp, flags) 1657 struct mount *mp; 1658 struct vnode *skipvp; 1659 int flags; 1660 { 1661 struct proc *p = curproc; /* XXX */ 1662 struct vnode *vp, *nvp; 1663 int busy = 0; 1664 1665 mtx_lock(&mntvnode_mtx); 1666 loop: 1667 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1668 /* 1669 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1670 * Start over if it has (it won't be on the list anymore). 1671 */ 1672 if (vp->v_mount != mp) 1673 goto loop; 1674 nvp = LIST_NEXT(vp, v_mntvnodes); 1675 /* 1676 * Skip over a selected vnode. 1677 */ 1678 if (vp == skipvp) 1679 continue; 1680 1681 mtx_lock(&vp->v_interlock); 1682 /* 1683 * Skip over a vnodes marked VSYSTEM. 1684 */ 1685 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1686 mtx_unlock(&vp->v_interlock); 1687 continue; 1688 } 1689 /* 1690 * If WRITECLOSE is set, only flush out regular file vnodes 1691 * open for writing. 1692 */ 1693 if ((flags & WRITECLOSE) && 1694 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1695 mtx_unlock(&vp->v_interlock); 1696 continue; 1697 } 1698 1699 /* 1700 * With v_usecount == 0, all we need to do is clear out the 1701 * vnode data structures and we are done. 1702 */ 1703 if (vp->v_usecount == 0) { 1704 mtx_unlock(&mntvnode_mtx); 1705 vgonel(vp, p); 1706 mtx_lock(&mntvnode_mtx); 1707 continue; 1708 } 1709 1710 /* 1711 * If FORCECLOSE is set, forcibly close the vnode. For block 1712 * or character devices, revert to an anonymous device. For 1713 * all other files, just kill them. 1714 */ 1715 if (flags & FORCECLOSE) { 1716 mtx_unlock(&mntvnode_mtx); 1717 if (vp->v_type != VCHR) { 1718 vgonel(vp, p); 1719 } else { 1720 vclean(vp, 0, p); 1721 vp->v_op = spec_vnodeop_p; 1722 insmntque(vp, (struct mount *) 0); 1723 } 1724 mtx_lock(&mntvnode_mtx); 1725 continue; 1726 } 1727 #ifdef DIAGNOSTIC 1728 if (busyprt) 1729 vprint("vflush: busy vnode", vp); 1730 #endif 1731 mtx_unlock(&vp->v_interlock); 1732 busy++; 1733 } 1734 mtx_unlock(&mntvnode_mtx); 1735 if (busy) 1736 return (EBUSY); 1737 return (0); 1738 } 1739 1740 /* 1741 * Disassociate the underlying file system from a vnode. 1742 */ 1743 static void 1744 vclean(vp, flags, p) 1745 struct vnode *vp; 1746 int flags; 1747 struct proc *p; 1748 { 1749 int active; 1750 1751 /* 1752 * Check to see if the vnode is in use. If so we have to reference it 1753 * before we clean it out so that its count cannot fall to zero and 1754 * generate a race against ourselves to recycle it. 1755 */ 1756 if ((active = vp->v_usecount)) 1757 vp->v_usecount++; 1758 1759 /* 1760 * Prevent the vnode from being recycled or brought into use while we 1761 * clean it out. 1762 */ 1763 if (vp->v_flag & VXLOCK) 1764 panic("vclean: deadlock"); 1765 vp->v_flag |= VXLOCK; 1766 vp->v_vxproc = curproc; 1767 /* 1768 * Even if the count is zero, the VOP_INACTIVE routine may still 1769 * have the object locked while it cleans it out. The VOP_LOCK 1770 * ensures that the VOP_INACTIVE routine is done with its work. 1771 * For active vnodes, it ensures that no other activity can 1772 * occur while the underlying object is being cleaned out. 1773 */ 1774 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1775 1776 /* 1777 * Clean out any buffers associated with the vnode. 1778 * If the flush fails, just toss the buffers. 1779 */ 1780 if (flags & DOCLOSE) { 1781 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1782 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1783 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) 1784 vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1785 } 1786 1787 VOP_DESTROYVOBJECT(vp); 1788 1789 /* 1790 * If purging an active vnode, it must be closed and 1791 * deactivated before being reclaimed. Note that the 1792 * VOP_INACTIVE will unlock the vnode. 1793 */ 1794 if (active) { 1795 if (flags & DOCLOSE) 1796 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1797 VOP_INACTIVE(vp, p); 1798 } else { 1799 /* 1800 * Any other processes trying to obtain this lock must first 1801 * wait for VXLOCK to clear, then call the new lock operation. 1802 */ 1803 VOP_UNLOCK(vp, 0, p); 1804 } 1805 /* 1806 * Reclaim the vnode. 1807 */ 1808 if (VOP_RECLAIM(vp, p)) 1809 panic("vclean: cannot reclaim"); 1810 1811 if (active) { 1812 /* 1813 * Inline copy of vrele() since VOP_INACTIVE 1814 * has already been called. 1815 */ 1816 mtx_lock(&vp->v_interlock); 1817 if (--vp->v_usecount <= 0) { 1818 #ifdef DIAGNOSTIC 1819 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1820 vprint("vclean: bad ref count", vp); 1821 panic("vclean: ref cnt"); 1822 } 1823 #endif 1824 vfree(vp); 1825 } 1826 mtx_unlock(&vp->v_interlock); 1827 } 1828 1829 cache_purge(vp); 1830 vp->v_vnlock = NULL; 1831 lockdestroy(&vp->v_lock); 1832 1833 if (VSHOULDFREE(vp)) 1834 vfree(vp); 1835 1836 /* 1837 * Done with purge, notify sleepers of the grim news. 1838 */ 1839 vp->v_op = dead_vnodeop_p; 1840 vn_pollgone(vp); 1841 vp->v_tag = VT_NON; 1842 vp->v_flag &= ~VXLOCK; 1843 vp->v_vxproc = NULL; 1844 if (vp->v_flag & VXWANT) { 1845 vp->v_flag &= ~VXWANT; 1846 wakeup((caddr_t) vp); 1847 } 1848 } 1849 1850 /* 1851 * Eliminate all activity associated with the requested vnode 1852 * and with all vnodes aliased to the requested vnode. 1853 */ 1854 int 1855 vop_revoke(ap) 1856 struct vop_revoke_args /* { 1857 struct vnode *a_vp; 1858 int a_flags; 1859 } */ *ap; 1860 { 1861 struct vnode *vp, *vq; 1862 dev_t dev; 1863 1864 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1865 1866 vp = ap->a_vp; 1867 /* 1868 * If a vgone (or vclean) is already in progress, 1869 * wait until it is done and return. 1870 */ 1871 if (vp->v_flag & VXLOCK) { 1872 vp->v_flag |= VXWANT; 1873 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1874 "vop_revokeall", 0); 1875 return (0); 1876 } 1877 dev = vp->v_rdev; 1878 for (;;) { 1879 mtx_lock(&spechash_mtx); 1880 vq = SLIST_FIRST(&dev->si_hlist); 1881 mtx_unlock(&spechash_mtx); 1882 if (!vq) 1883 break; 1884 vgone(vq); 1885 } 1886 return (0); 1887 } 1888 1889 /* 1890 * Recycle an unused vnode to the front of the free list. 1891 * Release the passed interlock if the vnode will be recycled. 1892 */ 1893 int 1894 vrecycle(vp, inter_lkp, p) 1895 struct vnode *vp; 1896 struct mtx *inter_lkp; 1897 struct proc *p; 1898 { 1899 1900 mtx_lock(&vp->v_interlock); 1901 if (vp->v_usecount == 0) { 1902 if (inter_lkp) { 1903 mtx_unlock(inter_lkp); 1904 } 1905 vgonel(vp, p); 1906 return (1); 1907 } 1908 mtx_unlock(&vp->v_interlock); 1909 return (0); 1910 } 1911 1912 /* 1913 * Eliminate all activity associated with a vnode 1914 * in preparation for reuse. 1915 */ 1916 void 1917 vgone(vp) 1918 register struct vnode *vp; 1919 { 1920 struct proc *p = curproc; /* XXX */ 1921 1922 mtx_lock(&vp->v_interlock); 1923 vgonel(vp, p); 1924 } 1925 1926 /* 1927 * vgone, with the vp interlock held. 1928 */ 1929 void 1930 vgonel(vp, p) 1931 struct vnode *vp; 1932 struct proc *p; 1933 { 1934 int s; 1935 1936 /* 1937 * If a vgone (or vclean) is already in progress, 1938 * wait until it is done and return. 1939 */ 1940 if (vp->v_flag & VXLOCK) { 1941 vp->v_flag |= VXWANT; 1942 msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP, 1943 "vgone", 0); 1944 return; 1945 } 1946 1947 /* 1948 * Clean out the filesystem specific data. 1949 */ 1950 vclean(vp, DOCLOSE, p); 1951 mtx_lock(&vp->v_interlock); 1952 1953 /* 1954 * Delete from old mount point vnode list, if on one. 1955 */ 1956 if (vp->v_mount != NULL) 1957 insmntque(vp, (struct mount *)0); 1958 /* 1959 * If special device, remove it from special device alias list 1960 * if it is on one. 1961 */ 1962 if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { 1963 mtx_lock(&spechash_mtx); 1964 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); 1965 freedev(vp->v_rdev); 1966 mtx_unlock(&spechash_mtx); 1967 vp->v_rdev = NULL; 1968 } 1969 1970 /* 1971 * If it is on the freelist and not already at the head, 1972 * move it to the head of the list. The test of the 1973 * VDOOMED flag and the reference count of zero is because 1974 * it will be removed from the free list by getnewvnode, 1975 * but will not have its reference count incremented until 1976 * after calling vgone. If the reference count were 1977 * incremented first, vgone would (incorrectly) try to 1978 * close the previous instance of the underlying object. 1979 */ 1980 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1981 s = splbio(); 1982 mtx_lock(&vnode_free_list_mtx); 1983 if (vp->v_flag & VFREE) 1984 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1985 else 1986 freevnodes++; 1987 vp->v_flag |= VFREE; 1988 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1989 mtx_unlock(&vnode_free_list_mtx); 1990 splx(s); 1991 } 1992 1993 vp->v_type = VBAD; 1994 mtx_unlock(&vp->v_interlock); 1995 } 1996 1997 /* 1998 * Lookup a vnode by device number. 1999 */ 2000 int 2001 vfinddev(dev, type, vpp) 2002 dev_t dev; 2003 enum vtype type; 2004 struct vnode **vpp; 2005 { 2006 struct vnode *vp; 2007 2008 mtx_lock(&spechash_mtx); 2009 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2010 if (type == vp->v_type) { 2011 *vpp = vp; 2012 mtx_unlock(&spechash_mtx); 2013 return (1); 2014 } 2015 } 2016 mtx_unlock(&spechash_mtx); 2017 return (0); 2018 } 2019 2020 /* 2021 * Calculate the total number of references to a special device. 2022 */ 2023 int 2024 vcount(vp) 2025 struct vnode *vp; 2026 { 2027 struct vnode *vq; 2028 int count; 2029 2030 count = 0; 2031 mtx_lock(&spechash_mtx); 2032 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) 2033 count += vq->v_usecount; 2034 mtx_unlock(&spechash_mtx); 2035 return (count); 2036 } 2037 2038 /* 2039 * Same as above, but using the dev_t as argument 2040 */ 2041 int 2042 count_dev(dev) 2043 dev_t dev; 2044 { 2045 struct vnode *vp; 2046 2047 vp = SLIST_FIRST(&dev->si_hlist); 2048 if (vp == NULL) 2049 return (0); 2050 return(vcount(vp)); 2051 } 2052 2053 /* 2054 * Print out a description of a vnode. 2055 */ 2056 static char *typename[] = 2057 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2058 2059 void 2060 vprint(label, vp) 2061 char *label; 2062 struct vnode *vp; 2063 { 2064 char buf[96]; 2065 2066 if (label != NULL) 2067 printf("%s: %p: ", label, (void *)vp); 2068 else 2069 printf("%p: ", (void *)vp); 2070 printf("type %s, usecount %d, writecount %d, refcount %d,", 2071 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2072 vp->v_holdcnt); 2073 buf[0] = '\0'; 2074 if (vp->v_flag & VROOT) 2075 strcat(buf, "|VROOT"); 2076 if (vp->v_flag & VTEXT) 2077 strcat(buf, "|VTEXT"); 2078 if (vp->v_flag & VSYSTEM) 2079 strcat(buf, "|VSYSTEM"); 2080 if (vp->v_flag & VXLOCK) 2081 strcat(buf, "|VXLOCK"); 2082 if (vp->v_flag & VXWANT) 2083 strcat(buf, "|VXWANT"); 2084 if (vp->v_flag & VBWAIT) 2085 strcat(buf, "|VBWAIT"); 2086 if (vp->v_flag & VDOOMED) 2087 strcat(buf, "|VDOOMED"); 2088 if (vp->v_flag & VFREE) 2089 strcat(buf, "|VFREE"); 2090 if (vp->v_flag & VOBJBUF) 2091 strcat(buf, "|VOBJBUF"); 2092 if (buf[0] != '\0') 2093 printf(" flags (%s)", &buf[1]); 2094 if (vp->v_data == NULL) { 2095 printf("\n"); 2096 } else { 2097 printf("\n\t"); 2098 VOP_PRINT(vp); 2099 } 2100 } 2101 2102 #ifdef DDB 2103 #include <ddb/ddb.h> 2104 /* 2105 * List all of the locked vnodes in the system. 2106 * Called when debugging the kernel. 2107 */ 2108 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2109 { 2110 struct proc *p = curproc; /* XXX */ 2111 struct mount *mp, *nmp; 2112 struct vnode *vp; 2113 2114 printf("Locked vnodes\n"); 2115 mtx_lock(&mountlist_mtx); 2116 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2117 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2118 nmp = TAILQ_NEXT(mp, mnt_list); 2119 continue; 2120 } 2121 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2122 if (VOP_ISLOCKED(vp, NULL)) 2123 vprint((char *)0, vp); 2124 } 2125 mtx_lock(&mountlist_mtx); 2126 nmp = TAILQ_NEXT(mp, mnt_list); 2127 vfs_unbusy(mp, p); 2128 } 2129 mtx_unlock(&mountlist_mtx); 2130 } 2131 #endif 2132 2133 /* 2134 * Top level filesystem related information gathering. 2135 */ 2136 static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2137 2138 static int 2139 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2140 { 2141 int *name = (int *)arg1 - 1; /* XXX */ 2142 u_int namelen = arg2 + 1; /* XXX */ 2143 struct vfsconf *vfsp; 2144 2145 #if 1 || defined(COMPAT_PRELITE2) 2146 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2147 if (namelen == 1) 2148 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2149 #endif 2150 2151 /* XXX the below code does not compile; vfs_sysctl does not exist. */ 2152 #ifdef notyet 2153 /* all sysctl names at this level are at least name and field */ 2154 if (namelen < 2) 2155 return (ENOTDIR); /* overloaded */ 2156 if (name[0] != VFS_GENERIC) { 2157 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2158 if (vfsp->vfc_typenum == name[0]) 2159 break; 2160 if (vfsp == NULL) 2161 return (EOPNOTSUPP); 2162 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2163 oldp, oldlenp, newp, newlen, p)); 2164 } 2165 #endif 2166 switch (name[1]) { 2167 case VFS_MAXTYPENUM: 2168 if (namelen != 2) 2169 return (ENOTDIR); 2170 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2171 case VFS_CONF: 2172 if (namelen != 3) 2173 return (ENOTDIR); /* overloaded */ 2174 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2175 if (vfsp->vfc_typenum == name[2]) 2176 break; 2177 if (vfsp == NULL) 2178 return (EOPNOTSUPP); 2179 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2180 } 2181 return (EOPNOTSUPP); 2182 } 2183 2184 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2185 "Generic filesystem"); 2186 2187 #if 1 || defined(COMPAT_PRELITE2) 2188 2189 static int 2190 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2191 { 2192 int error; 2193 struct vfsconf *vfsp; 2194 struct ovfsconf ovfs; 2195 2196 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2197 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2198 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2199 ovfs.vfc_index = vfsp->vfc_typenum; 2200 ovfs.vfc_refcount = vfsp->vfc_refcount; 2201 ovfs.vfc_flags = vfsp->vfc_flags; 2202 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2203 if (error) 2204 return error; 2205 } 2206 return 0; 2207 } 2208 2209 #endif /* 1 || COMPAT_PRELITE2 */ 2210 2211 #if COMPILING_LINT 2212 #define KINFO_VNODESLOP 10 2213 /* 2214 * Dump vnode list (via sysctl). 2215 * Copyout address of vnode followed by vnode. 2216 */ 2217 /* ARGSUSED */ 2218 static int 2219 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2220 { 2221 struct proc *p = curproc; /* XXX */ 2222 struct mount *mp, *nmp; 2223 struct vnode *nvp, *vp; 2224 int error; 2225 2226 #define VPTRSZ sizeof (struct vnode *) 2227 #define VNODESZ sizeof (struct vnode) 2228 2229 req->lock = 0; 2230 if (!req->oldptr) /* Make an estimate */ 2231 return (SYSCTL_OUT(req, 0, 2232 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2233 2234 mtx_lock(&mountlist_mtx); 2235 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2236 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2237 nmp = TAILQ_NEXT(mp, mnt_list); 2238 continue; 2239 } 2240 again: 2241 mtx_lock(&mntvnode_mtx); 2242 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2243 vp != NULL; 2244 vp = nvp) { 2245 /* 2246 * Check that the vp is still associated with 2247 * this filesystem. RACE: could have been 2248 * recycled onto the same filesystem. 2249 */ 2250 if (vp->v_mount != mp) { 2251 mtx_unlock(&mntvnode_mtx); 2252 goto again; 2253 } 2254 nvp = LIST_NEXT(vp, v_mntvnodes); 2255 mtx_unlock(&mntvnode_mtx); 2256 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2257 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2258 return (error); 2259 mtx_lock(&mntvnode_mtx); 2260 } 2261 mtx_unlock(&mntvnode_mtx); 2262 mtx_lock(&mountlist_mtx); 2263 nmp = TAILQ_NEXT(mp, mnt_list); 2264 vfs_unbusy(mp, p); 2265 } 2266 mtx_unlock(&mountlist_mtx); 2267 2268 return (0); 2269 } 2270 2271 /* 2272 * XXX 2273 * Exporting the vnode list on large systems causes them to crash. 2274 * Exporting the vnode list on medium systems causes sysctl to coredump. 2275 */ 2276 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2277 0, 0, sysctl_vnode, "S,vnode", ""); 2278 #endif 2279 2280 /* 2281 * Check to see if a filesystem is mounted on a block device. 2282 */ 2283 int 2284 vfs_mountedon(vp) 2285 struct vnode *vp; 2286 { 2287 2288 if (vp->v_rdev->si_mountpoint != NULL) 2289 return (EBUSY); 2290 return (0); 2291 } 2292 2293 /* 2294 * Unmount all filesystems. The list is traversed in reverse order 2295 * of mounting to avoid dependencies. 2296 */ 2297 void 2298 vfs_unmountall() 2299 { 2300 struct mount *mp; 2301 struct proc *p; 2302 int error; 2303 2304 if (curproc != NULL) 2305 p = curproc; 2306 else 2307 p = initproc; /* XXX XXX should this be proc0? */ 2308 /* 2309 * Since this only runs when rebooting, it is not interlocked. 2310 */ 2311 while(!TAILQ_EMPTY(&mountlist)) { 2312 mp = TAILQ_LAST(&mountlist, mntlist); 2313 error = dounmount(mp, MNT_FORCE, p); 2314 if (error) { 2315 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2316 printf("unmount of %s failed (", 2317 mp->mnt_stat.f_mntonname); 2318 if (error == EBUSY) 2319 printf("BUSY)\n"); 2320 else 2321 printf("%d)\n", error); 2322 } else { 2323 /* The unmount has removed mp from the mountlist */ 2324 } 2325 } 2326 } 2327 2328 /* 2329 * Build hash lists of net addresses and hang them off the mount point. 2330 * Called by ufs_mount() to set up the lists of export addresses. 2331 */ 2332 static int 2333 vfs_hang_addrlist(mp, nep, argp) 2334 struct mount *mp; 2335 struct netexport *nep; 2336 struct export_args *argp; 2337 { 2338 register struct netcred *np; 2339 register struct radix_node_head *rnh; 2340 register int i; 2341 struct radix_node *rn; 2342 struct sockaddr *saddr, *smask = 0; 2343 struct domain *dom; 2344 int error; 2345 2346 if (argp->ex_addrlen == 0) { 2347 if (mp->mnt_flag & MNT_DEFEXPORTED) 2348 return (EPERM); 2349 np = &nep->ne_defexported; 2350 np->netc_exflags = argp->ex_flags; 2351 bzero(&np->netc_anon, sizeof(np->netc_anon)); 2352 np->netc_anon.cr_uid = argp->ex_anon.cr_uid; 2353 np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; 2354 bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, 2355 sizeof(np->netc_anon.cr_groups)); 2356 np->netc_anon.cr_ref = 1; 2357 mp->mnt_flag |= MNT_DEFEXPORTED; 2358 return (0); 2359 } 2360 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2361 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO); 2362 saddr = (struct sockaddr *) (np + 1); 2363 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2364 goto out; 2365 if (saddr->sa_len > argp->ex_addrlen) 2366 saddr->sa_len = argp->ex_addrlen; 2367 if (argp->ex_masklen) { 2368 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2369 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2370 if (error) 2371 goto out; 2372 if (smask->sa_len > argp->ex_masklen) 2373 smask->sa_len = argp->ex_masklen; 2374 } 2375 i = saddr->sa_family; 2376 if ((rnh = nep->ne_rtable[i]) == 0) { 2377 /* 2378 * Seems silly to initialize every AF when most are not used, 2379 * do so on demand here 2380 */ 2381 for (dom = domains; dom; dom = dom->dom_next) 2382 if (dom->dom_family == i && dom->dom_rtattach) { 2383 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2384 dom->dom_rtoffset); 2385 break; 2386 } 2387 if ((rnh = nep->ne_rtable[i]) == 0) { 2388 error = ENOBUFS; 2389 goto out; 2390 } 2391 } 2392 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2393 np->netc_rnodes); 2394 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2395 error = EPERM; 2396 goto out; 2397 } 2398 np->netc_exflags = argp->ex_flags; 2399 bzero(&np->netc_anon, sizeof(np->netc_anon)); 2400 np->netc_anon.cr_uid = argp->ex_anon.cr_uid; 2401 np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups; 2402 bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups, 2403 sizeof(np->netc_anon.cr_groups)); 2404 np->netc_anon.cr_ref = 1; 2405 return (0); 2406 out: 2407 free(np, M_NETADDR); 2408 return (error); 2409 } 2410 2411 /* Helper for vfs_free_addrlist. */ 2412 /* ARGSUSED */ 2413 static int 2414 vfs_free_netcred(rn, w) 2415 struct radix_node *rn; 2416 void *w; 2417 { 2418 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2419 2420 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2421 free((caddr_t) rn, M_NETADDR); 2422 return (0); 2423 } 2424 2425 /* 2426 * Free the net address hash lists that are hanging off the mount points. 2427 */ 2428 static void 2429 vfs_free_addrlist(nep) 2430 struct netexport *nep; 2431 { 2432 register int i; 2433 register struct radix_node_head *rnh; 2434 2435 for (i = 0; i <= AF_MAX; i++) 2436 if ((rnh = nep->ne_rtable[i])) { 2437 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2438 (caddr_t) rnh); 2439 free((caddr_t) rnh, M_RTABLE); 2440 nep->ne_rtable[i] = 0; 2441 } 2442 } 2443 2444 /* 2445 * High level function to manipulate export options on a mount point 2446 * and the passed in netexport. 2447 * Struct export_args *argp is the variable used to twiddle options, 2448 * the structure is described in sys/mount.h 2449 */ 2450 int 2451 vfs_export(mp, nep, argp) 2452 struct mount *mp; 2453 struct netexport *nep; 2454 struct export_args *argp; 2455 { 2456 int error; 2457 2458 if (argp->ex_flags & MNT_DELEXPORT) { 2459 if (mp->mnt_flag & MNT_EXPUBLIC) { 2460 vfs_setpublicfs(NULL, NULL, NULL); 2461 mp->mnt_flag &= ~MNT_EXPUBLIC; 2462 } 2463 vfs_free_addrlist(nep); 2464 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2465 } 2466 if (argp->ex_flags & MNT_EXPORTED) { 2467 if (argp->ex_flags & MNT_EXPUBLIC) { 2468 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2469 return (error); 2470 mp->mnt_flag |= MNT_EXPUBLIC; 2471 } 2472 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2473 return (error); 2474 mp->mnt_flag |= MNT_EXPORTED; 2475 } 2476 return (0); 2477 } 2478 2479 /* 2480 * Set the publicly exported filesystem (WebNFS). Currently, only 2481 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2482 */ 2483 int 2484 vfs_setpublicfs(mp, nep, argp) 2485 struct mount *mp; 2486 struct netexport *nep; 2487 struct export_args *argp; 2488 { 2489 int error; 2490 struct vnode *rvp; 2491 char *cp; 2492 2493 /* 2494 * mp == NULL -> invalidate the current info, the FS is 2495 * no longer exported. May be called from either vfs_export 2496 * or unmount, so check if it hasn't already been done. 2497 */ 2498 if (mp == NULL) { 2499 if (nfs_pub.np_valid) { 2500 nfs_pub.np_valid = 0; 2501 if (nfs_pub.np_index != NULL) { 2502 FREE(nfs_pub.np_index, M_TEMP); 2503 nfs_pub.np_index = NULL; 2504 } 2505 } 2506 return (0); 2507 } 2508 2509 /* 2510 * Only one allowed at a time. 2511 */ 2512 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2513 return (EBUSY); 2514 2515 /* 2516 * Get real filehandle for root of exported FS. 2517 */ 2518 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2519 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2520 2521 if ((error = VFS_ROOT(mp, &rvp))) 2522 return (error); 2523 2524 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2525 return (error); 2526 2527 vput(rvp); 2528 2529 /* 2530 * If an indexfile was specified, pull it in. 2531 */ 2532 if (argp->ex_indexfile != NULL) { 2533 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2534 M_WAITOK); 2535 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2536 MAXNAMLEN, (size_t *)0); 2537 if (!error) { 2538 /* 2539 * Check for illegal filenames. 2540 */ 2541 for (cp = nfs_pub.np_index; *cp; cp++) { 2542 if (*cp == '/') { 2543 error = EINVAL; 2544 break; 2545 } 2546 } 2547 } 2548 if (error) { 2549 FREE(nfs_pub.np_index, M_TEMP); 2550 return (error); 2551 } 2552 } 2553 2554 nfs_pub.np_mount = mp; 2555 nfs_pub.np_valid = 1; 2556 return (0); 2557 } 2558 2559 /* 2560 * Used by the filesystems to determine if a given network address 2561 * (passed in 'nam') is present in thier exports list, returns a pointer 2562 * to struct netcred so that the filesystem can examine it for 2563 * access rights (read/write/etc). 2564 */ 2565 struct netcred * 2566 vfs_export_lookup(mp, nep, nam) 2567 register struct mount *mp; 2568 struct netexport *nep; 2569 struct sockaddr *nam; 2570 { 2571 register struct netcred *np; 2572 register struct radix_node_head *rnh; 2573 struct sockaddr *saddr; 2574 2575 np = NULL; 2576 if (mp->mnt_flag & MNT_EXPORTED) { 2577 /* 2578 * Lookup in the export list first. 2579 */ 2580 if (nam != NULL) { 2581 saddr = nam; 2582 rnh = nep->ne_rtable[saddr->sa_family]; 2583 if (rnh != NULL) { 2584 np = (struct netcred *) 2585 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2586 rnh); 2587 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2588 np = NULL; 2589 } 2590 } 2591 /* 2592 * If no address match, use the default if it exists. 2593 */ 2594 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2595 np = &nep->ne_defexported; 2596 } 2597 return (np); 2598 } 2599 2600 /* 2601 * perform msync on all vnodes under a mount point 2602 * the mount point must be locked. 2603 */ 2604 void 2605 vfs_msync(struct mount *mp, int flags) { 2606 struct vnode *vp, *nvp; 2607 struct vm_object *obj; 2608 int anyio, tries; 2609 2610 tries = 5; 2611 loop: 2612 anyio = 0; 2613 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2614 2615 nvp = LIST_NEXT(vp, v_mntvnodes); 2616 2617 if (vp->v_mount != mp) { 2618 goto loop; 2619 } 2620 2621 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2622 continue; 2623 2624 if (flags != MNT_WAIT) { 2625 if (VOP_GETVOBJECT(vp, &obj) != 0 || 2626 (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2627 continue; 2628 if (VOP_ISLOCKED(vp, NULL)) 2629 continue; 2630 } 2631 2632 mtx_lock(&vp->v_interlock); 2633 if (VOP_GETVOBJECT(vp, &obj) == 0 && 2634 (obj->flags & OBJ_MIGHTBEDIRTY)) { 2635 if (!vget(vp, 2636 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2637 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2638 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2639 anyio = 1; 2640 } 2641 vput(vp); 2642 } 2643 } else { 2644 mtx_unlock(&vp->v_interlock); 2645 } 2646 } 2647 if (anyio && (--tries > 0)) 2648 goto loop; 2649 } 2650 2651 /* 2652 * Create the VM object needed for VMIO and mmap support. This 2653 * is done for all VREG files in the system. Some filesystems might 2654 * afford the additional metadata buffering capability of the 2655 * VMIO code by making the device node be VMIO mode also. 2656 * 2657 * vp must be locked when vfs_object_create is called. 2658 */ 2659 int 2660 vfs_object_create(vp, p, cred) 2661 struct vnode *vp; 2662 struct proc *p; 2663 struct ucred *cred; 2664 { 2665 return (VOP_CREATEVOBJECT(vp, cred, p)); 2666 } 2667 2668 /* 2669 * Mark a vnode as free, putting it up for recycling. 2670 */ 2671 void 2672 vfree(vp) 2673 struct vnode *vp; 2674 { 2675 int s; 2676 2677 s = splbio(); 2678 mtx_lock(&vnode_free_list_mtx); 2679 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2680 if (vp->v_flag & VAGE) { 2681 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2682 } else { 2683 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2684 } 2685 freevnodes++; 2686 mtx_unlock(&vnode_free_list_mtx); 2687 vp->v_flag &= ~VAGE; 2688 vp->v_flag |= VFREE; 2689 splx(s); 2690 } 2691 2692 /* 2693 * Opposite of vfree() - mark a vnode as in use. 2694 */ 2695 void 2696 vbusy(vp) 2697 struct vnode *vp; 2698 { 2699 int s; 2700 2701 s = splbio(); 2702 mtx_lock(&vnode_free_list_mtx); 2703 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2704 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2705 freevnodes--; 2706 mtx_unlock(&vnode_free_list_mtx); 2707 vp->v_flag &= ~(VFREE|VAGE); 2708 splx(s); 2709 } 2710 2711 /* 2712 * Record a process's interest in events which might happen to 2713 * a vnode. Because poll uses the historic select-style interface 2714 * internally, this routine serves as both the ``check for any 2715 * pending events'' and the ``record my interest in future events'' 2716 * functions. (These are done together, while the lock is held, 2717 * to avoid race conditions.) 2718 */ 2719 int 2720 vn_pollrecord(vp, p, events) 2721 struct vnode *vp; 2722 struct proc *p; 2723 short events; 2724 { 2725 mtx_lock(&vp->v_pollinfo.vpi_lock); 2726 if (vp->v_pollinfo.vpi_revents & events) { 2727 /* 2728 * This leaves events we are not interested 2729 * in available for the other process which 2730 * which presumably had requested them 2731 * (otherwise they would never have been 2732 * recorded). 2733 */ 2734 events &= vp->v_pollinfo.vpi_revents; 2735 vp->v_pollinfo.vpi_revents &= ~events; 2736 2737 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2738 return events; 2739 } 2740 vp->v_pollinfo.vpi_events |= events; 2741 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2742 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2743 return 0; 2744 } 2745 2746 /* 2747 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2748 * it is possible for us to miss an event due to race conditions, but 2749 * that condition is expected to be rare, so for the moment it is the 2750 * preferred interface. 2751 */ 2752 void 2753 vn_pollevent(vp, events) 2754 struct vnode *vp; 2755 short events; 2756 { 2757 mtx_lock(&vp->v_pollinfo.vpi_lock); 2758 if (vp->v_pollinfo.vpi_events & events) { 2759 /* 2760 * We clear vpi_events so that we don't 2761 * call selwakeup() twice if two events are 2762 * posted before the polling process(es) is 2763 * awakened. This also ensures that we take at 2764 * most one selwakeup() if the polling process 2765 * is no longer interested. However, it does 2766 * mean that only one event can be noticed at 2767 * a time. (Perhaps we should only clear those 2768 * event bits which we note?) XXX 2769 */ 2770 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2771 vp->v_pollinfo.vpi_revents |= events; 2772 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2773 } 2774 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2775 } 2776 2777 #define VN_KNOTE(vp, b) \ 2778 KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) 2779 2780 /* 2781 * Wake up anyone polling on vp because it is being revoked. 2782 * This depends on dead_poll() returning POLLHUP for correct 2783 * behavior. 2784 */ 2785 void 2786 vn_pollgone(vp) 2787 struct vnode *vp; 2788 { 2789 mtx_lock(&vp->v_pollinfo.vpi_lock); 2790 VN_KNOTE(vp, NOTE_REVOKE); 2791 if (vp->v_pollinfo.vpi_events) { 2792 vp->v_pollinfo.vpi_events = 0; 2793 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2794 } 2795 mtx_unlock(&vp->v_pollinfo.vpi_lock); 2796 } 2797 2798 2799 2800 /* 2801 * Routine to create and manage a filesystem syncer vnode. 2802 */ 2803 #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2804 static int sync_fsync __P((struct vop_fsync_args *)); 2805 static int sync_inactive __P((struct vop_inactive_args *)); 2806 static int sync_reclaim __P((struct vop_reclaim_args *)); 2807 #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2808 #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2809 static int sync_print __P((struct vop_print_args *)); 2810 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2811 2812 static vop_t **sync_vnodeop_p; 2813 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2814 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2815 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2816 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2817 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2818 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2819 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2820 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2821 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2822 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2823 { NULL, NULL } 2824 }; 2825 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2826 { &sync_vnodeop_p, sync_vnodeop_entries }; 2827 2828 VNODEOP_SET(sync_vnodeop_opv_desc); 2829 2830 /* 2831 * Create a new filesystem syncer vnode for the specified mount point. 2832 */ 2833 int 2834 vfs_allocate_syncvnode(mp) 2835 struct mount *mp; 2836 { 2837 struct vnode *vp; 2838 static long start, incr, next; 2839 int error; 2840 2841 /* Allocate a new vnode */ 2842 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2843 mp->mnt_syncer = NULL; 2844 return (error); 2845 } 2846 vp->v_type = VNON; 2847 /* 2848 * Place the vnode onto the syncer worklist. We attempt to 2849 * scatter them about on the list so that they will go off 2850 * at evenly distributed times even if all the filesystems 2851 * are mounted at once. 2852 */ 2853 next += incr; 2854 if (next == 0 || next > syncer_maxdelay) { 2855 start /= 2; 2856 incr /= 2; 2857 if (start == 0) { 2858 start = syncer_maxdelay / 2; 2859 incr = syncer_maxdelay; 2860 } 2861 next = start; 2862 } 2863 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2864 mp->mnt_syncer = vp; 2865 return (0); 2866 } 2867 2868 /* 2869 * Do a lazy sync of the filesystem. 2870 */ 2871 static int 2872 sync_fsync(ap) 2873 struct vop_fsync_args /* { 2874 struct vnode *a_vp; 2875 struct ucred *a_cred; 2876 int a_waitfor; 2877 struct proc *a_p; 2878 } */ *ap; 2879 { 2880 struct vnode *syncvp = ap->a_vp; 2881 struct mount *mp = syncvp->v_mount; 2882 struct proc *p = ap->a_p; 2883 int asyncflag; 2884 2885 /* 2886 * We only need to do something if this is a lazy evaluation. 2887 */ 2888 if (ap->a_waitfor != MNT_LAZY) 2889 return (0); 2890 2891 /* 2892 * Move ourselves to the back of the sync list. 2893 */ 2894 vn_syncer_add_to_worklist(syncvp, syncdelay); 2895 2896 /* 2897 * Walk the list of vnodes pushing all that are dirty and 2898 * not already on the sync list. 2899 */ 2900 mtx_lock(&mountlist_mtx); 2901 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { 2902 mtx_unlock(&mountlist_mtx); 2903 return (0); 2904 } 2905 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2906 vfs_unbusy(mp, p); 2907 return (0); 2908 } 2909 asyncflag = mp->mnt_flag & MNT_ASYNC; 2910 mp->mnt_flag &= ~MNT_ASYNC; 2911 vfs_msync(mp, MNT_NOWAIT); 2912 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2913 if (asyncflag) 2914 mp->mnt_flag |= MNT_ASYNC; 2915 vn_finished_write(mp); 2916 vfs_unbusy(mp, p); 2917 return (0); 2918 } 2919 2920 /* 2921 * The syncer vnode is no referenced. 2922 */ 2923 static int 2924 sync_inactive(ap) 2925 struct vop_inactive_args /* { 2926 struct vnode *a_vp; 2927 struct proc *a_p; 2928 } */ *ap; 2929 { 2930 2931 vgone(ap->a_vp); 2932 return (0); 2933 } 2934 2935 /* 2936 * The syncer vnode is no longer needed and is being decommissioned. 2937 * 2938 * Modifications to the worklist must be protected at splbio(). 2939 */ 2940 static int 2941 sync_reclaim(ap) 2942 struct vop_reclaim_args /* { 2943 struct vnode *a_vp; 2944 } */ *ap; 2945 { 2946 struct vnode *vp = ap->a_vp; 2947 int s; 2948 2949 s = splbio(); 2950 vp->v_mount->mnt_syncer = NULL; 2951 if (vp->v_flag & VONWORKLST) { 2952 LIST_REMOVE(vp, v_synclist); 2953 vp->v_flag &= ~VONWORKLST; 2954 } 2955 splx(s); 2956 2957 return (0); 2958 } 2959 2960 /* 2961 * Print out a syncer vnode. 2962 */ 2963 static int 2964 sync_print(ap) 2965 struct vop_print_args /* { 2966 struct vnode *a_vp; 2967 } */ *ap; 2968 { 2969 struct vnode *vp = ap->a_vp; 2970 2971 printf("syncer vnode"); 2972 if (vp->v_vnlock != NULL) 2973 lockmgr_printinfo(vp->v_vnlock); 2974 printf("\n"); 2975 return (0); 2976 } 2977 2978 /* 2979 * extract the dev_t from a VCHR 2980 */ 2981 dev_t 2982 vn_todev(vp) 2983 struct vnode *vp; 2984 { 2985 if (vp->v_type != VCHR) 2986 return (NODEV); 2987 return (vp->v_rdev); 2988 } 2989 2990 /* 2991 * Check if vnode represents a disk device 2992 */ 2993 int 2994 vn_isdisk(vp, errp) 2995 struct vnode *vp; 2996 int *errp; 2997 { 2998 struct cdevsw *cdevsw; 2999 3000 if (vp->v_type != VCHR) { 3001 if (errp != NULL) 3002 *errp = ENOTBLK; 3003 return (0); 3004 } 3005 if (vp->v_rdev == NULL) { 3006 if (errp != NULL) 3007 *errp = ENXIO; 3008 return (0); 3009 } 3010 cdevsw = devsw(vp->v_rdev); 3011 if (cdevsw == NULL) { 3012 if (errp != NULL) 3013 *errp = ENXIO; 3014 return (0); 3015 } 3016 if (!(cdevsw->d_flags & D_DISK)) { 3017 if (errp != NULL) 3018 *errp = ENOTBLK; 3019 return (0); 3020 } 3021 if (errp != NULL) 3022 *errp = 0; 3023 return (1); 3024 } 3025 3026 /* 3027 * Free data allocated by namei(); see namei(9) for details. 3028 */ 3029 void 3030 NDFREE(ndp, flags) 3031 struct nameidata *ndp; 3032 const uint flags; 3033 { 3034 if (!(flags & NDF_NO_FREE_PNBUF) && 3035 (ndp->ni_cnd.cn_flags & HASBUF)) { 3036 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3037 ndp->ni_cnd.cn_flags &= ~HASBUF; 3038 } 3039 if (!(flags & NDF_NO_DVP_UNLOCK) && 3040 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3041 ndp->ni_dvp != ndp->ni_vp) 3042 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 3043 if (!(flags & NDF_NO_DVP_RELE) && 3044 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3045 vrele(ndp->ni_dvp); 3046 ndp->ni_dvp = NULL; 3047 } 3048 if (!(flags & NDF_NO_VP_UNLOCK) && 3049 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3050 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 3051 if (!(flags & NDF_NO_VP_RELE) && 3052 ndp->ni_vp) { 3053 vrele(ndp->ni_vp); 3054 ndp->ni_vp = NULL; 3055 } 3056 if (!(flags & NDF_NO_STARTDIR_RELE) && 3057 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3058 vrele(ndp->ni_startdir); 3059 ndp->ni_startdir = NULL; 3060 } 3061 } 3062 3063 /* 3064 * Common file system object access control check routine. Accepts a 3065 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3066 * and optional call-by-reference privused argument allowing vaccess() 3067 * to indicate to the caller whether privilege was used to satisfy the 3068 * request. Returns 0 on success, or an errno on failure. 3069 */ 3070 int 3071 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3072 enum vtype type; 3073 mode_t file_mode; 3074 uid_t file_uid; 3075 gid_t file_gid; 3076 mode_t acc_mode; 3077 struct ucred *cred; 3078 int *privused; 3079 { 3080 mode_t dac_granted; 3081 #ifdef CAPABILITIES 3082 mode_t cap_granted; 3083 #endif 3084 3085 /* 3086 * Look for a normal, non-privileged way to access the file/directory 3087 * as requested. If it exists, go with that. 3088 */ 3089 3090 if (privused != NULL) 3091 *privused = 0; 3092 3093 dac_granted = 0; 3094 3095 /* Check the owner. */ 3096 if (cred->cr_uid == file_uid) { 3097 dac_granted |= VADMIN; 3098 if (file_mode & S_IXUSR) 3099 dac_granted |= VEXEC; 3100 if (file_mode & S_IRUSR) 3101 dac_granted |= VREAD; 3102 if (file_mode & S_IWUSR) 3103 dac_granted |= VWRITE; 3104 3105 if ((acc_mode & dac_granted) == acc_mode) 3106 return (0); 3107 3108 goto privcheck; 3109 } 3110 3111 /* Otherwise, check the groups (first match) */ 3112 if (groupmember(file_gid, cred)) { 3113 if (file_mode & S_IXGRP) 3114 dac_granted |= VEXEC; 3115 if (file_mode & S_IRGRP) 3116 dac_granted |= VREAD; 3117 if (file_mode & S_IWGRP) 3118 dac_granted |= VWRITE; 3119 3120 if ((acc_mode & dac_granted) == acc_mode) 3121 return (0); 3122 3123 goto privcheck; 3124 } 3125 3126 /* Otherwise, check everyone else. */ 3127 if (file_mode & S_IXOTH) 3128 dac_granted |= VEXEC; 3129 if (file_mode & S_IROTH) 3130 dac_granted |= VREAD; 3131 if (file_mode & S_IWOTH) 3132 dac_granted |= VWRITE; 3133 if ((acc_mode & dac_granted) == acc_mode) 3134 return (0); 3135 3136 privcheck: 3137 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 3138 /* XXX audit: privilege used */ 3139 if (privused != NULL) 3140 *privused = 1; 3141 return (0); 3142 } 3143 3144 #ifdef CAPABILITIES 3145 /* 3146 * Build a capability mask to determine if the set of capabilities 3147 * satisfies the requirements when combined with the granted mask 3148 * from above. 3149 * For each capability, if the capability is required, bitwise 3150 * or the request type onto the cap_granted mask. 3151 */ 3152 cap_granted = 0; 3153 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3154 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3155 cap_granted |= VEXEC; 3156 3157 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3158 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3159 cap_granted |= VREAD; 3160 3161 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3162 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3163 cap_granted |= VWRITE; 3164 3165 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3166 !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) 3167 cap_granted |= VADMIN; 3168 3169 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3170 /* XXX audit: privilege used */ 3171 if (privused != NULL) 3172 *privused = 1; 3173 return (0); 3174 } 3175 #endif 3176 3177 return ((acc_mode & VADMIN) ? EPERM : EACCES); 3178 } 3179