1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD$ 40 */ 41 42 /* 43 * External virtual filesystem routines 44 */ 45 #include "opt_ddb.h" 46 #include "opt_ffs.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/conf.h> 53 #include <sys/dirent.h> 54 #include <sys/domain.h> 55 #include <sys/eventhandler.h> 56 #include <sys/fcntl.h> 57 #include <sys/kernel.h> 58 #include <sys/kthread.h> 59 #include <sys/ktr.h> 60 #include <sys/malloc.h> 61 #include <sys/mount.h> 62 #include <sys/mutex.h> 63 #include <sys/namei.h> 64 #include <sys/proc.h> 65 #include <sys/reboot.h> 66 #include <sys/socket.h> 67 #include <sys/stat.h> 68 #include <sys/sysctl.h> 69 #include <sys/vmmeter.h> 70 #include <sys/vnode.h> 71 72 #include <machine/limits.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_object.h> 76 #include <vm/vm_extern.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pager.h> 81 #include <vm/vnode_pager.h> 82 #include <vm/vm_zone.h> 83 84 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 85 86 static void addalias __P((struct vnode *vp, dev_t nvp_rdev)); 87 static void insmntque __P((struct vnode *vp, struct mount *mp)); 88 static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 89 90 /* 91 * Number of vnodes in existence. Increased whenever getnewvnode() 92 * allocates a new vnode, never decreased. 93 */ 94 static unsigned long numvnodes; 95 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 96 97 /* 98 * Conversion tables for conversion from vnode types to inode formats 99 * and back. 100 */ 101 enum vtype iftovt_tab[16] = { 102 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 103 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 104 }; 105 int vttoif_tab[9] = { 106 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 107 S_IFSOCK, S_IFIFO, S_IFMT, 108 }; 109 110 /* 111 * List of vnodes that are ready for recycling. 112 */ 113 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 114 115 /* 116 * Minimum number of free vnodes. If there are fewer than this free vnodes, 117 * getnewvnode() will return a newly allocated vnode. 118 */ 119 static u_long wantfreevnodes = 25; 120 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 121 /* Number of vnodes in the free list. */ 122 static u_long freevnodes = 0; 123 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 124 125 /* 126 * Various variables used for debugging the new implementation of 127 * reassignbuf(). 128 * XXX these are probably of (very) limited utility now. 129 */ 130 static int reassignbufcalls; 131 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 132 static int reassignbufloops; 133 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 134 static int reassignbufsortgood; 135 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 136 static int reassignbufsortbad; 137 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 138 /* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ 139 static int reassignbufmethod = 1; 140 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 141 142 #ifdef ENABLE_VFS_IOOPT 143 /* See NOTES for a description of this setting. */ 144 int vfs_ioopt = 0; 145 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 146 #endif 147 148 /* List of mounted filesystems. */ 149 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 150 151 /* For any iteration/modification of mountlist */ 152 struct mtx mountlist_mtx; 153 154 /* For any iteration/modification of mnt_vnodelist */ 155 struct simplelock mntvnode_slock; 156 /* 157 * Cache for the mount type id assigned to NFS. This is used for 158 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 159 */ 160 int nfs_mount_type = -1; 161 162 #ifndef NULL_SIMPLELOCKS 163 /* To keep more than one thread at a time from running vfs_getnewfsid */ 164 static struct simplelock mntid_slock; 165 166 /* For any iteration/modification of vnode_free_list */ 167 static struct simplelock vnode_free_list_slock; 168 169 /* 170 * For any iteration/modification of dev->si_hlist (linked through 171 * v_specnext) 172 */ 173 static struct simplelock spechash_slock; 174 #endif 175 176 /* Publicly exported FS */ 177 struct nfs_public nfs_pub; 178 179 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 180 static vm_zone_t vnode_zone; 181 182 /* Set to 1 to print out reclaim of active vnodes */ 183 int prtactive = 0; 184 185 /* 186 * The workitem queue. 187 * 188 * It is useful to delay writes of file data and filesystem metadata 189 * for tens of seconds so that quickly created and deleted files need 190 * not waste disk bandwidth being created and removed. To realize this, 191 * we append vnodes to a "workitem" queue. When running with a soft 192 * updates implementation, most pending metadata dependencies should 193 * not wait for more than a few seconds. Thus, mounted on block devices 194 * are delayed only about a half the time that file data is delayed. 195 * Similarly, directory updates are more critical, so are only delayed 196 * about a third the time that file data is delayed. Thus, there are 197 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 198 * one each second (driven off the filesystem syncer process). The 199 * syncer_delayno variable indicates the next queue that is to be processed. 200 * Items that need to be processed soon are placed in this queue: 201 * 202 * syncer_workitem_pending[syncer_delayno] 203 * 204 * A delay of fifteen seconds is done by placing the request fifteen 205 * entries later in the queue: 206 * 207 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 208 * 209 */ 210 static int syncer_delayno = 0; 211 static long syncer_mask; 212 LIST_HEAD(synclist, vnode); 213 static struct synclist *syncer_workitem_pending; 214 215 #define SYNCER_MAXDELAY 32 216 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 217 time_t syncdelay = 30; /* max time to delay syncing data */ 218 time_t filedelay = 30; /* time to delay syncing files */ 219 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 220 time_t dirdelay = 29; /* time to delay syncing directories */ 221 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 222 time_t metadelay = 28; /* time to delay syncing metadata */ 223 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 224 static int rushjob; /* number of slots to run ASAP */ 225 static int stat_rush_requests; /* number of times I/O speeded up */ 226 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 227 228 /* 229 * Number of vnodes we want to exist at any one time. This is mostly used 230 * to size hash tables in vnode-related code. It is normally not used in 231 * getnewvnode(), as wantfreevnodes is normally nonzero.) 232 * 233 * XXX desiredvnodes is historical cruft and should not exist. 234 */ 235 int desiredvnodes; 236 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 237 &desiredvnodes, 0, "Maximum number of vnodes"); 238 239 static void vfs_free_addrlist __P((struct netexport *nep)); 240 static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 241 static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 242 struct export_args *argp)); 243 244 /* 245 * Initialize the vnode management data structures. 246 */ 247 void 248 vntblinit() 249 { 250 251 desiredvnodes = maxproc + cnt.v_page_count / 4; 252 mtx_init(&mountlist_mtx, "mountlist", MTX_DEF); 253 simple_lock_init(&mntvnode_slock); 254 simple_lock_init(&mntid_slock); 255 simple_lock_init(&spechash_slock); 256 TAILQ_INIT(&vnode_free_list); 257 simple_lock_init(&vnode_free_list_slock); 258 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 259 /* 260 * Initialize the filesystem syncer. 261 */ 262 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 263 &syncer_mask); 264 syncer_maxdelay = syncer_mask + 1; 265 } 266 267 /* 268 * Mark a mount point as busy. Used to synchronize access and to delay 269 * unmounting. Interlock is not released on failure. 270 */ 271 int 272 vfs_busy(mp, flags, interlkp, p) 273 struct mount *mp; 274 int flags; 275 struct mtx *interlkp; 276 struct proc *p; 277 { 278 int lkflags; 279 280 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 281 if (flags & LK_NOWAIT) 282 return (ENOENT); 283 mp->mnt_kern_flag |= MNTK_MWAIT; 284 if (interlkp) { 285 mtx_exit(interlkp, MTX_DEF); 286 } 287 /* 288 * Since all busy locks are shared except the exclusive 289 * lock granted when unmounting, the only place that a 290 * wakeup needs to be done is at the release of the 291 * exclusive lock at the end of dounmount. 292 */ 293 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 294 if (interlkp) { 295 mtx_enter(interlkp, MTX_DEF); 296 } 297 return (ENOENT); 298 } 299 lkflags = LK_SHARED | LK_NOPAUSE; 300 if (interlkp) 301 lkflags |= LK_INTERLOCK; 302 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 303 panic("vfs_busy: unexpected lock failure"); 304 return (0); 305 } 306 307 /* 308 * Free a busy filesystem. 309 */ 310 void 311 vfs_unbusy(mp, p) 312 struct mount *mp; 313 struct proc *p; 314 { 315 316 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 317 } 318 319 /* 320 * Lookup a filesystem type, and if found allocate and initialize 321 * a mount structure for it. 322 * 323 * Devname is usually updated by mount(8) after booting. 324 */ 325 int 326 vfs_rootmountalloc(fstypename, devname, mpp) 327 char *fstypename; 328 char *devname; 329 struct mount **mpp; 330 { 331 struct proc *p = curproc; /* XXX */ 332 struct vfsconf *vfsp; 333 struct mount *mp; 334 335 if (fstypename == NULL) 336 return (ENODEV); 337 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 338 if (!strcmp(vfsp->vfc_name, fstypename)) 339 break; 340 if (vfsp == NULL) 341 return (ENODEV); 342 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 343 bzero((char *)mp, (u_long)sizeof(struct mount)); 344 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 345 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 346 LIST_INIT(&mp->mnt_vnodelist); 347 mp->mnt_vfc = vfsp; 348 mp->mnt_op = vfsp->vfc_vfsops; 349 mp->mnt_flag = MNT_RDONLY; 350 mp->mnt_vnodecovered = NULLVP; 351 vfsp->vfc_refcount++; 352 mp->mnt_iosize_max = DFLTPHYS; 353 mp->mnt_stat.f_type = vfsp->vfc_typenum; 354 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 355 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 356 mp->mnt_stat.f_mntonname[0] = '/'; 357 mp->mnt_stat.f_mntonname[1] = 0; 358 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 359 *mpp = mp; 360 return (0); 361 } 362 363 /* 364 * Find an appropriate filesystem to use for the root. If a filesystem 365 * has not been preselected, walk through the list of known filesystems 366 * trying those that have mountroot routines, and try them until one 367 * works or we have tried them all. 368 */ 369 #ifdef notdef /* XXX JH */ 370 int 371 lite2_vfs_mountroot() 372 { 373 struct vfsconf *vfsp; 374 extern int (*lite2_mountroot) __P((void)); 375 int error; 376 377 if (lite2_mountroot != NULL) 378 return ((*lite2_mountroot)()); 379 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 380 if (vfsp->vfc_mountroot == NULL) 381 continue; 382 if ((error = (*vfsp->vfc_mountroot)()) == 0) 383 return (0); 384 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 385 } 386 return (ENODEV); 387 } 388 #endif 389 390 /* 391 * Lookup a mount point by filesystem identifier. 392 */ 393 struct mount * 394 vfs_getvfs(fsid) 395 fsid_t *fsid; 396 { 397 register struct mount *mp; 398 399 mtx_enter(&mountlist_mtx, MTX_DEF); 400 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 401 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 402 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 403 mtx_exit(&mountlist_mtx, MTX_DEF); 404 return (mp); 405 } 406 } 407 mtx_exit(&mountlist_mtx, MTX_DEF); 408 return ((struct mount *) 0); 409 } 410 411 /* 412 * Get a new unique fsid. Try to make its val[0] unique, since this value 413 * will be used to create fake device numbers for stat(). Also try (but 414 * not so hard) make its val[0] unique mod 2^16, since some emulators only 415 * support 16-bit device numbers. We end up with unique val[0]'s for the 416 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 417 * 418 * Keep in mind that several mounts may be running in parallel. Starting 419 * the search one past where the previous search terminated is both a 420 * micro-optimization and a defense against returning the same fsid to 421 * different mounts. 422 */ 423 void 424 vfs_getnewfsid(mp) 425 struct mount *mp; 426 { 427 static u_int16_t mntid_base; 428 fsid_t tfsid; 429 int mtype; 430 431 simple_lock(&mntid_slock); 432 mtype = mp->mnt_vfc->vfc_typenum; 433 tfsid.val[1] = mtype; 434 mtype = (mtype & 0xFF) << 24; 435 for (;;) { 436 tfsid.val[0] = makeudev(255, 437 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 438 mntid_base++; 439 if (vfs_getvfs(&tfsid) == NULL) 440 break; 441 } 442 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 443 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 444 simple_unlock(&mntid_slock); 445 } 446 447 /* 448 * Knob to control the precision of file timestamps: 449 * 450 * 0 = seconds only; nanoseconds zeroed. 451 * 1 = seconds and nanoseconds, accurate within 1/HZ. 452 * 2 = seconds and nanoseconds, truncated to microseconds. 453 * >=3 = seconds and nanoseconds, maximum precision. 454 */ 455 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 456 457 static int timestamp_precision = TSP_SEC; 458 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 459 ×tamp_precision, 0, ""); 460 461 /* 462 * Get a current timestamp. 463 */ 464 void 465 vfs_timestamp(tsp) 466 struct timespec *tsp; 467 { 468 struct timeval tv; 469 470 switch (timestamp_precision) { 471 case TSP_SEC: 472 tsp->tv_sec = time_second; 473 tsp->tv_nsec = 0; 474 break; 475 case TSP_HZ: 476 getnanotime(tsp); 477 break; 478 case TSP_USEC: 479 microtime(&tv); 480 TIMEVAL_TO_TIMESPEC(&tv, tsp); 481 break; 482 case TSP_NSEC: 483 default: 484 nanotime(tsp); 485 break; 486 } 487 } 488 489 /* 490 * Set vnode attributes to VNOVAL 491 */ 492 void 493 vattr_null(vap) 494 register struct vattr *vap; 495 { 496 497 vap->va_type = VNON; 498 vap->va_size = VNOVAL; 499 vap->va_bytes = VNOVAL; 500 vap->va_mode = VNOVAL; 501 vap->va_nlink = VNOVAL; 502 vap->va_uid = VNOVAL; 503 vap->va_gid = VNOVAL; 504 vap->va_fsid = VNOVAL; 505 vap->va_fileid = VNOVAL; 506 vap->va_blocksize = VNOVAL; 507 vap->va_rdev = VNOVAL; 508 vap->va_atime.tv_sec = VNOVAL; 509 vap->va_atime.tv_nsec = VNOVAL; 510 vap->va_mtime.tv_sec = VNOVAL; 511 vap->va_mtime.tv_nsec = VNOVAL; 512 vap->va_ctime.tv_sec = VNOVAL; 513 vap->va_ctime.tv_nsec = VNOVAL; 514 vap->va_flags = VNOVAL; 515 vap->va_gen = VNOVAL; 516 vap->va_vaflags = 0; 517 } 518 519 /* 520 * Routines having to do with the management of the vnode table. 521 */ 522 523 /* 524 * Return the next vnode from the free list. 525 */ 526 int 527 getnewvnode(tag, mp, vops, vpp) 528 enum vtagtype tag; 529 struct mount *mp; 530 vop_t **vops; 531 struct vnode **vpp; 532 { 533 int s, count; 534 struct proc *p = curproc; /* XXX */ 535 struct vnode *vp = NULL; 536 struct mount *vnmp; 537 vm_object_t object; 538 539 /* 540 * We take the least recently used vnode from the freelist 541 * if we can get it and it has no cached pages, and no 542 * namecache entries are relative to it. 543 * Otherwise we allocate a new vnode 544 */ 545 546 s = splbio(); 547 simple_lock(&vnode_free_list_slock); 548 549 if (wantfreevnodes && freevnodes < wantfreevnodes) { 550 vp = NULL; 551 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 552 /* 553 * XXX: this is only here to be backwards compatible 554 */ 555 vp = NULL; 556 } else for (count = 0; count < freevnodes; count++) { 557 vp = TAILQ_FIRST(&vnode_free_list); 558 if (vp == NULL || vp->v_usecount) 559 panic("getnewvnode: free vnode isn't"); 560 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 561 /* 562 * Don't recycle if active in the namecache or 563 * if it still has cached pages or we cannot get 564 * its interlock. 565 */ 566 if (LIST_FIRST(&vp->v_cache_src) != NULL || 567 (VOP_GETVOBJECT(vp, &object) == 0 && 568 (object->resident_page_count || object->ref_count)) || 569 !mtx_try_enter(&vp->v_interlock, MTX_DEF)) { 570 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 571 vp = NULL; 572 continue; 573 } 574 /* 575 * Skip over it if its filesystem is being suspended. 576 */ 577 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 578 break; 579 mtx_exit(&vp->v_interlock, MTX_DEF); 580 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 581 vp = NULL; 582 } 583 if (vp) { 584 vp->v_flag |= VDOOMED; 585 vp->v_flag &= ~VFREE; 586 freevnodes--; 587 simple_unlock(&vnode_free_list_slock); 588 cache_purge(vp); 589 vp->v_lease = NULL; 590 if (vp->v_type != VBAD) { 591 vgonel(vp, p); 592 } else { 593 mtx_exit(&vp->v_interlock, MTX_DEF); 594 } 595 vn_finished_write(vnmp); 596 597 #ifdef INVARIANTS 598 { 599 int s; 600 601 if (vp->v_data) 602 panic("cleaned vnode isn't"); 603 s = splbio(); 604 if (vp->v_numoutput) 605 panic("Clean vnode has pending I/O's"); 606 splx(s); 607 if (vp->v_writecount != 0) 608 panic("Non-zero write count"); 609 } 610 #endif 611 vp->v_flag = 0; 612 vp->v_lastw = 0; 613 vp->v_lasta = 0; 614 vp->v_cstart = 0; 615 vp->v_clen = 0; 616 vp->v_socket = 0; 617 } else { 618 simple_unlock(&vnode_free_list_slock); 619 vp = (struct vnode *) zalloc(vnode_zone); 620 bzero((char *) vp, sizeof *vp); 621 mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF); 622 vp->v_dd = vp; 623 cache_purge(vp); 624 LIST_INIT(&vp->v_cache_src); 625 TAILQ_INIT(&vp->v_cache_dst); 626 numvnodes++; 627 } 628 629 TAILQ_INIT(&vp->v_cleanblkhd); 630 TAILQ_INIT(&vp->v_dirtyblkhd); 631 vp->v_type = VNON; 632 vp->v_tag = tag; 633 vp->v_op = vops; 634 lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); 635 insmntque(vp, mp); 636 *vpp = vp; 637 vp->v_usecount = 1; 638 vp->v_data = 0; 639 splx(s); 640 641 vfs_object_create(vp, p, p->p_ucred); 642 return (0); 643 } 644 645 /* 646 * Move a vnode from one mount queue to another. 647 */ 648 static void 649 insmntque(vp, mp) 650 register struct vnode *vp; 651 register struct mount *mp; 652 { 653 654 simple_lock(&mntvnode_slock); 655 /* 656 * Delete from old mount point vnode list, if on one. 657 */ 658 if (vp->v_mount != NULL) 659 LIST_REMOVE(vp, v_mntvnodes); 660 /* 661 * Insert into list of vnodes for the new mount point, if available. 662 */ 663 if ((vp->v_mount = mp) == NULL) { 664 simple_unlock(&mntvnode_slock); 665 return; 666 } 667 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 668 simple_unlock(&mntvnode_slock); 669 } 670 671 /* 672 * Update outstanding I/O count and do wakeup if requested. 673 */ 674 void 675 vwakeup(bp) 676 register struct buf *bp; 677 { 678 register struct vnode *vp; 679 680 bp->b_flags &= ~B_WRITEINPROG; 681 if ((vp = bp->b_vp)) { 682 vp->v_numoutput--; 683 if (vp->v_numoutput < 0) 684 panic("vwakeup: neg numoutput"); 685 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 686 vp->v_flag &= ~VBWAIT; 687 wakeup((caddr_t) &vp->v_numoutput); 688 } 689 } 690 } 691 692 /* 693 * Flush out and invalidate all buffers associated with a vnode. 694 * Called with the underlying object locked. 695 */ 696 int 697 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 698 register struct vnode *vp; 699 int flags; 700 struct ucred *cred; 701 struct proc *p; 702 int slpflag, slptimeo; 703 { 704 register struct buf *bp; 705 struct buf *nbp, *blist; 706 int s, error; 707 vm_object_t object; 708 709 if (flags & V_SAVE) { 710 s = splbio(); 711 while (vp->v_numoutput) { 712 vp->v_flag |= VBWAIT; 713 error = tsleep((caddr_t)&vp->v_numoutput, 714 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 715 if (error) { 716 splx(s); 717 return (error); 718 } 719 } 720 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 721 splx(s); 722 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 723 return (error); 724 s = splbio(); 725 if (vp->v_numoutput > 0 || 726 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 727 panic("vinvalbuf: dirty bufs"); 728 } 729 splx(s); 730 } 731 s = splbio(); 732 for (;;) { 733 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 734 if (!blist) 735 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 736 if (!blist) 737 break; 738 739 for (bp = blist; bp; bp = nbp) { 740 nbp = TAILQ_NEXT(bp, b_vnbufs); 741 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 742 error = BUF_TIMELOCK(bp, 743 LK_EXCLUSIVE | LK_SLEEPFAIL, 744 "vinvalbuf", slpflag, slptimeo); 745 if (error == ENOLCK) 746 break; 747 splx(s); 748 return (error); 749 } 750 /* 751 * XXX Since there are no node locks for NFS, I 752 * believe there is a slight chance that a delayed 753 * write will occur while sleeping just above, so 754 * check for it. Note that vfs_bio_awrite expects 755 * buffers to reside on a queue, while VOP_BWRITE and 756 * brelse do not. 757 */ 758 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 759 (flags & V_SAVE)) { 760 761 if (bp->b_vp == vp) { 762 if (bp->b_flags & B_CLUSTEROK) { 763 BUF_UNLOCK(bp); 764 vfs_bio_awrite(bp); 765 } else { 766 bremfree(bp); 767 bp->b_flags |= B_ASYNC; 768 BUF_WRITE(bp); 769 } 770 } else { 771 bremfree(bp); 772 (void) BUF_WRITE(bp); 773 } 774 break; 775 } 776 bremfree(bp); 777 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 778 bp->b_flags &= ~B_ASYNC; 779 brelse(bp); 780 } 781 } 782 783 while (vp->v_numoutput > 0) { 784 vp->v_flag |= VBWAIT; 785 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 786 } 787 788 splx(s); 789 790 /* 791 * Destroy the copy in the VM cache, too. 792 */ 793 mtx_enter(&vp->v_interlock, MTX_DEF); 794 if (VOP_GETVOBJECT(vp, &object) == 0) { 795 vm_object_page_remove(object, 0, 0, 796 (flags & V_SAVE) ? TRUE : FALSE); 797 } 798 mtx_exit(&vp->v_interlock, MTX_DEF); 799 800 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 801 panic("vinvalbuf: flush failed"); 802 return (0); 803 } 804 805 /* 806 * Truncate a file's buffer and pages to a specified length. This 807 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 808 * sync activity. 809 */ 810 int 811 vtruncbuf(vp, cred, p, length, blksize) 812 register struct vnode *vp; 813 struct ucred *cred; 814 struct proc *p; 815 off_t length; 816 int blksize; 817 { 818 register struct buf *bp; 819 struct buf *nbp; 820 int s, anyfreed; 821 int trunclbn; 822 823 /* 824 * Round up to the *next* lbn. 825 */ 826 trunclbn = (length + blksize - 1) / blksize; 827 828 s = splbio(); 829 restart: 830 anyfreed = 1; 831 for (;anyfreed;) { 832 anyfreed = 0; 833 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 834 nbp = TAILQ_NEXT(bp, b_vnbufs); 835 if (bp->b_lblkno >= trunclbn) { 836 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 837 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 838 goto restart; 839 } else { 840 bremfree(bp); 841 bp->b_flags |= (B_INVAL | B_RELBUF); 842 bp->b_flags &= ~B_ASYNC; 843 brelse(bp); 844 anyfreed = 1; 845 } 846 if (nbp && 847 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 848 (nbp->b_vp != vp) || 849 (nbp->b_flags & B_DELWRI))) { 850 goto restart; 851 } 852 } 853 } 854 855 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 856 nbp = TAILQ_NEXT(bp, b_vnbufs); 857 if (bp->b_lblkno >= trunclbn) { 858 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 859 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 860 goto restart; 861 } else { 862 bremfree(bp); 863 bp->b_flags |= (B_INVAL | B_RELBUF); 864 bp->b_flags &= ~B_ASYNC; 865 brelse(bp); 866 anyfreed = 1; 867 } 868 if (nbp && 869 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 870 (nbp->b_vp != vp) || 871 (nbp->b_flags & B_DELWRI) == 0)) { 872 goto restart; 873 } 874 } 875 } 876 } 877 878 if (length > 0) { 879 restartsync: 880 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 881 nbp = TAILQ_NEXT(bp, b_vnbufs); 882 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 883 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 884 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 885 goto restart; 886 } else { 887 bremfree(bp); 888 if (bp->b_vp == vp) { 889 bp->b_flags |= B_ASYNC; 890 } else { 891 bp->b_flags &= ~B_ASYNC; 892 } 893 BUF_WRITE(bp); 894 } 895 goto restartsync; 896 } 897 898 } 899 } 900 901 while (vp->v_numoutput > 0) { 902 vp->v_flag |= VBWAIT; 903 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 904 } 905 906 splx(s); 907 908 vnode_pager_setsize(vp, length); 909 910 return (0); 911 } 912 913 /* 914 * Associate a buffer with a vnode. 915 */ 916 void 917 bgetvp(vp, bp) 918 register struct vnode *vp; 919 register struct buf *bp; 920 { 921 int s; 922 923 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 924 925 vhold(vp); 926 bp->b_vp = vp; 927 bp->b_dev = vn_todev(vp); 928 /* 929 * Insert onto list for new vnode. 930 */ 931 s = splbio(); 932 bp->b_xflags |= BX_VNCLEAN; 933 bp->b_xflags &= ~BX_VNDIRTY; 934 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 935 splx(s); 936 } 937 938 /* 939 * Disassociate a buffer from a vnode. 940 */ 941 void 942 brelvp(bp) 943 register struct buf *bp; 944 { 945 struct vnode *vp; 946 struct buflists *listheadp; 947 int s; 948 949 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 950 951 /* 952 * Delete from old vnode list, if on one. 953 */ 954 vp = bp->b_vp; 955 s = splbio(); 956 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 957 if (bp->b_xflags & BX_VNDIRTY) 958 listheadp = &vp->v_dirtyblkhd; 959 else 960 listheadp = &vp->v_cleanblkhd; 961 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 962 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 963 } 964 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 965 vp->v_flag &= ~VONWORKLST; 966 LIST_REMOVE(vp, v_synclist); 967 } 968 splx(s); 969 bp->b_vp = (struct vnode *) 0; 970 vdrop(vp); 971 } 972 973 /* 974 * Add an item to the syncer work queue. 975 */ 976 static void 977 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 978 { 979 int s, slot; 980 981 s = splbio(); 982 983 if (vp->v_flag & VONWORKLST) { 984 LIST_REMOVE(vp, v_synclist); 985 } 986 987 if (delay > syncer_maxdelay - 2) 988 delay = syncer_maxdelay - 2; 989 slot = (syncer_delayno + delay) & syncer_mask; 990 991 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 992 vp->v_flag |= VONWORKLST; 993 splx(s); 994 } 995 996 struct proc *updateproc; 997 static void sched_sync __P((void)); 998 static struct kproc_desc up_kp = { 999 "syncer", 1000 sched_sync, 1001 &updateproc 1002 }; 1003 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1004 1005 /* 1006 * System filesystem synchronizer daemon. 1007 */ 1008 void 1009 sched_sync(void) 1010 { 1011 struct synclist *slp; 1012 struct vnode *vp; 1013 struct mount *mp; 1014 long starttime; 1015 int s; 1016 struct proc *p = updateproc; 1017 1018 mtx_enter(&Giant, MTX_DEF); 1019 1020 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, 1021 SHUTDOWN_PRI_LAST); 1022 1023 for (;;) { 1024 kproc_suspend_loop(p); 1025 1026 starttime = time_second; 1027 1028 /* 1029 * Push files whose dirty time has expired. Be careful 1030 * of interrupt race on slp queue. 1031 */ 1032 s = splbio(); 1033 slp = &syncer_workitem_pending[syncer_delayno]; 1034 syncer_delayno += 1; 1035 if (syncer_delayno == syncer_maxdelay) 1036 syncer_delayno = 0; 1037 splx(s); 1038 1039 while ((vp = LIST_FIRST(slp)) != NULL) { 1040 if (VOP_ISLOCKED(vp, NULL) == 0 && 1041 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 1042 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1043 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 1044 VOP_UNLOCK(vp, 0, p); 1045 vn_finished_write(mp); 1046 } 1047 s = splbio(); 1048 if (LIST_FIRST(slp) == vp) { 1049 /* 1050 * Note: v_tag VT_VFS vps can remain on the 1051 * worklist too with no dirty blocks, but 1052 * since sync_fsync() moves it to a different 1053 * slot we are safe. 1054 */ 1055 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1056 !vn_isdisk(vp, NULL)) 1057 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1058 /* 1059 * Put us back on the worklist. The worklist 1060 * routine will remove us from our current 1061 * position and then add us back in at a later 1062 * position. 1063 */ 1064 vn_syncer_add_to_worklist(vp, syncdelay); 1065 } 1066 splx(s); 1067 } 1068 1069 /* 1070 * Do soft update processing. 1071 */ 1072 #ifdef SOFTUPDATES 1073 softdep_process_worklist(NULL); 1074 #endif 1075 1076 /* 1077 * The variable rushjob allows the kernel to speed up the 1078 * processing of the filesystem syncer process. A rushjob 1079 * value of N tells the filesystem syncer to process the next 1080 * N seconds worth of work on its queue ASAP. Currently rushjob 1081 * is used by the soft update code to speed up the filesystem 1082 * syncer process when the incore state is getting so far 1083 * ahead of the disk that the kernel memory pool is being 1084 * threatened with exhaustion. 1085 */ 1086 if (rushjob > 0) { 1087 rushjob -= 1; 1088 continue; 1089 } 1090 /* 1091 * If it has taken us less than a second to process the 1092 * current work, then wait. Otherwise start right over 1093 * again. We can still lose time if any single round 1094 * takes more than two seconds, but it does not really 1095 * matter as we are just trying to generally pace the 1096 * filesystem activity. 1097 */ 1098 if (time_second == starttime) 1099 tsleep(&lbolt, PPAUSE, "syncer", 0); 1100 } 1101 } 1102 1103 /* 1104 * Request the syncer daemon to speed up its work. 1105 * We never push it to speed up more than half of its 1106 * normal turn time, otherwise it could take over the cpu. 1107 */ 1108 int 1109 speedup_syncer() 1110 { 1111 int s; 1112 1113 s = splhigh(); 1114 if (updateproc->p_wchan == &lbolt) 1115 setrunnable(updateproc); 1116 splx(s); 1117 if (rushjob < syncdelay / 2) { 1118 rushjob += 1; 1119 stat_rush_requests += 1; 1120 return (1); 1121 } 1122 return(0); 1123 } 1124 1125 /* 1126 * Associate a p-buffer with a vnode. 1127 * 1128 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1129 * with the buffer. i.e. the bp has not been linked into the vnode or 1130 * ref-counted. 1131 */ 1132 void 1133 pbgetvp(vp, bp) 1134 register struct vnode *vp; 1135 register struct buf *bp; 1136 { 1137 1138 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1139 1140 bp->b_vp = vp; 1141 bp->b_flags |= B_PAGING; 1142 bp->b_dev = vn_todev(vp); 1143 } 1144 1145 /* 1146 * Disassociate a p-buffer from a vnode. 1147 */ 1148 void 1149 pbrelvp(bp) 1150 register struct buf *bp; 1151 { 1152 1153 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1154 1155 /* XXX REMOVE ME */ 1156 if (bp->b_vnbufs.tqe_next != NULL) { 1157 panic( 1158 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1159 bp, 1160 (int)bp->b_flags 1161 ); 1162 } 1163 bp->b_vp = (struct vnode *) 0; 1164 bp->b_flags &= ~B_PAGING; 1165 } 1166 1167 /* 1168 * Change the vnode a pager buffer is associated with. 1169 */ 1170 void 1171 pbreassignbuf(bp, newvp) 1172 struct buf *bp; 1173 struct vnode *newvp; 1174 { 1175 1176 KASSERT(bp->b_flags & B_PAGING, 1177 ("pbreassignbuf() on non phys bp %p", bp)); 1178 bp->b_vp = newvp; 1179 } 1180 1181 /* 1182 * Reassign a buffer from one vnode to another. 1183 * Used to assign file specific control information 1184 * (indirect blocks) to the vnode to which they belong. 1185 */ 1186 void 1187 reassignbuf(bp, newvp) 1188 register struct buf *bp; 1189 register struct vnode *newvp; 1190 { 1191 struct buflists *listheadp; 1192 int delay; 1193 int s; 1194 1195 if (newvp == NULL) { 1196 printf("reassignbuf: NULL"); 1197 return; 1198 } 1199 ++reassignbufcalls; 1200 1201 /* 1202 * B_PAGING flagged buffers cannot be reassigned because their vp 1203 * is not fully linked in. 1204 */ 1205 if (bp->b_flags & B_PAGING) 1206 panic("cannot reassign paging buffer"); 1207 1208 s = splbio(); 1209 /* 1210 * Delete from old vnode list, if on one. 1211 */ 1212 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1213 if (bp->b_xflags & BX_VNDIRTY) 1214 listheadp = &bp->b_vp->v_dirtyblkhd; 1215 else 1216 listheadp = &bp->b_vp->v_cleanblkhd; 1217 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1218 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1219 if (bp->b_vp != newvp) { 1220 vdrop(bp->b_vp); 1221 bp->b_vp = NULL; /* for clarification */ 1222 } 1223 } 1224 /* 1225 * If dirty, put on list of dirty buffers; otherwise insert onto list 1226 * of clean buffers. 1227 */ 1228 if (bp->b_flags & B_DELWRI) { 1229 struct buf *tbp; 1230 1231 listheadp = &newvp->v_dirtyblkhd; 1232 if ((newvp->v_flag & VONWORKLST) == 0) { 1233 switch (newvp->v_type) { 1234 case VDIR: 1235 delay = dirdelay; 1236 break; 1237 case VCHR: 1238 if (newvp->v_rdev->si_mountpoint != NULL) { 1239 delay = metadelay; 1240 break; 1241 } 1242 /* fall through */ 1243 default: 1244 delay = filedelay; 1245 } 1246 vn_syncer_add_to_worklist(newvp, delay); 1247 } 1248 bp->b_xflags |= BX_VNDIRTY; 1249 tbp = TAILQ_FIRST(listheadp); 1250 if (tbp == NULL || 1251 bp->b_lblkno == 0 || 1252 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1253 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1254 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1255 ++reassignbufsortgood; 1256 } else if (bp->b_lblkno < 0) { 1257 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1258 ++reassignbufsortgood; 1259 } else if (reassignbufmethod == 1) { 1260 /* 1261 * New sorting algorithm, only handle sequential case, 1262 * otherwise append to end (but before metadata) 1263 */ 1264 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1265 (tbp->b_xflags & BX_VNDIRTY)) { 1266 /* 1267 * Found the best place to insert the buffer 1268 */ 1269 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1270 ++reassignbufsortgood; 1271 } else { 1272 /* 1273 * Missed, append to end, but before meta-data. 1274 * We know that the head buffer in the list is 1275 * not meta-data due to prior conditionals. 1276 * 1277 * Indirect effects: NFS second stage write 1278 * tends to wind up here, giving maximum 1279 * distance between the unstable write and the 1280 * commit rpc. 1281 */ 1282 tbp = TAILQ_LAST(listheadp, buflists); 1283 while (tbp && tbp->b_lblkno < 0) 1284 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1285 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1286 ++reassignbufsortbad; 1287 } 1288 } else { 1289 /* 1290 * Old sorting algorithm, scan queue and insert 1291 */ 1292 struct buf *ttbp; 1293 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1294 (ttbp->b_lblkno < bp->b_lblkno)) { 1295 ++reassignbufloops; 1296 tbp = ttbp; 1297 } 1298 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1299 } 1300 } else { 1301 bp->b_xflags |= BX_VNCLEAN; 1302 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1303 if ((newvp->v_flag & VONWORKLST) && 1304 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1305 newvp->v_flag &= ~VONWORKLST; 1306 LIST_REMOVE(newvp, v_synclist); 1307 } 1308 } 1309 if (bp->b_vp != newvp) { 1310 bp->b_vp = newvp; 1311 vhold(bp->b_vp); 1312 } 1313 splx(s); 1314 } 1315 1316 /* 1317 * Create a vnode for a device. 1318 * Used for mounting the root file system. 1319 */ 1320 int 1321 bdevvp(dev, vpp) 1322 dev_t dev; 1323 struct vnode **vpp; 1324 { 1325 register struct vnode *vp; 1326 struct vnode *nvp; 1327 int error; 1328 1329 if (dev == NODEV) { 1330 *vpp = NULLVP; 1331 return (ENXIO); 1332 } 1333 if (vfinddev(dev, VCHR, vpp)) 1334 return (0); 1335 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1336 if (error) { 1337 *vpp = NULLVP; 1338 return (error); 1339 } 1340 vp = nvp; 1341 vp->v_type = VCHR; 1342 addalias(vp, dev); 1343 *vpp = vp; 1344 return (0); 1345 } 1346 1347 /* 1348 * Add vnode to the alias list hung off the dev_t. 1349 * 1350 * The reason for this gunk is that multiple vnodes can reference 1351 * the same physical device, so checking vp->v_usecount to see 1352 * how many users there are is inadequate; the v_usecount for 1353 * the vnodes need to be accumulated. vcount() does that. 1354 */ 1355 struct vnode * 1356 addaliasu(nvp, nvp_rdev) 1357 struct vnode *nvp; 1358 udev_t nvp_rdev; 1359 { 1360 struct vnode *ovp; 1361 vop_t **ops; 1362 dev_t dev; 1363 1364 if (nvp->v_type == VBLK) 1365 return (nvp); 1366 if (nvp->v_type != VCHR) 1367 panic("addaliasu on non-special vnode"); 1368 dev = udev2dev(nvp_rdev, 0); 1369 /* 1370 * Check to see if we have a bdevvp vnode with no associated 1371 * filesystem. If so, we want to associate the filesystem of 1372 * the new newly instigated vnode with the bdevvp vnode and 1373 * discard the newly created vnode rather than leaving the 1374 * bdevvp vnode lying around with no associated filesystem. 1375 */ 1376 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1377 addalias(nvp, dev); 1378 return (nvp); 1379 } 1380 /* 1381 * Discard unneeded vnode, but save its node specific data. 1382 * Note that if there is a lock, it is carried over in the 1383 * node specific data to the replacement vnode. 1384 */ 1385 vref(ovp); 1386 ovp->v_data = nvp->v_data; 1387 ovp->v_tag = nvp->v_tag; 1388 nvp->v_data = NULL; 1389 ops = nvp->v_op; 1390 nvp->v_op = ovp->v_op; 1391 ovp->v_op = ops; 1392 lockinit(&ovp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE); 1393 if (nvp->v_vnlock) 1394 ovp->v_vnlock = &ovp->v_lock; 1395 insmntque(ovp, nvp->v_mount); 1396 vrele(nvp); 1397 vgone(nvp); 1398 return (ovp); 1399 } 1400 1401 /* This is a local helper function that do the same as addaliasu, but for a 1402 * dev_t instead of an udev_t. */ 1403 static void 1404 addalias(nvp, dev) 1405 struct vnode *nvp; 1406 dev_t dev; 1407 { 1408 1409 KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode")); 1410 nvp->v_rdev = dev; 1411 simple_lock(&spechash_slock); 1412 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1413 simple_unlock(&spechash_slock); 1414 } 1415 1416 /* 1417 * Grab a particular vnode from the free list, increment its 1418 * reference count and lock it. The vnode lock bit is set if the 1419 * vnode is being eliminated in vgone. The process is awakened 1420 * when the transition is completed, and an error returned to 1421 * indicate that the vnode is no longer usable (possibly having 1422 * been changed to a new file system type). 1423 */ 1424 int 1425 vget(vp, flags, p) 1426 register struct vnode *vp; 1427 int flags; 1428 struct proc *p; 1429 { 1430 int error; 1431 1432 /* 1433 * If the vnode is in the process of being cleaned out for 1434 * another use, we wait for the cleaning to finish and then 1435 * return failure. Cleaning is determined by checking that 1436 * the VXLOCK flag is set. 1437 */ 1438 if ((flags & LK_INTERLOCK) == 0) 1439 mtx_enter(&vp->v_interlock, MTX_DEF); 1440 if (vp->v_flag & VXLOCK) { 1441 if (vp->v_vxproc == curproc) { 1442 printf("VXLOCK interlock avoided\n"); 1443 } else { 1444 vp->v_flag |= VXWANT; 1445 mtx_exit(&vp->v_interlock, MTX_DEF); 1446 tsleep((caddr_t)vp, PINOD, "vget", 0); 1447 return (ENOENT); 1448 } 1449 } 1450 1451 vp->v_usecount++; 1452 1453 if (VSHOULDBUSY(vp)) 1454 vbusy(vp); 1455 if (flags & LK_TYPE_MASK) { 1456 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1457 /* 1458 * must expand vrele here because we do not want 1459 * to call VOP_INACTIVE if the reference count 1460 * drops back to zero since it was never really 1461 * active. We must remove it from the free list 1462 * before sleeping so that multiple processes do 1463 * not try to recycle it. 1464 */ 1465 mtx_enter(&vp->v_interlock, MTX_DEF); 1466 vp->v_usecount--; 1467 if (VSHOULDFREE(vp)) 1468 vfree(vp); 1469 mtx_exit(&vp->v_interlock, MTX_DEF); 1470 } 1471 return (error); 1472 } 1473 mtx_exit(&vp->v_interlock, MTX_DEF); 1474 return (0); 1475 } 1476 1477 /* 1478 * Increase the reference count of a vnode. 1479 */ 1480 void 1481 vref(struct vnode *vp) 1482 { 1483 mtx_enter(&vp->v_interlock, MTX_DEF); 1484 vp->v_usecount++; 1485 mtx_exit(&vp->v_interlock, MTX_DEF); 1486 } 1487 1488 /* 1489 * Vnode put/release. 1490 * If count drops to zero, call inactive routine and return to freelist. 1491 */ 1492 void 1493 vrele(vp) 1494 struct vnode *vp; 1495 { 1496 struct proc *p = curproc; /* XXX */ 1497 1498 KASSERT(vp != NULL, ("vrele: null vp")); 1499 1500 mtx_enter(&vp->v_interlock, MTX_DEF); 1501 1502 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); 1503 1504 if (vp->v_usecount > 1) { 1505 1506 vp->v_usecount--; 1507 mtx_exit(&vp->v_interlock, MTX_DEF); 1508 1509 return; 1510 } 1511 1512 if (vp->v_usecount == 1) { 1513 1514 vp->v_usecount--; 1515 if (VSHOULDFREE(vp)) 1516 vfree(vp); 1517 /* 1518 * If we are doing a vput, the node is already locked, and we must 1519 * call VOP_INACTIVE with the node locked. So, in the case of 1520 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1521 */ 1522 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1523 VOP_INACTIVE(vp, p); 1524 } 1525 1526 } else { 1527 #ifdef DIAGNOSTIC 1528 vprint("vrele: negative ref count", vp); 1529 mtx_exit(&vp->v_interlock, MTX_DEF); 1530 #endif 1531 panic("vrele: negative ref cnt"); 1532 } 1533 } 1534 1535 /* 1536 * Release an already locked vnode. This give the same effects as 1537 * unlock+vrele(), but takes less time and avoids releasing and 1538 * re-aquiring the lock (as vrele() aquires the lock internally.) 1539 */ 1540 void 1541 vput(vp) 1542 struct vnode *vp; 1543 { 1544 struct proc *p = curproc; /* XXX */ 1545 1546 KASSERT(vp != NULL, ("vput: null vp")); 1547 mtx_enter(&vp->v_interlock, MTX_DEF); 1548 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); 1549 1550 if (vp->v_usecount > 1) { 1551 1552 vp->v_usecount--; 1553 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1554 return; 1555 1556 } 1557 1558 if (vp->v_usecount == 1) { 1559 1560 vp->v_usecount--; 1561 if (VSHOULDFREE(vp)) 1562 vfree(vp); 1563 /* 1564 * If we are doing a vput, the node is already locked, and we must 1565 * call VOP_INACTIVE with the node locked. So, in the case of 1566 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1567 */ 1568 mtx_exit(&vp->v_interlock, MTX_DEF); 1569 VOP_INACTIVE(vp, p); 1570 1571 } else { 1572 #ifdef DIAGNOSTIC 1573 vprint("vput: negative ref count", vp); 1574 #endif 1575 panic("vput: negative ref cnt"); 1576 } 1577 } 1578 1579 /* 1580 * Somebody doesn't want the vnode recycled. 1581 */ 1582 void 1583 vhold(vp) 1584 register struct vnode *vp; 1585 { 1586 int s; 1587 1588 s = splbio(); 1589 vp->v_holdcnt++; 1590 if (VSHOULDBUSY(vp)) 1591 vbusy(vp); 1592 splx(s); 1593 } 1594 1595 /* 1596 * Note that there is one less who cares about this vnode. vdrop() is the 1597 * opposite of vhold(). 1598 */ 1599 void 1600 vdrop(vp) 1601 register struct vnode *vp; 1602 { 1603 int s; 1604 1605 s = splbio(); 1606 if (vp->v_holdcnt <= 0) 1607 panic("vdrop: holdcnt"); 1608 vp->v_holdcnt--; 1609 if (VSHOULDFREE(vp)) 1610 vfree(vp); 1611 splx(s); 1612 } 1613 1614 /* 1615 * Remove any vnodes in the vnode table belonging to mount point mp. 1616 * 1617 * If MNT_NOFORCE is specified, there should not be any active ones, 1618 * return error if any are found (nb: this is a user error, not a 1619 * system error). If MNT_FORCE is specified, detach any active vnodes 1620 * that are found. 1621 */ 1622 #ifdef DIAGNOSTIC 1623 static int busyprt = 0; /* print out busy vnodes */ 1624 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1625 #endif 1626 1627 int 1628 vflush(mp, skipvp, flags) 1629 struct mount *mp; 1630 struct vnode *skipvp; 1631 int flags; 1632 { 1633 struct proc *p = curproc; /* XXX */ 1634 struct vnode *vp, *nvp; 1635 int busy = 0; 1636 1637 simple_lock(&mntvnode_slock); 1638 loop: 1639 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1640 /* 1641 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1642 * Start over if it has (it won't be on the list anymore). 1643 */ 1644 if (vp->v_mount != mp) 1645 goto loop; 1646 nvp = LIST_NEXT(vp, v_mntvnodes); 1647 /* 1648 * Skip over a selected vnode. 1649 */ 1650 if (vp == skipvp) 1651 continue; 1652 1653 mtx_enter(&vp->v_interlock, MTX_DEF); 1654 /* 1655 * Skip over a vnodes marked VSYSTEM. 1656 */ 1657 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1658 mtx_exit(&vp->v_interlock, MTX_DEF); 1659 continue; 1660 } 1661 /* 1662 * If WRITECLOSE is set, only flush out regular file vnodes 1663 * open for writing. 1664 */ 1665 if ((flags & WRITECLOSE) && 1666 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1667 mtx_exit(&vp->v_interlock, MTX_DEF); 1668 continue; 1669 } 1670 1671 /* 1672 * With v_usecount == 0, all we need to do is clear out the 1673 * vnode data structures and we are done. 1674 */ 1675 if (vp->v_usecount == 0) { 1676 simple_unlock(&mntvnode_slock); 1677 vgonel(vp, p); 1678 simple_lock(&mntvnode_slock); 1679 continue; 1680 } 1681 1682 /* 1683 * If FORCECLOSE is set, forcibly close the vnode. For block 1684 * or character devices, revert to an anonymous device. For 1685 * all other files, just kill them. 1686 */ 1687 if (flags & FORCECLOSE) { 1688 simple_unlock(&mntvnode_slock); 1689 if (vp->v_type != VCHR) { 1690 vgonel(vp, p); 1691 } else { 1692 vclean(vp, 0, p); 1693 vp->v_op = spec_vnodeop_p; 1694 insmntque(vp, (struct mount *) 0); 1695 } 1696 simple_lock(&mntvnode_slock); 1697 continue; 1698 } 1699 #ifdef DIAGNOSTIC 1700 if (busyprt) 1701 vprint("vflush: busy vnode", vp); 1702 #endif 1703 mtx_exit(&vp->v_interlock, MTX_DEF); 1704 busy++; 1705 } 1706 simple_unlock(&mntvnode_slock); 1707 if (busy) 1708 return (EBUSY); 1709 return (0); 1710 } 1711 1712 /* 1713 * Disassociate the underlying file system from a vnode. 1714 */ 1715 static void 1716 vclean(vp, flags, p) 1717 struct vnode *vp; 1718 int flags; 1719 struct proc *p; 1720 { 1721 int active; 1722 1723 /* 1724 * Check to see if the vnode is in use. If so we have to reference it 1725 * before we clean it out so that its count cannot fall to zero and 1726 * generate a race against ourselves to recycle it. 1727 */ 1728 if ((active = vp->v_usecount)) 1729 vp->v_usecount++; 1730 1731 /* 1732 * Prevent the vnode from being recycled or brought into use while we 1733 * clean it out. 1734 */ 1735 if (vp->v_flag & VXLOCK) 1736 panic("vclean: deadlock"); 1737 vp->v_flag |= VXLOCK; 1738 vp->v_vxproc = curproc; 1739 /* 1740 * Even if the count is zero, the VOP_INACTIVE routine may still 1741 * have the object locked while it cleans it out. The VOP_LOCK 1742 * ensures that the VOP_INACTIVE routine is done with its work. 1743 * For active vnodes, it ensures that no other activity can 1744 * occur while the underlying object is being cleaned out. 1745 */ 1746 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1747 1748 /* 1749 * Clean out any buffers associated with the vnode. 1750 * If the flush fails, just toss the buffers. 1751 */ 1752 if (flags & DOCLOSE) { 1753 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1754 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1755 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) 1756 vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1757 } 1758 1759 VOP_DESTROYVOBJECT(vp); 1760 1761 /* 1762 * If purging an active vnode, it must be closed and 1763 * deactivated before being reclaimed. Note that the 1764 * VOP_INACTIVE will unlock the vnode. 1765 */ 1766 if (active) { 1767 if (flags & DOCLOSE) 1768 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1769 VOP_INACTIVE(vp, p); 1770 } else { 1771 /* 1772 * Any other processes trying to obtain this lock must first 1773 * wait for VXLOCK to clear, then call the new lock operation. 1774 */ 1775 VOP_UNLOCK(vp, 0, p); 1776 } 1777 /* 1778 * Reclaim the vnode. 1779 */ 1780 if (VOP_RECLAIM(vp, p)) 1781 panic("vclean: cannot reclaim"); 1782 1783 if (active) { 1784 /* 1785 * Inline copy of vrele() since VOP_INACTIVE 1786 * has already been called. 1787 */ 1788 mtx_enter(&vp->v_interlock, MTX_DEF); 1789 if (--vp->v_usecount <= 0) { 1790 #ifdef DIAGNOSTIC 1791 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1792 vprint("vclean: bad ref count", vp); 1793 panic("vclean: ref cnt"); 1794 } 1795 #endif 1796 vfree(vp); 1797 } 1798 mtx_exit(&vp->v_interlock, MTX_DEF); 1799 } 1800 1801 cache_purge(vp); 1802 vp->v_vnlock = NULL; 1803 lockdestroy(&vp->v_lock); 1804 1805 if (VSHOULDFREE(vp)) 1806 vfree(vp); 1807 1808 /* 1809 * Done with purge, notify sleepers of the grim news. 1810 */ 1811 vp->v_op = dead_vnodeop_p; 1812 vn_pollgone(vp); 1813 vp->v_tag = VT_NON; 1814 vp->v_flag &= ~VXLOCK; 1815 vp->v_vxproc = NULL; 1816 if (vp->v_flag & VXWANT) { 1817 vp->v_flag &= ~VXWANT; 1818 wakeup((caddr_t) vp); 1819 } 1820 } 1821 1822 /* 1823 * Eliminate all activity associated with the requested vnode 1824 * and with all vnodes aliased to the requested vnode. 1825 */ 1826 int 1827 vop_revoke(ap) 1828 struct vop_revoke_args /* { 1829 struct vnode *a_vp; 1830 int a_flags; 1831 } */ *ap; 1832 { 1833 struct vnode *vp, *vq; 1834 dev_t dev; 1835 1836 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1837 1838 vp = ap->a_vp; 1839 /* 1840 * If a vgone (or vclean) is already in progress, 1841 * wait until it is done and return. 1842 */ 1843 if (vp->v_flag & VXLOCK) { 1844 vp->v_flag |= VXWANT; 1845 mtx_exit(&vp->v_interlock, MTX_DEF); 1846 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1847 return (0); 1848 } 1849 dev = vp->v_rdev; 1850 for (;;) { 1851 simple_lock(&spechash_slock); 1852 vq = SLIST_FIRST(&dev->si_hlist); 1853 simple_unlock(&spechash_slock); 1854 if (!vq) 1855 break; 1856 vgone(vq); 1857 } 1858 return (0); 1859 } 1860 1861 /* 1862 * Recycle an unused vnode to the front of the free list. 1863 * Release the passed interlock if the vnode will be recycled. 1864 */ 1865 int 1866 vrecycle(vp, inter_lkp, p) 1867 struct vnode *vp; 1868 struct simplelock *inter_lkp; 1869 struct proc *p; 1870 { 1871 1872 mtx_enter(&vp->v_interlock, MTX_DEF); 1873 if (vp->v_usecount == 0) { 1874 if (inter_lkp) { 1875 simple_unlock(inter_lkp); 1876 } 1877 vgonel(vp, p); 1878 return (1); 1879 } 1880 mtx_exit(&vp->v_interlock, MTX_DEF); 1881 return (0); 1882 } 1883 1884 /* 1885 * Eliminate all activity associated with a vnode 1886 * in preparation for reuse. 1887 */ 1888 void 1889 vgone(vp) 1890 register struct vnode *vp; 1891 { 1892 struct proc *p = curproc; /* XXX */ 1893 1894 mtx_enter(&vp->v_interlock, MTX_DEF); 1895 vgonel(vp, p); 1896 } 1897 1898 /* 1899 * vgone, with the vp interlock held. 1900 */ 1901 void 1902 vgonel(vp, p) 1903 struct vnode *vp; 1904 struct proc *p; 1905 { 1906 int s; 1907 1908 /* 1909 * If a vgone (or vclean) is already in progress, 1910 * wait until it is done and return. 1911 */ 1912 if (vp->v_flag & VXLOCK) { 1913 vp->v_flag |= VXWANT; 1914 mtx_exit(&vp->v_interlock, MTX_DEF); 1915 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1916 return; 1917 } 1918 1919 /* 1920 * Clean out the filesystem specific data. 1921 */ 1922 vclean(vp, DOCLOSE, p); 1923 mtx_enter(&vp->v_interlock, MTX_DEF); 1924 1925 /* 1926 * Delete from old mount point vnode list, if on one. 1927 */ 1928 if (vp->v_mount != NULL) 1929 insmntque(vp, (struct mount *)0); 1930 /* 1931 * If special device, remove it from special device alias list 1932 * if it is on one. 1933 */ 1934 if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) { 1935 simple_lock(&spechash_slock); 1936 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext); 1937 freedev(vp->v_rdev); 1938 simple_unlock(&spechash_slock); 1939 vp->v_rdev = NULL; 1940 } 1941 1942 /* 1943 * If it is on the freelist and not already at the head, 1944 * move it to the head of the list. The test of the 1945 * VDOOMED flag and the reference count of zero is because 1946 * it will be removed from the free list by getnewvnode, 1947 * but will not have its reference count incremented until 1948 * after calling vgone. If the reference count were 1949 * incremented first, vgone would (incorrectly) try to 1950 * close the previous instance of the underlying object. 1951 */ 1952 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1953 s = splbio(); 1954 simple_lock(&vnode_free_list_slock); 1955 if (vp->v_flag & VFREE) 1956 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1957 else 1958 freevnodes++; 1959 vp->v_flag |= VFREE; 1960 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1961 simple_unlock(&vnode_free_list_slock); 1962 splx(s); 1963 } 1964 1965 vp->v_type = VBAD; 1966 mtx_exit(&vp->v_interlock, MTX_DEF); 1967 } 1968 1969 /* 1970 * Lookup a vnode by device number. 1971 */ 1972 int 1973 vfinddev(dev, type, vpp) 1974 dev_t dev; 1975 enum vtype type; 1976 struct vnode **vpp; 1977 { 1978 struct vnode *vp; 1979 1980 simple_lock(&spechash_slock); 1981 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1982 if (type == vp->v_type) { 1983 *vpp = vp; 1984 simple_unlock(&spechash_slock); 1985 return (1); 1986 } 1987 } 1988 simple_unlock(&spechash_slock); 1989 return (0); 1990 } 1991 1992 /* 1993 * Calculate the total number of references to a special device. 1994 */ 1995 int 1996 vcount(vp) 1997 struct vnode *vp; 1998 { 1999 struct vnode *vq; 2000 int count; 2001 2002 count = 0; 2003 simple_lock(&spechash_slock); 2004 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) 2005 count += vq->v_usecount; 2006 simple_unlock(&spechash_slock); 2007 return (count); 2008 } 2009 2010 /* 2011 * Same as above, but using the dev_t as argument 2012 */ 2013 int 2014 count_dev(dev) 2015 dev_t dev; 2016 { 2017 struct vnode *vp; 2018 2019 vp = SLIST_FIRST(&dev->si_hlist); 2020 if (vp == NULL) 2021 return (0); 2022 return(vcount(vp)); 2023 } 2024 2025 /* 2026 * Print out a description of a vnode. 2027 */ 2028 static char *typename[] = 2029 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2030 2031 void 2032 vprint(label, vp) 2033 char *label; 2034 struct vnode *vp; 2035 { 2036 char buf[96]; 2037 2038 if (label != NULL) 2039 printf("%s: %p: ", label, (void *)vp); 2040 else 2041 printf("%p: ", (void *)vp); 2042 printf("type %s, usecount %d, writecount %d, refcount %d,", 2043 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2044 vp->v_holdcnt); 2045 buf[0] = '\0'; 2046 if (vp->v_flag & VROOT) 2047 strcat(buf, "|VROOT"); 2048 if (vp->v_flag & VTEXT) 2049 strcat(buf, "|VTEXT"); 2050 if (vp->v_flag & VSYSTEM) 2051 strcat(buf, "|VSYSTEM"); 2052 if (vp->v_flag & VXLOCK) 2053 strcat(buf, "|VXLOCK"); 2054 if (vp->v_flag & VXWANT) 2055 strcat(buf, "|VXWANT"); 2056 if (vp->v_flag & VBWAIT) 2057 strcat(buf, "|VBWAIT"); 2058 if (vp->v_flag & VDOOMED) 2059 strcat(buf, "|VDOOMED"); 2060 if (vp->v_flag & VFREE) 2061 strcat(buf, "|VFREE"); 2062 if (vp->v_flag & VOBJBUF) 2063 strcat(buf, "|VOBJBUF"); 2064 if (buf[0] != '\0') 2065 printf(" flags (%s)", &buf[1]); 2066 if (vp->v_data == NULL) { 2067 printf("\n"); 2068 } else { 2069 printf("\n\t"); 2070 VOP_PRINT(vp); 2071 } 2072 } 2073 2074 #ifdef DDB 2075 #include <ddb/ddb.h> 2076 /* 2077 * List all of the locked vnodes in the system. 2078 * Called when debugging the kernel. 2079 */ 2080 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2081 { 2082 struct proc *p = curproc; /* XXX */ 2083 struct mount *mp, *nmp; 2084 struct vnode *vp; 2085 2086 printf("Locked vnodes\n"); 2087 mtx_enter(&mountlist_mtx, MTX_DEF); 2088 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2089 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2090 nmp = TAILQ_NEXT(mp, mnt_list); 2091 continue; 2092 } 2093 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2094 if (VOP_ISLOCKED(vp, NULL)) 2095 vprint((char *)0, vp); 2096 } 2097 mtx_enter(&mountlist_mtx, MTX_DEF); 2098 nmp = TAILQ_NEXT(mp, mnt_list); 2099 vfs_unbusy(mp, p); 2100 } 2101 mtx_exit(&mountlist_mtx, MTX_DEF); 2102 } 2103 #endif 2104 2105 /* 2106 * Top level filesystem related information gathering. 2107 */ 2108 static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2109 2110 static int 2111 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2112 { 2113 int *name = (int *)arg1 - 1; /* XXX */ 2114 u_int namelen = arg2 + 1; /* XXX */ 2115 struct vfsconf *vfsp; 2116 2117 #if 1 || defined(COMPAT_PRELITE2) 2118 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2119 if (namelen == 1) 2120 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2121 #endif 2122 2123 /* XXX the below code does not compile; vfs_sysctl does not exist. */ 2124 #ifdef notyet 2125 /* all sysctl names at this level are at least name and field */ 2126 if (namelen < 2) 2127 return (ENOTDIR); /* overloaded */ 2128 if (name[0] != VFS_GENERIC) { 2129 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2130 if (vfsp->vfc_typenum == name[0]) 2131 break; 2132 if (vfsp == NULL) 2133 return (EOPNOTSUPP); 2134 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2135 oldp, oldlenp, newp, newlen, p)); 2136 } 2137 #endif 2138 switch (name[1]) { 2139 case VFS_MAXTYPENUM: 2140 if (namelen != 2) 2141 return (ENOTDIR); 2142 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2143 case VFS_CONF: 2144 if (namelen != 3) 2145 return (ENOTDIR); /* overloaded */ 2146 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2147 if (vfsp->vfc_typenum == name[2]) 2148 break; 2149 if (vfsp == NULL) 2150 return (EOPNOTSUPP); 2151 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2152 } 2153 return (EOPNOTSUPP); 2154 } 2155 2156 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2157 "Generic filesystem"); 2158 2159 #if 1 || defined(COMPAT_PRELITE2) 2160 2161 static int 2162 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2163 { 2164 int error; 2165 struct vfsconf *vfsp; 2166 struct ovfsconf ovfs; 2167 2168 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2169 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2170 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2171 ovfs.vfc_index = vfsp->vfc_typenum; 2172 ovfs.vfc_refcount = vfsp->vfc_refcount; 2173 ovfs.vfc_flags = vfsp->vfc_flags; 2174 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2175 if (error) 2176 return error; 2177 } 2178 return 0; 2179 } 2180 2181 #endif /* 1 || COMPAT_PRELITE2 */ 2182 2183 #if COMPILING_LINT 2184 #define KINFO_VNODESLOP 10 2185 /* 2186 * Dump vnode list (via sysctl). 2187 * Copyout address of vnode followed by vnode. 2188 */ 2189 /* ARGSUSED */ 2190 static int 2191 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2192 { 2193 struct proc *p = curproc; /* XXX */ 2194 struct mount *mp, *nmp; 2195 struct vnode *nvp, *vp; 2196 int error; 2197 2198 #define VPTRSZ sizeof (struct vnode *) 2199 #define VNODESZ sizeof (struct vnode) 2200 2201 req->lock = 0; 2202 if (!req->oldptr) /* Make an estimate */ 2203 return (SYSCTL_OUT(req, 0, 2204 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2205 2206 mtx_enter(&mountlist_mtx, MTX_DEF); 2207 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2208 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) { 2209 nmp = TAILQ_NEXT(mp, mnt_list); 2210 continue; 2211 } 2212 again: 2213 simple_lock(&mntvnode_slock); 2214 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2215 vp != NULL; 2216 vp = nvp) { 2217 /* 2218 * Check that the vp is still associated with 2219 * this filesystem. RACE: could have been 2220 * recycled onto the same filesystem. 2221 */ 2222 if (vp->v_mount != mp) { 2223 simple_unlock(&mntvnode_slock); 2224 goto again; 2225 } 2226 nvp = LIST_NEXT(vp, v_mntvnodes); 2227 simple_unlock(&mntvnode_slock); 2228 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2229 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2230 return (error); 2231 simple_lock(&mntvnode_slock); 2232 } 2233 simple_unlock(&mntvnode_slock); 2234 mtx_enter(&mountlist_mtx, MTX_DEF); 2235 nmp = TAILQ_NEXT(mp, mnt_list); 2236 vfs_unbusy(mp, p); 2237 } 2238 mtx_exit(&mountlist_mtx, MTX_DEF); 2239 2240 return (0); 2241 } 2242 2243 /* 2244 * XXX 2245 * Exporting the vnode list on large systems causes them to crash. 2246 * Exporting the vnode list on medium systems causes sysctl to coredump. 2247 */ 2248 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2249 0, 0, sysctl_vnode, "S,vnode", ""); 2250 #endif 2251 2252 /* 2253 * Check to see if a filesystem is mounted on a block device. 2254 */ 2255 int 2256 vfs_mountedon(vp) 2257 struct vnode *vp; 2258 { 2259 2260 if (vp->v_rdev->si_mountpoint != NULL) 2261 return (EBUSY); 2262 return (0); 2263 } 2264 2265 /* 2266 * Unmount all filesystems. The list is traversed in reverse order 2267 * of mounting to avoid dependencies. 2268 */ 2269 void 2270 vfs_unmountall() 2271 { 2272 struct mount *mp; 2273 struct proc *p; 2274 int error; 2275 2276 if (curproc != NULL) 2277 p = curproc; 2278 else 2279 p = initproc; /* XXX XXX should this be proc0? */ 2280 /* 2281 * Since this only runs when rebooting, it is not interlocked. 2282 */ 2283 while(!TAILQ_EMPTY(&mountlist)) { 2284 mp = TAILQ_LAST(&mountlist, mntlist); 2285 error = dounmount(mp, MNT_FORCE, p); 2286 if (error) { 2287 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2288 printf("unmount of %s failed (", 2289 mp->mnt_stat.f_mntonname); 2290 if (error == EBUSY) 2291 printf("BUSY)\n"); 2292 else 2293 printf("%d)\n", error); 2294 } else { 2295 /* The unmount has removed mp from the mountlist */ 2296 } 2297 } 2298 } 2299 2300 /* 2301 * Build hash lists of net addresses and hang them off the mount point. 2302 * Called by ufs_mount() to set up the lists of export addresses. 2303 */ 2304 static int 2305 vfs_hang_addrlist(mp, nep, argp) 2306 struct mount *mp; 2307 struct netexport *nep; 2308 struct export_args *argp; 2309 { 2310 register struct netcred *np; 2311 register struct radix_node_head *rnh; 2312 register int i; 2313 struct radix_node *rn; 2314 struct sockaddr *saddr, *smask = 0; 2315 struct domain *dom; 2316 int error; 2317 2318 if (argp->ex_addrlen == 0) { 2319 if (mp->mnt_flag & MNT_DEFEXPORTED) 2320 return (EPERM); 2321 np = &nep->ne_defexported; 2322 np->netc_exflags = argp->ex_flags; 2323 np->netc_anon = argp->ex_anon; 2324 np->netc_anon.cr_ref = 1; 2325 mp->mnt_flag |= MNT_DEFEXPORTED; 2326 return (0); 2327 } 2328 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2329 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2330 bzero((caddr_t) np, i); 2331 saddr = (struct sockaddr *) (np + 1); 2332 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2333 goto out; 2334 if (saddr->sa_len > argp->ex_addrlen) 2335 saddr->sa_len = argp->ex_addrlen; 2336 if (argp->ex_masklen) { 2337 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2338 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2339 if (error) 2340 goto out; 2341 if (smask->sa_len > argp->ex_masklen) 2342 smask->sa_len = argp->ex_masklen; 2343 } 2344 i = saddr->sa_family; 2345 if ((rnh = nep->ne_rtable[i]) == 0) { 2346 /* 2347 * Seems silly to initialize every AF when most are not used, 2348 * do so on demand here 2349 */ 2350 for (dom = domains; dom; dom = dom->dom_next) 2351 if (dom->dom_family == i && dom->dom_rtattach) { 2352 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2353 dom->dom_rtoffset); 2354 break; 2355 } 2356 if ((rnh = nep->ne_rtable[i]) == 0) { 2357 error = ENOBUFS; 2358 goto out; 2359 } 2360 } 2361 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2362 np->netc_rnodes); 2363 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2364 error = EPERM; 2365 goto out; 2366 } 2367 np->netc_exflags = argp->ex_flags; 2368 np->netc_anon = argp->ex_anon; 2369 np->netc_anon.cr_ref = 1; 2370 return (0); 2371 out: 2372 free(np, M_NETADDR); 2373 return (error); 2374 } 2375 2376 /* Helper for vfs_free_addrlist. */ 2377 /* ARGSUSED */ 2378 static int 2379 vfs_free_netcred(rn, w) 2380 struct radix_node *rn; 2381 void *w; 2382 { 2383 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2384 2385 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2386 free((caddr_t) rn, M_NETADDR); 2387 return (0); 2388 } 2389 2390 /* 2391 * Free the net address hash lists that are hanging off the mount points. 2392 */ 2393 static void 2394 vfs_free_addrlist(nep) 2395 struct netexport *nep; 2396 { 2397 register int i; 2398 register struct radix_node_head *rnh; 2399 2400 for (i = 0; i <= AF_MAX; i++) 2401 if ((rnh = nep->ne_rtable[i])) { 2402 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2403 (caddr_t) rnh); 2404 free((caddr_t) rnh, M_RTABLE); 2405 nep->ne_rtable[i] = 0; 2406 } 2407 } 2408 2409 /* 2410 * High level function to manipulate export options on a mount point 2411 * and the passed in netexport. 2412 * Struct export_args *argp is the variable used to twiddle options, 2413 * the structure is described in sys/mount.h 2414 */ 2415 int 2416 vfs_export(mp, nep, argp) 2417 struct mount *mp; 2418 struct netexport *nep; 2419 struct export_args *argp; 2420 { 2421 int error; 2422 2423 if (argp->ex_flags & MNT_DELEXPORT) { 2424 if (mp->mnt_flag & MNT_EXPUBLIC) { 2425 vfs_setpublicfs(NULL, NULL, NULL); 2426 mp->mnt_flag &= ~MNT_EXPUBLIC; 2427 } 2428 vfs_free_addrlist(nep); 2429 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2430 } 2431 if (argp->ex_flags & MNT_EXPORTED) { 2432 if (argp->ex_flags & MNT_EXPUBLIC) { 2433 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2434 return (error); 2435 mp->mnt_flag |= MNT_EXPUBLIC; 2436 } 2437 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2438 return (error); 2439 mp->mnt_flag |= MNT_EXPORTED; 2440 } 2441 return (0); 2442 } 2443 2444 /* 2445 * Set the publicly exported filesystem (WebNFS). Currently, only 2446 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2447 */ 2448 int 2449 vfs_setpublicfs(mp, nep, argp) 2450 struct mount *mp; 2451 struct netexport *nep; 2452 struct export_args *argp; 2453 { 2454 int error; 2455 struct vnode *rvp; 2456 char *cp; 2457 2458 /* 2459 * mp == NULL -> invalidate the current info, the FS is 2460 * no longer exported. May be called from either vfs_export 2461 * or unmount, so check if it hasn't already been done. 2462 */ 2463 if (mp == NULL) { 2464 if (nfs_pub.np_valid) { 2465 nfs_pub.np_valid = 0; 2466 if (nfs_pub.np_index != NULL) { 2467 FREE(nfs_pub.np_index, M_TEMP); 2468 nfs_pub.np_index = NULL; 2469 } 2470 } 2471 return (0); 2472 } 2473 2474 /* 2475 * Only one allowed at a time. 2476 */ 2477 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2478 return (EBUSY); 2479 2480 /* 2481 * Get real filehandle for root of exported FS. 2482 */ 2483 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2484 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2485 2486 if ((error = VFS_ROOT(mp, &rvp))) 2487 return (error); 2488 2489 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2490 return (error); 2491 2492 vput(rvp); 2493 2494 /* 2495 * If an indexfile was specified, pull it in. 2496 */ 2497 if (argp->ex_indexfile != NULL) { 2498 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2499 M_WAITOK); 2500 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2501 MAXNAMLEN, (size_t *)0); 2502 if (!error) { 2503 /* 2504 * Check for illegal filenames. 2505 */ 2506 for (cp = nfs_pub.np_index; *cp; cp++) { 2507 if (*cp == '/') { 2508 error = EINVAL; 2509 break; 2510 } 2511 } 2512 } 2513 if (error) { 2514 FREE(nfs_pub.np_index, M_TEMP); 2515 return (error); 2516 } 2517 } 2518 2519 nfs_pub.np_mount = mp; 2520 nfs_pub.np_valid = 1; 2521 return (0); 2522 } 2523 2524 /* 2525 * Used by the filesystems to determine if a given network address 2526 * (passed in 'nam') is present in thier exports list, returns a pointer 2527 * to struct netcred so that the filesystem can examine it for 2528 * access rights (read/write/etc). 2529 */ 2530 struct netcred * 2531 vfs_export_lookup(mp, nep, nam) 2532 register struct mount *mp; 2533 struct netexport *nep; 2534 struct sockaddr *nam; 2535 { 2536 register struct netcred *np; 2537 register struct radix_node_head *rnh; 2538 struct sockaddr *saddr; 2539 2540 np = NULL; 2541 if (mp->mnt_flag & MNT_EXPORTED) { 2542 /* 2543 * Lookup in the export list first. 2544 */ 2545 if (nam != NULL) { 2546 saddr = nam; 2547 rnh = nep->ne_rtable[saddr->sa_family]; 2548 if (rnh != NULL) { 2549 np = (struct netcred *) 2550 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2551 rnh); 2552 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2553 np = NULL; 2554 } 2555 } 2556 /* 2557 * If no address match, use the default if it exists. 2558 */ 2559 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2560 np = &nep->ne_defexported; 2561 } 2562 return (np); 2563 } 2564 2565 /* 2566 * perform msync on all vnodes under a mount point 2567 * the mount point must be locked. 2568 */ 2569 void 2570 vfs_msync(struct mount *mp, int flags) { 2571 struct vnode *vp, *nvp; 2572 struct vm_object *obj; 2573 int anyio, tries; 2574 2575 tries = 5; 2576 loop: 2577 anyio = 0; 2578 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2579 2580 nvp = LIST_NEXT(vp, v_mntvnodes); 2581 2582 if (vp->v_mount != mp) { 2583 goto loop; 2584 } 2585 2586 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2587 continue; 2588 2589 if (flags != MNT_WAIT) { 2590 if (VOP_GETVOBJECT(vp, &obj) != 0 || 2591 (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2592 continue; 2593 if (VOP_ISLOCKED(vp, NULL)) 2594 continue; 2595 } 2596 2597 mtx_enter(&vp->v_interlock, MTX_DEF); 2598 if (VOP_GETVOBJECT(vp, &obj) == 0 && 2599 (obj->flags & OBJ_MIGHTBEDIRTY)) { 2600 if (!vget(vp, 2601 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2602 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2603 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2604 anyio = 1; 2605 } 2606 vput(vp); 2607 } 2608 } else { 2609 mtx_exit(&vp->v_interlock, MTX_DEF); 2610 } 2611 } 2612 if (anyio && (--tries > 0)) 2613 goto loop; 2614 } 2615 2616 /* 2617 * Create the VM object needed for VMIO and mmap support. This 2618 * is done for all VREG files in the system. Some filesystems might 2619 * afford the additional metadata buffering capability of the 2620 * VMIO code by making the device node be VMIO mode also. 2621 * 2622 * vp must be locked when vfs_object_create is called. 2623 */ 2624 int 2625 vfs_object_create(vp, p, cred) 2626 struct vnode *vp; 2627 struct proc *p; 2628 struct ucred *cred; 2629 { 2630 return (VOP_CREATEVOBJECT(vp, cred, p)); 2631 } 2632 2633 /* 2634 * Mark a vnode as free, putting it up for recycling. 2635 */ 2636 void 2637 vfree(vp) 2638 struct vnode *vp; 2639 { 2640 int s; 2641 2642 s = splbio(); 2643 simple_lock(&vnode_free_list_slock); 2644 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2645 if (vp->v_flag & VAGE) { 2646 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2647 } else { 2648 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2649 } 2650 freevnodes++; 2651 simple_unlock(&vnode_free_list_slock); 2652 vp->v_flag &= ~VAGE; 2653 vp->v_flag |= VFREE; 2654 splx(s); 2655 } 2656 2657 /* 2658 * Opposite of vfree() - mark a vnode as in use. 2659 */ 2660 void 2661 vbusy(vp) 2662 struct vnode *vp; 2663 { 2664 int s; 2665 2666 s = splbio(); 2667 simple_lock(&vnode_free_list_slock); 2668 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2669 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2670 freevnodes--; 2671 simple_unlock(&vnode_free_list_slock); 2672 vp->v_flag &= ~(VFREE|VAGE); 2673 splx(s); 2674 } 2675 2676 /* 2677 * Record a process's interest in events which might happen to 2678 * a vnode. Because poll uses the historic select-style interface 2679 * internally, this routine serves as both the ``check for any 2680 * pending events'' and the ``record my interest in future events'' 2681 * functions. (These are done together, while the lock is held, 2682 * to avoid race conditions.) 2683 */ 2684 int 2685 vn_pollrecord(vp, p, events) 2686 struct vnode *vp; 2687 struct proc *p; 2688 short events; 2689 { 2690 simple_lock(&vp->v_pollinfo.vpi_lock); 2691 if (vp->v_pollinfo.vpi_revents & events) { 2692 /* 2693 * This leaves events we are not interested 2694 * in available for the other process which 2695 * which presumably had requested them 2696 * (otherwise they would never have been 2697 * recorded). 2698 */ 2699 events &= vp->v_pollinfo.vpi_revents; 2700 vp->v_pollinfo.vpi_revents &= ~events; 2701 2702 simple_unlock(&vp->v_pollinfo.vpi_lock); 2703 return events; 2704 } 2705 vp->v_pollinfo.vpi_events |= events; 2706 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2707 simple_unlock(&vp->v_pollinfo.vpi_lock); 2708 return 0; 2709 } 2710 2711 /* 2712 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2713 * it is possible for us to miss an event due to race conditions, but 2714 * that condition is expected to be rare, so for the moment it is the 2715 * preferred interface. 2716 */ 2717 void 2718 vn_pollevent(vp, events) 2719 struct vnode *vp; 2720 short events; 2721 { 2722 simple_lock(&vp->v_pollinfo.vpi_lock); 2723 if (vp->v_pollinfo.vpi_events & events) { 2724 /* 2725 * We clear vpi_events so that we don't 2726 * call selwakeup() twice if two events are 2727 * posted before the polling process(es) is 2728 * awakened. This also ensures that we take at 2729 * most one selwakeup() if the polling process 2730 * is no longer interested. However, it does 2731 * mean that only one event can be noticed at 2732 * a time. (Perhaps we should only clear those 2733 * event bits which we note?) XXX 2734 */ 2735 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2736 vp->v_pollinfo.vpi_revents |= events; 2737 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2738 } 2739 simple_unlock(&vp->v_pollinfo.vpi_lock); 2740 } 2741 2742 /* 2743 * Wake up anyone polling on vp because it is being revoked. 2744 * This depends on dead_poll() returning POLLHUP for correct 2745 * behavior. 2746 */ 2747 void 2748 vn_pollgone(vp) 2749 struct vnode *vp; 2750 { 2751 simple_lock(&vp->v_pollinfo.vpi_lock); 2752 if (vp->v_pollinfo.vpi_events) { 2753 vp->v_pollinfo.vpi_events = 0; 2754 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2755 } 2756 simple_unlock(&vp->v_pollinfo.vpi_lock); 2757 } 2758 2759 2760 2761 /* 2762 * Routine to create and manage a filesystem syncer vnode. 2763 */ 2764 #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2765 static int sync_fsync __P((struct vop_fsync_args *)); 2766 static int sync_inactive __P((struct vop_inactive_args *)); 2767 static int sync_reclaim __P((struct vop_reclaim_args *)); 2768 #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2769 #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2770 static int sync_print __P((struct vop_print_args *)); 2771 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2772 2773 static vop_t **sync_vnodeop_p; 2774 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2775 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2776 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2777 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2778 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2779 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2780 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2781 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2782 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2783 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2784 { NULL, NULL } 2785 }; 2786 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2787 { &sync_vnodeop_p, sync_vnodeop_entries }; 2788 2789 VNODEOP_SET(sync_vnodeop_opv_desc); 2790 2791 /* 2792 * Create a new filesystem syncer vnode for the specified mount point. 2793 */ 2794 int 2795 vfs_allocate_syncvnode(mp) 2796 struct mount *mp; 2797 { 2798 struct vnode *vp; 2799 static long start, incr, next; 2800 int error; 2801 2802 /* Allocate a new vnode */ 2803 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2804 mp->mnt_syncer = NULL; 2805 return (error); 2806 } 2807 vp->v_type = VNON; 2808 /* 2809 * Place the vnode onto the syncer worklist. We attempt to 2810 * scatter them about on the list so that they will go off 2811 * at evenly distributed times even if all the filesystems 2812 * are mounted at once. 2813 */ 2814 next += incr; 2815 if (next == 0 || next > syncer_maxdelay) { 2816 start /= 2; 2817 incr /= 2; 2818 if (start == 0) { 2819 start = syncer_maxdelay / 2; 2820 incr = syncer_maxdelay; 2821 } 2822 next = start; 2823 } 2824 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2825 mp->mnt_syncer = vp; 2826 return (0); 2827 } 2828 2829 /* 2830 * Do a lazy sync of the filesystem. 2831 */ 2832 static int 2833 sync_fsync(ap) 2834 struct vop_fsync_args /* { 2835 struct vnode *a_vp; 2836 struct ucred *a_cred; 2837 int a_waitfor; 2838 struct proc *a_p; 2839 } */ *ap; 2840 { 2841 struct vnode *syncvp = ap->a_vp; 2842 struct mount *mp = syncvp->v_mount; 2843 struct proc *p = ap->a_p; 2844 int asyncflag; 2845 2846 /* 2847 * We only need to do something if this is a lazy evaluation. 2848 */ 2849 if (ap->a_waitfor != MNT_LAZY) 2850 return (0); 2851 2852 /* 2853 * Move ourselves to the back of the sync list. 2854 */ 2855 vn_syncer_add_to_worklist(syncvp, syncdelay); 2856 2857 /* 2858 * Walk the list of vnodes pushing all that are dirty and 2859 * not already on the sync list. 2860 */ 2861 mtx_enter(&mountlist_mtx, MTX_DEF); 2862 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) { 2863 mtx_exit(&mountlist_mtx, MTX_DEF); 2864 return (0); 2865 } 2866 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2867 vfs_unbusy(mp, p); 2868 mtx_exit(&mountlist_mtx, MTX_DEF); 2869 return (0); 2870 } 2871 asyncflag = mp->mnt_flag & MNT_ASYNC; 2872 mp->mnt_flag &= ~MNT_ASYNC; 2873 vfs_msync(mp, MNT_NOWAIT); 2874 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2875 if (asyncflag) 2876 mp->mnt_flag |= MNT_ASYNC; 2877 vn_finished_write(mp); 2878 vfs_unbusy(mp, p); 2879 return (0); 2880 } 2881 2882 /* 2883 * The syncer vnode is no referenced. 2884 */ 2885 static int 2886 sync_inactive(ap) 2887 struct vop_inactive_args /* { 2888 struct vnode *a_vp; 2889 struct proc *a_p; 2890 } */ *ap; 2891 { 2892 2893 vgone(ap->a_vp); 2894 return (0); 2895 } 2896 2897 /* 2898 * The syncer vnode is no longer needed and is being decommissioned. 2899 * 2900 * Modifications to the worklist must be protected at splbio(). 2901 */ 2902 static int 2903 sync_reclaim(ap) 2904 struct vop_reclaim_args /* { 2905 struct vnode *a_vp; 2906 } */ *ap; 2907 { 2908 struct vnode *vp = ap->a_vp; 2909 int s; 2910 2911 s = splbio(); 2912 vp->v_mount->mnt_syncer = NULL; 2913 if (vp->v_flag & VONWORKLST) { 2914 LIST_REMOVE(vp, v_synclist); 2915 vp->v_flag &= ~VONWORKLST; 2916 } 2917 splx(s); 2918 2919 return (0); 2920 } 2921 2922 /* 2923 * Print out a syncer vnode. 2924 */ 2925 static int 2926 sync_print(ap) 2927 struct vop_print_args /* { 2928 struct vnode *a_vp; 2929 } */ *ap; 2930 { 2931 struct vnode *vp = ap->a_vp; 2932 2933 printf("syncer vnode"); 2934 if (vp->v_vnlock != NULL) 2935 lockmgr_printinfo(vp->v_vnlock); 2936 printf("\n"); 2937 return (0); 2938 } 2939 2940 /* 2941 * extract the dev_t from a VCHR 2942 */ 2943 dev_t 2944 vn_todev(vp) 2945 struct vnode *vp; 2946 { 2947 if (vp->v_type != VCHR) 2948 return (NODEV); 2949 return (vp->v_rdev); 2950 } 2951 2952 /* 2953 * Check if vnode represents a disk device 2954 */ 2955 int 2956 vn_isdisk(vp, errp) 2957 struct vnode *vp; 2958 int *errp; 2959 { 2960 struct cdevsw *cdevsw; 2961 2962 if (vp->v_type != VCHR) { 2963 if (errp != NULL) 2964 *errp = ENOTBLK; 2965 return (0); 2966 } 2967 if (vp->v_rdev == NULL) { 2968 if (errp != NULL) 2969 *errp = ENXIO; 2970 return (0); 2971 } 2972 cdevsw = devsw(vp->v_rdev); 2973 if (cdevsw == NULL) { 2974 if (errp != NULL) 2975 *errp = ENXIO; 2976 return (0); 2977 } 2978 if (!(cdevsw->d_flags & D_DISK)) { 2979 if (errp != NULL) 2980 *errp = ENOTBLK; 2981 return (0); 2982 } 2983 if (errp != NULL) 2984 *errp = 0; 2985 return (1); 2986 } 2987 2988 /* 2989 * Free data allocated by namei(); see namei(9) for details. 2990 */ 2991 void 2992 NDFREE(ndp, flags) 2993 struct nameidata *ndp; 2994 const uint flags; 2995 { 2996 if (!(flags & NDF_NO_FREE_PNBUF) && 2997 (ndp->ni_cnd.cn_flags & HASBUF)) { 2998 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2999 ndp->ni_cnd.cn_flags &= ~HASBUF; 3000 } 3001 if (!(flags & NDF_NO_DVP_UNLOCK) && 3002 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3003 ndp->ni_dvp != ndp->ni_vp) 3004 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 3005 if (!(flags & NDF_NO_DVP_RELE) && 3006 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3007 vrele(ndp->ni_dvp); 3008 ndp->ni_dvp = NULL; 3009 } 3010 if (!(flags & NDF_NO_VP_UNLOCK) && 3011 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3012 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 3013 if (!(flags & NDF_NO_VP_RELE) && 3014 ndp->ni_vp) { 3015 vrele(ndp->ni_vp); 3016 ndp->ni_vp = NULL; 3017 } 3018 if (!(flags & NDF_NO_STARTDIR_RELE) && 3019 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3020 vrele(ndp->ni_startdir); 3021 ndp->ni_startdir = NULL; 3022 } 3023 } 3024 3025 /* 3026 * Common file system object access control check routine. Accepts a 3027 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3028 * and optional call-by-reference privused argument allowing vaccess() 3029 * to indicate to the caller whether privilege was used to satisfy the 3030 * request. Returns 0 on success, or an errno on failure. 3031 */ 3032 int 3033 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3034 enum vtype type; 3035 mode_t file_mode; 3036 uid_t file_uid; 3037 gid_t file_gid; 3038 mode_t acc_mode; 3039 struct ucred *cred; 3040 int *privused; 3041 { 3042 mode_t dac_granted; 3043 #ifdef CAPABILITIES 3044 mode_t cap_granted; 3045 #endif 3046 3047 /* 3048 * Look for a normal, non-privileged way to access the file/directory 3049 * as requested. If it exists, go with that. 3050 */ 3051 3052 if (privused != NULL) 3053 *privused = 0; 3054 3055 dac_granted = 0; 3056 3057 /* Check the owner. */ 3058 if (cred->cr_uid == file_uid) { 3059 dac_granted |= VADMIN; 3060 if (file_mode & S_IXUSR) 3061 dac_granted |= VEXEC; 3062 if (file_mode & S_IRUSR) 3063 dac_granted |= VREAD; 3064 if (file_mode & S_IWUSR) 3065 dac_granted |= VWRITE; 3066 3067 if ((acc_mode & dac_granted) == acc_mode) 3068 return (0); 3069 3070 goto privcheck; 3071 } 3072 3073 /* Otherwise, check the groups (first match) */ 3074 if (groupmember(file_gid, cred)) { 3075 if (file_mode & S_IXGRP) 3076 dac_granted |= VEXEC; 3077 if (file_mode & S_IRGRP) 3078 dac_granted |= VREAD; 3079 if (file_mode & S_IWGRP) 3080 dac_granted |= VWRITE; 3081 3082 if ((acc_mode & dac_granted) == acc_mode) 3083 return (0); 3084 3085 goto privcheck; 3086 } 3087 3088 /* Otherwise, check everyone else. */ 3089 if (file_mode & S_IXOTH) 3090 dac_granted |= VEXEC; 3091 if (file_mode & S_IROTH) 3092 dac_granted |= VREAD; 3093 if (file_mode & S_IWOTH) 3094 dac_granted |= VWRITE; 3095 if ((acc_mode & dac_granted) == acc_mode) 3096 return (0); 3097 3098 privcheck: 3099 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 3100 /* XXX audit: privilege used */ 3101 if (privused != NULL) 3102 *privused = 1; 3103 return (0); 3104 } 3105 3106 #ifdef CAPABILITIES 3107 /* 3108 * Build a capability mask to determine if the set of capabilities 3109 * satisfies the requirements when combined with the granted mask 3110 * from above. 3111 * For each capability, if the capability is required, bitwise 3112 * or the request type onto the cap_granted mask. 3113 */ 3114 cap_granted = 0; 3115 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3116 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3117 cap_granted |= VEXEC; 3118 3119 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3120 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3121 cap_granted |= VREAD; 3122 3123 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3124 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3125 cap_granted |= VWRITE; 3126 3127 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3128 !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT)) 3129 cap_granted |= VADMIN; 3130 3131 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3132 /* XXX audit: privilege used */ 3133 if (privused != NULL) 3134 *privused = 1; 3135 return (0); 3136 } 3137 #endif 3138 3139 return (EACCES); 3140 } 3141