1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_mac.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/bio.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/event.h> 53 #include <sys/eventhandler.h> 54 #include <sys/extattr.h> 55 #include <sys/fcntl.h> 56 #include <sys/kdb.h> 57 #include <sys/kernel.h> 58 #include <sys/kthread.h> 59 #include <sys/mac.h> 60 #include <sys/malloc.h> 61 #include <sys/mount.h> 62 #include <sys/namei.h> 63 #include <sys/reboot.h> 64 #include <sys/sleepqueue.h> 65 #include <sys/stat.h> 66 #include <sys/sysctl.h> 67 #include <sys/syslog.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/stdarg.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/pmap.h> 77 #include <vm/vm_map.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_kern.h> 80 #include <vm/uma.h> 81 82 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 83 84 static void delmntque(struct vnode *vp); 85 static void insmntque(struct vnode *vp, struct mount *mp); 86 static void vlruvp(struct vnode *vp); 87 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 88 int slpflag, int slptimeo); 89 static void syncer_shutdown(void *arg, int howto); 90 static int vtryrecycle(struct vnode *vp); 91 static void vbusy(struct vnode *vp); 92 static void vdropl(struct vnode *vp); 93 static void vinactive(struct vnode *, struct thread *); 94 static void v_incr_usecount(struct vnode *, int); 95 static void vfree(struct vnode *); 96 97 /* 98 * Enable Giant pushdown based on whether or not the vm is mpsafe in this 99 * build. Without mpsafevm the buffer cache can not run Giant free. 100 */ 101 #if defined(__alpha__) || defined(__amd64__) || defined(__i386__) 102 int mpsafe_vfs = 1; 103 #else 104 int mpsafe_vfs; 105 #endif 106 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs); 107 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0, 108 "MPSAFE VFS"); 109 110 /* 111 * Number of vnodes in existence. Increased whenever getnewvnode() 112 * allocates a new vnode, never decreased. 113 */ 114 static unsigned long numvnodes; 115 116 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 117 118 /* 119 * Conversion tables for conversion from vnode types to inode formats 120 * and back. 121 */ 122 enum vtype iftovt_tab[16] = { 123 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 124 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 125 }; 126 int vttoif_tab[9] = { 127 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 128 S_IFSOCK, S_IFIFO, S_IFMT, 129 }; 130 131 /* 132 * List of vnodes that are ready for recycling. 133 */ 134 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 135 136 /* 137 * Minimum number of free vnodes. If there are fewer than this free vnodes, 138 * getnewvnode() will return a newly allocated vnode. 139 */ 140 static u_long wantfreevnodes = 25; 141 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 142 /* Number of vnodes in the free list. */ 143 static u_long freevnodes; 144 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 145 146 /* 147 * Various variables used for debugging the new implementation of 148 * reassignbuf(). 149 * XXX these are probably of (very) limited utility now. 150 */ 151 static int reassignbufcalls; 152 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 153 154 /* 155 * Cache for the mount type id assigned to NFS. This is used for 156 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 157 */ 158 int nfs_mount_type = -1; 159 160 /* To keep more than one thread at a time from running vfs_getnewfsid */ 161 static struct mtx mntid_mtx; 162 163 /* 164 * Lock for any access to the following: 165 * vnode_free_list 166 * numvnodes 167 * freevnodes 168 */ 169 static struct mtx vnode_free_list_mtx; 170 171 /* Publicly exported FS */ 172 struct nfs_public nfs_pub; 173 174 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 175 static uma_zone_t vnode_zone; 176 static uma_zone_t vnodepoll_zone; 177 178 /* Set to 1 to print out reclaim of active vnodes */ 179 int prtactive; 180 181 /* 182 * The workitem queue. 183 * 184 * It is useful to delay writes of file data and filesystem metadata 185 * for tens of seconds so that quickly created and deleted files need 186 * not waste disk bandwidth being created and removed. To realize this, 187 * we append vnodes to a "workitem" queue. When running with a soft 188 * updates implementation, most pending metadata dependencies should 189 * not wait for more than a few seconds. Thus, mounted on block devices 190 * are delayed only about a half the time that file data is delayed. 191 * Similarly, directory updates are more critical, so are only delayed 192 * about a third the time that file data is delayed. Thus, there are 193 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 194 * one each second (driven off the filesystem syncer process). The 195 * syncer_delayno variable indicates the next queue that is to be processed. 196 * Items that need to be processed soon are placed in this queue: 197 * 198 * syncer_workitem_pending[syncer_delayno] 199 * 200 * A delay of fifteen seconds is done by placing the request fifteen 201 * entries later in the queue: 202 * 203 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 204 * 205 */ 206 static int syncer_delayno; 207 static long syncer_mask; 208 LIST_HEAD(synclist, bufobj); 209 static struct synclist *syncer_workitem_pending; 210 /* 211 * The sync_mtx protects: 212 * bo->bo_synclist 213 * sync_vnode_count 214 * syncer_delayno 215 * syncer_state 216 * syncer_workitem_pending 217 * syncer_worklist_len 218 * rushjob 219 */ 220 static struct mtx sync_mtx; 221 222 #define SYNCER_MAXDELAY 32 223 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 224 static int syncdelay = 30; /* max time to delay syncing data */ 225 static int filedelay = 30; /* time to delay syncing files */ 226 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 227 static int dirdelay = 29; /* time to delay syncing directories */ 228 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 229 static int metadelay = 28; /* time to delay syncing metadata */ 230 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 231 static int rushjob; /* number of slots to run ASAP */ 232 static int stat_rush_requests; /* number of times I/O speeded up */ 233 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 234 235 /* 236 * When shutting down the syncer, run it at four times normal speed. 237 */ 238 #define SYNCER_SHUTDOWN_SPEEDUP 4 239 static int sync_vnode_count; 240 static int syncer_worklist_len; 241 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 242 syncer_state; 243 244 /* 245 * Number of vnodes we want to exist at any one time. This is mostly used 246 * to size hash tables in vnode-related code. It is normally not used in 247 * getnewvnode(), as wantfreevnodes is normally nonzero.) 248 * 249 * XXX desiredvnodes is historical cruft and should not exist. 250 */ 251 int desiredvnodes; 252 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 253 &desiredvnodes, 0, "Maximum number of vnodes"); 254 static int minvnodes; 255 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 256 &minvnodes, 0, "Minimum number of vnodes"); 257 static int vnlru_nowhere; 258 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 259 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 260 261 /* Hook for calling soft updates. */ 262 int (*softdep_process_worklist_hook)(struct mount *); 263 264 /* 265 * Macros to control when a vnode is freed and recycled. All require 266 * the vnode interlock. 267 */ 268 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) 269 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) 270 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) 271 272 273 /* 274 * Initialize the vnode management data structures. 275 */ 276 #ifndef MAXVNODES_MAX 277 #define MAXVNODES_MAX 100000 278 #endif 279 static void 280 vntblinit(void *dummy __unused) 281 { 282 283 /* 284 * Desiredvnodes is a function of the physical memory size and 285 * the kernel's heap size. Specifically, desiredvnodes scales 286 * in proportion to the physical memory size until two fifths 287 * of the kernel's heap size is consumed by vnodes and vm 288 * objects. 289 */ 290 desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / 291 (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); 292 if (desiredvnodes > MAXVNODES_MAX) { 293 if (bootverbose) 294 printf("Reducing kern.maxvnodes %d -> %d\n", 295 desiredvnodes, MAXVNODES_MAX); 296 desiredvnodes = MAXVNODES_MAX; 297 } 298 minvnodes = desiredvnodes / 4; 299 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF); 300 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 301 TAILQ_INIT(&vnode_free_list); 302 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 303 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 304 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 305 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 306 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 307 /* 308 * Initialize the filesystem syncer. 309 */ 310 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 311 &syncer_mask); 312 syncer_maxdelay = syncer_mask + 1; 313 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 314 } 315 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 316 317 318 /* 319 * Mark a mount point as busy. Used to synchronize access and to delay 320 * unmounting. Interlock is not released on failure. 321 */ 322 int 323 vfs_busy(mp, flags, interlkp, td) 324 struct mount *mp; 325 int flags; 326 struct mtx *interlkp; 327 struct thread *td; 328 { 329 int lkflags; 330 331 MNT_ILOCK(mp); 332 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 333 if (flags & LK_NOWAIT) { 334 MNT_IUNLOCK(mp); 335 return (ENOENT); 336 } 337 if (interlkp) 338 mtx_unlock(interlkp); 339 mp->mnt_kern_flag |= MNTK_MWAIT; 340 /* 341 * Since all busy locks are shared except the exclusive 342 * lock granted when unmounting, the only place that a 343 * wakeup needs to be done is at the release of the 344 * exclusive lock at the end of dounmount. 345 */ 346 msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0); 347 if (interlkp) 348 mtx_lock(interlkp); 349 return (ENOENT); 350 } 351 if (interlkp) 352 mtx_unlock(interlkp); 353 lkflags = LK_SHARED | LK_NOPAUSE | LK_INTERLOCK; 354 if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td)) 355 panic("vfs_busy: unexpected lock failure"); 356 return (0); 357 } 358 359 /* 360 * Free a busy filesystem. 361 */ 362 void 363 vfs_unbusy(mp, td) 364 struct mount *mp; 365 struct thread *td; 366 { 367 368 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); 369 } 370 371 /* 372 * Lookup a mount point by filesystem identifier. 373 */ 374 struct mount * 375 vfs_getvfs(fsid) 376 fsid_t *fsid; 377 { 378 struct mount *mp; 379 380 mtx_lock(&mountlist_mtx); 381 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 382 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 383 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 384 mtx_unlock(&mountlist_mtx); 385 return (mp); 386 } 387 } 388 mtx_unlock(&mountlist_mtx); 389 return ((struct mount *) 0); 390 } 391 392 /* 393 * Check if a user can access priveledged mount options. 394 */ 395 int 396 vfs_suser(struct mount *mp, struct thread *td) 397 { 398 int error; 399 400 if ((mp->mnt_flag & MNT_USER) == 0 || 401 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 402 if ((error = suser(td)) != 0) 403 return (error); 404 } 405 return (0); 406 } 407 408 /* 409 * Get a new unique fsid. Try to make its val[0] unique, since this value 410 * will be used to create fake device numbers for stat(). Also try (but 411 * not so hard) make its val[0] unique mod 2^16, since some emulators only 412 * support 16-bit device numbers. We end up with unique val[0]'s for the 413 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 414 * 415 * Keep in mind that several mounts may be running in parallel. Starting 416 * the search one past where the previous search terminated is both a 417 * micro-optimization and a defense against returning the same fsid to 418 * different mounts. 419 */ 420 void 421 vfs_getnewfsid(mp) 422 struct mount *mp; 423 { 424 static u_int16_t mntid_base; 425 fsid_t tfsid; 426 int mtype; 427 428 mtx_lock(&mntid_mtx); 429 mtype = mp->mnt_vfc->vfc_typenum; 430 tfsid.val[1] = mtype; 431 mtype = (mtype & 0xFF) << 24; 432 for (;;) { 433 tfsid.val[0] = makedev(255, 434 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 435 mntid_base++; 436 if (vfs_getvfs(&tfsid) == NULL) 437 break; 438 } 439 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 440 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 441 mtx_unlock(&mntid_mtx); 442 } 443 444 /* 445 * Knob to control the precision of file timestamps: 446 * 447 * 0 = seconds only; nanoseconds zeroed. 448 * 1 = seconds and nanoseconds, accurate within 1/HZ. 449 * 2 = seconds and nanoseconds, truncated to microseconds. 450 * >=3 = seconds and nanoseconds, maximum precision. 451 */ 452 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 453 454 static int timestamp_precision = TSP_SEC; 455 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 456 ×tamp_precision, 0, ""); 457 458 /* 459 * Get a current timestamp. 460 */ 461 void 462 vfs_timestamp(tsp) 463 struct timespec *tsp; 464 { 465 struct timeval tv; 466 467 switch (timestamp_precision) { 468 case TSP_SEC: 469 tsp->tv_sec = time_second; 470 tsp->tv_nsec = 0; 471 break; 472 case TSP_HZ: 473 getnanotime(tsp); 474 break; 475 case TSP_USEC: 476 microtime(&tv); 477 TIMEVAL_TO_TIMESPEC(&tv, tsp); 478 break; 479 case TSP_NSEC: 480 default: 481 nanotime(tsp); 482 break; 483 } 484 } 485 486 /* 487 * Set vnode attributes to VNOVAL 488 */ 489 void 490 vattr_null(vap) 491 struct vattr *vap; 492 { 493 494 vap->va_type = VNON; 495 vap->va_size = VNOVAL; 496 vap->va_bytes = VNOVAL; 497 vap->va_mode = VNOVAL; 498 vap->va_nlink = VNOVAL; 499 vap->va_uid = VNOVAL; 500 vap->va_gid = VNOVAL; 501 vap->va_fsid = VNOVAL; 502 vap->va_fileid = VNOVAL; 503 vap->va_blocksize = VNOVAL; 504 vap->va_rdev = VNOVAL; 505 vap->va_atime.tv_sec = VNOVAL; 506 vap->va_atime.tv_nsec = VNOVAL; 507 vap->va_mtime.tv_sec = VNOVAL; 508 vap->va_mtime.tv_nsec = VNOVAL; 509 vap->va_ctime.tv_sec = VNOVAL; 510 vap->va_ctime.tv_nsec = VNOVAL; 511 vap->va_birthtime.tv_sec = VNOVAL; 512 vap->va_birthtime.tv_nsec = VNOVAL; 513 vap->va_flags = VNOVAL; 514 vap->va_gen = VNOVAL; 515 vap->va_vaflags = 0; 516 } 517 518 /* 519 * This routine is called when we have too many vnodes. It attempts 520 * to free <count> vnodes and will potentially free vnodes that still 521 * have VM backing store (VM backing store is typically the cause 522 * of a vnode blowout so we want to do this). Therefore, this operation 523 * is not considered cheap. 524 * 525 * A number of conditions may prevent a vnode from being reclaimed. 526 * the buffer cache may have references on the vnode, a directory 527 * vnode may still have references due to the namei cache representing 528 * underlying files, or the vnode may be in active use. It is not 529 * desireable to reuse such vnodes. These conditions may cause the 530 * number of vnodes to reach some minimum value regardless of what 531 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 532 */ 533 static int 534 vlrureclaim(struct mount *mp) 535 { 536 struct vnode *vp; 537 int done; 538 int trigger; 539 int usevnodes; 540 int count; 541 542 /* 543 * Calculate the trigger point, don't allow user 544 * screwups to blow us up. This prevents us from 545 * recycling vnodes with lots of resident pages. We 546 * aren't trying to free memory, we are trying to 547 * free vnodes. 548 */ 549 usevnodes = desiredvnodes; 550 if (usevnodes <= 0) 551 usevnodes = 1; 552 trigger = cnt.v_page_count * 2 / usevnodes; 553 554 done = 0; 555 vn_start_write(NULL, &mp, V_WAIT); 556 MNT_ILOCK(mp); 557 count = mp->mnt_nvnodelistsize / 10 + 1; 558 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 559 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 560 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 561 562 if (vp->v_type != VNON && 563 vp->v_type != VBAD && 564 VI_TRYLOCK(vp)) { 565 /* critical path opt */ 566 if (LIST_EMPTY(&(vp)->v_cache_src) && 567 !(vp)->v_usecount && 568 (vp->v_object == NULL || 569 vp->v_object->resident_page_count < trigger)) { 570 struct thread *td; 571 572 td = curthread; 573 MNT_IUNLOCK(mp); 574 VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE, td); 575 if ((vp->v_iflag & VI_DOOMED) == 0) 576 vgone(vp); 577 VOP_UNLOCK(vp, 0, td); 578 done++; 579 MNT_ILOCK(mp); 580 } else 581 VI_UNLOCK(vp); 582 } 583 --count; 584 } 585 MNT_IUNLOCK(mp); 586 vn_finished_write(mp); 587 return done; 588 } 589 590 /* 591 * Attempt to recycle vnodes in a context that is always safe to block. 592 * Calling vlrurecycle() from the bowels of filesystem code has some 593 * interesting deadlock problems. 594 */ 595 static struct proc *vnlruproc; 596 static int vnlruproc_sig; 597 598 static void 599 vnlru_proc(void) 600 { 601 struct mount *mp, *nmp; 602 int done; 603 struct proc *p = vnlruproc; 604 struct thread *td = FIRST_THREAD_IN_PROC(p); 605 606 mtx_lock(&Giant); 607 608 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 609 SHUTDOWN_PRI_FIRST); 610 611 for (;;) { 612 kthread_suspend_check(p); 613 mtx_lock(&vnode_free_list_mtx); 614 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { 615 vnlruproc_sig = 0; 616 wakeup(&vnlruproc_sig); 617 msleep(vnlruproc, &vnode_free_list_mtx, 618 PVFS|PDROP, "vlruwt", hz); 619 continue; 620 } 621 mtx_unlock(&vnode_free_list_mtx); 622 done = 0; 623 mtx_lock(&mountlist_mtx); 624 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 625 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 626 nmp = TAILQ_NEXT(mp, mnt_list); 627 continue; 628 } 629 done += vlrureclaim(mp); 630 mtx_lock(&mountlist_mtx); 631 nmp = TAILQ_NEXT(mp, mnt_list); 632 vfs_unbusy(mp, td); 633 } 634 mtx_unlock(&mountlist_mtx); 635 if (done == 0) { 636 #if 0 637 /* These messages are temporary debugging aids */ 638 if (vnlru_nowhere < 5) 639 printf("vnlru process getting nowhere..\n"); 640 else if (vnlru_nowhere == 5) 641 printf("vnlru process messages stopped.\n"); 642 #endif 643 vnlru_nowhere++; 644 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 645 } 646 } 647 } 648 649 static struct kproc_desc vnlru_kp = { 650 "vnlru", 651 vnlru_proc, 652 &vnlruproc 653 }; 654 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 655 656 /* 657 * Routines having to do with the management of the vnode table. 658 */ 659 660 /* 661 * Check to see if a free vnode can be recycled. If it can, 662 * recycle it and return it with the vnode interlock held. 663 */ 664 static int 665 vtryrecycle(struct vnode *vp) 666 { 667 struct thread *td = curthread; 668 struct mount *vnmp; 669 int error; 670 671 ASSERT_VI_LOCKED(vp, "vtryrecycle"); 672 error = 0; 673 /* 674 * This vnode may found and locked via some other list, if so we 675 * can't recycle it yet. 676 */ 677 if (VOP_LOCK(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0) 678 return (EWOULDBLOCK); 679 /* 680 * Don't recycle if its filesystem is being suspended. 681 */ 682 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 683 VOP_UNLOCK(vp, 0, td); 684 return (EBUSY); 685 } 686 /* 687 * If we got this far, we need to acquire the interlock and see if 688 * anyone picked up this vnode from another list. If not, we will 689 * mark it with DOOMED via vgonel() so that anyone who does find it 690 * will skip over it. 691 */ 692 VI_LOCK(vp); 693 if (!VCANRECYCLE(vp)) { 694 VI_UNLOCK(vp); 695 error = EBUSY; 696 goto done; 697 } 698 mtx_lock(&vnode_free_list_mtx); 699 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 700 vp->v_iflag &= ~VI_FREE; 701 mtx_unlock(&vnode_free_list_mtx); 702 if ((vp->v_iflag & VI_DOOMED) == 0) { 703 vp->v_iflag |= VI_DOOMED; 704 vgonel(vp, td); 705 VI_LOCK(vp); 706 } 707 /* 708 * If someone ref'd the vnode while we were cleaning, we have to 709 * free it once the last ref is dropped. 710 */ 711 if (vp->v_holdcnt) 712 error = EBUSY; 713 VI_UNLOCK(vp); 714 done: 715 VOP_UNLOCK(vp, 0, td); 716 vn_finished_write(vnmp); 717 return (error); 718 } 719 720 /* 721 * Return the next vnode from the free list. 722 */ 723 int 724 getnewvnode(tag, mp, vops, vpp) 725 const char *tag; 726 struct mount *mp; 727 struct vop_vector *vops; 728 struct vnode **vpp; 729 { 730 struct vnode *vp = NULL; 731 struct vpollinfo *pollinfo = NULL; 732 struct bufobj *bo; 733 734 mtx_lock(&vnode_free_list_mtx); 735 736 /* 737 * Try to reuse vnodes if we hit the max. This situation only 738 * occurs in certain large-memory (2G+) situations. We cannot 739 * attempt to directly reclaim vnodes due to nasty recursion 740 * problems. 741 */ 742 while (numvnodes - freevnodes > desiredvnodes) { 743 if (vnlruproc_sig == 0) { 744 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 745 wakeup(vnlruproc); 746 } 747 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 748 "vlruwk", hz); 749 } 750 751 /* 752 * Attempt to reuse a vnode already on the free list, allocating 753 * a new vnode if we can't find one or if we have not reached a 754 * good minimum for good LRU performance. 755 */ 756 757 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 758 int error; 759 int count; 760 761 for (count = 0; count < freevnodes; vp = NULL, count++) { 762 vp = TAILQ_FIRST(&vnode_free_list); 763 /* 764 * The list can be modified while the free_list_mtx 765 * has been dropped and vp could be NULL here. 766 */ 767 if (!vp) 768 break; 769 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 770 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 771 /* 772 * Don't recycle if we can't get the interlock. 773 */ 774 if (!VI_TRYLOCK(vp)) 775 continue; 776 if (!VCANRECYCLE(vp)) { 777 VI_UNLOCK(vp); 778 continue; 779 } 780 mtx_unlock(&vnode_free_list_mtx); 781 error = vtryrecycle(vp); 782 mtx_lock(&vnode_free_list_mtx); 783 if (error == 0) 784 break; 785 } 786 } 787 if (vp) { 788 freevnodes--; 789 bo = &vp->v_bufobj; 790 mtx_unlock(&vnode_free_list_mtx); 791 792 #ifdef INVARIANTS 793 { 794 if (vp->v_data) 795 printf("cleaned vnode isn't, " 796 "address %p, inode %p\n", 797 vp, vp->v_data); 798 if (bo->bo_numoutput) 799 panic("%p: Clean vnode has pending I/O's", vp); 800 if (vp->v_usecount != 0) 801 panic("%p: Non-zero use count", vp); 802 if (vp->v_writecount != 0) 803 panic("%p: Non-zero write count", vp); 804 } 805 #endif 806 if ((pollinfo = vp->v_pollinfo) != NULL) { 807 /* 808 * To avoid lock order reversals, the call to 809 * uma_zfree() must be delayed until the vnode 810 * interlock is released. 811 */ 812 vp->v_pollinfo = NULL; 813 } 814 #ifdef MAC 815 mac_destroy_vnode(vp); 816 #endif 817 vp->v_iflag = 0; 818 vp->v_vflag = 0; 819 vp->v_lastw = 0; 820 vp->v_lasta = 0; 821 vp->v_cstart = 0; 822 vp->v_clen = 0; 823 bzero(&vp->v_un, sizeof vp->v_un); 824 lockdestroy(vp->v_vnlock); 825 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE); 826 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, 827 ("cleanbufcnt not 0")); 828 VNASSERT(bo->bo_clean.bv_root == NULL, vp, 829 ("cleanblkroot not NULL")); 830 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, 831 ("dirtybufcnt not 0")); 832 VNASSERT(bo->bo_dirty.bv_root == NULL, vp, 833 ("dirtyblkroot not NULL")); 834 } else { 835 numvnodes++; 836 mtx_unlock(&vnode_free_list_mtx); 837 838 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); 839 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 840 vp->v_dd = vp; 841 bo = &vp->v_bufobj; 842 bo->__bo_vnode = vp; 843 bo->bo_mtx = &vp->v_interlock; 844 vp->v_vnlock = &vp->v_lock; 845 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE); 846 cache_purge(vp); /* Sets up v_id. */ 847 LIST_INIT(&vp->v_cache_src); 848 TAILQ_INIT(&vp->v_cache_dst); 849 } 850 851 TAILQ_INIT(&bo->bo_clean.bv_hd); 852 TAILQ_INIT(&bo->bo_dirty.bv_hd); 853 bo->bo_ops = &buf_ops_bio; 854 bo->bo_private = vp; 855 vp->v_type = VNON; 856 vp->v_tag = tag; 857 vp->v_op = vops; 858 *vpp = vp; 859 v_incr_usecount(vp, 1); 860 vp->v_data = 0; 861 if (pollinfo != NULL) { 862 knlist_destroy(&pollinfo->vpi_selinfo.si_note); 863 mtx_destroy(&pollinfo->vpi_lock); 864 uma_zfree(vnodepoll_zone, pollinfo); 865 } 866 #ifdef MAC 867 mac_init_vnode(vp); 868 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 869 mac_associate_vnode_singlelabel(mp, vp); 870 else if (mp == NULL) 871 printf("NULL mp in getnewvnode()\n"); 872 #endif 873 delmntque(vp); 874 if (mp != NULL) { 875 insmntque(vp, mp); 876 bo->bo_bsize = mp->mnt_stat.f_iosize; 877 } 878 879 return (0); 880 } 881 882 /* 883 * Delete from old mount point vnode list, if on one. 884 */ 885 static void 886 delmntque(struct vnode *vp) 887 { 888 struct mount *mp; 889 890 if (vp->v_mount == NULL) 891 return; 892 mp = vp->v_mount; 893 MNT_ILOCK(mp); 894 vp->v_mount = NULL; 895 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 896 ("bad mount point vnode list size")); 897 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 898 mp->mnt_nvnodelistsize--; 899 MNT_IUNLOCK(mp); 900 } 901 902 /* 903 * Insert into list of vnodes for the new mount point, if available. 904 */ 905 static void 906 insmntque(struct vnode *vp, struct mount *mp) 907 { 908 909 vp->v_mount = mp; 910 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 911 MNT_ILOCK(vp->v_mount); 912 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 913 mp->mnt_nvnodelistsize++; 914 MNT_IUNLOCK(vp->v_mount); 915 } 916 917 /* 918 * Flush out and invalidate all buffers associated with a bufobj 919 * Called with the underlying object locked. 920 */ 921 int 922 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo) 923 { 924 int error; 925 926 BO_LOCK(bo); 927 if (flags & V_SAVE) { 928 error = bufobj_wwait(bo, slpflag, slptimeo); 929 if (error) { 930 BO_UNLOCK(bo); 931 return (error); 932 } 933 if (bo->bo_dirty.bv_cnt > 0) { 934 BO_UNLOCK(bo); 935 if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0) 936 return (error); 937 /* 938 * XXX We could save a lock/unlock if this was only 939 * enabled under INVARIANTS 940 */ 941 BO_LOCK(bo); 942 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 943 panic("vinvalbuf: dirty bufs"); 944 } 945 } 946 /* 947 * If you alter this loop please notice that interlock is dropped and 948 * reacquired in flushbuflist. Special care is needed to ensure that 949 * no race conditions occur from this. 950 */ 951 do { 952 error = flushbuflist(&bo->bo_clean, 953 flags, bo, slpflag, slptimeo); 954 if (error == 0) 955 error = flushbuflist(&bo->bo_dirty, 956 flags, bo, slpflag, slptimeo); 957 if (error != 0 && error != EAGAIN) { 958 BO_UNLOCK(bo); 959 return (error); 960 } 961 } while (error != 0); 962 963 /* 964 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 965 * have write I/O in-progress but if there is a VM object then the 966 * VM object can also have read-I/O in-progress. 967 */ 968 do { 969 bufobj_wwait(bo, 0, 0); 970 BO_UNLOCK(bo); 971 if (bo->bo_object != NULL) { 972 VM_OBJECT_LOCK(bo->bo_object); 973 vm_object_pip_wait(bo->bo_object, "bovlbx"); 974 VM_OBJECT_UNLOCK(bo->bo_object); 975 } 976 BO_LOCK(bo); 977 } while (bo->bo_numoutput > 0); 978 BO_UNLOCK(bo); 979 980 /* 981 * Destroy the copy in the VM cache, too. 982 */ 983 if (bo->bo_object != NULL) { 984 VM_OBJECT_LOCK(bo->bo_object); 985 vm_object_page_remove(bo->bo_object, 0, 0, 986 (flags & V_SAVE) ? TRUE : FALSE); 987 VM_OBJECT_UNLOCK(bo->bo_object); 988 } 989 990 #ifdef INVARIANTS 991 BO_LOCK(bo); 992 if ((flags & (V_ALT | V_NORMAL)) == 0 && 993 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 994 panic("vinvalbuf: flush failed"); 995 BO_UNLOCK(bo); 996 #endif 997 return (0); 998 } 999 1000 /* 1001 * Flush out and invalidate all buffers associated with a vnode. 1002 * Called with the underlying object locked. 1003 */ 1004 int 1005 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo) 1006 { 1007 1008 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1009 return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo)); 1010 } 1011 1012 /* 1013 * Flush out buffers on the specified list. 1014 * 1015 */ 1016 static int 1017 flushbuflist(bufv, flags, bo, slpflag, slptimeo) 1018 struct bufv *bufv; 1019 int flags; 1020 struct bufobj *bo; 1021 int slpflag, slptimeo; 1022 { 1023 struct buf *bp, *nbp; 1024 int retval, error; 1025 1026 ASSERT_BO_LOCKED(bo); 1027 1028 retval = 0; 1029 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1030 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1031 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1032 continue; 1033 } 1034 retval = EAGAIN; 1035 error = BUF_TIMELOCK(bp, 1036 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), 1037 "flushbuf", slpflag, slptimeo); 1038 if (error) { 1039 BO_LOCK(bo); 1040 return (error != ENOLCK ? error : EAGAIN); 1041 } 1042 if (bp->b_bufobj != bo) { /* XXX: necessary ? */ 1043 BO_LOCK(bo); 1044 return (EAGAIN); 1045 } 1046 /* 1047 * XXX Since there are no node locks for NFS, I 1048 * believe there is a slight chance that a delayed 1049 * write will occur while sleeping just above, so 1050 * check for it. 1051 */ 1052 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1053 (flags & V_SAVE)) { 1054 bremfree(bp); 1055 bp->b_flags |= B_ASYNC; 1056 bwrite(bp); 1057 BO_LOCK(bo); 1058 return (EAGAIN); /* XXX: why not loop ? */ 1059 } 1060 bremfree(bp); 1061 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 1062 bp->b_flags &= ~B_ASYNC; 1063 brelse(bp); 1064 BO_LOCK(bo); 1065 } 1066 return (retval); 1067 } 1068 1069 /* 1070 * Truncate a file's buffer and pages to a specified length. This 1071 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1072 * sync activity. 1073 */ 1074 int 1075 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) 1076 { 1077 struct buf *bp, *nbp; 1078 int anyfreed; 1079 int trunclbn; 1080 struct bufobj *bo; 1081 1082 /* 1083 * Round up to the *next* lbn. 1084 */ 1085 trunclbn = (length + blksize - 1) / blksize; 1086 1087 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1088 restart: 1089 VI_LOCK(vp); 1090 bo = &vp->v_bufobj; 1091 anyfreed = 1; 1092 for (;anyfreed;) { 1093 anyfreed = 0; 1094 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1095 if (bp->b_lblkno < trunclbn) 1096 continue; 1097 if (BUF_LOCK(bp, 1098 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1099 VI_MTX(vp)) == ENOLCK) 1100 goto restart; 1101 1102 bremfree(bp); 1103 bp->b_flags |= (B_INVAL | B_RELBUF); 1104 bp->b_flags &= ~B_ASYNC; 1105 brelse(bp); 1106 anyfreed = 1; 1107 1108 if (nbp != NULL && 1109 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1110 (nbp->b_vp != vp) || 1111 (nbp->b_flags & B_DELWRI))) { 1112 goto restart; 1113 } 1114 VI_LOCK(vp); 1115 } 1116 1117 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1118 if (bp->b_lblkno < trunclbn) 1119 continue; 1120 if (BUF_LOCK(bp, 1121 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1122 VI_MTX(vp)) == ENOLCK) 1123 goto restart; 1124 bremfree(bp); 1125 bp->b_flags |= (B_INVAL | B_RELBUF); 1126 bp->b_flags &= ~B_ASYNC; 1127 brelse(bp); 1128 anyfreed = 1; 1129 if (nbp != NULL && 1130 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1131 (nbp->b_vp != vp) || 1132 (nbp->b_flags & B_DELWRI) == 0)) { 1133 goto restart; 1134 } 1135 VI_LOCK(vp); 1136 } 1137 } 1138 1139 if (length > 0) { 1140 restartsync: 1141 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1142 if (bp->b_lblkno > 0) 1143 continue; 1144 /* 1145 * Since we hold the vnode lock this should only 1146 * fail if we're racing with the buf daemon. 1147 */ 1148 if (BUF_LOCK(bp, 1149 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1150 VI_MTX(vp)) == ENOLCK) { 1151 goto restart; 1152 } 1153 VNASSERT((bp->b_flags & B_DELWRI), vp, 1154 ("buf(%p) on dirty queue without DELWRI", bp)); 1155 1156 bremfree(bp); 1157 bawrite(bp); 1158 VI_LOCK(vp); 1159 goto restartsync; 1160 } 1161 } 1162 1163 bufobj_wwait(bo, 0, 0); 1164 VI_UNLOCK(vp); 1165 vnode_pager_setsize(vp, length); 1166 1167 return (0); 1168 } 1169 1170 /* 1171 * buf_splay() - splay tree core for the clean/dirty list of buffers in 1172 * a vnode. 1173 * 1174 * NOTE: We have to deal with the special case of a background bitmap 1175 * buffer, a situation where two buffers will have the same logical 1176 * block offset. We want (1) only the foreground buffer to be accessed 1177 * in a lookup and (2) must differentiate between the foreground and 1178 * background buffer in the splay tree algorithm because the splay 1179 * tree cannot normally handle multiple entities with the same 'index'. 1180 * We accomplish this by adding differentiating flags to the splay tree's 1181 * numerical domain. 1182 */ 1183 static 1184 struct buf * 1185 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) 1186 { 1187 struct buf dummy; 1188 struct buf *lefttreemax, *righttreemin, *y; 1189 1190 if (root == NULL) 1191 return (NULL); 1192 lefttreemax = righttreemin = &dummy; 1193 for (;;) { 1194 if (lblkno < root->b_lblkno || 1195 (lblkno == root->b_lblkno && 1196 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1197 if ((y = root->b_left) == NULL) 1198 break; 1199 if (lblkno < y->b_lblkno) { 1200 /* Rotate right. */ 1201 root->b_left = y->b_right; 1202 y->b_right = root; 1203 root = y; 1204 if ((y = root->b_left) == NULL) 1205 break; 1206 } 1207 /* Link into the new root's right tree. */ 1208 righttreemin->b_left = root; 1209 righttreemin = root; 1210 } else if (lblkno > root->b_lblkno || 1211 (lblkno == root->b_lblkno && 1212 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { 1213 if ((y = root->b_right) == NULL) 1214 break; 1215 if (lblkno > y->b_lblkno) { 1216 /* Rotate left. */ 1217 root->b_right = y->b_left; 1218 y->b_left = root; 1219 root = y; 1220 if ((y = root->b_right) == NULL) 1221 break; 1222 } 1223 /* Link into the new root's left tree. */ 1224 lefttreemax->b_right = root; 1225 lefttreemax = root; 1226 } else { 1227 break; 1228 } 1229 root = y; 1230 } 1231 /* Assemble the new root. */ 1232 lefttreemax->b_right = root->b_left; 1233 righttreemin->b_left = root->b_right; 1234 root->b_left = dummy.b_right; 1235 root->b_right = dummy.b_left; 1236 return (root); 1237 } 1238 1239 static void 1240 buf_vlist_remove(struct buf *bp) 1241 { 1242 struct buf *root; 1243 struct bufv *bv; 1244 1245 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1246 ASSERT_BO_LOCKED(bp->b_bufobj); 1247 if (bp->b_xflags & BX_VNDIRTY) 1248 bv = &bp->b_bufobj->bo_dirty; 1249 else 1250 bv = &bp->b_bufobj->bo_clean; 1251 if (bp != bv->bv_root) { 1252 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1253 KASSERT(root == bp, ("splay lookup failed in remove")); 1254 } 1255 if (bp->b_left == NULL) { 1256 root = bp->b_right; 1257 } else { 1258 root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); 1259 root->b_right = bp->b_right; 1260 } 1261 bv->bv_root = root; 1262 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1263 bv->bv_cnt--; 1264 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1265 } 1266 1267 /* 1268 * Add the buffer to the sorted clean or dirty block list using a 1269 * splay tree algorithm. 1270 * 1271 * NOTE: xflags is passed as a constant, optimizing this inline function! 1272 */ 1273 static void 1274 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1275 { 1276 struct buf *root; 1277 struct bufv *bv; 1278 1279 ASSERT_BO_LOCKED(bo); 1280 bp->b_xflags |= xflags; 1281 if (xflags & BX_VNDIRTY) 1282 bv = &bo->bo_dirty; 1283 else 1284 bv = &bo->bo_clean; 1285 1286 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1287 if (root == NULL) { 1288 bp->b_left = NULL; 1289 bp->b_right = NULL; 1290 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1291 } else if (bp->b_lblkno < root->b_lblkno || 1292 (bp->b_lblkno == root->b_lblkno && 1293 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1294 bp->b_left = root->b_left; 1295 bp->b_right = root; 1296 root->b_left = NULL; 1297 TAILQ_INSERT_BEFORE(root, bp, b_bobufs); 1298 } else { 1299 bp->b_right = root->b_right; 1300 bp->b_left = root; 1301 root->b_right = NULL; 1302 TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); 1303 } 1304 bv->bv_cnt++; 1305 bv->bv_root = bp; 1306 } 1307 1308 /* 1309 * Lookup a buffer using the splay tree. Note that we specifically avoid 1310 * shadow buffers used in background bitmap writes. 1311 * 1312 * This code isn't quite efficient as it could be because we are maintaining 1313 * two sorted lists and do not know which list the block resides in. 1314 * 1315 * During a "make buildworld" the desired buffer is found at one of 1316 * the roots more than 60% of the time. Thus, checking both roots 1317 * before performing either splay eliminates unnecessary splays on the 1318 * first tree splayed. 1319 */ 1320 struct buf * 1321 gbincore(struct bufobj *bo, daddr_t lblkno) 1322 { 1323 struct buf *bp; 1324 1325 ASSERT_BO_LOCKED(bo); 1326 if ((bp = bo->bo_clean.bv_root) != NULL && 1327 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1328 return (bp); 1329 if ((bp = bo->bo_dirty.bv_root) != NULL && 1330 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1331 return (bp); 1332 if ((bp = bo->bo_clean.bv_root) != NULL) { 1333 bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); 1334 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1335 return (bp); 1336 } 1337 if ((bp = bo->bo_dirty.bv_root) != NULL) { 1338 bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); 1339 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1340 return (bp); 1341 } 1342 return (NULL); 1343 } 1344 1345 /* 1346 * Associate a buffer with a vnode. 1347 */ 1348 void 1349 bgetvp(struct vnode *vp, struct buf *bp) 1350 { 1351 1352 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1353 1354 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1355 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1356 ("bgetvp: bp already attached! %p", bp)); 1357 1358 ASSERT_VI_LOCKED(vp, "bgetvp"); 1359 vholdl(vp); 1360 bp->b_vp = vp; 1361 bp->b_bufobj = &vp->v_bufobj; 1362 /* 1363 * Insert onto list for new vnode. 1364 */ 1365 buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN); 1366 } 1367 1368 /* 1369 * Disassociate a buffer from a vnode. 1370 */ 1371 void 1372 brelvp(struct buf *bp) 1373 { 1374 struct bufobj *bo; 1375 struct vnode *vp; 1376 1377 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1378 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1379 1380 /* 1381 * Delete from old vnode list, if on one. 1382 */ 1383 vp = bp->b_vp; /* XXX */ 1384 bo = bp->b_bufobj; 1385 BO_LOCK(bo); 1386 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1387 buf_vlist_remove(bp); 1388 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1389 bo->bo_flag &= ~BO_ONWORKLST; 1390 mtx_lock(&sync_mtx); 1391 LIST_REMOVE(bo, bo_synclist); 1392 syncer_worklist_len--; 1393 mtx_unlock(&sync_mtx); 1394 } 1395 vdropl(vp); 1396 bp->b_vp = NULL; 1397 bp->b_bufobj = NULL; 1398 BO_UNLOCK(bo); 1399 } 1400 1401 /* 1402 * Add an item to the syncer work queue. 1403 */ 1404 static void 1405 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 1406 { 1407 int slot; 1408 1409 ASSERT_BO_LOCKED(bo); 1410 1411 mtx_lock(&sync_mtx); 1412 if (bo->bo_flag & BO_ONWORKLST) 1413 LIST_REMOVE(bo, bo_synclist); 1414 else { 1415 bo->bo_flag |= BO_ONWORKLST; 1416 syncer_worklist_len++; 1417 } 1418 1419 if (delay > syncer_maxdelay - 2) 1420 delay = syncer_maxdelay - 2; 1421 slot = (syncer_delayno + delay) & syncer_mask; 1422 1423 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 1424 mtx_unlock(&sync_mtx); 1425 } 1426 1427 static int 1428 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 1429 { 1430 int error, len; 1431 1432 mtx_lock(&sync_mtx); 1433 len = syncer_worklist_len - sync_vnode_count; 1434 mtx_unlock(&sync_mtx); 1435 error = SYSCTL_OUT(req, &len, sizeof(len)); 1436 return (error); 1437 } 1438 1439 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 1440 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 1441 1442 struct proc *updateproc; 1443 static void sched_sync(void); 1444 static struct kproc_desc up_kp = { 1445 "syncer", 1446 sched_sync, 1447 &updateproc 1448 }; 1449 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1450 1451 static int 1452 sync_vnode(struct bufobj *bo, struct thread *td) 1453 { 1454 struct vnode *vp; 1455 struct mount *mp; 1456 1457 vp = bo->__bo_vnode; /* XXX */ 1458 if (VOP_ISLOCKED(vp, NULL) != 0) 1459 return (1); 1460 if (VI_TRYLOCK(vp) == 0) 1461 return (1); 1462 /* 1463 * We use vhold in case the vnode does not 1464 * successfully sync. vhold prevents the vnode from 1465 * going away when we unlock the sync_mtx so that 1466 * we can acquire the vnode interlock. 1467 */ 1468 vholdl(vp); 1469 mtx_unlock(&sync_mtx); 1470 VI_UNLOCK(vp); 1471 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1472 vdrop(vp); 1473 mtx_lock(&sync_mtx); 1474 return (1); 1475 } 1476 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1477 (void) VOP_FSYNC(vp, MNT_LAZY, td); 1478 VOP_UNLOCK(vp, 0, td); 1479 vn_finished_write(mp); 1480 VI_LOCK(vp); 1481 if ((bo->bo_flag & BO_ONWORKLST) != 0) { 1482 /* 1483 * Put us back on the worklist. The worklist 1484 * routine will remove us from our current 1485 * position and then add us back in at a later 1486 * position. 1487 */ 1488 vn_syncer_add_to_worklist(bo, syncdelay); 1489 } 1490 vdropl(vp); 1491 VI_UNLOCK(vp); 1492 mtx_lock(&sync_mtx); 1493 return (0); 1494 } 1495 1496 /* 1497 * System filesystem synchronizer daemon. 1498 */ 1499 static void 1500 sched_sync(void) 1501 { 1502 struct synclist *next; 1503 struct synclist *slp; 1504 struct bufobj *bo; 1505 long starttime; 1506 struct thread *td = FIRST_THREAD_IN_PROC(updateproc); 1507 static int dummychan; 1508 int last_work_seen; 1509 int net_worklist_len; 1510 int syncer_final_iter; 1511 int first_printf; 1512 int error; 1513 1514 mtx_lock(&Giant); 1515 last_work_seen = 0; 1516 syncer_final_iter = 0; 1517 first_printf = 1; 1518 syncer_state = SYNCER_RUNNING; 1519 starttime = time_second; 1520 1521 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 1522 SHUTDOWN_PRI_LAST); 1523 1524 for (;;) { 1525 mtx_lock(&sync_mtx); 1526 if (syncer_state == SYNCER_FINAL_DELAY && 1527 syncer_final_iter == 0) { 1528 mtx_unlock(&sync_mtx); 1529 kthread_suspend_check(td->td_proc); 1530 mtx_lock(&sync_mtx); 1531 } 1532 net_worklist_len = syncer_worklist_len - sync_vnode_count; 1533 if (syncer_state != SYNCER_RUNNING && 1534 starttime != time_second) { 1535 if (first_printf) { 1536 printf("\nSyncing disks, vnodes remaining..."); 1537 first_printf = 0; 1538 } 1539 printf("%d ", net_worklist_len); 1540 } 1541 starttime = time_second; 1542 1543 /* 1544 * Push files whose dirty time has expired. Be careful 1545 * of interrupt race on slp queue. 1546 * 1547 * Skip over empty worklist slots when shutting down. 1548 */ 1549 do { 1550 slp = &syncer_workitem_pending[syncer_delayno]; 1551 syncer_delayno += 1; 1552 if (syncer_delayno == syncer_maxdelay) 1553 syncer_delayno = 0; 1554 next = &syncer_workitem_pending[syncer_delayno]; 1555 /* 1556 * If the worklist has wrapped since the 1557 * it was emptied of all but syncer vnodes, 1558 * switch to the FINAL_DELAY state and run 1559 * for one more second. 1560 */ 1561 if (syncer_state == SYNCER_SHUTTING_DOWN && 1562 net_worklist_len == 0 && 1563 last_work_seen == syncer_delayno) { 1564 syncer_state = SYNCER_FINAL_DELAY; 1565 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 1566 } 1567 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 1568 syncer_worklist_len > 0); 1569 1570 /* 1571 * Keep track of the last time there was anything 1572 * on the worklist other than syncer vnodes. 1573 * Return to the SHUTTING_DOWN state if any 1574 * new work appears. 1575 */ 1576 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 1577 last_work_seen = syncer_delayno; 1578 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 1579 syncer_state = SYNCER_SHUTTING_DOWN; 1580 while ((bo = LIST_FIRST(slp)) != NULL) { 1581 error = sync_vnode(bo, td); 1582 if (error == 1) { 1583 LIST_REMOVE(bo, bo_synclist); 1584 LIST_INSERT_HEAD(next, bo, bo_synclist); 1585 continue; 1586 } 1587 } 1588 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 1589 syncer_final_iter--; 1590 mtx_unlock(&sync_mtx); 1591 1592 /* 1593 * Do soft update processing. 1594 */ 1595 if (softdep_process_worklist_hook != NULL) 1596 (*softdep_process_worklist_hook)(NULL); 1597 1598 /* 1599 * The variable rushjob allows the kernel to speed up the 1600 * processing of the filesystem syncer process. A rushjob 1601 * value of N tells the filesystem syncer to process the next 1602 * N seconds worth of work on its queue ASAP. Currently rushjob 1603 * is used by the soft update code to speed up the filesystem 1604 * syncer process when the incore state is getting so far 1605 * ahead of the disk that the kernel memory pool is being 1606 * threatened with exhaustion. 1607 */ 1608 mtx_lock(&sync_mtx); 1609 if (rushjob > 0) { 1610 rushjob -= 1; 1611 mtx_unlock(&sync_mtx); 1612 continue; 1613 } 1614 mtx_unlock(&sync_mtx); 1615 /* 1616 * Just sleep for a short period if time between 1617 * iterations when shutting down to allow some I/O 1618 * to happen. 1619 * 1620 * If it has taken us less than a second to process the 1621 * current work, then wait. Otherwise start right over 1622 * again. We can still lose time if any single round 1623 * takes more than two seconds, but it does not really 1624 * matter as we are just trying to generally pace the 1625 * filesystem activity. 1626 */ 1627 if (syncer_state != SYNCER_RUNNING) 1628 tsleep(&dummychan, PPAUSE, "syncfnl", 1629 hz / SYNCER_SHUTDOWN_SPEEDUP); 1630 else if (time_second == starttime) 1631 tsleep(&lbolt, PPAUSE, "syncer", 0); 1632 } 1633 } 1634 1635 /* 1636 * Request the syncer daemon to speed up its work. 1637 * We never push it to speed up more than half of its 1638 * normal turn time, otherwise it could take over the cpu. 1639 */ 1640 int 1641 speedup_syncer() 1642 { 1643 struct thread *td; 1644 int ret = 0; 1645 1646 td = FIRST_THREAD_IN_PROC(updateproc); 1647 sleepq_remove(td, &lbolt); 1648 mtx_lock(&sync_mtx); 1649 if (rushjob < syncdelay / 2) { 1650 rushjob += 1; 1651 stat_rush_requests += 1; 1652 ret = 1; 1653 } 1654 mtx_unlock(&sync_mtx); 1655 return (ret); 1656 } 1657 1658 /* 1659 * Tell the syncer to speed up its work and run though its work 1660 * list several times, then tell it to shut down. 1661 */ 1662 static void 1663 syncer_shutdown(void *arg, int howto) 1664 { 1665 struct thread *td; 1666 1667 if (howto & RB_NOSYNC) 1668 return; 1669 td = FIRST_THREAD_IN_PROC(updateproc); 1670 sleepq_remove(td, &lbolt); 1671 mtx_lock(&sync_mtx); 1672 syncer_state = SYNCER_SHUTTING_DOWN; 1673 rushjob = 0; 1674 mtx_unlock(&sync_mtx); 1675 kproc_shutdown(arg, howto); 1676 } 1677 1678 /* 1679 * Reassign a buffer from one vnode to another. 1680 * Used to assign file specific control information 1681 * (indirect blocks) to the vnode to which they belong. 1682 */ 1683 void 1684 reassignbuf(struct buf *bp) 1685 { 1686 struct vnode *vp; 1687 struct bufobj *bo; 1688 int delay; 1689 1690 vp = bp->b_vp; 1691 bo = bp->b_bufobj; 1692 ++reassignbufcalls; 1693 1694 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 1695 bp, bp->b_vp, bp->b_flags); 1696 /* 1697 * B_PAGING flagged buffers cannot be reassigned because their vp 1698 * is not fully linked in. 1699 */ 1700 if (bp->b_flags & B_PAGING) 1701 panic("cannot reassign paging buffer"); 1702 1703 /* 1704 * Delete from old vnode list, if on one. 1705 */ 1706 VI_LOCK(vp); 1707 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1708 buf_vlist_remove(bp); 1709 /* 1710 * If dirty, put on list of dirty buffers; otherwise insert onto list 1711 * of clean buffers. 1712 */ 1713 if (bp->b_flags & B_DELWRI) { 1714 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 1715 switch (vp->v_type) { 1716 case VDIR: 1717 delay = dirdelay; 1718 break; 1719 case VCHR: 1720 delay = metadelay; 1721 break; 1722 default: 1723 delay = filedelay; 1724 } 1725 vn_syncer_add_to_worklist(bo, delay); 1726 } 1727 buf_vlist_add(bp, bo, BX_VNDIRTY); 1728 } else { 1729 buf_vlist_add(bp, bo, BX_VNCLEAN); 1730 1731 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1732 mtx_lock(&sync_mtx); 1733 LIST_REMOVE(bo, bo_synclist); 1734 syncer_worklist_len--; 1735 mtx_unlock(&sync_mtx); 1736 bo->bo_flag &= ~BO_ONWORKLST; 1737 } 1738 } 1739 VI_UNLOCK(vp); 1740 } 1741 1742 static void 1743 v_incr_usecount(struct vnode *vp, int delta) 1744 { 1745 1746 vp->v_usecount += delta; 1747 vp->v_holdcnt += delta; 1748 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 1749 dev_lock(); 1750 vp->v_rdev->si_usecount += delta; 1751 dev_unlock(); 1752 } 1753 } 1754 1755 /* 1756 * Grab a particular vnode from the free list, increment its 1757 * reference count and lock it. The vnode lock bit is set if the 1758 * vnode is being eliminated in vgone. The process is awakened 1759 * when the transition is completed, and an error returned to 1760 * indicate that the vnode is no longer usable (possibly having 1761 * been changed to a new filesystem type). 1762 */ 1763 int 1764 vget(vp, flags, td) 1765 struct vnode *vp; 1766 int flags; 1767 struct thread *td; 1768 { 1769 int error; 1770 1771 error = 0; 1772 if ((flags & LK_INTERLOCK) == 0) 1773 VI_LOCK(vp); 1774 /* 1775 * If the vnode is in the process of being cleaned out for 1776 * another use, we wait for the cleaning to finish and then 1777 * return failure. Cleaning is determined by checking that 1778 * the VI_DOOMED flag is set. 1779 */ 1780 if (vp->v_iflag & VI_DOOMED && vp->v_vxthread != td && 1781 ((flags & LK_NOWAIT) || (flags & LK_TYPE_MASK) == 0)) { 1782 VI_UNLOCK(vp); 1783 return (EBUSY); 1784 } 1785 v_incr_usecount(vp, 1); 1786 if (VSHOULDBUSY(vp)) 1787 vbusy(vp); 1788 if ((flags & LK_TYPE_MASK) == 0) { 1789 VI_UNLOCK(vp); 1790 return (0); 1791 } 1792 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) 1793 goto drop; 1794 if (vp->v_iflag & VI_DOOMED && vp->v_vxthread != td) { 1795 VOP_UNLOCK(vp, 0, td); 1796 error = ENOENT; 1797 goto drop; 1798 } 1799 return (0); 1800 1801 drop: 1802 /* 1803 * must expand vrele here because we do not want 1804 * to call VOP_INACTIVE if the reference count 1805 * drops back to zero since it was never really 1806 * active. We must remove it from the free list 1807 * before sleeping so that multiple processes do 1808 * not try to recycle it. 1809 */ 1810 VI_LOCK(vp); 1811 v_incr_usecount(vp, -1); 1812 if (VSHOULDFREE(vp)) 1813 vfree(vp); 1814 else 1815 vlruvp(vp); 1816 VI_UNLOCK(vp); 1817 return (error); 1818 } 1819 1820 /* 1821 * Increase the reference count of a vnode. 1822 */ 1823 void 1824 vref(struct vnode *vp) 1825 { 1826 1827 VI_LOCK(vp); 1828 v_incr_usecount(vp, 1); 1829 VI_UNLOCK(vp); 1830 } 1831 1832 /* 1833 * Return reference count of a vnode. 1834 * 1835 * The results of this call are only guaranteed when some mechanism other 1836 * than the VI lock is used to stop other processes from gaining references 1837 * to the vnode. This may be the case if the caller holds the only reference. 1838 * This is also useful when stale data is acceptable as race conditions may 1839 * be accounted for by some other means. 1840 */ 1841 int 1842 vrefcnt(struct vnode *vp) 1843 { 1844 int usecnt; 1845 1846 VI_LOCK(vp); 1847 usecnt = vp->v_usecount; 1848 VI_UNLOCK(vp); 1849 1850 return (usecnt); 1851 } 1852 1853 1854 /* 1855 * Vnode put/release. 1856 * If count drops to zero, call inactive routine and return to freelist. 1857 */ 1858 void 1859 vrele(vp) 1860 struct vnode *vp; 1861 { 1862 struct thread *td = curthread; /* XXX */ 1863 1864 KASSERT(vp != NULL, ("vrele: null vp")); 1865 1866 VI_LOCK(vp); 1867 1868 /* Skip this v_writecount check if we're going to panic below. */ 1869 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, 1870 ("vrele: missed vn_close")); 1871 1872 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && 1873 vp->v_usecount == 1)) { 1874 v_incr_usecount(vp, -1); 1875 VI_UNLOCK(vp); 1876 1877 return; 1878 } 1879 if (vp->v_usecount != 1) { 1880 #ifdef DIAGNOSTIC 1881 vprint("vrele: negative ref count", vp); 1882 #endif 1883 VI_UNLOCK(vp); 1884 panic("vrele: negative ref cnt"); 1885 } 1886 v_incr_usecount(vp, -1); 1887 /* 1888 * We must call VOP_INACTIVE with the node locked. Mark 1889 * as VI_DOINGINACT to avoid recursion. 1890 */ 1891 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) { 1892 VI_LOCK(vp); 1893 vinactive(vp, td); 1894 VOP_UNLOCK(vp, 0, td); 1895 } else 1896 VI_LOCK(vp); 1897 if (VSHOULDFREE(vp)) 1898 vfree(vp); 1899 else 1900 vlruvp(vp); 1901 VI_UNLOCK(vp); 1902 } 1903 1904 /* 1905 * Release an already locked vnode. This give the same effects as 1906 * unlock+vrele(), but takes less time and avoids releasing and 1907 * re-aquiring the lock (as vrele() aquires the lock internally.) 1908 */ 1909 void 1910 vput(vp) 1911 struct vnode *vp; 1912 { 1913 struct thread *td = curthread; /* XXX */ 1914 1915 KASSERT(vp != NULL, ("vput: null vp")); 1916 VI_LOCK(vp); 1917 /* Skip this v_writecount check if we're going to panic below. */ 1918 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, 1919 ("vput: missed vn_close")); 1920 1921 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && 1922 vp->v_usecount == 1)) { 1923 v_incr_usecount(vp, -1); 1924 VOP_UNLOCK(vp, LK_INTERLOCK, td); 1925 return; 1926 } 1927 1928 if (vp->v_usecount == 1) { 1929 v_incr_usecount(vp, -1); 1930 vinactive(vp, td); 1931 VOP_UNLOCK(vp, 0, td); 1932 if (VSHOULDFREE(vp)) 1933 vfree(vp); 1934 else 1935 vlruvp(vp); 1936 VI_UNLOCK(vp); 1937 1938 } else { 1939 #ifdef DIAGNOSTIC 1940 vprint("vput: negative ref count", vp); 1941 #endif 1942 panic("vput: negative ref cnt"); 1943 } 1944 } 1945 1946 /* 1947 * Somebody doesn't want the vnode recycled. 1948 */ 1949 void 1950 vhold(struct vnode *vp) 1951 { 1952 1953 VI_LOCK(vp); 1954 vholdl(vp); 1955 VI_UNLOCK(vp); 1956 } 1957 1958 void 1959 vholdl(struct vnode *vp) 1960 { 1961 1962 vp->v_holdcnt++; 1963 if (VSHOULDBUSY(vp)) 1964 vbusy(vp); 1965 } 1966 1967 /* 1968 * Note that there is one less who cares about this vnode. vdrop() is the 1969 * opposite of vhold(). 1970 */ 1971 void 1972 vdrop(struct vnode *vp) 1973 { 1974 1975 VI_LOCK(vp); 1976 vdropl(vp); 1977 VI_UNLOCK(vp); 1978 } 1979 1980 static void 1981 vdropl(struct vnode *vp) 1982 { 1983 1984 if (vp->v_holdcnt <= 0) 1985 panic("vdrop: holdcnt %d", vp->v_holdcnt); 1986 vp->v_holdcnt--; 1987 if (VSHOULDFREE(vp)) 1988 vfree(vp); 1989 else 1990 vlruvp(vp); 1991 } 1992 1993 static void 1994 vinactive(struct vnode *vp, struct thread *td) 1995 { 1996 ASSERT_VOP_LOCKED(vp, "vinactive"); 1997 ASSERT_VI_LOCKED(vp, "vinactive"); 1998 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 1999 ("vinactive: recursed on VI_DOINGINACT")); 2000 vp->v_iflag |= VI_DOINGINACT; 2001 VI_UNLOCK(vp); 2002 VOP_INACTIVE(vp, td); 2003 VI_LOCK(vp); 2004 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2005 ("vinactive: lost VI_DOINGINACT")); 2006 vp->v_iflag &= ~VI_DOINGINACT; 2007 } 2008 2009 /* 2010 * Remove any vnodes in the vnode table belonging to mount point mp. 2011 * 2012 * If FORCECLOSE is not specified, there should not be any active ones, 2013 * return error if any are found (nb: this is a user error, not a 2014 * system error). If FORCECLOSE is specified, detach any active vnodes 2015 * that are found. 2016 * 2017 * If WRITECLOSE is set, only flush out regular file vnodes open for 2018 * writing. 2019 * 2020 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2021 * 2022 * `rootrefs' specifies the base reference count for the root vnode 2023 * of this filesystem. The root vnode is considered busy if its 2024 * v_usecount exceeds this value. On a successful return, vflush(, td) 2025 * will call vrele() on the root vnode exactly rootrefs times. 2026 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2027 * be zero. 2028 */ 2029 #ifdef DIAGNOSTIC 2030 static int busyprt = 0; /* print out busy vnodes */ 2031 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 2032 #endif 2033 2034 int 2035 vflush(mp, rootrefs, flags, td) 2036 struct mount *mp; 2037 int rootrefs; 2038 int flags; 2039 struct thread *td; 2040 { 2041 struct vnode *vp, *nvp, *rootvp = NULL; 2042 struct vattr vattr; 2043 int busy = 0, error; 2044 2045 if (rootrefs > 0) { 2046 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2047 ("vflush: bad args")); 2048 /* 2049 * Get the filesystem root vnode. We can vput() it 2050 * immediately, since with rootrefs > 0, it won't go away. 2051 */ 2052 if ((error = VFS_ROOT(mp, &rootvp, td)) != 0) 2053 return (error); 2054 vput(rootvp); 2055 2056 } 2057 MNT_ILOCK(mp); 2058 loop: 2059 MNT_VNODE_FOREACH(vp, mp, nvp) { 2060 2061 VI_LOCK(vp); 2062 MNT_IUNLOCK(mp); 2063 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td); 2064 if (error) { 2065 MNT_ILOCK(mp); 2066 goto loop; 2067 } 2068 /* 2069 * Skip over a vnodes marked VV_SYSTEM. 2070 */ 2071 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2072 VOP_UNLOCK(vp, 0, td); 2073 MNT_ILOCK(mp); 2074 continue; 2075 } 2076 /* 2077 * If WRITECLOSE is set, flush out unlinked but still open 2078 * files (even if open only for reading) and regular file 2079 * vnodes open for writing. 2080 */ 2081 if (flags & WRITECLOSE) { 2082 error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); 2083 VI_LOCK(vp); 2084 2085 if ((vp->v_type == VNON || 2086 (error == 0 && vattr.va_nlink > 0)) && 2087 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2088 VOP_UNLOCK(vp, LK_INTERLOCK, td); 2089 MNT_ILOCK(mp); 2090 continue; 2091 } 2092 } else 2093 VI_LOCK(vp); 2094 /* 2095 * With v_usecount == 0, all we need to do is clear out the 2096 * vnode data structures and we are done. 2097 */ 2098 if (vp->v_usecount == 0) { 2099 vgonel(vp, td); 2100 VOP_UNLOCK(vp, 0, td); 2101 MNT_ILOCK(mp); 2102 continue; 2103 } 2104 /* 2105 * If FORCECLOSE is set, forcibly close the vnode. For block 2106 * or character devices, revert to an anonymous device. For 2107 * all other files, just kill them. 2108 */ 2109 if (flags & FORCECLOSE) { 2110 VNASSERT(vp->v_type != VCHR && vp->v_type != VBLK, vp, 2111 ("device VNODE %p is FORCECLOSED", vp)); 2112 vgonel(vp, td); 2113 VOP_UNLOCK(vp, 0, td); 2114 MNT_ILOCK(mp); 2115 continue; 2116 } 2117 VOP_UNLOCK(vp, 0, td); 2118 #ifdef DIAGNOSTIC 2119 if (busyprt) 2120 vprint("vflush: busy vnode", vp); 2121 #endif 2122 VI_UNLOCK(vp); 2123 MNT_ILOCK(mp); 2124 busy++; 2125 } 2126 MNT_IUNLOCK(mp); 2127 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 2128 /* 2129 * If just the root vnode is busy, and if its refcount 2130 * is equal to `rootrefs', then go ahead and kill it. 2131 */ 2132 VI_LOCK(rootvp); 2133 KASSERT(busy > 0, ("vflush: not busy")); 2134 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 2135 ("vflush: usecount %d < rootrefs %d", 2136 rootvp->v_usecount, rootrefs)); 2137 if (busy == 1 && rootvp->v_usecount == rootrefs) { 2138 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td); 2139 vgone(rootvp); 2140 VOP_UNLOCK(rootvp, 0, td); 2141 busy = 0; 2142 } else 2143 VI_UNLOCK(rootvp); 2144 } 2145 if (busy) 2146 return (EBUSY); 2147 for (; rootrefs > 0; rootrefs--) 2148 vrele(rootvp); 2149 return (0); 2150 } 2151 2152 /* 2153 * This moves a now (likely recyclable) vnode to the end of the 2154 * mountlist. XXX However, it is temporarily disabled until we 2155 * can clean up ffs_sync() and friends, which have loop restart 2156 * conditions which this code causes to operate O(N^2). 2157 */ 2158 static void 2159 vlruvp(struct vnode *vp) 2160 { 2161 #if 0 2162 struct mount *mp; 2163 2164 if ((mp = vp->v_mount) != NULL) { 2165 MNT_ILOCK(mp); 2166 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2167 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2168 MNT_IUNLOCK(mp); 2169 } 2170 #endif 2171 } 2172 2173 /* 2174 * Recycle an unused vnode to the front of the free list. 2175 * Release the passed interlock if the vnode will be recycled. 2176 */ 2177 int 2178 vrecycle(struct vnode *vp, struct thread *td) 2179 { 2180 2181 ASSERT_VOP_LOCKED(vp, "vrecycle"); 2182 VI_LOCK(vp); 2183 if (vp->v_usecount == 0) { 2184 vgonel(vp, td); 2185 return (1); 2186 } 2187 VI_UNLOCK(vp); 2188 return (0); 2189 } 2190 2191 /* 2192 * Eliminate all activity associated with a vnode 2193 * in preparation for reuse. 2194 */ 2195 void 2196 vgone(struct vnode *vp) 2197 { 2198 struct thread *td = curthread; /* XXX */ 2199 ASSERT_VOP_LOCKED(vp, "vgone"); 2200 2201 VI_LOCK(vp); 2202 vgonel(vp, td); 2203 } 2204 2205 /* 2206 * vgone, with the vp interlock held. 2207 */ 2208 void 2209 vgonel(struct vnode *vp, struct thread *td) 2210 { 2211 int active; 2212 int doomed; 2213 2214 ASSERT_VOP_LOCKED(vp, "vgonel"); 2215 ASSERT_VI_LOCKED(vp, "vgonel"); 2216 2217 /* 2218 * Check to see if the vnode is in use. If so we have to reference it 2219 * before we clean it out so that its count cannot fall to zero and 2220 * generate a race against ourselves to recycle it. 2221 */ 2222 if ((active = vp->v_usecount)) 2223 v_incr_usecount(vp, 1); 2224 2225 /* 2226 * See if we're already doomed, if so, this is coming from a 2227 * successful vtryrecycle(); 2228 */ 2229 doomed = (vp->v_iflag & VI_DOOMED); 2230 vp->v_iflag |= VI_DOOMED; 2231 vp->v_vxthread = curthread; 2232 VI_UNLOCK(vp); 2233 2234 /* 2235 * Clean out any buffers associated with the vnode. 2236 * If the flush fails, just toss the buffers. 2237 */ 2238 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)); 2239 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 2240 if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0) 2241 vinvalbuf(vp, 0, td, 0, 0); 2242 2243 /* 2244 * If purging an active vnode, it must be closed and 2245 * deactivated before being reclaimed. 2246 */ 2247 if (active) { 2248 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 2249 VI_LOCK(vp); 2250 if ((vp->v_iflag & VI_DOINGINACT) == 0) 2251 vinactive(vp, td); 2252 VI_UNLOCK(vp); 2253 } 2254 /* 2255 * Reclaim the vnode. 2256 */ 2257 if (VOP_RECLAIM(vp, td)) 2258 panic("vgone: cannot reclaim"); 2259 2260 VNASSERT(vp->v_object == NULL, vp, 2261 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 2262 2263 /* 2264 * Delete from old mount point vnode list. 2265 */ 2266 delmntque(vp); 2267 cache_purge(vp); 2268 VI_LOCK(vp); 2269 if (active) { 2270 v_incr_usecount(vp, -1); 2271 VNASSERT(vp->v_usecount >= 0, vp, ("vgone: bad ref count")); 2272 } 2273 /* 2274 * Done with purge, reset to the standard lock and 2275 * notify sleepers of the grim news. 2276 */ 2277 vp->v_vnlock = &vp->v_lock; 2278 vp->v_op = &dead_vnodeops; 2279 vp->v_tag = "none"; 2280 vp->v_type = VBAD; 2281 vp->v_vxthread = NULL; 2282 2283 /* 2284 * If it is on the freelist and not already at the head, 2285 * move it to the head of the list. The test of the 2286 * VDOOMED flag and the reference count of zero is because 2287 * it will be removed from the free list by getnewvnode, 2288 * but will not have its reference count incremented until 2289 * after calling vgone. If the reference count were 2290 * incremented first, vgone would (incorrectly) try to 2291 * close the previous instance of the underlying object. 2292 */ 2293 if (vp->v_holdcnt == 0 && !doomed) { 2294 mtx_lock(&vnode_free_list_mtx); 2295 if (vp->v_iflag & VI_FREE) { 2296 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2297 } else { 2298 vp->v_iflag |= VI_FREE; 2299 freevnodes++; 2300 } 2301 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2302 mtx_unlock(&vnode_free_list_mtx); 2303 } 2304 VI_UNLOCK(vp); 2305 } 2306 2307 /* 2308 * Calculate the total number of references to a special device. 2309 */ 2310 int 2311 vcount(vp) 2312 struct vnode *vp; 2313 { 2314 int count; 2315 2316 dev_lock(); 2317 count = vp->v_rdev->si_usecount; 2318 dev_unlock(); 2319 return (count); 2320 } 2321 2322 /* 2323 * Same as above, but using the struct cdev *as argument 2324 */ 2325 int 2326 count_dev(dev) 2327 struct cdev *dev; 2328 { 2329 int count; 2330 2331 dev_lock(); 2332 count = dev->si_usecount; 2333 dev_unlock(); 2334 return(count); 2335 } 2336 2337 /* 2338 * Print out a description of a vnode. 2339 */ 2340 static char *typename[] = 2341 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2342 2343 void 2344 vn_printf(struct vnode *vp, const char *fmt, ...) 2345 { 2346 va_list ap; 2347 char buf[96]; 2348 2349 va_start(ap, fmt); 2350 vprintf(fmt, ap); 2351 va_end(ap); 2352 printf("%p: ", (void *)vp); 2353 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 2354 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 2355 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 2356 buf[0] = '\0'; 2357 buf[1] = '\0'; 2358 if (vp->v_vflag & VV_ROOT) 2359 strcat(buf, "|VV_ROOT"); 2360 if (vp->v_vflag & VV_TEXT) 2361 strcat(buf, "|VV_TEXT"); 2362 if (vp->v_vflag & VV_SYSTEM) 2363 strcat(buf, "|VV_SYSTEM"); 2364 if (vp->v_iflag & VI_DOOMED) 2365 strcat(buf, "|VI_DOOMED"); 2366 if (vp->v_iflag & VI_FREE) 2367 strcat(buf, "|VI_FREE"); 2368 printf(" flags (%s)\n", buf + 1); 2369 if (mtx_owned(VI_MTX(vp))) 2370 printf(" VI_LOCKed"); 2371 if (vp->v_object != NULL); 2372 printf(" v_object %p ref %d pages %d\n", 2373 vp->v_object, vp->v_object->ref_count, 2374 vp->v_object->resident_page_count); 2375 printf(" "); 2376 lockmgr_printinfo(vp->v_vnlock); 2377 printf("\n"); 2378 if (vp->v_data != NULL) 2379 VOP_PRINT(vp); 2380 } 2381 2382 #ifdef DDB 2383 #include <ddb/ddb.h> 2384 /* 2385 * List all of the locked vnodes in the system. 2386 * Called when debugging the kernel. 2387 */ 2388 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 2389 { 2390 struct mount *mp, *nmp; 2391 struct vnode *vp; 2392 2393 /* 2394 * Note: because this is DDB, we can't obey the locking semantics 2395 * for these structures, which means we could catch an inconsistent 2396 * state and dereference a nasty pointer. Not much to be done 2397 * about that. 2398 */ 2399 printf("Locked vnodes\n"); 2400 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2401 nmp = TAILQ_NEXT(mp, mnt_list); 2402 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2403 if (VOP_ISLOCKED(vp, NULL)) 2404 vprint("", vp); 2405 } 2406 nmp = TAILQ_NEXT(mp, mnt_list); 2407 } 2408 } 2409 #endif 2410 2411 /* 2412 * Fill in a struct xvfsconf based on a struct vfsconf. 2413 */ 2414 static void 2415 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) 2416 { 2417 2418 strcpy(xvfsp->vfc_name, vfsp->vfc_name); 2419 xvfsp->vfc_typenum = vfsp->vfc_typenum; 2420 xvfsp->vfc_refcount = vfsp->vfc_refcount; 2421 xvfsp->vfc_flags = vfsp->vfc_flags; 2422 /* 2423 * These are unused in userland, we keep them 2424 * to not break binary compatibility. 2425 */ 2426 xvfsp->vfc_vfsops = NULL; 2427 xvfsp->vfc_next = NULL; 2428 } 2429 2430 /* 2431 * Top level filesystem related information gathering. 2432 */ 2433 static int 2434 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 2435 { 2436 struct vfsconf *vfsp; 2437 struct xvfsconf xvfsp; 2438 int error; 2439 2440 error = 0; 2441 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 2442 vfsconf2x(vfsp, &xvfsp); 2443 error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); 2444 if (error) 2445 break; 2446 } 2447 return (error); 2448 } 2449 2450 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, 2451 "S,xvfsconf", "List of all configured filesystems"); 2452 2453 #ifndef BURN_BRIDGES 2454 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 2455 2456 static int 2457 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2458 { 2459 int *name = (int *)arg1 - 1; /* XXX */ 2460 u_int namelen = arg2 + 1; /* XXX */ 2461 struct vfsconf *vfsp; 2462 struct xvfsconf xvfsp; 2463 2464 printf("WARNING: userland calling deprecated sysctl, " 2465 "please rebuild world\n"); 2466 2467 #if 1 || defined(COMPAT_PRELITE2) 2468 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2469 if (namelen == 1) 2470 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2471 #endif 2472 2473 switch (name[1]) { 2474 case VFS_MAXTYPENUM: 2475 if (namelen != 2) 2476 return (ENOTDIR); 2477 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2478 case VFS_CONF: 2479 if (namelen != 3) 2480 return (ENOTDIR); /* overloaded */ 2481 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) 2482 if (vfsp->vfc_typenum == name[2]) 2483 break; 2484 if (vfsp == NULL) 2485 return (EOPNOTSUPP); 2486 vfsconf2x(vfsp, &xvfsp); 2487 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 2488 } 2489 return (EOPNOTSUPP); 2490 } 2491 2492 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, 2493 vfs_sysctl, "Generic filesystem"); 2494 2495 #if 1 || defined(COMPAT_PRELITE2) 2496 2497 static int 2498 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2499 { 2500 int error; 2501 struct vfsconf *vfsp; 2502 struct ovfsconf ovfs; 2503 2504 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 2505 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2506 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2507 ovfs.vfc_index = vfsp->vfc_typenum; 2508 ovfs.vfc_refcount = vfsp->vfc_refcount; 2509 ovfs.vfc_flags = vfsp->vfc_flags; 2510 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2511 if (error) 2512 return error; 2513 } 2514 return 0; 2515 } 2516 2517 #endif /* 1 || COMPAT_PRELITE2 */ 2518 #endif /* !BURN_BRIDGES */ 2519 2520 #define KINFO_VNODESLOP 10 2521 #ifdef notyet 2522 /* 2523 * Dump vnode list (via sysctl). 2524 */ 2525 /* ARGSUSED */ 2526 static int 2527 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2528 { 2529 struct xvnode *xvn; 2530 struct thread *td = req->td; 2531 struct mount *mp; 2532 struct vnode *vp; 2533 int error, len, n; 2534 2535 /* 2536 * Stale numvnodes access is not fatal here. 2537 */ 2538 req->lock = 0; 2539 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 2540 if (!req->oldptr) 2541 /* Make an estimate */ 2542 return (SYSCTL_OUT(req, 0, len)); 2543 2544 error = sysctl_wire_old_buffer(req, 0); 2545 if (error != 0) 2546 return (error); 2547 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 2548 n = 0; 2549 mtx_lock(&mountlist_mtx); 2550 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2551 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) 2552 continue; 2553 MNT_ILOCK(mp); 2554 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2555 if (n == len) 2556 break; 2557 vref(vp); 2558 xvn[n].xv_size = sizeof *xvn; 2559 xvn[n].xv_vnode = vp; 2560 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 2561 XV_COPY(usecount); 2562 XV_COPY(writecount); 2563 XV_COPY(holdcnt); 2564 XV_COPY(id); 2565 XV_COPY(mount); 2566 XV_COPY(numoutput); 2567 XV_COPY(type); 2568 #undef XV_COPY 2569 xvn[n].xv_flag = vp->v_vflag; 2570 2571 switch (vp->v_type) { 2572 case VREG: 2573 case VDIR: 2574 case VLNK: 2575 break; 2576 case VBLK: 2577 case VCHR: 2578 if (vp->v_rdev == NULL) { 2579 vrele(vp); 2580 continue; 2581 } 2582 xvn[n].xv_dev = dev2udev(vp->v_rdev); 2583 break; 2584 case VSOCK: 2585 xvn[n].xv_socket = vp->v_socket; 2586 break; 2587 case VFIFO: 2588 xvn[n].xv_fifo = vp->v_fifoinfo; 2589 break; 2590 case VNON: 2591 case VBAD: 2592 default: 2593 /* shouldn't happen? */ 2594 vrele(vp); 2595 continue; 2596 } 2597 vrele(vp); 2598 ++n; 2599 } 2600 MNT_IUNLOCK(mp); 2601 mtx_lock(&mountlist_mtx); 2602 vfs_unbusy(mp, td); 2603 if (n == len) 2604 break; 2605 } 2606 mtx_unlock(&mountlist_mtx); 2607 2608 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 2609 free(xvn, M_TEMP); 2610 return (error); 2611 } 2612 2613 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2614 0, 0, sysctl_vnode, "S,xvnode", ""); 2615 #endif 2616 2617 /* 2618 * Unmount all filesystems. The list is traversed in reverse order 2619 * of mounting to avoid dependencies. 2620 */ 2621 void 2622 vfs_unmountall() 2623 { 2624 struct mount *mp; 2625 struct thread *td; 2626 int error; 2627 2628 if (curthread != NULL) 2629 td = curthread; 2630 else 2631 td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */ 2632 /* 2633 * Since this only runs when rebooting, it is not interlocked. 2634 */ 2635 while(!TAILQ_EMPTY(&mountlist)) { 2636 mp = TAILQ_LAST(&mountlist, mntlist); 2637 error = dounmount(mp, MNT_FORCE, td); 2638 if (error) { 2639 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2640 printf("unmount of %s failed (", 2641 mp->mnt_stat.f_mntonname); 2642 if (error == EBUSY) 2643 printf("BUSY)\n"); 2644 else 2645 printf("%d)\n", error); 2646 } else { 2647 /* The unmount has removed mp from the mountlist */ 2648 } 2649 } 2650 } 2651 2652 /* 2653 * perform msync on all vnodes under a mount point 2654 * the mount point must be locked. 2655 */ 2656 void 2657 vfs_msync(struct mount *mp, int flags) 2658 { 2659 struct vnode *vp, *nvp; 2660 struct vm_object *obj; 2661 int tries; 2662 2663 tries = 5; 2664 MNT_ILOCK(mp); 2665 loop: 2666 TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) { 2667 if (vp->v_mount != mp) { 2668 if (--tries > 0) 2669 goto loop; 2670 break; 2671 } 2672 2673 VI_LOCK(vp); 2674 if ((vp->v_iflag & VI_OBJDIRTY) && 2675 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2676 MNT_IUNLOCK(mp); 2677 if (!vget(vp, 2678 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 2679 curthread)) { 2680 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 2681 vput(vp); 2682 MNT_ILOCK(mp); 2683 continue; 2684 } 2685 2686 obj = vp->v_object; 2687 if (obj != NULL) { 2688 VM_OBJECT_LOCK(obj); 2689 vm_object_page_clean(obj, 0, 0, 2690 flags == MNT_WAIT ? 2691 OBJPC_SYNC : OBJPC_NOSYNC); 2692 VM_OBJECT_UNLOCK(obj); 2693 } 2694 vput(vp); 2695 } 2696 MNT_ILOCK(mp); 2697 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { 2698 if (--tries > 0) 2699 goto loop; 2700 break; 2701 } 2702 } else 2703 VI_UNLOCK(vp); 2704 } 2705 MNT_IUNLOCK(mp); 2706 } 2707 2708 /* 2709 * Mark a vnode as free, putting it up for recycling. 2710 */ 2711 static void 2712 vfree(struct vnode *vp) 2713 { 2714 2715 ASSERT_VI_LOCKED(vp, "vfree"); 2716 mtx_lock(&vnode_free_list_mtx); 2717 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); 2718 VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); 2719 if (vp->v_iflag & (VI_AGE|VI_DOOMED)) { 2720 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2721 } else { 2722 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2723 } 2724 freevnodes++; 2725 mtx_unlock(&vnode_free_list_mtx); 2726 vp->v_iflag &= ~(VI_AGE|VI_DOOMED); 2727 vp->v_iflag |= VI_FREE; 2728 } 2729 2730 /* 2731 * Opposite of vfree() - mark a vnode as in use. 2732 */ 2733 static void 2734 vbusy(struct vnode *vp) 2735 { 2736 2737 ASSERT_VI_LOCKED(vp, "vbusy"); 2738 VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); 2739 2740 mtx_lock(&vnode_free_list_mtx); 2741 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2742 freevnodes--; 2743 mtx_unlock(&vnode_free_list_mtx); 2744 2745 vp->v_iflag &= ~(VI_FREE|VI_AGE); 2746 } 2747 2748 /* 2749 * Initalize per-vnode helper structure to hold poll-related state. 2750 */ 2751 void 2752 v_addpollinfo(struct vnode *vp) 2753 { 2754 struct vpollinfo *vi; 2755 2756 vi = uma_zalloc(vnodepoll_zone, M_WAITOK); 2757 if (vp->v_pollinfo != NULL) { 2758 uma_zfree(vnodepoll_zone, vi); 2759 return; 2760 } 2761 vp->v_pollinfo = vi; 2762 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 2763 knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, 2764 &vp->v_pollinfo->vpi_lock); 2765 } 2766 2767 /* 2768 * Record a process's interest in events which might happen to 2769 * a vnode. Because poll uses the historic select-style interface 2770 * internally, this routine serves as both the ``check for any 2771 * pending events'' and the ``record my interest in future events'' 2772 * functions. (These are done together, while the lock is held, 2773 * to avoid race conditions.) 2774 */ 2775 int 2776 vn_pollrecord(vp, td, events) 2777 struct vnode *vp; 2778 struct thread *td; 2779 short events; 2780 { 2781 2782 if (vp->v_pollinfo == NULL) 2783 v_addpollinfo(vp); 2784 mtx_lock(&vp->v_pollinfo->vpi_lock); 2785 if (vp->v_pollinfo->vpi_revents & events) { 2786 /* 2787 * This leaves events we are not interested 2788 * in available for the other process which 2789 * which presumably had requested them 2790 * (otherwise they would never have been 2791 * recorded). 2792 */ 2793 events &= vp->v_pollinfo->vpi_revents; 2794 vp->v_pollinfo->vpi_revents &= ~events; 2795 2796 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2797 return events; 2798 } 2799 vp->v_pollinfo->vpi_events |= events; 2800 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 2801 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2802 return 0; 2803 } 2804 2805 /* 2806 * Routine to create and manage a filesystem syncer vnode. 2807 */ 2808 #define sync_close ((int (*)(struct vop_close_args *))nullop) 2809 static int sync_fsync(struct vop_fsync_args *); 2810 static int sync_inactive(struct vop_inactive_args *); 2811 static int sync_reclaim(struct vop_reclaim_args *); 2812 2813 static struct vop_vector sync_vnodeops = { 2814 .vop_bypass = VOP_EOPNOTSUPP, 2815 .vop_close = sync_close, /* close */ 2816 .vop_fsync = sync_fsync, /* fsync */ 2817 .vop_inactive = sync_inactive, /* inactive */ 2818 .vop_reclaim = sync_reclaim, /* reclaim */ 2819 .vop_lock = vop_stdlock, /* lock */ 2820 .vop_unlock = vop_stdunlock, /* unlock */ 2821 .vop_islocked = vop_stdislocked, /* islocked */ 2822 }; 2823 2824 /* 2825 * Create a new filesystem syncer vnode for the specified mount point. 2826 */ 2827 int 2828 vfs_allocate_syncvnode(mp) 2829 struct mount *mp; 2830 { 2831 struct vnode *vp; 2832 static long start, incr, next; 2833 int error; 2834 2835 /* Allocate a new vnode */ 2836 if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { 2837 mp->mnt_syncer = NULL; 2838 return (error); 2839 } 2840 vp->v_type = VNON; 2841 /* 2842 * Place the vnode onto the syncer worklist. We attempt to 2843 * scatter them about on the list so that they will go off 2844 * at evenly distributed times even if all the filesystems 2845 * are mounted at once. 2846 */ 2847 next += incr; 2848 if (next == 0 || next > syncer_maxdelay) { 2849 start /= 2; 2850 incr /= 2; 2851 if (start == 0) { 2852 start = syncer_maxdelay / 2; 2853 incr = syncer_maxdelay; 2854 } 2855 next = start; 2856 } 2857 VI_LOCK(vp); 2858 vn_syncer_add_to_worklist(&vp->v_bufobj, 2859 syncdelay > 0 ? next % syncdelay : 0); 2860 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 2861 mtx_lock(&sync_mtx); 2862 sync_vnode_count++; 2863 mtx_unlock(&sync_mtx); 2864 VI_UNLOCK(vp); 2865 mp->mnt_syncer = vp; 2866 return (0); 2867 } 2868 2869 /* 2870 * Do a lazy sync of the filesystem. 2871 */ 2872 static int 2873 sync_fsync(ap) 2874 struct vop_fsync_args /* { 2875 struct vnode *a_vp; 2876 struct ucred *a_cred; 2877 int a_waitfor; 2878 struct thread *a_td; 2879 } */ *ap; 2880 { 2881 struct vnode *syncvp = ap->a_vp; 2882 struct mount *mp = syncvp->v_mount; 2883 struct thread *td = ap->a_td; 2884 int error, asyncflag; 2885 struct bufobj *bo; 2886 2887 /* 2888 * We only need to do something if this is a lazy evaluation. 2889 */ 2890 if (ap->a_waitfor != MNT_LAZY) 2891 return (0); 2892 2893 /* 2894 * Move ourselves to the back of the sync list. 2895 */ 2896 bo = &syncvp->v_bufobj; 2897 BO_LOCK(bo); 2898 vn_syncer_add_to_worklist(bo, syncdelay); 2899 BO_UNLOCK(bo); 2900 2901 /* 2902 * Walk the list of vnodes pushing all that are dirty and 2903 * not already on the sync list. 2904 */ 2905 mtx_lock(&mountlist_mtx); 2906 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { 2907 mtx_unlock(&mountlist_mtx); 2908 return (0); 2909 } 2910 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2911 vfs_unbusy(mp, td); 2912 return (0); 2913 } 2914 asyncflag = mp->mnt_flag & MNT_ASYNC; 2915 mp->mnt_flag &= ~MNT_ASYNC; 2916 vfs_msync(mp, MNT_NOWAIT); 2917 error = VFS_SYNC(mp, MNT_LAZY, td); 2918 if (asyncflag) 2919 mp->mnt_flag |= MNT_ASYNC; 2920 vn_finished_write(mp); 2921 vfs_unbusy(mp, td); 2922 return (error); 2923 } 2924 2925 /* 2926 * The syncer vnode is no referenced. 2927 */ 2928 static int 2929 sync_inactive(ap) 2930 struct vop_inactive_args /* { 2931 struct vnode *a_vp; 2932 struct thread *a_td; 2933 } */ *ap; 2934 { 2935 2936 vgone(ap->a_vp); 2937 return (0); 2938 } 2939 2940 /* 2941 * The syncer vnode is no longer needed and is being decommissioned. 2942 * 2943 * Modifications to the worklist must be protected by sync_mtx. 2944 */ 2945 static int 2946 sync_reclaim(ap) 2947 struct vop_reclaim_args /* { 2948 struct vnode *a_vp; 2949 } */ *ap; 2950 { 2951 struct vnode *vp = ap->a_vp; 2952 struct bufobj *bo; 2953 2954 VI_LOCK(vp); 2955 bo = &vp->v_bufobj; 2956 vp->v_mount->mnt_syncer = NULL; 2957 if (bo->bo_flag & BO_ONWORKLST) { 2958 mtx_lock(&sync_mtx); 2959 LIST_REMOVE(bo, bo_synclist); 2960 syncer_worklist_len--; 2961 sync_vnode_count--; 2962 mtx_unlock(&sync_mtx); 2963 bo->bo_flag &= ~BO_ONWORKLST; 2964 } 2965 VI_UNLOCK(vp); 2966 2967 return (0); 2968 } 2969 2970 /* 2971 * Check if vnode represents a disk device 2972 */ 2973 int 2974 vn_isdisk(vp, errp) 2975 struct vnode *vp; 2976 int *errp; 2977 { 2978 int error; 2979 2980 error = 0; 2981 dev_lock(); 2982 if (vp->v_type != VCHR) 2983 error = ENOTBLK; 2984 else if (vp->v_rdev == NULL) 2985 error = ENXIO; 2986 else if (vp->v_rdev->si_devsw == NULL) 2987 error = ENXIO; 2988 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 2989 error = ENOTBLK; 2990 dev_unlock(); 2991 if (errp != NULL) 2992 *errp = error; 2993 return (error == 0); 2994 } 2995 2996 /* 2997 * Free data allocated by namei(); see namei(9) for details. 2998 */ 2999 void 3000 NDFREE(ndp, flags) 3001 struct nameidata *ndp; 3002 const u_int flags; 3003 { 3004 3005 if (!(flags & NDF_NO_FREE_PNBUF) && 3006 (ndp->ni_cnd.cn_flags & HASBUF)) { 3007 uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3008 ndp->ni_cnd.cn_flags &= ~HASBUF; 3009 } 3010 if (!(flags & NDF_NO_DVP_UNLOCK) && 3011 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3012 ndp->ni_dvp != ndp->ni_vp) 3013 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread); 3014 if (!(flags & NDF_NO_DVP_RELE) && 3015 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3016 vrele(ndp->ni_dvp); 3017 ndp->ni_dvp = NULL; 3018 } 3019 if (!(flags & NDF_NO_VP_UNLOCK) && 3020 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3021 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread); 3022 if (!(flags & NDF_NO_VP_RELE) && 3023 ndp->ni_vp) { 3024 vrele(ndp->ni_vp); 3025 ndp->ni_vp = NULL; 3026 } 3027 if (!(flags & NDF_NO_STARTDIR_RELE) && 3028 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3029 vrele(ndp->ni_startdir); 3030 ndp->ni_startdir = NULL; 3031 } 3032 } 3033 3034 /* 3035 * Common filesystem object access control check routine. Accepts a 3036 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3037 * and optional call-by-reference privused argument allowing vaccess() 3038 * to indicate to the caller whether privilege was used to satisfy the 3039 * request (obsoleted). Returns 0 on success, or an errno on failure. 3040 */ 3041 int 3042 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3043 enum vtype type; 3044 mode_t file_mode; 3045 uid_t file_uid; 3046 gid_t file_gid; 3047 mode_t acc_mode; 3048 struct ucred *cred; 3049 int *privused; 3050 { 3051 mode_t dac_granted; 3052 #ifdef CAPABILITIES 3053 mode_t cap_granted; 3054 #endif 3055 3056 /* 3057 * Look for a normal, non-privileged way to access the file/directory 3058 * as requested. If it exists, go with that. 3059 */ 3060 3061 if (privused != NULL) 3062 *privused = 0; 3063 3064 dac_granted = 0; 3065 3066 /* Check the owner. */ 3067 if (cred->cr_uid == file_uid) { 3068 dac_granted |= VADMIN; 3069 if (file_mode & S_IXUSR) 3070 dac_granted |= VEXEC; 3071 if (file_mode & S_IRUSR) 3072 dac_granted |= VREAD; 3073 if (file_mode & S_IWUSR) 3074 dac_granted |= (VWRITE | VAPPEND); 3075 3076 if ((acc_mode & dac_granted) == acc_mode) 3077 return (0); 3078 3079 goto privcheck; 3080 } 3081 3082 /* Otherwise, check the groups (first match) */ 3083 if (groupmember(file_gid, cred)) { 3084 if (file_mode & S_IXGRP) 3085 dac_granted |= VEXEC; 3086 if (file_mode & S_IRGRP) 3087 dac_granted |= VREAD; 3088 if (file_mode & S_IWGRP) 3089 dac_granted |= (VWRITE | VAPPEND); 3090 3091 if ((acc_mode & dac_granted) == acc_mode) 3092 return (0); 3093 3094 goto privcheck; 3095 } 3096 3097 /* Otherwise, check everyone else. */ 3098 if (file_mode & S_IXOTH) 3099 dac_granted |= VEXEC; 3100 if (file_mode & S_IROTH) 3101 dac_granted |= VREAD; 3102 if (file_mode & S_IWOTH) 3103 dac_granted |= (VWRITE | VAPPEND); 3104 if ((acc_mode & dac_granted) == acc_mode) 3105 return (0); 3106 3107 privcheck: 3108 if (!suser_cred(cred, SUSER_ALLOWJAIL)) { 3109 /* XXX audit: privilege used */ 3110 if (privused != NULL) 3111 *privused = 1; 3112 return (0); 3113 } 3114 3115 #ifdef CAPABILITIES 3116 /* 3117 * Build a capability mask to determine if the set of capabilities 3118 * satisfies the requirements when combined with the granted mask 3119 * from above. 3120 * For each capability, if the capability is required, bitwise 3121 * or the request type onto the cap_granted mask. 3122 */ 3123 cap_granted = 0; 3124 3125 if (type == VDIR) { 3126 /* 3127 * For directories, use CAP_DAC_READ_SEARCH to satisfy 3128 * VEXEC requests, instead of CAP_DAC_EXECUTE. 3129 */ 3130 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3131 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL)) 3132 cap_granted |= VEXEC; 3133 } else { 3134 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3135 !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL)) 3136 cap_granted |= VEXEC; 3137 } 3138 3139 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3140 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL)) 3141 cap_granted |= VREAD; 3142 3143 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3144 !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL)) 3145 cap_granted |= (VWRITE | VAPPEND); 3146 3147 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3148 !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL)) 3149 cap_granted |= VADMIN; 3150 3151 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3152 /* XXX audit: privilege used */ 3153 if (privused != NULL) 3154 *privused = 1; 3155 return (0); 3156 } 3157 #endif 3158 3159 return ((acc_mode & VADMIN) ? EPERM : EACCES); 3160 } 3161 3162 /* 3163 * Credential check based on process requesting service, and per-attribute 3164 * permissions. 3165 */ 3166 int 3167 extattr_check_cred(struct vnode *vp, int attrnamespace, 3168 struct ucred *cred, struct thread *td, int access) 3169 { 3170 3171 /* 3172 * Kernel-invoked always succeeds. 3173 */ 3174 if (cred == NOCRED) 3175 return (0); 3176 3177 /* 3178 * Do not allow privileged processes in jail to directly 3179 * manipulate system attributes. 3180 * 3181 * XXX What capability should apply here? 3182 * Probably CAP_SYS_SETFFLAG. 3183 */ 3184 switch (attrnamespace) { 3185 case EXTATTR_NAMESPACE_SYSTEM: 3186 /* Potentially should be: return (EPERM); */ 3187 return (suser_cred(cred, 0)); 3188 case EXTATTR_NAMESPACE_USER: 3189 return (VOP_ACCESS(vp, access, cred, td)); 3190 default: 3191 return (EPERM); 3192 } 3193 } 3194 3195 #ifdef DEBUG_VFS_LOCKS 3196 /* 3197 * This only exists to supress warnings from unlocked specfs accesses. It is 3198 * no longer ok to have an unlocked VFS. 3199 */ 3200 #define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD) 3201 3202 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 3203 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, ""); 3204 3205 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 3206 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, ""); 3207 3208 int vfs_badlock_print = 1; /* Print lock violations. */ 3209 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, ""); 3210 3211 #ifdef KDB 3212 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 3213 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, ""); 3214 #endif 3215 3216 static void 3217 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 3218 { 3219 3220 #ifdef KDB 3221 if (vfs_badlock_backtrace) 3222 kdb_backtrace(); 3223 #endif 3224 if (vfs_badlock_print) 3225 printf("%s: %p %s\n", str, (void *)vp, msg); 3226 if (vfs_badlock_ddb) 3227 kdb_enter("lock violation"); 3228 } 3229 3230 void 3231 assert_vi_locked(struct vnode *vp, const char *str) 3232 { 3233 3234 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 3235 vfs_badlock("interlock is not locked but should be", str, vp); 3236 } 3237 3238 void 3239 assert_vi_unlocked(struct vnode *vp, const char *str) 3240 { 3241 3242 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 3243 vfs_badlock("interlock is locked but should not be", str, vp); 3244 } 3245 3246 void 3247 assert_vop_locked(struct vnode *vp, const char *str) 3248 { 3249 3250 if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0) 3251 vfs_badlock("is not locked but should be", str, vp); 3252 } 3253 3254 void 3255 assert_vop_unlocked(struct vnode *vp, const char *str) 3256 { 3257 3258 if (vp && !IGNORE_LOCK(vp) && 3259 VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) 3260 vfs_badlock("is locked but should not be", str, vp); 3261 } 3262 3263 #if 0 3264 void 3265 assert_vop_elocked(struct vnode *vp, const char *str) 3266 { 3267 3268 if (vp && !IGNORE_LOCK(vp) && 3269 VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE) 3270 vfs_badlock("is not exclusive locked but should be", str, vp); 3271 } 3272 3273 void 3274 assert_vop_elocked_other(struct vnode *vp, const char *str) 3275 { 3276 3277 if (vp && !IGNORE_LOCK(vp) && 3278 VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER) 3279 vfs_badlock("is not exclusive locked by another thread", 3280 str, vp); 3281 } 3282 3283 void 3284 assert_vop_slocked(struct vnode *vp, const char *str) 3285 { 3286 3287 if (vp && !IGNORE_LOCK(vp) && 3288 VOP_ISLOCKED(vp, curthread) != LK_SHARED) 3289 vfs_badlock("is not locked shared but should be", str, vp); 3290 } 3291 #endif /* 0 */ 3292 3293 void 3294 vop_rename_pre(void *ap) 3295 { 3296 struct vop_rename_args *a = ap; 3297 3298 if (a->a_tvp) 3299 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 3300 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 3301 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 3302 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 3303 3304 /* Check the source (from). */ 3305 if (a->a_tdvp != a->a_fdvp) 3306 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 3307 if (a->a_tvp != a->a_fvp) 3308 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked"); 3309 3310 /* Check the target. */ 3311 if (a->a_tvp) 3312 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 3313 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 3314 } 3315 3316 void 3317 vop_strategy_pre(void *ap) 3318 { 3319 struct vop_strategy_args *a; 3320 struct buf *bp; 3321 3322 a = ap; 3323 bp = a->a_bp; 3324 3325 /* 3326 * Cluster ops lock their component buffers but not the IO container. 3327 */ 3328 if ((bp->b_flags & B_CLUSTER) != 0) 3329 return; 3330 3331 if (BUF_REFCNT(bp) < 1) { 3332 if (vfs_badlock_print) 3333 printf( 3334 "VOP_STRATEGY: bp is not locked but should be\n"); 3335 if (vfs_badlock_ddb) 3336 kdb_enter("lock violation"); 3337 } 3338 } 3339 3340 void 3341 vop_lookup_pre(void *ap) 3342 { 3343 struct vop_lookup_args *a; 3344 struct vnode *dvp; 3345 3346 a = ap; 3347 dvp = a->a_dvp; 3348 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); 3349 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); 3350 } 3351 3352 void 3353 vop_lookup_post(void *ap, int rc) 3354 { 3355 struct vop_lookup_args *a; 3356 struct componentname *cnp; 3357 struct vnode *dvp; 3358 struct vnode *vp; 3359 int flags; 3360 3361 a = ap; 3362 dvp = a->a_dvp; 3363 cnp = a->a_cnp; 3364 vp = *(a->a_vpp); 3365 flags = cnp->cn_flags; 3366 3367 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); 3368 3369 /* 3370 * If this is the last path component for this lookup and LOCKPARENT 3371 * is set, OR if there is an error the directory has to be locked. 3372 */ 3373 if ((flags & LOCKPARENT) && (flags & ISLASTCN)) 3374 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)"); 3375 else if (rc != 0) 3376 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)"); 3377 else if (dvp != vp) 3378 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)"); 3379 if (flags & PDIRUNLOCK) 3380 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)"); 3381 } 3382 3383 void 3384 vop_lock_pre(void *ap) 3385 { 3386 struct vop_lock_args *a = ap; 3387 3388 if ((a->a_flags & LK_INTERLOCK) == 0) 3389 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 3390 else 3391 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 3392 } 3393 3394 void 3395 vop_lock_post(void *ap, int rc) 3396 { 3397 struct vop_lock_args *a = ap; 3398 3399 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 3400 if (rc == 0) 3401 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 3402 } 3403 3404 void 3405 vop_unlock_pre(void *ap) 3406 { 3407 struct vop_unlock_args *a = ap; 3408 3409 if (a->a_flags & LK_INTERLOCK) 3410 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 3411 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 3412 } 3413 3414 void 3415 vop_unlock_post(void *ap, int rc) 3416 { 3417 struct vop_unlock_args *a = ap; 3418 3419 if (a->a_flags & LK_INTERLOCK) 3420 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 3421 } 3422 #endif /* DEBUG_VFS_LOCKS */ 3423 3424 static struct knlist fs_knlist; 3425 3426 static void 3427 vfs_event_init(void *arg) 3428 { 3429 knlist_init(&fs_knlist, NULL); 3430 } 3431 /* XXX - correct order? */ 3432 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 3433 3434 void 3435 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) 3436 { 3437 3438 KNOTE_UNLOCKED(&fs_knlist, event); 3439 } 3440 3441 static int filt_fsattach(struct knote *kn); 3442 static void filt_fsdetach(struct knote *kn); 3443 static int filt_fsevent(struct knote *kn, long hint); 3444 3445 struct filterops fs_filtops = 3446 { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; 3447 3448 static int 3449 filt_fsattach(struct knote *kn) 3450 { 3451 3452 kn->kn_flags |= EV_CLEAR; 3453 knlist_add(&fs_knlist, kn, 0); 3454 return (0); 3455 } 3456 3457 static void 3458 filt_fsdetach(struct knote *kn) 3459 { 3460 3461 knlist_remove(&fs_knlist, kn, 0); 3462 } 3463 3464 static int 3465 filt_fsevent(struct knote *kn, long hint) 3466 { 3467 3468 kn->kn_fflags |= hint; 3469 return (kn->kn_fflags != 0); 3470 } 3471 3472 static int 3473 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 3474 { 3475 struct vfsidctl vc; 3476 int error; 3477 struct mount *mp; 3478 3479 error = SYSCTL_IN(req, &vc, sizeof(vc)); 3480 if (error) 3481 return (error); 3482 if (vc.vc_vers != VFS_CTL_VERS1) 3483 return (EINVAL); 3484 mp = vfs_getvfs(&vc.vc_fsid); 3485 if (mp == NULL) 3486 return (ENOENT); 3487 /* ensure that a specific sysctl goes to the right filesystem. */ 3488 if (strcmp(vc.vc_fstypename, "*") != 0 && 3489 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 3490 return (EINVAL); 3491 } 3492 VCTLTOREQ(&vc, req); 3493 return (VFS_SYSCTL(mp, vc.vc_op, req)); 3494 } 3495 3496 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, 3497 NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); 3498 3499 /* 3500 * Function to initialize a va_filerev field sensibly. 3501 * XXX: Wouldn't a random number make a lot more sense ?? 3502 */ 3503 u_quad_t 3504 init_va_filerev(void) 3505 { 3506 struct bintime bt; 3507 3508 getbinuptime(&bt); 3509 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 3510 } 3511