1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_mac.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/bio.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/event.h> 53 #include <sys/eventhandler.h> 54 #include <sys/extattr.h> 55 #include <sys/fcntl.h> 56 #include <sys/kdb.h> 57 #include <sys/kernel.h> 58 #include <sys/kthread.h> 59 #include <sys/mac.h> 60 #include <sys/malloc.h> 61 #include <sys/mount.h> 62 #include <sys/namei.h> 63 #include <sys/reboot.h> 64 #include <sys/sleepqueue.h> 65 #include <sys/stat.h> 66 #include <sys/sysctl.h> 67 #include <sys/syslog.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/stdarg.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/pmap.h> 77 #include <vm/vm_map.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_kern.h> 80 #include <vm/uma.h> 81 82 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 83 84 static void delmntque(struct vnode *vp); 85 static void insmntque(struct vnode *vp, struct mount *mp); 86 static void vlruvp(struct vnode *vp); 87 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 88 int slpflag, int slptimeo); 89 static void syncer_shutdown(void *arg, int howto); 90 static int vtryrecycle(struct vnode *vp); 91 static void vx_lock(struct vnode *vp); 92 static void vx_unlock(struct vnode *vp); 93 static void vbusy(struct vnode *vp); 94 static void vdropl(struct vnode *vp); 95 static void vholdl(struct vnode *); 96 97 /* 98 * Enable Giant pushdown based on whether or not the vm is mpsafe in this 99 * build. Without mpsafevm the buffer cache can not run Giant free. 100 */ 101 #if defined(__alpha__) || defined(__amd64__) || defined(__i386__) 102 int mpsafe_vfs = 1; 103 #else 104 int mpsafe_vfs; 105 #endif 106 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs); 107 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0, 108 "MPSAFE VFS"); 109 110 /* 111 * Number of vnodes in existence. Increased whenever getnewvnode() 112 * allocates a new vnode, never decreased. 113 */ 114 static unsigned long numvnodes; 115 116 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 117 118 /* 119 * Conversion tables for conversion from vnode types to inode formats 120 * and back. 121 */ 122 enum vtype iftovt_tab[16] = { 123 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 124 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 125 }; 126 int vttoif_tab[9] = { 127 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 128 S_IFSOCK, S_IFIFO, S_IFMT, 129 }; 130 131 /* 132 * List of vnodes that are ready for recycling. 133 */ 134 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 135 136 /* 137 * Minimum number of free vnodes. If there are fewer than this free vnodes, 138 * getnewvnode() will return a newly allocated vnode. 139 */ 140 static u_long wantfreevnodes = 25; 141 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 142 /* Number of vnodes in the free list. */ 143 static u_long freevnodes; 144 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 145 146 /* 147 * Various variables used for debugging the new implementation of 148 * reassignbuf(). 149 * XXX these are probably of (very) limited utility now. 150 */ 151 static int reassignbufcalls; 152 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 153 static int nameileafonly; 154 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); 155 156 /* 157 * Cache for the mount type id assigned to NFS. This is used for 158 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 159 */ 160 int nfs_mount_type = -1; 161 162 /* To keep more than one thread at a time from running vfs_getnewfsid */ 163 static struct mtx mntid_mtx; 164 165 /* 166 * Lock for any access to the following: 167 * vnode_free_list 168 * numvnodes 169 * freevnodes 170 */ 171 static struct mtx vnode_free_list_mtx; 172 173 /* Publicly exported FS */ 174 struct nfs_public nfs_pub; 175 176 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 177 static uma_zone_t vnode_zone; 178 static uma_zone_t vnodepoll_zone; 179 180 /* Set to 1 to print out reclaim of active vnodes */ 181 int prtactive; 182 183 /* 184 * The workitem queue. 185 * 186 * It is useful to delay writes of file data and filesystem metadata 187 * for tens of seconds so that quickly created and deleted files need 188 * not waste disk bandwidth being created and removed. To realize this, 189 * we append vnodes to a "workitem" queue. When running with a soft 190 * updates implementation, most pending metadata dependencies should 191 * not wait for more than a few seconds. Thus, mounted on block devices 192 * are delayed only about a half the time that file data is delayed. 193 * Similarly, directory updates are more critical, so are only delayed 194 * about a third the time that file data is delayed. Thus, there are 195 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 196 * one each second (driven off the filesystem syncer process). The 197 * syncer_delayno variable indicates the next queue that is to be processed. 198 * Items that need to be processed soon are placed in this queue: 199 * 200 * syncer_workitem_pending[syncer_delayno] 201 * 202 * A delay of fifteen seconds is done by placing the request fifteen 203 * entries later in the queue: 204 * 205 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 206 * 207 */ 208 static int syncer_delayno; 209 static long syncer_mask; 210 LIST_HEAD(synclist, bufobj); 211 static struct synclist *syncer_workitem_pending; 212 /* 213 * The sync_mtx protects: 214 * bo->bo_synclist 215 * sync_vnode_count 216 * syncer_delayno 217 * syncer_state 218 * syncer_workitem_pending 219 * syncer_worklist_len 220 * rushjob 221 */ 222 static struct mtx sync_mtx; 223 224 #define SYNCER_MAXDELAY 32 225 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 226 static int syncdelay = 30; /* max time to delay syncing data */ 227 static int filedelay = 30; /* time to delay syncing files */ 228 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 229 static int dirdelay = 29; /* time to delay syncing directories */ 230 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 231 static int metadelay = 28; /* time to delay syncing metadata */ 232 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 233 static int rushjob; /* number of slots to run ASAP */ 234 static int stat_rush_requests; /* number of times I/O speeded up */ 235 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 236 237 /* 238 * When shutting down the syncer, run it at four times normal speed. 239 */ 240 #define SYNCER_SHUTDOWN_SPEEDUP 4 241 static int sync_vnode_count; 242 static int syncer_worklist_len; 243 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 244 syncer_state; 245 246 /* 247 * Number of vnodes we want to exist at any one time. This is mostly used 248 * to size hash tables in vnode-related code. It is normally not used in 249 * getnewvnode(), as wantfreevnodes is normally nonzero.) 250 * 251 * XXX desiredvnodes is historical cruft and should not exist. 252 */ 253 int desiredvnodes; 254 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 255 &desiredvnodes, 0, "Maximum number of vnodes"); 256 static int minvnodes; 257 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 258 &minvnodes, 0, "Minimum number of vnodes"); 259 static int vnlru_nowhere; 260 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 261 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 262 263 /* Hook for calling soft updates. */ 264 int (*softdep_process_worklist_hook)(struct mount *); 265 266 /* 267 * Initialize the vnode management data structures. 268 */ 269 #ifndef MAXVNODES_MAX 270 #define MAXVNODES_MAX 100000 271 #endif 272 static void 273 vntblinit(void *dummy __unused) 274 { 275 276 /* 277 * Desiredvnodes is a function of the physical memory size and 278 * the kernel's heap size. Specifically, desiredvnodes scales 279 * in proportion to the physical memory size until two fifths 280 * of the kernel's heap size is consumed by vnodes and vm 281 * objects. 282 */ 283 desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / 284 (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); 285 if (desiredvnodes > MAXVNODES_MAX) { 286 if (bootverbose) 287 printf("Reducing kern.maxvnodes %d -> %d\n", 288 desiredvnodes, MAXVNODES_MAX); 289 desiredvnodes = MAXVNODES_MAX; 290 } 291 minvnodes = desiredvnodes / 4; 292 mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF); 293 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 294 TAILQ_INIT(&vnode_free_list); 295 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 296 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 297 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 298 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 299 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 300 /* 301 * Initialize the filesystem syncer. 302 */ 303 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 304 &syncer_mask); 305 syncer_maxdelay = syncer_mask + 1; 306 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 307 } 308 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 309 310 311 /* 312 * Mark a mount point as busy. Used to synchronize access and to delay 313 * unmounting. Interlock is not released on failure. 314 */ 315 int 316 vfs_busy(mp, flags, interlkp, td) 317 struct mount *mp; 318 int flags; 319 struct mtx *interlkp; 320 struct thread *td; 321 { 322 int lkflags; 323 324 MNT_ILOCK(mp); 325 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 326 if (flags & LK_NOWAIT) { 327 MNT_IUNLOCK(mp); 328 return (ENOENT); 329 } 330 if (interlkp) 331 mtx_unlock(interlkp); 332 mp->mnt_kern_flag |= MNTK_MWAIT; 333 /* 334 * Since all busy locks are shared except the exclusive 335 * lock granted when unmounting, the only place that a 336 * wakeup needs to be done is at the release of the 337 * exclusive lock at the end of dounmount. 338 */ 339 msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0); 340 if (interlkp) 341 mtx_lock(interlkp); 342 return (ENOENT); 343 } 344 if (interlkp) 345 mtx_unlock(interlkp); 346 lkflags = LK_SHARED | LK_NOPAUSE | LK_INTERLOCK; 347 if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td)) 348 panic("vfs_busy: unexpected lock failure"); 349 return (0); 350 } 351 352 /* 353 * Free a busy filesystem. 354 */ 355 void 356 vfs_unbusy(mp, td) 357 struct mount *mp; 358 struct thread *td; 359 { 360 361 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); 362 } 363 364 /* 365 * Lookup a mount point by filesystem identifier. 366 */ 367 struct mount * 368 vfs_getvfs(fsid) 369 fsid_t *fsid; 370 { 371 struct mount *mp; 372 373 mtx_lock(&mountlist_mtx); 374 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 375 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 376 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 377 mtx_unlock(&mountlist_mtx); 378 return (mp); 379 } 380 } 381 mtx_unlock(&mountlist_mtx); 382 return ((struct mount *) 0); 383 } 384 385 /* 386 * Check if a user can access priveledged mount options. 387 */ 388 int 389 vfs_suser(struct mount *mp, struct thread *td) 390 { 391 int error; 392 393 if ((mp->mnt_flag & MNT_USER) == 0 || 394 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 395 if ((error = suser(td)) != 0) 396 return (error); 397 } 398 return (0); 399 } 400 401 /* 402 * Get a new unique fsid. Try to make its val[0] unique, since this value 403 * will be used to create fake device numbers for stat(). Also try (but 404 * not so hard) make its val[0] unique mod 2^16, since some emulators only 405 * support 16-bit device numbers. We end up with unique val[0]'s for the 406 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 407 * 408 * Keep in mind that several mounts may be running in parallel. Starting 409 * the search one past where the previous search terminated is both a 410 * micro-optimization and a defense against returning the same fsid to 411 * different mounts. 412 */ 413 void 414 vfs_getnewfsid(mp) 415 struct mount *mp; 416 { 417 static u_int16_t mntid_base; 418 fsid_t tfsid; 419 int mtype; 420 421 mtx_lock(&mntid_mtx); 422 mtype = mp->mnt_vfc->vfc_typenum; 423 tfsid.val[1] = mtype; 424 mtype = (mtype & 0xFF) << 24; 425 for (;;) { 426 tfsid.val[0] = makedev(255, 427 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 428 mntid_base++; 429 if (vfs_getvfs(&tfsid) == NULL) 430 break; 431 } 432 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 433 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 434 mtx_unlock(&mntid_mtx); 435 } 436 437 /* 438 * Knob to control the precision of file timestamps: 439 * 440 * 0 = seconds only; nanoseconds zeroed. 441 * 1 = seconds and nanoseconds, accurate within 1/HZ. 442 * 2 = seconds and nanoseconds, truncated to microseconds. 443 * >=3 = seconds and nanoseconds, maximum precision. 444 */ 445 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 446 447 static int timestamp_precision = TSP_SEC; 448 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 449 ×tamp_precision, 0, ""); 450 451 /* 452 * Get a current timestamp. 453 */ 454 void 455 vfs_timestamp(tsp) 456 struct timespec *tsp; 457 { 458 struct timeval tv; 459 460 switch (timestamp_precision) { 461 case TSP_SEC: 462 tsp->tv_sec = time_second; 463 tsp->tv_nsec = 0; 464 break; 465 case TSP_HZ: 466 getnanotime(tsp); 467 break; 468 case TSP_USEC: 469 microtime(&tv); 470 TIMEVAL_TO_TIMESPEC(&tv, tsp); 471 break; 472 case TSP_NSEC: 473 default: 474 nanotime(tsp); 475 break; 476 } 477 } 478 479 /* 480 * Set vnode attributes to VNOVAL 481 */ 482 void 483 vattr_null(vap) 484 struct vattr *vap; 485 { 486 487 vap->va_type = VNON; 488 vap->va_size = VNOVAL; 489 vap->va_bytes = VNOVAL; 490 vap->va_mode = VNOVAL; 491 vap->va_nlink = VNOVAL; 492 vap->va_uid = VNOVAL; 493 vap->va_gid = VNOVAL; 494 vap->va_fsid = VNOVAL; 495 vap->va_fileid = VNOVAL; 496 vap->va_blocksize = VNOVAL; 497 vap->va_rdev = VNOVAL; 498 vap->va_atime.tv_sec = VNOVAL; 499 vap->va_atime.tv_nsec = VNOVAL; 500 vap->va_mtime.tv_sec = VNOVAL; 501 vap->va_mtime.tv_nsec = VNOVAL; 502 vap->va_ctime.tv_sec = VNOVAL; 503 vap->va_ctime.tv_nsec = VNOVAL; 504 vap->va_birthtime.tv_sec = VNOVAL; 505 vap->va_birthtime.tv_nsec = VNOVAL; 506 vap->va_flags = VNOVAL; 507 vap->va_gen = VNOVAL; 508 vap->va_vaflags = 0; 509 } 510 511 /* 512 * This routine is called when we have too many vnodes. It attempts 513 * to free <count> vnodes and will potentially free vnodes that still 514 * have VM backing store (VM backing store is typically the cause 515 * of a vnode blowout so we want to do this). Therefore, this operation 516 * is not considered cheap. 517 * 518 * A number of conditions may prevent a vnode from being reclaimed. 519 * the buffer cache may have references on the vnode, a directory 520 * vnode may still have references due to the namei cache representing 521 * underlying files, or the vnode may be in active use. It is not 522 * desireable to reuse such vnodes. These conditions may cause the 523 * number of vnodes to reach some minimum value regardless of what 524 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 525 */ 526 static int 527 vlrureclaim(struct mount *mp) 528 { 529 struct vnode *vp; 530 int done; 531 int trigger; 532 int usevnodes; 533 int count; 534 535 /* 536 * Calculate the trigger point, don't allow user 537 * screwups to blow us up. This prevents us from 538 * recycling vnodes with lots of resident pages. We 539 * aren't trying to free memory, we are trying to 540 * free vnodes. 541 */ 542 usevnodes = desiredvnodes; 543 if (usevnodes <= 0) 544 usevnodes = 1; 545 trigger = cnt.v_page_count * 2 / usevnodes; 546 547 done = 0; 548 MNT_ILOCK(mp); 549 count = mp->mnt_nvnodelistsize / 10 + 1; 550 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 551 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 552 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 553 554 if (vp->v_type != VNON && 555 vp->v_type != VBAD && 556 VI_TRYLOCK(vp)) { 557 if (VMIGHTFREE(vp) && /* critical path opt */ 558 (vp->v_object == NULL || 559 vp->v_object->resident_page_count < trigger)) { 560 MNT_IUNLOCK(mp); 561 vgonel(vp, curthread); 562 done++; 563 MNT_ILOCK(mp); 564 } else 565 VI_UNLOCK(vp); 566 } 567 --count; 568 } 569 MNT_IUNLOCK(mp); 570 return done; 571 } 572 573 /* 574 * Attempt to recycle vnodes in a context that is always safe to block. 575 * Calling vlrurecycle() from the bowels of filesystem code has some 576 * interesting deadlock problems. 577 */ 578 static struct proc *vnlruproc; 579 static int vnlruproc_sig; 580 581 static void 582 vnlru_proc(void) 583 { 584 struct mount *mp, *nmp; 585 int done; 586 struct proc *p = vnlruproc; 587 struct thread *td = FIRST_THREAD_IN_PROC(p); 588 589 mtx_lock(&Giant); 590 591 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 592 SHUTDOWN_PRI_FIRST); 593 594 for (;;) { 595 kthread_suspend_check(p); 596 mtx_lock(&vnode_free_list_mtx); 597 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { 598 vnlruproc_sig = 0; 599 wakeup(&vnlruproc_sig); 600 msleep(vnlruproc, &vnode_free_list_mtx, 601 PVFS|PDROP, "vlruwt", hz); 602 continue; 603 } 604 mtx_unlock(&vnode_free_list_mtx); 605 done = 0; 606 mtx_lock(&mountlist_mtx); 607 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 608 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 609 nmp = TAILQ_NEXT(mp, mnt_list); 610 continue; 611 } 612 done += vlrureclaim(mp); 613 mtx_lock(&mountlist_mtx); 614 nmp = TAILQ_NEXT(mp, mnt_list); 615 vfs_unbusy(mp, td); 616 } 617 mtx_unlock(&mountlist_mtx); 618 if (done == 0) { 619 #if 0 620 /* These messages are temporary debugging aids */ 621 if (vnlru_nowhere < 5) 622 printf("vnlru process getting nowhere..\n"); 623 else if (vnlru_nowhere == 5) 624 printf("vnlru process messages stopped.\n"); 625 #endif 626 vnlru_nowhere++; 627 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 628 } 629 } 630 } 631 632 static struct kproc_desc vnlru_kp = { 633 "vnlru", 634 vnlru_proc, 635 &vnlruproc 636 }; 637 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 638 639 640 /* 641 * Routines having to do with the management of the vnode table. 642 */ 643 644 /* 645 * Check to see if a free vnode can be recycled. If it can, 646 * recycle it and return it with the vnode interlock held. 647 */ 648 static int 649 vtryrecycle(struct vnode *vp) 650 { 651 struct thread *td = curthread; 652 vm_object_t object; 653 struct mount *vnmp; 654 int error; 655 656 /* Don't recycle if we can't get the interlock */ 657 if (!VI_TRYLOCK(vp)) 658 return (EWOULDBLOCK); 659 if (!VCANRECYCLE(vp)) { 660 VI_UNLOCK(vp); 661 return (EBUSY); 662 } 663 /* 664 * This vnode may found and locked via some other list, if so we 665 * can't recycle it yet. 666 */ 667 if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0) 668 return (EWOULDBLOCK); 669 /* 670 * Don't recycle if its filesystem is being suspended. 671 */ 672 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 673 VOP_UNLOCK(vp, 0, td); 674 return (EBUSY); 675 } 676 677 /* 678 * Don't recycle if we still have cached pages. 679 */ 680 object = vp->v_object; 681 if (object != NULL) { 682 VM_OBJECT_LOCK(object); 683 if (object->resident_page_count || 684 object->ref_count) { 685 VM_OBJECT_UNLOCK(object); 686 error = EBUSY; 687 goto done; 688 } 689 VM_OBJECT_UNLOCK(object); 690 } 691 if (LIST_FIRST(&vp->v_cache_src)) { 692 /* 693 * note: nameileafonly sysctl is temporary, 694 * for debugging only, and will eventually be 695 * removed. 696 */ 697 if (nameileafonly > 0) { 698 /* 699 * Do not reuse namei-cached directory 700 * vnodes that have cached 701 * subdirectories. 702 */ 703 if (cache_leaf_test(vp) < 0) { 704 error = EISDIR; 705 goto done; 706 } 707 } else if (nameileafonly < 0 || 708 vmiodirenable == 0) { 709 /* 710 * Do not reuse namei-cached directory 711 * vnodes if nameileafonly is -1 or 712 * if VMIO backing for directories is 713 * turned off (otherwise we reuse them 714 * too quickly). 715 */ 716 error = EBUSY; 717 goto done; 718 } 719 } 720 /* 721 * If we got this far, we need to acquire the interlock and see if 722 * anyone picked up this vnode from another list. If not, we will 723 * mark it with XLOCK via vgonel() so that anyone who does find it 724 * will skip over it. 725 */ 726 VI_LOCK(vp); 727 if (!VCANRECYCLE(vp)) { 728 VI_UNLOCK(vp); 729 error = EBUSY; 730 goto done; 731 } 732 mtx_lock(&vnode_free_list_mtx); 733 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 734 vp->v_iflag &= ~VI_FREE; 735 mtx_unlock(&vnode_free_list_mtx); 736 vp->v_iflag |= VI_DOOMED; 737 if ((vp->v_type != VBAD) || (vp->v_data != NULL)) { 738 VOP_UNLOCK(vp, 0, td); 739 vgonel(vp, td); 740 } else 741 VOP_UNLOCK(vp, LK_INTERLOCK, td); 742 vn_finished_write(vnmp); 743 return (0); 744 done: 745 VOP_UNLOCK(vp, 0, td); 746 vn_finished_write(vnmp); 747 return (error); 748 } 749 750 /* 751 * Return the next vnode from the free list. 752 */ 753 int 754 getnewvnode(tag, mp, vops, vpp) 755 const char *tag; 756 struct mount *mp; 757 struct vop_vector *vops; 758 struct vnode **vpp; 759 { 760 struct vnode *vp = NULL; 761 struct vpollinfo *pollinfo = NULL; 762 struct bufobj *bo; 763 764 mtx_lock(&vnode_free_list_mtx); 765 766 /* 767 * Try to reuse vnodes if we hit the max. This situation only 768 * occurs in certain large-memory (2G+) situations. We cannot 769 * attempt to directly reclaim vnodes due to nasty recursion 770 * problems. 771 */ 772 while (numvnodes - freevnodes > desiredvnodes) { 773 if (vnlruproc_sig == 0) { 774 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 775 wakeup(vnlruproc); 776 } 777 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 778 "vlruwk", hz); 779 } 780 781 /* 782 * Attempt to reuse a vnode already on the free list, allocating 783 * a new vnode if we can't find one or if we have not reached a 784 * good minimum for good LRU performance. 785 */ 786 787 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 788 int error; 789 int count; 790 791 for (count = 0; count < freevnodes; count++) { 792 vp = TAILQ_FIRST(&vnode_free_list); 793 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 794 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 795 mtx_unlock(&vnode_free_list_mtx); 796 error = vtryrecycle(vp); 797 mtx_lock(&vnode_free_list_mtx); 798 if (error == 0) 799 break; 800 vp = NULL; 801 } 802 } 803 if (vp) { 804 freevnodes--; 805 bo = &vp->v_bufobj; 806 mtx_unlock(&vnode_free_list_mtx); 807 808 #ifdef INVARIANTS 809 { 810 if (vp->v_data) 811 printf("cleaned vnode isn't, " 812 "address %p, inode %p\n", 813 vp, vp->v_data); 814 if (bo->bo_numoutput) 815 panic("%p: Clean vnode has pending I/O's", vp); 816 if (vp->v_usecount != 0) 817 panic("%p: Non-zero use count", vp); 818 if (vp->v_writecount != 0) 819 panic("%p: Non-zero write count", vp); 820 } 821 #endif 822 if ((pollinfo = vp->v_pollinfo) != NULL) { 823 /* 824 * To avoid lock order reversals, the call to 825 * uma_zfree() must be delayed until the vnode 826 * interlock is released. 827 */ 828 vp->v_pollinfo = NULL; 829 } 830 #ifdef MAC 831 mac_destroy_vnode(vp); 832 #endif 833 vp->v_iflag = 0; 834 vp->v_vflag = 0; 835 vp->v_lastw = 0; 836 vp->v_lasta = 0; 837 vp->v_cstart = 0; 838 vp->v_clen = 0; 839 bzero(&vp->v_un, sizeof vp->v_un); 840 lockdestroy(vp->v_vnlock); 841 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE); 842 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, 843 ("cleanbufcnt not 0")); 844 VNASSERT(bo->bo_clean.bv_root == NULL, vp, 845 ("cleanblkroot not NULL")); 846 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, 847 ("dirtybufcnt not 0")); 848 VNASSERT(bo->bo_dirty.bv_root == NULL, vp, 849 ("dirtyblkroot not NULL")); 850 } else { 851 numvnodes++; 852 mtx_unlock(&vnode_free_list_mtx); 853 854 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); 855 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 856 vp->v_dd = vp; 857 bo = &vp->v_bufobj; 858 bo->__bo_vnode = vp; 859 bo->bo_mtx = &vp->v_interlock; 860 vp->v_vnlock = &vp->v_lock; 861 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE); 862 cache_purge(vp); /* Sets up v_id. */ 863 LIST_INIT(&vp->v_cache_src); 864 TAILQ_INIT(&vp->v_cache_dst); 865 } 866 867 TAILQ_INIT(&bo->bo_clean.bv_hd); 868 TAILQ_INIT(&bo->bo_dirty.bv_hd); 869 bo->bo_ops = &buf_ops_bio; 870 bo->bo_private = vp; 871 vp->v_type = VNON; 872 vp->v_tag = tag; 873 vp->v_op = vops; 874 *vpp = vp; 875 vp->v_usecount = 1; 876 vp->v_data = 0; 877 if (pollinfo != NULL) { 878 knlist_destroy(&pollinfo->vpi_selinfo.si_note); 879 mtx_destroy(&pollinfo->vpi_lock); 880 uma_zfree(vnodepoll_zone, pollinfo); 881 } 882 #ifdef MAC 883 mac_init_vnode(vp); 884 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 885 mac_associate_vnode_singlelabel(mp, vp); 886 else if (mp == NULL) 887 printf("NULL mp in getnewvnode()\n"); 888 #endif 889 delmntque(vp); 890 if (mp != NULL) { 891 insmntque(vp, mp); 892 bo->bo_bsize = mp->mnt_stat.f_iosize; 893 } 894 895 return (0); 896 } 897 898 /* 899 * Delete from old mount point vnode list, if on one. 900 */ 901 static void 902 delmntque(struct vnode *vp) 903 { 904 struct mount *mp; 905 906 if (vp->v_mount == NULL) 907 return; 908 mp = vp->v_mount; 909 MNT_ILOCK(mp); 910 vp->v_mount = NULL; 911 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 912 ("bad mount point vnode list size")); 913 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 914 mp->mnt_nvnodelistsize--; 915 MNT_IUNLOCK(mp); 916 } 917 918 /* 919 * Insert into list of vnodes for the new mount point, if available. 920 */ 921 static void 922 insmntque(struct vnode *vp, struct mount *mp) 923 { 924 925 vp->v_mount = mp; 926 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 927 MNT_ILOCK(vp->v_mount); 928 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 929 mp->mnt_nvnodelistsize++; 930 MNT_IUNLOCK(vp->v_mount); 931 } 932 933 /* 934 * Flush out and invalidate all buffers associated with a bufobj 935 * Called with the underlying object locked. 936 */ 937 int 938 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo) 939 { 940 int error; 941 942 BO_LOCK(bo); 943 if (flags & V_SAVE) { 944 error = bufobj_wwait(bo, slpflag, slptimeo); 945 if (error) { 946 BO_UNLOCK(bo); 947 return (error); 948 } 949 if (bo->bo_dirty.bv_cnt > 0) { 950 BO_UNLOCK(bo); 951 if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0) 952 return (error); 953 /* 954 * XXX We could save a lock/unlock if this was only 955 * enabled under INVARIANTS 956 */ 957 BO_LOCK(bo); 958 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 959 panic("vinvalbuf: dirty bufs"); 960 } 961 } 962 /* 963 * If you alter this loop please notice that interlock is dropped and 964 * reacquired in flushbuflist. Special care is needed to ensure that 965 * no race conditions occur from this. 966 */ 967 do { 968 error = flushbuflist(&bo->bo_clean, 969 flags, bo, slpflag, slptimeo); 970 if (error == 0) 971 error = flushbuflist(&bo->bo_dirty, 972 flags, bo, slpflag, slptimeo); 973 if (error != 0 && error != EAGAIN) { 974 BO_UNLOCK(bo); 975 return (error); 976 } 977 } while (error != 0); 978 979 /* 980 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 981 * have write I/O in-progress but if there is a VM object then the 982 * VM object can also have read-I/O in-progress. 983 */ 984 do { 985 bufobj_wwait(bo, 0, 0); 986 BO_UNLOCK(bo); 987 if (bo->bo_object != NULL) { 988 VM_OBJECT_LOCK(bo->bo_object); 989 vm_object_pip_wait(bo->bo_object, "bovlbx"); 990 VM_OBJECT_UNLOCK(bo->bo_object); 991 } 992 BO_LOCK(bo); 993 } while (bo->bo_numoutput > 0); 994 BO_UNLOCK(bo); 995 996 /* 997 * Destroy the copy in the VM cache, too. 998 */ 999 if (bo->bo_object != NULL) { 1000 VM_OBJECT_LOCK(bo->bo_object); 1001 vm_object_page_remove(bo->bo_object, 0, 0, 1002 (flags & V_SAVE) ? TRUE : FALSE); 1003 VM_OBJECT_UNLOCK(bo->bo_object); 1004 } 1005 1006 #ifdef INVARIANTS 1007 BO_LOCK(bo); 1008 if ((flags & (V_ALT | V_NORMAL)) == 0 && 1009 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 1010 panic("vinvalbuf: flush failed"); 1011 BO_UNLOCK(bo); 1012 #endif 1013 return (0); 1014 } 1015 1016 /* 1017 * Flush out and invalidate all buffers associated with a vnode. 1018 * Called with the underlying object locked. 1019 */ 1020 int 1021 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo) 1022 { 1023 1024 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1025 return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo)); 1026 } 1027 1028 /* 1029 * Flush out buffers on the specified list. 1030 * 1031 */ 1032 static int 1033 flushbuflist(bufv, flags, bo, slpflag, slptimeo) 1034 struct bufv *bufv; 1035 int flags; 1036 struct bufobj *bo; 1037 int slpflag, slptimeo; 1038 { 1039 struct buf *bp, *nbp; 1040 int retval, error; 1041 1042 ASSERT_BO_LOCKED(bo); 1043 1044 retval = 0; 1045 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1046 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1047 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1048 continue; 1049 } 1050 retval = EAGAIN; 1051 error = BUF_TIMELOCK(bp, 1052 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), 1053 "flushbuf", slpflag, slptimeo); 1054 if (error) { 1055 BO_LOCK(bo); 1056 return (error != ENOLCK ? error : EAGAIN); 1057 } 1058 if (bp->b_bufobj != bo) { /* XXX: necessary ? */ 1059 BO_LOCK(bo); 1060 return (EAGAIN); 1061 } 1062 /* 1063 * XXX Since there are no node locks for NFS, I 1064 * believe there is a slight chance that a delayed 1065 * write will occur while sleeping just above, so 1066 * check for it. 1067 */ 1068 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1069 (flags & V_SAVE)) { 1070 bremfree(bp); 1071 bp->b_flags |= B_ASYNC; 1072 bwrite(bp); 1073 BO_LOCK(bo); 1074 return (EAGAIN); /* XXX: why not loop ? */ 1075 } 1076 bremfree(bp); 1077 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 1078 bp->b_flags &= ~B_ASYNC; 1079 brelse(bp); 1080 BO_LOCK(bo); 1081 } 1082 return (retval); 1083 } 1084 1085 /* 1086 * Truncate a file's buffer and pages to a specified length. This 1087 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1088 * sync activity. 1089 */ 1090 int 1091 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) 1092 { 1093 struct buf *bp, *nbp; 1094 int anyfreed; 1095 int trunclbn; 1096 struct bufobj *bo; 1097 1098 /* 1099 * Round up to the *next* lbn. 1100 */ 1101 trunclbn = (length + blksize - 1) / blksize; 1102 1103 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1104 restart: 1105 VI_LOCK(vp); 1106 bo = &vp->v_bufobj; 1107 anyfreed = 1; 1108 for (;anyfreed;) { 1109 anyfreed = 0; 1110 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1111 if (bp->b_lblkno < trunclbn) 1112 continue; 1113 if (BUF_LOCK(bp, 1114 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1115 VI_MTX(vp)) == ENOLCK) 1116 goto restart; 1117 1118 bremfree(bp); 1119 bp->b_flags |= (B_INVAL | B_RELBUF); 1120 bp->b_flags &= ~B_ASYNC; 1121 brelse(bp); 1122 anyfreed = 1; 1123 1124 if (nbp != NULL && 1125 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1126 (nbp->b_vp != vp) || 1127 (nbp->b_flags & B_DELWRI))) { 1128 goto restart; 1129 } 1130 VI_LOCK(vp); 1131 } 1132 1133 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1134 if (bp->b_lblkno < trunclbn) 1135 continue; 1136 if (BUF_LOCK(bp, 1137 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1138 VI_MTX(vp)) == ENOLCK) 1139 goto restart; 1140 bremfree(bp); 1141 bp->b_flags |= (B_INVAL | B_RELBUF); 1142 bp->b_flags &= ~B_ASYNC; 1143 brelse(bp); 1144 anyfreed = 1; 1145 if (nbp != NULL && 1146 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1147 (nbp->b_vp != vp) || 1148 (nbp->b_flags & B_DELWRI) == 0)) { 1149 goto restart; 1150 } 1151 VI_LOCK(vp); 1152 } 1153 } 1154 1155 if (length > 0) { 1156 restartsync: 1157 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1158 if (bp->b_lblkno > 0) 1159 continue; 1160 /* 1161 * Since we hold the vnode lock this should only 1162 * fail if we're racing with the buf daemon. 1163 */ 1164 if (BUF_LOCK(bp, 1165 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1166 VI_MTX(vp)) == ENOLCK) { 1167 goto restart; 1168 } 1169 VNASSERT((bp->b_flags & B_DELWRI), vp, 1170 ("buf(%p) on dirty queue without DELWRI", bp)); 1171 1172 bremfree(bp); 1173 bawrite(bp); 1174 VI_LOCK(vp); 1175 goto restartsync; 1176 } 1177 } 1178 1179 bufobj_wwait(bo, 0, 0); 1180 VI_UNLOCK(vp); 1181 vnode_pager_setsize(vp, length); 1182 1183 return (0); 1184 } 1185 1186 /* 1187 * buf_splay() - splay tree core for the clean/dirty list of buffers in 1188 * a vnode. 1189 * 1190 * NOTE: We have to deal with the special case of a background bitmap 1191 * buffer, a situation where two buffers will have the same logical 1192 * block offset. We want (1) only the foreground buffer to be accessed 1193 * in a lookup and (2) must differentiate between the foreground and 1194 * background buffer in the splay tree algorithm because the splay 1195 * tree cannot normally handle multiple entities with the same 'index'. 1196 * We accomplish this by adding differentiating flags to the splay tree's 1197 * numerical domain. 1198 */ 1199 static 1200 struct buf * 1201 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) 1202 { 1203 struct buf dummy; 1204 struct buf *lefttreemax, *righttreemin, *y; 1205 1206 if (root == NULL) 1207 return (NULL); 1208 lefttreemax = righttreemin = &dummy; 1209 for (;;) { 1210 if (lblkno < root->b_lblkno || 1211 (lblkno == root->b_lblkno && 1212 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1213 if ((y = root->b_left) == NULL) 1214 break; 1215 if (lblkno < y->b_lblkno) { 1216 /* Rotate right. */ 1217 root->b_left = y->b_right; 1218 y->b_right = root; 1219 root = y; 1220 if ((y = root->b_left) == NULL) 1221 break; 1222 } 1223 /* Link into the new root's right tree. */ 1224 righttreemin->b_left = root; 1225 righttreemin = root; 1226 } else if (lblkno > root->b_lblkno || 1227 (lblkno == root->b_lblkno && 1228 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { 1229 if ((y = root->b_right) == NULL) 1230 break; 1231 if (lblkno > y->b_lblkno) { 1232 /* Rotate left. */ 1233 root->b_right = y->b_left; 1234 y->b_left = root; 1235 root = y; 1236 if ((y = root->b_right) == NULL) 1237 break; 1238 } 1239 /* Link into the new root's left tree. */ 1240 lefttreemax->b_right = root; 1241 lefttreemax = root; 1242 } else { 1243 break; 1244 } 1245 root = y; 1246 } 1247 /* Assemble the new root. */ 1248 lefttreemax->b_right = root->b_left; 1249 righttreemin->b_left = root->b_right; 1250 root->b_left = dummy.b_right; 1251 root->b_right = dummy.b_left; 1252 return (root); 1253 } 1254 1255 static void 1256 buf_vlist_remove(struct buf *bp) 1257 { 1258 struct buf *root; 1259 struct bufv *bv; 1260 1261 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1262 ASSERT_BO_LOCKED(bp->b_bufobj); 1263 if (bp->b_xflags & BX_VNDIRTY) 1264 bv = &bp->b_bufobj->bo_dirty; 1265 else 1266 bv = &bp->b_bufobj->bo_clean; 1267 if (bp != bv->bv_root) { 1268 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1269 KASSERT(root == bp, ("splay lookup failed in remove")); 1270 } 1271 if (bp->b_left == NULL) { 1272 root = bp->b_right; 1273 } else { 1274 root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); 1275 root->b_right = bp->b_right; 1276 } 1277 bv->bv_root = root; 1278 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1279 bv->bv_cnt--; 1280 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1281 } 1282 1283 /* 1284 * Add the buffer to the sorted clean or dirty block list using a 1285 * splay tree algorithm. 1286 * 1287 * NOTE: xflags is passed as a constant, optimizing this inline function! 1288 */ 1289 static void 1290 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1291 { 1292 struct buf *root; 1293 struct bufv *bv; 1294 1295 ASSERT_BO_LOCKED(bo); 1296 bp->b_xflags |= xflags; 1297 if (xflags & BX_VNDIRTY) 1298 bv = &bo->bo_dirty; 1299 else 1300 bv = &bo->bo_clean; 1301 1302 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1303 if (root == NULL) { 1304 bp->b_left = NULL; 1305 bp->b_right = NULL; 1306 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1307 } else if (bp->b_lblkno < root->b_lblkno || 1308 (bp->b_lblkno == root->b_lblkno && 1309 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1310 bp->b_left = root->b_left; 1311 bp->b_right = root; 1312 root->b_left = NULL; 1313 TAILQ_INSERT_BEFORE(root, bp, b_bobufs); 1314 } else { 1315 bp->b_right = root->b_right; 1316 bp->b_left = root; 1317 root->b_right = NULL; 1318 TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); 1319 } 1320 bv->bv_cnt++; 1321 bv->bv_root = bp; 1322 } 1323 1324 /* 1325 * Lookup a buffer using the splay tree. Note that we specifically avoid 1326 * shadow buffers used in background bitmap writes. 1327 * 1328 * This code isn't quite efficient as it could be because we are maintaining 1329 * two sorted lists and do not know which list the block resides in. 1330 * 1331 * During a "make buildworld" the desired buffer is found at one of 1332 * the roots more than 60% of the time. Thus, checking both roots 1333 * before performing either splay eliminates unnecessary splays on the 1334 * first tree splayed. 1335 */ 1336 struct buf * 1337 gbincore(struct bufobj *bo, daddr_t lblkno) 1338 { 1339 struct buf *bp; 1340 1341 ASSERT_BO_LOCKED(bo); 1342 if ((bp = bo->bo_clean.bv_root) != NULL && 1343 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1344 return (bp); 1345 if ((bp = bo->bo_dirty.bv_root) != NULL && 1346 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1347 return (bp); 1348 if ((bp = bo->bo_clean.bv_root) != NULL) { 1349 bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); 1350 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1351 return (bp); 1352 } 1353 if ((bp = bo->bo_dirty.bv_root) != NULL) { 1354 bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); 1355 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1356 return (bp); 1357 } 1358 return (NULL); 1359 } 1360 1361 /* 1362 * Associate a buffer with a vnode. 1363 */ 1364 void 1365 bgetvp(struct vnode *vp, struct buf *bp) 1366 { 1367 1368 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1369 1370 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1371 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1372 ("bgetvp: bp already attached! %p", bp)); 1373 1374 ASSERT_VI_LOCKED(vp, "bgetvp"); 1375 vholdl(vp); 1376 bp->b_vp = vp; 1377 bp->b_bufobj = &vp->v_bufobj; 1378 /* 1379 * Insert onto list for new vnode. 1380 */ 1381 buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN); 1382 } 1383 1384 /* 1385 * Disassociate a buffer from a vnode. 1386 */ 1387 void 1388 brelvp(struct buf *bp) 1389 { 1390 struct bufobj *bo; 1391 struct vnode *vp; 1392 1393 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1394 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1395 1396 /* 1397 * Delete from old vnode list, if on one. 1398 */ 1399 vp = bp->b_vp; /* XXX */ 1400 bo = bp->b_bufobj; 1401 BO_LOCK(bo); 1402 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1403 buf_vlist_remove(bp); 1404 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1405 bo->bo_flag &= ~BO_ONWORKLST; 1406 mtx_lock(&sync_mtx); 1407 LIST_REMOVE(bo, bo_synclist); 1408 syncer_worklist_len--; 1409 mtx_unlock(&sync_mtx); 1410 } 1411 vdropl(vp); 1412 bp->b_vp = NULL; 1413 bp->b_bufobj = NULL; 1414 BO_UNLOCK(bo); 1415 } 1416 1417 /* 1418 * Add an item to the syncer work queue. 1419 */ 1420 static void 1421 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 1422 { 1423 int slot; 1424 1425 ASSERT_BO_LOCKED(bo); 1426 1427 mtx_lock(&sync_mtx); 1428 if (bo->bo_flag & BO_ONWORKLST) 1429 LIST_REMOVE(bo, bo_synclist); 1430 else { 1431 bo->bo_flag |= BO_ONWORKLST; 1432 syncer_worklist_len++; 1433 } 1434 1435 if (delay > syncer_maxdelay - 2) 1436 delay = syncer_maxdelay - 2; 1437 slot = (syncer_delayno + delay) & syncer_mask; 1438 1439 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 1440 mtx_unlock(&sync_mtx); 1441 } 1442 1443 static int 1444 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 1445 { 1446 int error, len; 1447 1448 mtx_lock(&sync_mtx); 1449 len = syncer_worklist_len - sync_vnode_count; 1450 mtx_unlock(&sync_mtx); 1451 error = SYSCTL_OUT(req, &len, sizeof(len)); 1452 return (error); 1453 } 1454 1455 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 1456 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 1457 1458 struct proc *updateproc; 1459 static void sched_sync(void); 1460 static struct kproc_desc up_kp = { 1461 "syncer", 1462 sched_sync, 1463 &updateproc 1464 }; 1465 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1466 1467 static int 1468 sync_vnode(struct bufobj *bo, struct thread *td) 1469 { 1470 struct vnode *vp; 1471 struct mount *mp; 1472 1473 vp = bo->__bo_vnode; /* XXX */ 1474 if (VOP_ISLOCKED(vp, NULL) != 0) 1475 return (1); 1476 if (VI_TRYLOCK(vp) == 0) 1477 return (1); 1478 /* 1479 * We use vhold in case the vnode does not 1480 * successfully sync. vhold prevents the vnode from 1481 * going away when we unlock the sync_mtx so that 1482 * we can acquire the vnode interlock. 1483 */ 1484 vholdl(vp); 1485 mtx_unlock(&sync_mtx); 1486 VI_UNLOCK(vp); 1487 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1488 vdrop(vp); 1489 mtx_lock(&sync_mtx); 1490 return (1); 1491 } 1492 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1493 (void) VOP_FSYNC(vp, MNT_LAZY, td); 1494 VOP_UNLOCK(vp, 0, td); 1495 vn_finished_write(mp); 1496 VI_LOCK(vp); 1497 if ((bo->bo_flag & BO_ONWORKLST) != 0) { 1498 /* 1499 * Put us back on the worklist. The worklist 1500 * routine will remove us from our current 1501 * position and then add us back in at a later 1502 * position. 1503 */ 1504 vn_syncer_add_to_worklist(bo, syncdelay); 1505 } 1506 vdropl(vp); 1507 VI_UNLOCK(vp); 1508 mtx_lock(&sync_mtx); 1509 return (0); 1510 } 1511 1512 /* 1513 * System filesystem synchronizer daemon. 1514 */ 1515 static void 1516 sched_sync(void) 1517 { 1518 struct synclist *next; 1519 struct synclist *slp; 1520 struct bufobj *bo; 1521 long starttime; 1522 struct thread *td = FIRST_THREAD_IN_PROC(updateproc); 1523 static int dummychan; 1524 int last_work_seen; 1525 int net_worklist_len; 1526 int syncer_final_iter; 1527 int first_printf; 1528 int error; 1529 1530 mtx_lock(&Giant); 1531 last_work_seen = 0; 1532 syncer_final_iter = 0; 1533 first_printf = 1; 1534 syncer_state = SYNCER_RUNNING; 1535 starttime = time_second; 1536 1537 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 1538 SHUTDOWN_PRI_LAST); 1539 1540 for (;;) { 1541 mtx_lock(&sync_mtx); 1542 if (syncer_state == SYNCER_FINAL_DELAY && 1543 syncer_final_iter == 0) { 1544 mtx_unlock(&sync_mtx); 1545 kthread_suspend_check(td->td_proc); 1546 mtx_lock(&sync_mtx); 1547 } 1548 net_worklist_len = syncer_worklist_len - sync_vnode_count; 1549 if (syncer_state != SYNCER_RUNNING && 1550 starttime != time_second) { 1551 if (first_printf) { 1552 printf("\nSyncing disks, vnodes remaining..."); 1553 first_printf = 0; 1554 } 1555 printf("%d ", net_worklist_len); 1556 } 1557 starttime = time_second; 1558 1559 /* 1560 * Push files whose dirty time has expired. Be careful 1561 * of interrupt race on slp queue. 1562 * 1563 * Skip over empty worklist slots when shutting down. 1564 */ 1565 do { 1566 slp = &syncer_workitem_pending[syncer_delayno]; 1567 syncer_delayno += 1; 1568 if (syncer_delayno == syncer_maxdelay) 1569 syncer_delayno = 0; 1570 next = &syncer_workitem_pending[syncer_delayno]; 1571 /* 1572 * If the worklist has wrapped since the 1573 * it was emptied of all but syncer vnodes, 1574 * switch to the FINAL_DELAY state and run 1575 * for one more second. 1576 */ 1577 if (syncer_state == SYNCER_SHUTTING_DOWN && 1578 net_worklist_len == 0 && 1579 last_work_seen == syncer_delayno) { 1580 syncer_state = SYNCER_FINAL_DELAY; 1581 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 1582 } 1583 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 1584 syncer_worklist_len > 0); 1585 1586 /* 1587 * Keep track of the last time there was anything 1588 * on the worklist other than syncer vnodes. 1589 * Return to the SHUTTING_DOWN state if any 1590 * new work appears. 1591 */ 1592 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 1593 last_work_seen = syncer_delayno; 1594 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 1595 syncer_state = SYNCER_SHUTTING_DOWN; 1596 while ((bo = LIST_FIRST(slp)) != NULL) { 1597 error = sync_vnode(bo, td); 1598 if (error == 1) { 1599 LIST_REMOVE(bo, bo_synclist); 1600 LIST_INSERT_HEAD(next, bo, bo_synclist); 1601 continue; 1602 } 1603 } 1604 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 1605 syncer_final_iter--; 1606 mtx_unlock(&sync_mtx); 1607 1608 /* 1609 * Do soft update processing. 1610 */ 1611 if (softdep_process_worklist_hook != NULL) 1612 (*softdep_process_worklist_hook)(NULL); 1613 1614 /* 1615 * The variable rushjob allows the kernel to speed up the 1616 * processing of the filesystem syncer process. A rushjob 1617 * value of N tells the filesystem syncer to process the next 1618 * N seconds worth of work on its queue ASAP. Currently rushjob 1619 * is used by the soft update code to speed up the filesystem 1620 * syncer process when the incore state is getting so far 1621 * ahead of the disk that the kernel memory pool is being 1622 * threatened with exhaustion. 1623 */ 1624 mtx_lock(&sync_mtx); 1625 if (rushjob > 0) { 1626 rushjob -= 1; 1627 mtx_unlock(&sync_mtx); 1628 continue; 1629 } 1630 mtx_unlock(&sync_mtx); 1631 /* 1632 * Just sleep for a short period if time between 1633 * iterations when shutting down to allow some I/O 1634 * to happen. 1635 * 1636 * If it has taken us less than a second to process the 1637 * current work, then wait. Otherwise start right over 1638 * again. We can still lose time if any single round 1639 * takes more than two seconds, but it does not really 1640 * matter as we are just trying to generally pace the 1641 * filesystem activity. 1642 */ 1643 if (syncer_state != SYNCER_RUNNING) 1644 tsleep(&dummychan, PPAUSE, "syncfnl", 1645 hz / SYNCER_SHUTDOWN_SPEEDUP); 1646 else if (time_second == starttime) 1647 tsleep(&lbolt, PPAUSE, "syncer", 0); 1648 } 1649 } 1650 1651 /* 1652 * Request the syncer daemon to speed up its work. 1653 * We never push it to speed up more than half of its 1654 * normal turn time, otherwise it could take over the cpu. 1655 */ 1656 int 1657 speedup_syncer() 1658 { 1659 struct thread *td; 1660 int ret = 0; 1661 1662 td = FIRST_THREAD_IN_PROC(updateproc); 1663 sleepq_remove(td, &lbolt); 1664 mtx_lock(&sync_mtx); 1665 if (rushjob < syncdelay / 2) { 1666 rushjob += 1; 1667 stat_rush_requests += 1; 1668 ret = 1; 1669 } 1670 mtx_unlock(&sync_mtx); 1671 return (ret); 1672 } 1673 1674 /* 1675 * Tell the syncer to speed up its work and run though its work 1676 * list several times, then tell it to shut down. 1677 */ 1678 static void 1679 syncer_shutdown(void *arg, int howto) 1680 { 1681 struct thread *td; 1682 1683 if (howto & RB_NOSYNC) 1684 return; 1685 td = FIRST_THREAD_IN_PROC(updateproc); 1686 sleepq_remove(td, &lbolt); 1687 mtx_lock(&sync_mtx); 1688 syncer_state = SYNCER_SHUTTING_DOWN; 1689 rushjob = 0; 1690 mtx_unlock(&sync_mtx); 1691 kproc_shutdown(arg, howto); 1692 } 1693 1694 /* 1695 * Reassign a buffer from one vnode to another. 1696 * Used to assign file specific control information 1697 * (indirect blocks) to the vnode to which they belong. 1698 */ 1699 void 1700 reassignbuf(struct buf *bp) 1701 { 1702 struct vnode *vp; 1703 struct bufobj *bo; 1704 int delay; 1705 1706 vp = bp->b_vp; 1707 bo = bp->b_bufobj; 1708 ++reassignbufcalls; 1709 1710 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 1711 bp, bp->b_vp, bp->b_flags); 1712 /* 1713 * B_PAGING flagged buffers cannot be reassigned because their vp 1714 * is not fully linked in. 1715 */ 1716 if (bp->b_flags & B_PAGING) 1717 panic("cannot reassign paging buffer"); 1718 1719 /* 1720 * Delete from old vnode list, if on one. 1721 */ 1722 VI_LOCK(vp); 1723 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1724 buf_vlist_remove(bp); 1725 /* 1726 * If dirty, put on list of dirty buffers; otherwise insert onto list 1727 * of clean buffers. 1728 */ 1729 if (bp->b_flags & B_DELWRI) { 1730 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 1731 switch (vp->v_type) { 1732 case VDIR: 1733 delay = dirdelay; 1734 break; 1735 case VCHR: 1736 delay = metadelay; 1737 break; 1738 default: 1739 delay = filedelay; 1740 } 1741 vn_syncer_add_to_worklist(bo, delay); 1742 } 1743 buf_vlist_add(bp, bo, BX_VNDIRTY); 1744 } else { 1745 buf_vlist_add(bp, bo, BX_VNCLEAN); 1746 1747 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1748 mtx_lock(&sync_mtx); 1749 LIST_REMOVE(bo, bo_synclist); 1750 syncer_worklist_len--; 1751 mtx_unlock(&sync_mtx); 1752 bo->bo_flag &= ~BO_ONWORKLST; 1753 } 1754 } 1755 VI_UNLOCK(vp); 1756 } 1757 1758 static void 1759 v_incr_usecount(struct vnode *vp, int delta) 1760 { 1761 1762 vp->v_usecount += delta; 1763 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 1764 dev_lock(); 1765 vp->v_rdev->si_usecount += delta; 1766 dev_unlock(); 1767 } 1768 } 1769 1770 /* 1771 * Grab a particular vnode from the free list, increment its 1772 * reference count and lock it. The vnode lock bit is set if the 1773 * vnode is being eliminated in vgone. The process is awakened 1774 * when the transition is completed, and an error returned to 1775 * indicate that the vnode is no longer usable (possibly having 1776 * been changed to a new filesystem type). 1777 */ 1778 int 1779 vget(vp, flags, td) 1780 struct vnode *vp; 1781 int flags; 1782 struct thread *td; 1783 { 1784 int error; 1785 1786 /* 1787 * If the vnode is in the process of being cleaned out for 1788 * another use, we wait for the cleaning to finish and then 1789 * return failure. Cleaning is determined by checking that 1790 * the VI_XLOCK flag is set. 1791 */ 1792 if ((flags & LK_INTERLOCK) == 0) 1793 VI_LOCK(vp); 1794 if (vp->v_iflag & VI_XLOCK && vp->v_vxthread != curthread) { 1795 if ((flags & LK_NOWAIT) == 0) { 1796 vx_waitl(vp); 1797 VI_UNLOCK(vp); 1798 return (ENOENT); 1799 } 1800 VI_UNLOCK(vp); 1801 return (EBUSY); 1802 } 1803 1804 v_incr_usecount(vp, 1); 1805 1806 if (VSHOULDBUSY(vp)) 1807 vbusy(vp); 1808 if (flags & LK_TYPE_MASK) { 1809 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { 1810 /* 1811 * must expand vrele here because we do not want 1812 * to call VOP_INACTIVE if the reference count 1813 * drops back to zero since it was never really 1814 * active. We must remove it from the free list 1815 * before sleeping so that multiple processes do 1816 * not try to recycle it. 1817 */ 1818 VI_LOCK(vp); 1819 v_incr_usecount(vp, -1); 1820 if (VSHOULDFREE(vp)) 1821 vfree(vp); 1822 else 1823 vlruvp(vp); 1824 VI_UNLOCK(vp); 1825 } 1826 return (error); 1827 } 1828 VI_UNLOCK(vp); 1829 return (0); 1830 } 1831 1832 /* 1833 * Increase the reference count of a vnode. 1834 */ 1835 void 1836 vref(struct vnode *vp) 1837 { 1838 1839 VI_LOCK(vp); 1840 v_incr_usecount(vp, 1); 1841 VI_UNLOCK(vp); 1842 } 1843 1844 /* 1845 * Return reference count of a vnode. 1846 * 1847 * The results of this call are only guaranteed when some mechanism other 1848 * than the VI lock is used to stop other processes from gaining references 1849 * to the vnode. This may be the case if the caller holds the only reference. 1850 * This is also useful when stale data is acceptable as race conditions may 1851 * be accounted for by some other means. 1852 */ 1853 int 1854 vrefcnt(struct vnode *vp) 1855 { 1856 int usecnt; 1857 1858 VI_LOCK(vp); 1859 usecnt = vp->v_usecount; 1860 VI_UNLOCK(vp); 1861 1862 return (usecnt); 1863 } 1864 1865 1866 /* 1867 * Vnode put/release. 1868 * If count drops to zero, call inactive routine and return to freelist. 1869 */ 1870 void 1871 vrele(vp) 1872 struct vnode *vp; 1873 { 1874 struct thread *td = curthread; /* XXX */ 1875 1876 KASSERT(vp != NULL, ("vrele: null vp")); 1877 1878 VI_LOCK(vp); 1879 1880 /* Skip this v_writecount check if we're going to panic below. */ 1881 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, 1882 ("vrele: missed vn_close")); 1883 1884 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && 1885 vp->v_usecount == 1)) { 1886 v_incr_usecount(vp, -1); 1887 VI_UNLOCK(vp); 1888 1889 return; 1890 } 1891 1892 if (vp->v_usecount == 1) { 1893 v_incr_usecount(vp, -1); 1894 /* 1895 * We must call VOP_INACTIVE with the node locked. Mark 1896 * as VI_DOINGINACT to avoid recursion. 1897 */ 1898 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) { 1899 VI_LOCK(vp); 1900 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 1901 ("vrele: recursed on VI_DOINGINACT")); 1902 vp->v_iflag |= VI_DOINGINACT; 1903 VI_UNLOCK(vp); 1904 VOP_INACTIVE(vp, td); 1905 VI_LOCK(vp); 1906 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 1907 ("vrele: lost VI_DOINGINACT")); 1908 vp->v_iflag &= ~VI_DOINGINACT; 1909 } else 1910 VI_LOCK(vp); 1911 if (VSHOULDFREE(vp)) 1912 vfree(vp); 1913 else 1914 vlruvp(vp); 1915 VI_UNLOCK(vp); 1916 1917 } else { 1918 #ifdef DIAGNOSTIC 1919 vprint("vrele: negative ref count", vp); 1920 #endif 1921 VI_UNLOCK(vp); 1922 panic("vrele: negative ref cnt"); 1923 } 1924 } 1925 1926 /* 1927 * Release an already locked vnode. This give the same effects as 1928 * unlock+vrele(), but takes less time and avoids releasing and 1929 * re-aquiring the lock (as vrele() aquires the lock internally.) 1930 */ 1931 void 1932 vput(vp) 1933 struct vnode *vp; 1934 { 1935 struct thread *td = curthread; /* XXX */ 1936 1937 KASSERT(vp != NULL, ("vput: null vp")); 1938 VI_LOCK(vp); 1939 /* Skip this v_writecount check if we're going to panic below. */ 1940 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, 1941 ("vput: missed vn_close")); 1942 1943 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && 1944 vp->v_usecount == 1)) { 1945 v_incr_usecount(vp, -1); 1946 VOP_UNLOCK(vp, LK_INTERLOCK, td); 1947 return; 1948 } 1949 1950 if (vp->v_usecount == 1) { 1951 v_incr_usecount(vp, -1); 1952 /* 1953 * We must call VOP_INACTIVE with the node locked, so 1954 * we just need to release the vnode mutex. Mark as 1955 * as VI_DOINGINACT to avoid recursion. 1956 */ 1957 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 1958 ("vput: recursed on VI_DOINGINACT")); 1959 vp->v_iflag |= VI_DOINGINACT; 1960 VI_UNLOCK(vp); 1961 VOP_INACTIVE(vp, td); 1962 VI_LOCK(vp); 1963 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 1964 ("vput: lost VI_DOINGINACT")); 1965 vp->v_iflag &= ~VI_DOINGINACT; 1966 if (VSHOULDFREE(vp)) 1967 vfree(vp); 1968 else 1969 vlruvp(vp); 1970 VI_UNLOCK(vp); 1971 1972 } else { 1973 #ifdef DIAGNOSTIC 1974 vprint("vput: negative ref count", vp); 1975 #endif 1976 panic("vput: negative ref cnt"); 1977 } 1978 } 1979 1980 /* 1981 * Somebody doesn't want the vnode recycled. 1982 */ 1983 void 1984 vhold(struct vnode *vp) 1985 { 1986 1987 VI_LOCK(vp); 1988 vholdl(vp); 1989 VI_UNLOCK(vp); 1990 } 1991 1992 static void 1993 vholdl(struct vnode *vp) 1994 { 1995 1996 vp->v_holdcnt++; 1997 if (VSHOULDBUSY(vp)) 1998 vbusy(vp); 1999 } 2000 2001 /* 2002 * Note that there is one less who cares about this vnode. vdrop() is the 2003 * opposite of vhold(). 2004 */ 2005 void 2006 vdrop(struct vnode *vp) 2007 { 2008 2009 VI_LOCK(vp); 2010 vdropl(vp); 2011 VI_UNLOCK(vp); 2012 } 2013 2014 static void 2015 vdropl(struct vnode *vp) 2016 { 2017 2018 if (vp->v_holdcnt <= 0) 2019 panic("vdrop: holdcnt"); 2020 vp->v_holdcnt--; 2021 if (VSHOULDFREE(vp)) 2022 vfree(vp); 2023 else 2024 vlruvp(vp); 2025 } 2026 2027 /* 2028 * Remove any vnodes in the vnode table belonging to mount point mp. 2029 * 2030 * If FORCECLOSE is not specified, there should not be any active ones, 2031 * return error if any are found (nb: this is a user error, not a 2032 * system error). If FORCECLOSE is specified, detach any active vnodes 2033 * that are found. 2034 * 2035 * If WRITECLOSE is set, only flush out regular file vnodes open for 2036 * writing. 2037 * 2038 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2039 * 2040 * `rootrefs' specifies the base reference count for the root vnode 2041 * of this filesystem. The root vnode is considered busy if its 2042 * v_usecount exceeds this value. On a successful return, vflush(, td) 2043 * will call vrele() on the root vnode exactly rootrefs times. 2044 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2045 * be zero. 2046 */ 2047 #ifdef DIAGNOSTIC 2048 static int busyprt = 0; /* print out busy vnodes */ 2049 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 2050 #endif 2051 2052 int 2053 vflush(mp, rootrefs, flags, td) 2054 struct mount *mp; 2055 int rootrefs; 2056 int flags; 2057 struct thread *td; 2058 { 2059 struct vnode *vp, *nvp, *rootvp = NULL; 2060 struct vattr vattr; 2061 int busy = 0, error; 2062 2063 if (rootrefs > 0) { 2064 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2065 ("vflush: bad args")); 2066 /* 2067 * Get the filesystem root vnode. We can vput() it 2068 * immediately, since with rootrefs > 0, it won't go away. 2069 */ 2070 if ((error = VFS_ROOT(mp, &rootvp, td)) != 0) 2071 return (error); 2072 vput(rootvp); 2073 2074 } 2075 MNT_ILOCK(mp); 2076 loop: 2077 MNT_VNODE_FOREACH(vp, mp, nvp) { 2078 2079 VI_LOCK(vp); 2080 MNT_IUNLOCK(mp); 2081 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td); 2082 if (error) { 2083 MNT_ILOCK(mp); 2084 goto loop; 2085 } 2086 /* 2087 * Skip over a vnodes marked VV_SYSTEM. 2088 */ 2089 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2090 VOP_UNLOCK(vp, 0, td); 2091 MNT_ILOCK(mp); 2092 continue; 2093 } 2094 /* 2095 * If WRITECLOSE is set, flush out unlinked but still open 2096 * files (even if open only for reading) and regular file 2097 * vnodes open for writing. 2098 */ 2099 if (flags & WRITECLOSE) { 2100 error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); 2101 VI_LOCK(vp); 2102 2103 if ((vp->v_type == VNON || 2104 (error == 0 && vattr.va_nlink > 0)) && 2105 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2106 VOP_UNLOCK(vp, LK_INTERLOCK, td); 2107 MNT_ILOCK(mp); 2108 continue; 2109 } 2110 } else 2111 VI_LOCK(vp); 2112 2113 VOP_UNLOCK(vp, 0, td); 2114 2115 /* 2116 * With v_usecount == 0, all we need to do is clear out the 2117 * vnode data structures and we are done. 2118 */ 2119 if (vp->v_usecount == 0) { 2120 vgonel(vp, td); 2121 MNT_ILOCK(mp); 2122 continue; 2123 } 2124 2125 /* 2126 * If FORCECLOSE is set, forcibly close the vnode. For block 2127 * or character devices, revert to an anonymous device. For 2128 * all other files, just kill them. 2129 */ 2130 if (flags & FORCECLOSE) { 2131 VNASSERT(vp->v_type != VCHR && vp->v_type != VBLK, vp, 2132 ("device VNODE %p is FORCECLOSED", vp)); 2133 vgonel(vp, td); 2134 MNT_ILOCK(mp); 2135 continue; 2136 } 2137 #ifdef DIAGNOSTIC 2138 if (busyprt) 2139 vprint("vflush: busy vnode", vp); 2140 #endif 2141 VI_UNLOCK(vp); 2142 MNT_ILOCK(mp); 2143 busy++; 2144 } 2145 MNT_IUNLOCK(mp); 2146 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 2147 /* 2148 * If just the root vnode is busy, and if its refcount 2149 * is equal to `rootrefs', then go ahead and kill it. 2150 */ 2151 VI_LOCK(rootvp); 2152 KASSERT(busy > 0, ("vflush: not busy")); 2153 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 2154 ("vflush: usecount %d < rootrefs %d", 2155 rootvp->v_usecount, rootrefs)); 2156 if (busy == 1 && rootvp->v_usecount == rootrefs) { 2157 vgonel(rootvp, td); 2158 busy = 0; 2159 } else 2160 VI_UNLOCK(rootvp); 2161 } 2162 if (busy) 2163 return (EBUSY); 2164 for (; rootrefs > 0; rootrefs--) 2165 vrele(rootvp); 2166 return (0); 2167 } 2168 2169 /* 2170 * This moves a now (likely recyclable) vnode to the end of the 2171 * mountlist. XXX However, it is temporarily disabled until we 2172 * can clean up ffs_sync() and friends, which have loop restart 2173 * conditions which this code causes to operate O(N^2). 2174 */ 2175 static void 2176 vlruvp(struct vnode *vp) 2177 { 2178 #if 0 2179 struct mount *mp; 2180 2181 if ((mp = vp->v_mount) != NULL) { 2182 MNT_ILOCK(mp); 2183 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2184 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2185 MNT_IUNLOCK(mp); 2186 } 2187 #endif 2188 } 2189 2190 static void 2191 vx_lock(struct vnode *vp) 2192 { 2193 2194 ASSERT_VI_LOCKED(vp, "vx_lock"); 2195 2196 /* 2197 * Prevent the vnode from being recycled or brought into use while we 2198 * clean it out. 2199 */ 2200 if (vp->v_iflag & VI_XLOCK) 2201 panic("vx_lock: deadlock"); 2202 vp->v_iflag |= VI_XLOCK; 2203 vp->v_vxthread = curthread; 2204 } 2205 2206 static void 2207 vx_unlock(struct vnode *vp) 2208 { 2209 ASSERT_VI_LOCKED(vp, "vx_unlock"); 2210 vp->v_iflag &= ~VI_XLOCK; 2211 vp->v_vxthread = NULL; 2212 if (vp->v_iflag & VI_XWANT) { 2213 vp->v_iflag &= ~VI_XWANT; 2214 wakeup(vp); 2215 } 2216 } 2217 2218 int 2219 vx_wait(struct vnode *vp) 2220 { 2221 int locked; 2222 2223 ASSERT_VI_UNLOCKED(vp, "vx_wait"); 2224 VI_LOCK(vp); 2225 locked = vx_waitl(vp); 2226 VI_UNLOCK(vp); 2227 return (locked); 2228 } 2229 2230 int 2231 vx_waitl(struct vnode *vp) 2232 { 2233 int locked = 0; 2234 2235 ASSERT_VI_LOCKED(vp, "vx_wait"); 2236 while (vp->v_iflag & VI_XLOCK) { 2237 locked = 1; 2238 vp->v_iflag |= VI_XWANT; 2239 msleep(vp, VI_MTX(vp), PINOD, "vxwait", 0); 2240 } 2241 return (locked); 2242 } 2243 2244 /* 2245 * Recycle an unused vnode to the front of the free list. 2246 * Release the passed interlock if the vnode will be recycled. 2247 */ 2248 int 2249 vrecycle(struct vnode *vp, struct thread *td) 2250 { 2251 2252 VI_LOCK(vp); 2253 if (vp->v_usecount == 0) { 2254 vgonel(vp, td); 2255 return (1); 2256 } 2257 VI_UNLOCK(vp); 2258 return (0); 2259 } 2260 2261 /* 2262 * Eliminate all activity associated with a vnode 2263 * in preparation for reuse. 2264 */ 2265 void 2266 vgone(struct vnode *vp) 2267 { 2268 struct thread *td = curthread; /* XXX */ 2269 2270 VI_LOCK(vp); 2271 vgonel(vp, td); 2272 } 2273 2274 /* 2275 * vgone, with the vp interlock held. 2276 */ 2277 void 2278 vgonel(struct vnode *vp, struct thread *td) 2279 { 2280 int active; 2281 2282 /* 2283 * If a vgone (or vclean) is already in progress, 2284 * wait until it is done and return. 2285 */ 2286 ASSERT_VI_LOCKED(vp, "vgonel"); 2287 if (vx_waitl(vp)) { 2288 VI_UNLOCK(vp); 2289 return; 2290 } 2291 2292 vx_lock(vp); 2293 2294 /* 2295 * Check to see if the vnode is in use. If so we have to reference it 2296 * before we clean it out so that its count cannot fall to zero and 2297 * generate a race against ourselves to recycle it. 2298 */ 2299 if ((active = vp->v_usecount)) 2300 v_incr_usecount(vp, 1); 2301 2302 /* 2303 * Even if the count is zero, the VOP_INACTIVE routine may still 2304 * have the object locked while it cleans it out. The VOP_LOCK 2305 * ensures that the VOP_INACTIVE routine is done with its work. 2306 * For active vnodes, it ensures that no other activity can 2307 * occur while the underlying object is being cleaned out. 2308 */ 2309 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td); 2310 2311 /* 2312 * Clean out any buffers associated with the vnode. 2313 * If the flush fails, just toss the buffers. 2314 */ 2315 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)); 2316 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 2317 if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0) 2318 vinvalbuf(vp, 0, td, 0, 0); 2319 2320 /* 2321 * Any other processes trying to obtain this lock must first 2322 * wait for VXLOCK to clear, then call the new lock operation. 2323 */ 2324 VOP_UNLOCK(vp, 0, td); 2325 2326 /* 2327 * If purging an active vnode, it must be closed and 2328 * deactivated before being reclaimed. Note that the 2329 * VOP_INACTIVE will unlock the vnode. 2330 */ 2331 if (active) { 2332 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 2333 VI_LOCK(vp); 2334 if ((vp->v_iflag & VI_DOINGINACT) == 0) { 2335 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2336 ("vclean: recursed on VI_DOINGINACT")); 2337 vp->v_iflag |= VI_DOINGINACT; 2338 VI_UNLOCK(vp); 2339 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0) 2340 panic("vclean: cannot relock."); 2341 VOP_INACTIVE(vp, td); 2342 VI_LOCK(vp); 2343 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2344 ("vclean: lost VI_DOINGINACT")); 2345 vp->v_iflag &= ~VI_DOINGINACT; 2346 } 2347 VI_UNLOCK(vp); 2348 } 2349 /* 2350 * Reclaim the vnode. 2351 */ 2352 if (VOP_RECLAIM(vp, td)) 2353 panic("vclean: cannot reclaim"); 2354 2355 VNASSERT(vp->v_object == NULL, vp, 2356 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 2357 2358 if (active) { 2359 /* 2360 * Inline copy of vrele() since VOP_INACTIVE 2361 * has already been called. 2362 */ 2363 VI_LOCK(vp); 2364 v_incr_usecount(vp, -1); 2365 if (vp->v_usecount <= 0) { 2366 #ifdef INVARIANTS 2367 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 2368 vprint("vclean: bad ref count", vp); 2369 panic("vclean: ref cnt"); 2370 } 2371 #endif 2372 if (VSHOULDFREE(vp)) 2373 vfree(vp); 2374 } 2375 VI_UNLOCK(vp); 2376 } 2377 /* 2378 * Delete from old mount point vnode list. 2379 */ 2380 delmntque(vp); 2381 cache_purge(vp); 2382 VI_LOCK(vp); 2383 if (VSHOULDFREE(vp)) 2384 vfree(vp); 2385 2386 /* 2387 * Done with purge, reset to the standard lock and 2388 * notify sleepers of the grim news. 2389 */ 2390 vp->v_vnlock = &vp->v_lock; 2391 vp->v_op = &dead_vnodeops; 2392 vp->v_tag = "none"; 2393 2394 VI_UNLOCK(vp); 2395 2396 /* 2397 * If special device, remove it from special device alias list 2398 * if it is on one. 2399 */ 2400 VI_LOCK(vp); 2401 2402 /* 2403 * If it is on the freelist and not already at the head, 2404 * move it to the head of the list. The test of the 2405 * VDOOMED flag and the reference count of zero is because 2406 * it will be removed from the free list by getnewvnode, 2407 * but will not have its reference count incremented until 2408 * after calling vgone. If the reference count were 2409 * incremented first, vgone would (incorrectly) try to 2410 * close the previous instance of the underlying object. 2411 */ 2412 if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) { 2413 mtx_lock(&vnode_free_list_mtx); 2414 if (vp->v_iflag & VI_FREE) { 2415 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2416 } else { 2417 vp->v_iflag |= VI_FREE; 2418 freevnodes++; 2419 } 2420 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2421 mtx_unlock(&vnode_free_list_mtx); 2422 } 2423 2424 vp->v_type = VBAD; 2425 vx_unlock(vp); 2426 VI_UNLOCK(vp); 2427 } 2428 2429 /* 2430 * Calculate the total number of references to a special device. 2431 */ 2432 int 2433 vcount(vp) 2434 struct vnode *vp; 2435 { 2436 int count; 2437 2438 dev_lock(); 2439 count = vp->v_rdev->si_usecount; 2440 dev_unlock(); 2441 return (count); 2442 } 2443 2444 /* 2445 * Same as above, but using the struct cdev *as argument 2446 */ 2447 int 2448 count_dev(dev) 2449 struct cdev *dev; 2450 { 2451 int count; 2452 2453 dev_lock(); 2454 count = dev->si_usecount; 2455 dev_unlock(); 2456 return(count); 2457 } 2458 2459 /* 2460 * Print out a description of a vnode. 2461 */ 2462 static char *typename[] = 2463 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2464 2465 void 2466 vn_printf(struct vnode *vp, const char *fmt, ...) 2467 { 2468 va_list ap; 2469 char buf[96]; 2470 2471 va_start(ap, fmt); 2472 vprintf(fmt, ap); 2473 va_end(ap); 2474 printf("%p: ", (void *)vp); 2475 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 2476 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 2477 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 2478 buf[0] = '\0'; 2479 buf[1] = '\0'; 2480 if (vp->v_vflag & VV_ROOT) 2481 strcat(buf, "|VV_ROOT"); 2482 if (vp->v_vflag & VV_TEXT) 2483 strcat(buf, "|VV_TEXT"); 2484 if (vp->v_vflag & VV_SYSTEM) 2485 strcat(buf, "|VV_SYSTEM"); 2486 if (vp->v_iflag & VI_XLOCK) 2487 strcat(buf, "|VI_XLOCK"); 2488 if (vp->v_iflag & VI_XWANT) 2489 strcat(buf, "|VI_XWANT"); 2490 if (vp->v_iflag & VI_DOOMED) 2491 strcat(buf, "|VI_DOOMED"); 2492 if (vp->v_iflag & VI_FREE) 2493 strcat(buf, "|VI_FREE"); 2494 printf(" flags (%s)\n", buf + 1); 2495 if (mtx_owned(VI_MTX(vp))) 2496 printf(" VI_LOCKed"); 2497 if (vp->v_object != NULL); 2498 printf(" v_object %p\n", vp->v_object); 2499 printf(" "); 2500 lockmgr_printinfo(vp->v_vnlock); 2501 printf("\n"); 2502 if (vp->v_data != NULL) 2503 VOP_PRINT(vp); 2504 } 2505 2506 #ifdef DDB 2507 #include <ddb/ddb.h> 2508 /* 2509 * List all of the locked vnodes in the system. 2510 * Called when debugging the kernel. 2511 */ 2512 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 2513 { 2514 struct mount *mp, *nmp; 2515 struct vnode *vp; 2516 2517 /* 2518 * Note: because this is DDB, we can't obey the locking semantics 2519 * for these structures, which means we could catch an inconsistent 2520 * state and dereference a nasty pointer. Not much to be done 2521 * about that. 2522 */ 2523 printf("Locked vnodes\n"); 2524 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2525 nmp = TAILQ_NEXT(mp, mnt_list); 2526 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2527 if (VOP_ISLOCKED(vp, NULL)) 2528 vprint("", vp); 2529 } 2530 nmp = TAILQ_NEXT(mp, mnt_list); 2531 } 2532 } 2533 #endif 2534 2535 /* 2536 * Fill in a struct xvfsconf based on a struct vfsconf. 2537 */ 2538 static void 2539 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) 2540 { 2541 2542 strcpy(xvfsp->vfc_name, vfsp->vfc_name); 2543 xvfsp->vfc_typenum = vfsp->vfc_typenum; 2544 xvfsp->vfc_refcount = vfsp->vfc_refcount; 2545 xvfsp->vfc_flags = vfsp->vfc_flags; 2546 /* 2547 * These are unused in userland, we keep them 2548 * to not break binary compatibility. 2549 */ 2550 xvfsp->vfc_vfsops = NULL; 2551 xvfsp->vfc_next = NULL; 2552 } 2553 2554 /* 2555 * Top level filesystem related information gathering. 2556 */ 2557 static int 2558 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 2559 { 2560 struct vfsconf *vfsp; 2561 struct xvfsconf xvfsp; 2562 int error; 2563 2564 error = 0; 2565 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 2566 vfsconf2x(vfsp, &xvfsp); 2567 error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); 2568 if (error) 2569 break; 2570 } 2571 return (error); 2572 } 2573 2574 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, 2575 "S,xvfsconf", "List of all configured filesystems"); 2576 2577 #ifndef BURN_BRIDGES 2578 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 2579 2580 static int 2581 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2582 { 2583 int *name = (int *)arg1 - 1; /* XXX */ 2584 u_int namelen = arg2 + 1; /* XXX */ 2585 struct vfsconf *vfsp; 2586 struct xvfsconf xvfsp; 2587 2588 printf("WARNING: userland calling deprecated sysctl, " 2589 "please rebuild world\n"); 2590 2591 #if 1 || defined(COMPAT_PRELITE2) 2592 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2593 if (namelen == 1) 2594 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2595 #endif 2596 2597 switch (name[1]) { 2598 case VFS_MAXTYPENUM: 2599 if (namelen != 2) 2600 return (ENOTDIR); 2601 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2602 case VFS_CONF: 2603 if (namelen != 3) 2604 return (ENOTDIR); /* overloaded */ 2605 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) 2606 if (vfsp->vfc_typenum == name[2]) 2607 break; 2608 if (vfsp == NULL) 2609 return (EOPNOTSUPP); 2610 vfsconf2x(vfsp, &xvfsp); 2611 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 2612 } 2613 return (EOPNOTSUPP); 2614 } 2615 2616 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, 2617 vfs_sysctl, "Generic filesystem"); 2618 2619 #if 1 || defined(COMPAT_PRELITE2) 2620 2621 static int 2622 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2623 { 2624 int error; 2625 struct vfsconf *vfsp; 2626 struct ovfsconf ovfs; 2627 2628 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 2629 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2630 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2631 ovfs.vfc_index = vfsp->vfc_typenum; 2632 ovfs.vfc_refcount = vfsp->vfc_refcount; 2633 ovfs.vfc_flags = vfsp->vfc_flags; 2634 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2635 if (error) 2636 return error; 2637 } 2638 return 0; 2639 } 2640 2641 #endif /* 1 || COMPAT_PRELITE2 */ 2642 #endif /* !BURN_BRIDGES */ 2643 2644 #define KINFO_VNODESLOP 10 2645 #ifdef notyet 2646 /* 2647 * Dump vnode list (via sysctl). 2648 */ 2649 /* ARGSUSED */ 2650 static int 2651 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2652 { 2653 struct xvnode *xvn; 2654 struct thread *td = req->td; 2655 struct mount *mp; 2656 struct vnode *vp; 2657 int error, len, n; 2658 2659 /* 2660 * Stale numvnodes access is not fatal here. 2661 */ 2662 req->lock = 0; 2663 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 2664 if (!req->oldptr) 2665 /* Make an estimate */ 2666 return (SYSCTL_OUT(req, 0, len)); 2667 2668 error = sysctl_wire_old_buffer(req, 0); 2669 if (error != 0) 2670 return (error); 2671 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 2672 n = 0; 2673 mtx_lock(&mountlist_mtx); 2674 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2675 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) 2676 continue; 2677 MNT_ILOCK(mp); 2678 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2679 if (n == len) 2680 break; 2681 vref(vp); 2682 xvn[n].xv_size = sizeof *xvn; 2683 xvn[n].xv_vnode = vp; 2684 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 2685 XV_COPY(usecount); 2686 XV_COPY(writecount); 2687 XV_COPY(holdcnt); 2688 XV_COPY(id); 2689 XV_COPY(mount); 2690 XV_COPY(numoutput); 2691 XV_COPY(type); 2692 #undef XV_COPY 2693 xvn[n].xv_flag = vp->v_vflag; 2694 2695 switch (vp->v_type) { 2696 case VREG: 2697 case VDIR: 2698 case VLNK: 2699 break; 2700 case VBLK: 2701 case VCHR: 2702 if (vp->v_rdev == NULL) { 2703 vrele(vp); 2704 continue; 2705 } 2706 xvn[n].xv_dev = dev2udev(vp->v_rdev); 2707 break; 2708 case VSOCK: 2709 xvn[n].xv_socket = vp->v_socket; 2710 break; 2711 case VFIFO: 2712 xvn[n].xv_fifo = vp->v_fifoinfo; 2713 break; 2714 case VNON: 2715 case VBAD: 2716 default: 2717 /* shouldn't happen? */ 2718 vrele(vp); 2719 continue; 2720 } 2721 vrele(vp); 2722 ++n; 2723 } 2724 MNT_IUNLOCK(mp); 2725 mtx_lock(&mountlist_mtx); 2726 vfs_unbusy(mp, td); 2727 if (n == len) 2728 break; 2729 } 2730 mtx_unlock(&mountlist_mtx); 2731 2732 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 2733 free(xvn, M_TEMP); 2734 return (error); 2735 } 2736 2737 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2738 0, 0, sysctl_vnode, "S,xvnode", ""); 2739 #endif 2740 2741 /* 2742 * Unmount all filesystems. The list is traversed in reverse order 2743 * of mounting to avoid dependencies. 2744 */ 2745 void 2746 vfs_unmountall() 2747 { 2748 struct mount *mp; 2749 struct thread *td; 2750 int error; 2751 2752 if (curthread != NULL) 2753 td = curthread; 2754 else 2755 td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */ 2756 /* 2757 * Since this only runs when rebooting, it is not interlocked. 2758 */ 2759 while(!TAILQ_EMPTY(&mountlist)) { 2760 mp = TAILQ_LAST(&mountlist, mntlist); 2761 error = dounmount(mp, MNT_FORCE, td); 2762 if (error) { 2763 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2764 printf("unmount of %s failed (", 2765 mp->mnt_stat.f_mntonname); 2766 if (error == EBUSY) 2767 printf("BUSY)\n"); 2768 else 2769 printf("%d)\n", error); 2770 } else { 2771 /* The unmount has removed mp from the mountlist */ 2772 } 2773 } 2774 } 2775 2776 /* 2777 * perform msync on all vnodes under a mount point 2778 * the mount point must be locked. 2779 */ 2780 void 2781 vfs_msync(struct mount *mp, int flags) 2782 { 2783 struct vnode *vp, *nvp; 2784 struct vm_object *obj; 2785 int tries; 2786 2787 tries = 5; 2788 MNT_ILOCK(mp); 2789 loop: 2790 TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) { 2791 if (vp->v_mount != mp) { 2792 if (--tries > 0) 2793 goto loop; 2794 break; 2795 } 2796 2797 VI_LOCK(vp); 2798 if (vp->v_iflag & VI_XLOCK) { 2799 VI_UNLOCK(vp); 2800 continue; 2801 } 2802 2803 if ((vp->v_iflag & VI_OBJDIRTY) && 2804 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2805 MNT_IUNLOCK(mp); 2806 if (!vget(vp, 2807 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 2808 curthread)) { 2809 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 2810 vput(vp); 2811 MNT_ILOCK(mp); 2812 continue; 2813 } 2814 2815 obj = vp->v_object; 2816 if (obj != NULL) { 2817 VM_OBJECT_LOCK(obj); 2818 vm_object_page_clean(obj, 0, 0, 2819 flags == MNT_WAIT ? 2820 OBJPC_SYNC : OBJPC_NOSYNC); 2821 VM_OBJECT_UNLOCK(obj); 2822 } 2823 vput(vp); 2824 } 2825 MNT_ILOCK(mp); 2826 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { 2827 if (--tries > 0) 2828 goto loop; 2829 break; 2830 } 2831 } else 2832 VI_UNLOCK(vp); 2833 } 2834 MNT_IUNLOCK(mp); 2835 } 2836 2837 /* 2838 * Mark a vnode as free, putting it up for recycling. 2839 */ 2840 void 2841 vfree(struct vnode *vp) 2842 { 2843 2844 ASSERT_VI_LOCKED(vp, "vfree"); 2845 mtx_lock(&vnode_free_list_mtx); 2846 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); 2847 if (vp->v_iflag & VI_AGE) { 2848 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2849 } else { 2850 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2851 } 2852 freevnodes++; 2853 mtx_unlock(&vnode_free_list_mtx); 2854 vp->v_iflag &= ~VI_AGE; 2855 vp->v_iflag |= VI_FREE; 2856 } 2857 2858 /* 2859 * Opposite of vfree() - mark a vnode as in use. 2860 */ 2861 static void 2862 vbusy(struct vnode *vp) 2863 { 2864 2865 ASSERT_VI_LOCKED(vp, "vbusy"); 2866 VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); 2867 2868 mtx_lock(&vnode_free_list_mtx); 2869 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2870 freevnodes--; 2871 mtx_unlock(&vnode_free_list_mtx); 2872 2873 vp->v_iflag &= ~(VI_FREE|VI_AGE); 2874 } 2875 2876 /* 2877 * Initalize per-vnode helper structure to hold poll-related state. 2878 */ 2879 void 2880 v_addpollinfo(struct vnode *vp) 2881 { 2882 struct vpollinfo *vi; 2883 2884 vi = uma_zalloc(vnodepoll_zone, M_WAITOK); 2885 if (vp->v_pollinfo != NULL) { 2886 uma_zfree(vnodepoll_zone, vi); 2887 return; 2888 } 2889 vp->v_pollinfo = vi; 2890 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 2891 knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, 2892 &vp->v_pollinfo->vpi_lock); 2893 } 2894 2895 /* 2896 * Record a process's interest in events which might happen to 2897 * a vnode. Because poll uses the historic select-style interface 2898 * internally, this routine serves as both the ``check for any 2899 * pending events'' and the ``record my interest in future events'' 2900 * functions. (These are done together, while the lock is held, 2901 * to avoid race conditions.) 2902 */ 2903 int 2904 vn_pollrecord(vp, td, events) 2905 struct vnode *vp; 2906 struct thread *td; 2907 short events; 2908 { 2909 2910 if (vp->v_pollinfo == NULL) 2911 v_addpollinfo(vp); 2912 mtx_lock(&vp->v_pollinfo->vpi_lock); 2913 if (vp->v_pollinfo->vpi_revents & events) { 2914 /* 2915 * This leaves events we are not interested 2916 * in available for the other process which 2917 * which presumably had requested them 2918 * (otherwise they would never have been 2919 * recorded). 2920 */ 2921 events &= vp->v_pollinfo->vpi_revents; 2922 vp->v_pollinfo->vpi_revents &= ~events; 2923 2924 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2925 return events; 2926 } 2927 vp->v_pollinfo->vpi_events |= events; 2928 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 2929 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2930 return 0; 2931 } 2932 2933 /* 2934 * Routine to create and manage a filesystem syncer vnode. 2935 */ 2936 #define sync_close ((int (*)(struct vop_close_args *))nullop) 2937 static int sync_fsync(struct vop_fsync_args *); 2938 static int sync_inactive(struct vop_inactive_args *); 2939 static int sync_reclaim(struct vop_reclaim_args *); 2940 2941 static struct vop_vector sync_vnodeops = { 2942 .vop_bypass = VOP_EOPNOTSUPP, 2943 .vop_close = sync_close, /* close */ 2944 .vop_fsync = sync_fsync, /* fsync */ 2945 .vop_inactive = sync_inactive, /* inactive */ 2946 .vop_reclaim = sync_reclaim, /* reclaim */ 2947 .vop_lock = vop_stdlock, /* lock */ 2948 .vop_unlock = vop_stdunlock, /* unlock */ 2949 .vop_islocked = vop_stdislocked, /* islocked */ 2950 }; 2951 2952 /* 2953 * Create a new filesystem syncer vnode for the specified mount point. 2954 */ 2955 int 2956 vfs_allocate_syncvnode(mp) 2957 struct mount *mp; 2958 { 2959 struct vnode *vp; 2960 static long start, incr, next; 2961 int error; 2962 2963 /* Allocate a new vnode */ 2964 if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { 2965 mp->mnt_syncer = NULL; 2966 return (error); 2967 } 2968 vp->v_type = VNON; 2969 /* 2970 * Place the vnode onto the syncer worklist. We attempt to 2971 * scatter them about on the list so that they will go off 2972 * at evenly distributed times even if all the filesystems 2973 * are mounted at once. 2974 */ 2975 next += incr; 2976 if (next == 0 || next > syncer_maxdelay) { 2977 start /= 2; 2978 incr /= 2; 2979 if (start == 0) { 2980 start = syncer_maxdelay / 2; 2981 incr = syncer_maxdelay; 2982 } 2983 next = start; 2984 } 2985 VI_LOCK(vp); 2986 vn_syncer_add_to_worklist(&vp->v_bufobj, 2987 syncdelay > 0 ? next % syncdelay : 0); 2988 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 2989 mtx_lock(&sync_mtx); 2990 sync_vnode_count++; 2991 mtx_unlock(&sync_mtx); 2992 VI_UNLOCK(vp); 2993 mp->mnt_syncer = vp; 2994 return (0); 2995 } 2996 2997 /* 2998 * Do a lazy sync of the filesystem. 2999 */ 3000 static int 3001 sync_fsync(ap) 3002 struct vop_fsync_args /* { 3003 struct vnode *a_vp; 3004 struct ucred *a_cred; 3005 int a_waitfor; 3006 struct thread *a_td; 3007 } */ *ap; 3008 { 3009 struct vnode *syncvp = ap->a_vp; 3010 struct mount *mp = syncvp->v_mount; 3011 struct thread *td = ap->a_td; 3012 int error, asyncflag; 3013 struct bufobj *bo; 3014 3015 /* 3016 * We only need to do something if this is a lazy evaluation. 3017 */ 3018 if (ap->a_waitfor != MNT_LAZY) 3019 return (0); 3020 3021 /* 3022 * Move ourselves to the back of the sync list. 3023 */ 3024 bo = &syncvp->v_bufobj; 3025 BO_LOCK(bo); 3026 vn_syncer_add_to_worklist(bo, syncdelay); 3027 BO_UNLOCK(bo); 3028 3029 /* 3030 * Walk the list of vnodes pushing all that are dirty and 3031 * not already on the sync list. 3032 */ 3033 mtx_lock(&mountlist_mtx); 3034 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { 3035 mtx_unlock(&mountlist_mtx); 3036 return (0); 3037 } 3038 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 3039 vfs_unbusy(mp, td); 3040 return (0); 3041 } 3042 asyncflag = mp->mnt_flag & MNT_ASYNC; 3043 mp->mnt_flag &= ~MNT_ASYNC; 3044 vfs_msync(mp, MNT_NOWAIT); 3045 error = VFS_SYNC(mp, MNT_LAZY, td); 3046 if (asyncflag) 3047 mp->mnt_flag |= MNT_ASYNC; 3048 vn_finished_write(mp); 3049 vfs_unbusy(mp, td); 3050 return (error); 3051 } 3052 3053 /* 3054 * The syncer vnode is no referenced. 3055 */ 3056 static int 3057 sync_inactive(ap) 3058 struct vop_inactive_args /* { 3059 struct vnode *a_vp; 3060 struct thread *a_td; 3061 } */ *ap; 3062 { 3063 3064 VOP_UNLOCK(ap->a_vp, 0, ap->a_td); 3065 vgone(ap->a_vp); 3066 return (0); 3067 } 3068 3069 /* 3070 * The syncer vnode is no longer needed and is being decommissioned. 3071 * 3072 * Modifications to the worklist must be protected by sync_mtx. 3073 */ 3074 static int 3075 sync_reclaim(ap) 3076 struct vop_reclaim_args /* { 3077 struct vnode *a_vp; 3078 } */ *ap; 3079 { 3080 struct vnode *vp = ap->a_vp; 3081 struct bufobj *bo; 3082 3083 VI_LOCK(vp); 3084 bo = &vp->v_bufobj; 3085 vp->v_mount->mnt_syncer = NULL; 3086 if (bo->bo_flag & BO_ONWORKLST) { 3087 mtx_lock(&sync_mtx); 3088 LIST_REMOVE(bo, bo_synclist); 3089 syncer_worklist_len--; 3090 sync_vnode_count--; 3091 mtx_unlock(&sync_mtx); 3092 bo->bo_flag &= ~BO_ONWORKLST; 3093 } 3094 VI_UNLOCK(vp); 3095 3096 return (0); 3097 } 3098 3099 /* 3100 * Check if vnode represents a disk device 3101 */ 3102 int 3103 vn_isdisk(vp, errp) 3104 struct vnode *vp; 3105 int *errp; 3106 { 3107 int error; 3108 3109 error = 0; 3110 dev_lock(); 3111 if (vp->v_type != VCHR) 3112 error = ENOTBLK; 3113 else if (vp->v_rdev == NULL) 3114 error = ENXIO; 3115 else if (vp->v_rdev->si_devsw == NULL) 3116 error = ENXIO; 3117 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 3118 error = ENOTBLK; 3119 dev_unlock(); 3120 if (errp != NULL) 3121 *errp = error; 3122 return (error == 0); 3123 } 3124 3125 /* 3126 * Free data allocated by namei(); see namei(9) for details. 3127 */ 3128 void 3129 NDFREE(ndp, flags) 3130 struct nameidata *ndp; 3131 const u_int flags; 3132 { 3133 3134 if (!(flags & NDF_NO_FREE_PNBUF) && 3135 (ndp->ni_cnd.cn_flags & HASBUF)) { 3136 uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3137 ndp->ni_cnd.cn_flags &= ~HASBUF; 3138 } 3139 if (!(flags & NDF_NO_DVP_UNLOCK) && 3140 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3141 ndp->ni_dvp != ndp->ni_vp) 3142 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread); 3143 if (!(flags & NDF_NO_DVP_RELE) && 3144 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3145 vrele(ndp->ni_dvp); 3146 ndp->ni_dvp = NULL; 3147 } 3148 if (!(flags & NDF_NO_VP_UNLOCK) && 3149 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3150 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread); 3151 if (!(flags & NDF_NO_VP_RELE) && 3152 ndp->ni_vp) { 3153 vrele(ndp->ni_vp); 3154 ndp->ni_vp = NULL; 3155 } 3156 if (!(flags & NDF_NO_STARTDIR_RELE) && 3157 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3158 vrele(ndp->ni_startdir); 3159 ndp->ni_startdir = NULL; 3160 } 3161 } 3162 3163 /* 3164 * Common filesystem object access control check routine. Accepts a 3165 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3166 * and optional call-by-reference privused argument allowing vaccess() 3167 * to indicate to the caller whether privilege was used to satisfy the 3168 * request (obsoleted). Returns 0 on success, or an errno on failure. 3169 */ 3170 int 3171 vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3172 enum vtype type; 3173 mode_t file_mode; 3174 uid_t file_uid; 3175 gid_t file_gid; 3176 mode_t acc_mode; 3177 struct ucred *cred; 3178 int *privused; 3179 { 3180 mode_t dac_granted; 3181 #ifdef CAPABILITIES 3182 mode_t cap_granted; 3183 #endif 3184 3185 /* 3186 * Look for a normal, non-privileged way to access the file/directory 3187 * as requested. If it exists, go with that. 3188 */ 3189 3190 if (privused != NULL) 3191 *privused = 0; 3192 3193 dac_granted = 0; 3194 3195 /* Check the owner. */ 3196 if (cred->cr_uid == file_uid) { 3197 dac_granted |= VADMIN; 3198 if (file_mode & S_IXUSR) 3199 dac_granted |= VEXEC; 3200 if (file_mode & S_IRUSR) 3201 dac_granted |= VREAD; 3202 if (file_mode & S_IWUSR) 3203 dac_granted |= (VWRITE | VAPPEND); 3204 3205 if ((acc_mode & dac_granted) == acc_mode) 3206 return (0); 3207 3208 goto privcheck; 3209 } 3210 3211 /* Otherwise, check the groups (first match) */ 3212 if (groupmember(file_gid, cred)) { 3213 if (file_mode & S_IXGRP) 3214 dac_granted |= VEXEC; 3215 if (file_mode & S_IRGRP) 3216 dac_granted |= VREAD; 3217 if (file_mode & S_IWGRP) 3218 dac_granted |= (VWRITE | VAPPEND); 3219 3220 if ((acc_mode & dac_granted) == acc_mode) 3221 return (0); 3222 3223 goto privcheck; 3224 } 3225 3226 /* Otherwise, check everyone else. */ 3227 if (file_mode & S_IXOTH) 3228 dac_granted |= VEXEC; 3229 if (file_mode & S_IROTH) 3230 dac_granted |= VREAD; 3231 if (file_mode & S_IWOTH) 3232 dac_granted |= (VWRITE | VAPPEND); 3233 if ((acc_mode & dac_granted) == acc_mode) 3234 return (0); 3235 3236 privcheck: 3237 if (!suser_cred(cred, SUSER_ALLOWJAIL)) { 3238 /* XXX audit: privilege used */ 3239 if (privused != NULL) 3240 *privused = 1; 3241 return (0); 3242 } 3243 3244 #ifdef CAPABILITIES 3245 /* 3246 * Build a capability mask to determine if the set of capabilities 3247 * satisfies the requirements when combined with the granted mask 3248 * from above. 3249 * For each capability, if the capability is required, bitwise 3250 * or the request type onto the cap_granted mask. 3251 */ 3252 cap_granted = 0; 3253 3254 if (type == VDIR) { 3255 /* 3256 * For directories, use CAP_DAC_READ_SEARCH to satisfy 3257 * VEXEC requests, instead of CAP_DAC_EXECUTE. 3258 */ 3259 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3260 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL)) 3261 cap_granted |= VEXEC; 3262 } else { 3263 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3264 !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL)) 3265 cap_granted |= VEXEC; 3266 } 3267 3268 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3269 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL)) 3270 cap_granted |= VREAD; 3271 3272 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3273 !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL)) 3274 cap_granted |= (VWRITE | VAPPEND); 3275 3276 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3277 !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL)) 3278 cap_granted |= VADMIN; 3279 3280 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3281 /* XXX audit: privilege used */ 3282 if (privused != NULL) 3283 *privused = 1; 3284 return (0); 3285 } 3286 #endif 3287 3288 return ((acc_mode & VADMIN) ? EPERM : EACCES); 3289 } 3290 3291 /* 3292 * Credential check based on process requesting service, and per-attribute 3293 * permissions. 3294 */ 3295 int 3296 extattr_check_cred(struct vnode *vp, int attrnamespace, 3297 struct ucred *cred, struct thread *td, int access) 3298 { 3299 3300 /* 3301 * Kernel-invoked always succeeds. 3302 */ 3303 if (cred == NOCRED) 3304 return (0); 3305 3306 /* 3307 * Do not allow privileged processes in jail to directly 3308 * manipulate system attributes. 3309 * 3310 * XXX What capability should apply here? 3311 * Probably CAP_SYS_SETFFLAG. 3312 */ 3313 switch (attrnamespace) { 3314 case EXTATTR_NAMESPACE_SYSTEM: 3315 /* Potentially should be: return (EPERM); */ 3316 return (suser_cred(cred, 0)); 3317 case EXTATTR_NAMESPACE_USER: 3318 return (VOP_ACCESS(vp, access, cred, td)); 3319 default: 3320 return (EPERM); 3321 } 3322 } 3323 3324 #ifdef DEBUG_VFS_LOCKS 3325 /* 3326 * This only exists to supress warnings from unlocked specfs accesses. It is 3327 * no longer ok to have an unlocked VFS. 3328 */ 3329 #define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD) 3330 3331 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 3332 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, ""); 3333 3334 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 3335 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, ""); 3336 3337 int vfs_badlock_print = 1; /* Print lock violations. */ 3338 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, ""); 3339 3340 #ifdef KDB 3341 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 3342 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, ""); 3343 #endif 3344 3345 static void 3346 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 3347 { 3348 3349 #ifdef KDB 3350 if (vfs_badlock_backtrace) 3351 kdb_backtrace(); 3352 #endif 3353 if (vfs_badlock_print) 3354 printf("%s: %p %s\n", str, (void *)vp, msg); 3355 if (vfs_badlock_ddb) 3356 kdb_enter("lock violation"); 3357 } 3358 3359 void 3360 assert_vi_locked(struct vnode *vp, const char *str) 3361 { 3362 3363 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 3364 vfs_badlock("interlock is not locked but should be", str, vp); 3365 } 3366 3367 void 3368 assert_vi_unlocked(struct vnode *vp, const char *str) 3369 { 3370 3371 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 3372 vfs_badlock("interlock is locked but should not be", str, vp); 3373 } 3374 3375 void 3376 assert_vop_locked(struct vnode *vp, const char *str) 3377 { 3378 3379 if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0 && 3380 !((vp->v_iflag & VI_XLOCK) && vp->v_vxthread == curthread)) 3381 vfs_badlock("is not locked but should be", str, vp); 3382 } 3383 3384 void 3385 assert_vop_unlocked(struct vnode *vp, const char *str) 3386 { 3387 3388 if (vp && !IGNORE_LOCK(vp) && 3389 VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) 3390 vfs_badlock("is locked but should not be", str, vp); 3391 } 3392 3393 #if 0 3394 void 3395 assert_vop_elocked(struct vnode *vp, const char *str) 3396 { 3397 3398 if (vp && !IGNORE_LOCK(vp) && 3399 VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE) 3400 vfs_badlock("is not exclusive locked but should be", str, vp); 3401 } 3402 3403 void 3404 assert_vop_elocked_other(struct vnode *vp, const char *str) 3405 { 3406 3407 if (vp && !IGNORE_LOCK(vp) && 3408 VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER) 3409 vfs_badlock("is not exclusive locked by another thread", 3410 str, vp); 3411 } 3412 3413 void 3414 assert_vop_slocked(struct vnode *vp, const char *str) 3415 { 3416 3417 if (vp && !IGNORE_LOCK(vp) && 3418 VOP_ISLOCKED(vp, curthread) != LK_SHARED) 3419 vfs_badlock("is not locked shared but should be", str, vp); 3420 } 3421 #endif /* 0 */ 3422 3423 void 3424 vop_rename_pre(void *ap) 3425 { 3426 struct vop_rename_args *a = ap; 3427 3428 if (a->a_tvp) 3429 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 3430 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 3431 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 3432 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 3433 3434 /* Check the source (from). */ 3435 if (a->a_tdvp != a->a_fdvp) 3436 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 3437 if (a->a_tvp != a->a_fvp) 3438 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked"); 3439 3440 /* Check the target. */ 3441 if (a->a_tvp) 3442 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 3443 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 3444 } 3445 3446 void 3447 vop_strategy_pre(void *ap) 3448 { 3449 struct vop_strategy_args *a; 3450 struct buf *bp; 3451 3452 a = ap; 3453 bp = a->a_bp; 3454 3455 /* 3456 * Cluster ops lock their component buffers but not the IO container. 3457 */ 3458 if ((bp->b_flags & B_CLUSTER) != 0) 3459 return; 3460 3461 if (BUF_REFCNT(bp) < 1) { 3462 if (vfs_badlock_print) 3463 printf( 3464 "VOP_STRATEGY: bp is not locked but should be\n"); 3465 if (vfs_badlock_ddb) 3466 kdb_enter("lock violation"); 3467 } 3468 } 3469 3470 void 3471 vop_lookup_pre(void *ap) 3472 { 3473 struct vop_lookup_args *a; 3474 struct vnode *dvp; 3475 3476 a = ap; 3477 dvp = a->a_dvp; 3478 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); 3479 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); 3480 } 3481 3482 void 3483 vop_lookup_post(void *ap, int rc) 3484 { 3485 struct vop_lookup_args *a; 3486 struct componentname *cnp; 3487 struct vnode *dvp; 3488 struct vnode *vp; 3489 int flags; 3490 3491 a = ap; 3492 dvp = a->a_dvp; 3493 cnp = a->a_cnp; 3494 vp = *(a->a_vpp); 3495 flags = cnp->cn_flags; 3496 3497 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); 3498 3499 /* 3500 * If this is the last path component for this lookup and LOCKPARENT 3501 * is set, OR if there is an error the directory has to be locked. 3502 */ 3503 if ((flags & LOCKPARENT) && (flags & ISLASTCN)) 3504 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)"); 3505 else if (rc != 0) 3506 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)"); 3507 else if (dvp != vp) 3508 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)"); 3509 if (flags & PDIRUNLOCK) 3510 ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)"); 3511 } 3512 3513 void 3514 vop_lock_pre(void *ap) 3515 { 3516 struct vop_lock_args *a = ap; 3517 3518 if (a->a_vp->v_iflag & VI_XLOCK && 3519 a->a_vp->v_vxthread != curthread) { 3520 vprint("vop_lock_pre:", a->a_vp); 3521 panic("vop_lock_pre: locked while xlock held.\n"); 3522 } 3523 if ((a->a_flags & LK_INTERLOCK) == 0) 3524 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 3525 else 3526 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 3527 } 3528 3529 void 3530 vop_lock_post(void *ap, int rc) 3531 { 3532 struct vop_lock_args *a = ap; 3533 3534 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 3535 if (rc == 0) 3536 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 3537 } 3538 3539 void 3540 vop_unlock_pre(void *ap) 3541 { 3542 struct vop_unlock_args *a = ap; 3543 3544 if (a->a_flags & LK_INTERLOCK) 3545 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 3546 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 3547 } 3548 3549 void 3550 vop_unlock_post(void *ap, int rc) 3551 { 3552 struct vop_unlock_args *a = ap; 3553 3554 if (a->a_flags & LK_INTERLOCK) 3555 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 3556 } 3557 #endif /* DEBUG_VFS_LOCKS */ 3558 3559 static struct knlist fs_knlist; 3560 3561 static void 3562 vfs_event_init(void *arg) 3563 { 3564 knlist_init(&fs_knlist, NULL); 3565 } 3566 /* XXX - correct order? */ 3567 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 3568 3569 void 3570 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) 3571 { 3572 3573 KNOTE_UNLOCKED(&fs_knlist, event); 3574 } 3575 3576 static int filt_fsattach(struct knote *kn); 3577 static void filt_fsdetach(struct knote *kn); 3578 static int filt_fsevent(struct knote *kn, long hint); 3579 3580 struct filterops fs_filtops = 3581 { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; 3582 3583 static int 3584 filt_fsattach(struct knote *kn) 3585 { 3586 3587 kn->kn_flags |= EV_CLEAR; 3588 knlist_add(&fs_knlist, kn, 0); 3589 return (0); 3590 } 3591 3592 static void 3593 filt_fsdetach(struct knote *kn) 3594 { 3595 3596 knlist_remove(&fs_knlist, kn, 0); 3597 } 3598 3599 static int 3600 filt_fsevent(struct knote *kn, long hint) 3601 { 3602 3603 kn->kn_fflags |= hint; 3604 return (kn->kn_fflags != 0); 3605 } 3606 3607 static int 3608 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 3609 { 3610 struct vfsidctl vc; 3611 int error; 3612 struct mount *mp; 3613 3614 error = SYSCTL_IN(req, &vc, sizeof(vc)); 3615 if (error) 3616 return (error); 3617 if (vc.vc_vers != VFS_CTL_VERS1) 3618 return (EINVAL); 3619 mp = vfs_getvfs(&vc.vc_fsid); 3620 if (mp == NULL) 3621 return (ENOENT); 3622 /* ensure that a specific sysctl goes to the right filesystem. */ 3623 if (strcmp(vc.vc_fstypename, "*") != 0 && 3624 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 3625 return (EINVAL); 3626 } 3627 VCTLTOREQ(&vc, req); 3628 return (VFS_SYSCTL(mp, vc.vc_op, req)); 3629 } 3630 3631 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, 3632 NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); 3633 3634 /* 3635 * Function to initialize a va_filerev field sensibly. 3636 * XXX: Wouldn't a random number make a lot more sense ?? 3637 */ 3638 u_quad_t 3639 init_va_filerev(void) 3640 { 3641 struct bintime bt; 3642 3643 getbinuptime(&bt); 3644 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 3645 } 3646