1 /*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35 */ 36 37 /* 38 * External virtual filesystem routines 39 */ 40 41 #include <sys/cdefs.h> 42 __FBSDID("$FreeBSD$"); 43 44 #include "opt_ddb.h" 45 #include "opt_watchdog.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/bio.h> 50 #include <sys/buf.h> 51 #include <sys/condvar.h> 52 #include <sys/conf.h> 53 #include <sys/dirent.h> 54 #include <sys/event.h> 55 #include <sys/eventhandler.h> 56 #include <sys/extattr.h> 57 #include <sys/file.h> 58 #include <sys/fcntl.h> 59 #include <sys/jail.h> 60 #include <sys/kdb.h> 61 #include <sys/kernel.h> 62 #include <sys/kthread.h> 63 #include <sys/lockf.h> 64 #include <sys/malloc.h> 65 #include <sys/mount.h> 66 #include <sys/namei.h> 67 #include <sys/priv.h> 68 #include <sys/reboot.h> 69 #include <sys/sched.h> 70 #include <sys/sleepqueue.h> 71 #include <sys/stat.h> 72 #include <sys/sysctl.h> 73 #include <sys/syslog.h> 74 #include <sys/vmmeter.h> 75 #include <sys/vnode.h> 76 #include <sys/watchdog.h> 77 78 #include <machine/stdarg.h> 79 80 #include <security/mac/mac_framework.h> 81 82 #include <vm/vm.h> 83 #include <vm/vm_object.h> 84 #include <vm/vm_extern.h> 85 #include <vm/pmap.h> 86 #include <vm/vm_map.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_kern.h> 89 #include <vm/uma.h> 90 91 #ifdef DDB 92 #include <ddb/ddb.h> 93 #endif 94 95 #define WI_MPSAFEQ 0 96 #define WI_GIANTQ 1 97 98 static void delmntque(struct vnode *vp); 99 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 100 int slpflag, int slptimeo); 101 static void syncer_shutdown(void *arg, int howto); 102 static int vtryrecycle(struct vnode *vp); 103 static void v_incr_usecount(struct vnode *); 104 static void v_decr_usecount(struct vnode *); 105 static void v_decr_useonly(struct vnode *); 106 static void v_upgrade_usecount(struct vnode *); 107 static void vnlru_free(int); 108 static void vgonel(struct vnode *); 109 static void vfs_knllock(void *arg); 110 static void vfs_knlunlock(void *arg); 111 static void vfs_knl_assert_locked(void *arg); 112 static void vfs_knl_assert_unlocked(void *arg); 113 static void destroy_vpollinfo(struct vpollinfo *vi); 114 115 /* 116 * Number of vnodes in existence. Increased whenever getnewvnode() 117 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. 118 */ 119 static unsigned long numvnodes; 120 121 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 122 "Number of vnodes in existence"); 123 124 /* 125 * Conversion tables for conversion from vnode types to inode formats 126 * and back. 127 */ 128 enum vtype iftovt_tab[16] = { 129 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 130 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 131 }; 132 int vttoif_tab[10] = { 133 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 134 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 135 }; 136 137 /* 138 * List of vnodes that are ready for recycling. 139 */ 140 static TAILQ_HEAD(freelst, vnode) vnode_free_list; 141 142 /* 143 * Free vnode target. Free vnodes may simply be files which have been stat'd 144 * but not read. This is somewhat common, and a small cache of such files 145 * should be kept to avoid recreation costs. 146 */ 147 static u_long wantfreevnodes; 148 SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 149 /* Number of vnodes in the free list. */ 150 static u_long freevnodes; 151 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, 152 "Number of vnodes in the free list"); 153 154 static int vlru_allow_cache_src; 155 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, 156 &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); 157 158 /* 159 * Various variables used for debugging the new implementation of 160 * reassignbuf(). 161 * XXX these are probably of (very) limited utility now. 162 */ 163 static int reassignbufcalls; 164 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, 165 "Number of calls to reassignbuf"); 166 167 /* 168 * Cache for the mount type id assigned to NFS. This is used for 169 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 170 */ 171 int nfs_mount_type = -1; 172 173 /* To keep more than one thread at a time from running vfs_getnewfsid */ 174 static struct mtx mntid_mtx; 175 176 /* 177 * Lock for any access to the following: 178 * vnode_free_list 179 * numvnodes 180 * freevnodes 181 */ 182 static struct mtx vnode_free_list_mtx; 183 184 /* Publicly exported FS */ 185 struct nfs_public nfs_pub; 186 187 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 188 static uma_zone_t vnode_zone; 189 static uma_zone_t vnodepoll_zone; 190 191 /* 192 * The workitem queue. 193 * 194 * It is useful to delay writes of file data and filesystem metadata 195 * for tens of seconds so that quickly created and deleted files need 196 * not waste disk bandwidth being created and removed. To realize this, 197 * we append vnodes to a "workitem" queue. When running with a soft 198 * updates implementation, most pending metadata dependencies should 199 * not wait for more than a few seconds. Thus, mounted on block devices 200 * are delayed only about a half the time that file data is delayed. 201 * Similarly, directory updates are more critical, so are only delayed 202 * about a third the time that file data is delayed. Thus, there are 203 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 204 * one each second (driven off the filesystem syncer process). The 205 * syncer_delayno variable indicates the next queue that is to be processed. 206 * Items that need to be processed soon are placed in this queue: 207 * 208 * syncer_workitem_pending[syncer_delayno] 209 * 210 * A delay of fifteen seconds is done by placing the request fifteen 211 * entries later in the queue: 212 * 213 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 214 * 215 */ 216 static int syncer_delayno; 217 static long syncer_mask; 218 LIST_HEAD(synclist, bufobj); 219 static struct synclist *syncer_workitem_pending[2]; 220 /* 221 * The sync_mtx protects: 222 * bo->bo_synclist 223 * sync_vnode_count 224 * syncer_delayno 225 * syncer_state 226 * syncer_workitem_pending 227 * syncer_worklist_len 228 * rushjob 229 */ 230 static struct mtx sync_mtx; 231 static struct cv sync_wakeup; 232 233 #define SYNCER_MAXDELAY 32 234 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 235 static int syncdelay = 30; /* max time to delay syncing data */ 236 static int filedelay = 30; /* time to delay syncing files */ 237 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 238 "Time to delay syncing files (in seconds)"); 239 static int dirdelay = 29; /* time to delay syncing directories */ 240 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 241 "Time to delay syncing directories (in seconds)"); 242 static int metadelay = 28; /* time to delay syncing metadata */ 243 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 244 "Time to delay syncing metadata (in seconds)"); 245 static int rushjob; /* number of slots to run ASAP */ 246 static int stat_rush_requests; /* number of times I/O speeded up */ 247 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 248 "Number of times I/O speeded up (rush requests)"); 249 250 /* 251 * When shutting down the syncer, run it at four times normal speed. 252 */ 253 #define SYNCER_SHUTDOWN_SPEEDUP 4 254 static int sync_vnode_count; 255 static int syncer_worklist_len; 256 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 257 syncer_state; 258 259 /* 260 * Number of vnodes we want to exist at any one time. This is mostly used 261 * to size hash tables in vnode-related code. It is normally not used in 262 * getnewvnode(), as wantfreevnodes is normally nonzero.) 263 * 264 * XXX desiredvnodes is historical cruft and should not exist. 265 */ 266 int desiredvnodes; 267 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 268 &desiredvnodes, 0, "Maximum number of vnodes"); 269 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 270 &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); 271 static int vnlru_nowhere; 272 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 273 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 274 275 /* 276 * Macros to control when a vnode is freed and recycled. All require 277 * the vnode interlock. 278 */ 279 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) 280 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) 281 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) 282 283 284 /* 285 * Initialize the vnode management data structures. 286 * 287 * Reevaluate the following cap on the number of vnodes after the physical 288 * memory size exceeds 512GB. In the limit, as the physical memory size 289 * grows, the ratio of physical pages to vnodes approaches sixteen to one. 290 */ 291 #ifndef MAXVNODES_MAX 292 #define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) 293 #endif 294 static void 295 vntblinit(void *dummy __unused) 296 { 297 int physvnodes, virtvnodes; 298 299 /* 300 * Desiredvnodes is a function of the physical memory size and the 301 * kernel's heap size. Generally speaking, it scales with the 302 * physical memory size. The ratio of desiredvnodes to physical pages 303 * is one to four until desiredvnodes exceeds 98,304. Thereafter, the 304 * marginal ratio of desiredvnodes to physical pages is one to 305 * sixteen. However, desiredvnodes is limited by the kernel's heap 306 * size. The memory required by desiredvnodes vnodes and vm objects 307 * may not exceed one seventh of the kernel's heap size. 308 */ 309 physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4, 310 cnt.v_page_count) / 16; 311 virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + 312 sizeof(struct vnode))); 313 desiredvnodes = min(physvnodes, virtvnodes); 314 if (desiredvnodes > MAXVNODES_MAX) { 315 if (bootverbose) 316 printf("Reducing kern.maxvnodes %d -> %d\n", 317 desiredvnodes, MAXVNODES_MAX); 318 desiredvnodes = MAXVNODES_MAX; 319 } 320 wantfreevnodes = desiredvnodes / 4; 321 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 322 TAILQ_INIT(&vnode_free_list); 323 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 324 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 325 NULL, NULL, UMA_ALIGN_PTR, 0); 326 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 327 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 328 /* 329 * Initialize the filesystem syncer. 330 */ 331 syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE, 332 &syncer_mask); 333 syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE, 334 &syncer_mask); 335 syncer_maxdelay = syncer_mask + 1; 336 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 337 cv_init(&sync_wakeup, "syncer"); 338 } 339 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 340 341 342 /* 343 * Mark a mount point as busy. Used to synchronize access and to delay 344 * unmounting. Eventually, mountlist_mtx is not released on failure. 345 * 346 * vfs_busy() is a custom lock, it can block the caller. 347 * vfs_busy() only sleeps if the unmount is active on the mount point. 348 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 349 * vnode belonging to mp. 350 * 351 * Lookup uses vfs_busy() to traverse mount points. 352 * root fs var fs 353 * / vnode lock A / vnode lock (/var) D 354 * /var vnode lock B /log vnode lock(/var/log) E 355 * vfs_busy lock C vfs_busy lock F 356 * 357 * Within each file system, the lock order is C->A->B and F->D->E. 358 * 359 * When traversing across mounts, the system follows that lock order: 360 * 361 * C->A->B 362 * | 363 * +->F->D->E 364 * 365 * The lookup() process for namei("/var") illustrates the process: 366 * VOP_LOOKUP() obtains B while A is held 367 * vfs_busy() obtains a shared lock on F while A and B are held 368 * vput() releases lock on B 369 * vput() releases lock on A 370 * VFS_ROOT() obtains lock on D while shared lock on F is held 371 * vfs_unbusy() releases shared lock on F 372 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 373 * Attempt to lock A (instead of vp_crossmp) while D is held would 374 * violate the global order, causing deadlocks. 375 * 376 * dounmount() locks B while F is drained. 377 */ 378 int 379 vfs_busy(struct mount *mp, int flags) 380 { 381 382 MPASS((flags & ~MBF_MASK) == 0); 383 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 384 385 MNT_ILOCK(mp); 386 MNT_REF(mp); 387 /* 388 * If mount point is currenly being unmounted, sleep until the 389 * mount point fate is decided. If thread doing the unmounting fails, 390 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 391 * that this mount point has survived the unmount attempt and vfs_busy 392 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 393 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 394 * about to be really destroyed. vfs_busy needs to release its 395 * reference on the mount point in this case and return with ENOENT, 396 * telling the caller that mount mount it tried to busy is no longer 397 * valid. 398 */ 399 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 400 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 401 MNT_REL(mp); 402 MNT_IUNLOCK(mp); 403 CTR1(KTR_VFS, "%s: failed busying before sleeping", 404 __func__); 405 return (ENOENT); 406 } 407 if (flags & MBF_MNTLSTLOCK) 408 mtx_unlock(&mountlist_mtx); 409 mp->mnt_kern_flag |= MNTK_MWAIT; 410 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 411 if (flags & MBF_MNTLSTLOCK) 412 mtx_lock(&mountlist_mtx); 413 MNT_ILOCK(mp); 414 } 415 if (flags & MBF_MNTLSTLOCK) 416 mtx_unlock(&mountlist_mtx); 417 mp->mnt_lockref++; 418 MNT_IUNLOCK(mp); 419 return (0); 420 } 421 422 /* 423 * Free a busy filesystem. 424 */ 425 void 426 vfs_unbusy(struct mount *mp) 427 { 428 429 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 430 MNT_ILOCK(mp); 431 MNT_REL(mp); 432 KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); 433 mp->mnt_lockref--; 434 if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 435 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 436 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 437 mp->mnt_kern_flag &= ~MNTK_DRAINING; 438 wakeup(&mp->mnt_lockref); 439 } 440 MNT_IUNLOCK(mp); 441 } 442 443 /* 444 * Lookup a mount point by filesystem identifier. 445 */ 446 struct mount * 447 vfs_getvfs(fsid_t *fsid) 448 { 449 struct mount *mp; 450 451 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 452 mtx_lock(&mountlist_mtx); 453 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 454 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 455 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 456 vfs_ref(mp); 457 mtx_unlock(&mountlist_mtx); 458 return (mp); 459 } 460 } 461 mtx_unlock(&mountlist_mtx); 462 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 463 return ((struct mount *) 0); 464 } 465 466 /* 467 * Lookup a mount point by filesystem identifier, busying it before 468 * returning. 469 */ 470 struct mount * 471 vfs_busyfs(fsid_t *fsid) 472 { 473 struct mount *mp; 474 int error; 475 476 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 477 mtx_lock(&mountlist_mtx); 478 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 479 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 480 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 481 error = vfs_busy(mp, MBF_MNTLSTLOCK); 482 if (error) { 483 mtx_unlock(&mountlist_mtx); 484 return (NULL); 485 } 486 return (mp); 487 } 488 } 489 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 490 mtx_unlock(&mountlist_mtx); 491 return ((struct mount *) 0); 492 } 493 494 /* 495 * Check if a user can access privileged mount options. 496 */ 497 int 498 vfs_suser(struct mount *mp, struct thread *td) 499 { 500 int error; 501 502 /* 503 * If the thread is jailed, but this is not a jail-friendly file 504 * system, deny immediately. 505 */ 506 if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) 507 return (EPERM); 508 509 /* 510 * If the file system was mounted outside the jail of the calling 511 * thread, deny immediately. 512 */ 513 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 514 return (EPERM); 515 516 /* 517 * If file system supports delegated administration, we don't check 518 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 519 * by the file system itself. 520 * If this is not the user that did original mount, we check for 521 * the PRIV_VFS_MOUNT_OWNER privilege. 522 */ 523 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 524 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 525 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 526 return (error); 527 } 528 return (0); 529 } 530 531 /* 532 * Get a new unique fsid. Try to make its val[0] unique, since this value 533 * will be used to create fake device numbers for stat(). Also try (but 534 * not so hard) make its val[0] unique mod 2^16, since some emulators only 535 * support 16-bit device numbers. We end up with unique val[0]'s for the 536 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 537 * 538 * Keep in mind that several mounts may be running in parallel. Starting 539 * the search one past where the previous search terminated is both a 540 * micro-optimization and a defense against returning the same fsid to 541 * different mounts. 542 */ 543 void 544 vfs_getnewfsid(struct mount *mp) 545 { 546 static uint16_t mntid_base; 547 struct mount *nmp; 548 fsid_t tfsid; 549 int mtype; 550 551 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 552 mtx_lock(&mntid_mtx); 553 mtype = mp->mnt_vfc->vfc_typenum; 554 tfsid.val[1] = mtype; 555 mtype = (mtype & 0xFF) << 24; 556 for (;;) { 557 tfsid.val[0] = makedev(255, 558 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 559 mntid_base++; 560 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 561 break; 562 vfs_rel(nmp); 563 } 564 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 565 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 566 mtx_unlock(&mntid_mtx); 567 } 568 569 /* 570 * Knob to control the precision of file timestamps: 571 * 572 * 0 = seconds only; nanoseconds zeroed. 573 * 1 = seconds and nanoseconds, accurate within 1/HZ. 574 * 2 = seconds and nanoseconds, truncated to microseconds. 575 * >=3 = seconds and nanoseconds, maximum precision. 576 */ 577 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 578 579 static int timestamp_precision = TSP_SEC; 580 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 581 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 582 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " 583 "3+: sec + ns (max. precision))"); 584 585 /* 586 * Get a current timestamp. 587 */ 588 void 589 vfs_timestamp(struct timespec *tsp) 590 { 591 struct timeval tv; 592 593 switch (timestamp_precision) { 594 case TSP_SEC: 595 tsp->tv_sec = time_second; 596 tsp->tv_nsec = 0; 597 break; 598 case TSP_HZ: 599 getnanotime(tsp); 600 break; 601 case TSP_USEC: 602 microtime(&tv); 603 TIMEVAL_TO_TIMESPEC(&tv, tsp); 604 break; 605 case TSP_NSEC: 606 default: 607 nanotime(tsp); 608 break; 609 } 610 } 611 612 /* 613 * Set vnode attributes to VNOVAL 614 */ 615 void 616 vattr_null(struct vattr *vap) 617 { 618 619 vap->va_type = VNON; 620 vap->va_size = VNOVAL; 621 vap->va_bytes = VNOVAL; 622 vap->va_mode = VNOVAL; 623 vap->va_nlink = VNOVAL; 624 vap->va_uid = VNOVAL; 625 vap->va_gid = VNOVAL; 626 vap->va_fsid = VNOVAL; 627 vap->va_fileid = VNOVAL; 628 vap->va_blocksize = VNOVAL; 629 vap->va_rdev = VNOVAL; 630 vap->va_atime.tv_sec = VNOVAL; 631 vap->va_atime.tv_nsec = VNOVAL; 632 vap->va_mtime.tv_sec = VNOVAL; 633 vap->va_mtime.tv_nsec = VNOVAL; 634 vap->va_ctime.tv_sec = VNOVAL; 635 vap->va_ctime.tv_nsec = VNOVAL; 636 vap->va_birthtime.tv_sec = VNOVAL; 637 vap->va_birthtime.tv_nsec = VNOVAL; 638 vap->va_flags = VNOVAL; 639 vap->va_gen = VNOVAL; 640 vap->va_vaflags = 0; 641 } 642 643 /* 644 * This routine is called when we have too many vnodes. It attempts 645 * to free <count> vnodes and will potentially free vnodes that still 646 * have VM backing store (VM backing store is typically the cause 647 * of a vnode blowout so we want to do this). Therefore, this operation 648 * is not considered cheap. 649 * 650 * A number of conditions may prevent a vnode from being reclaimed. 651 * the buffer cache may have references on the vnode, a directory 652 * vnode may still have references due to the namei cache representing 653 * underlying files, or the vnode may be in active use. It is not 654 * desireable to reuse such vnodes. These conditions may cause the 655 * number of vnodes to reach some minimum value regardless of what 656 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 657 */ 658 static int 659 vlrureclaim(struct mount *mp) 660 { 661 struct vnode *vp; 662 int done; 663 int trigger; 664 int usevnodes; 665 int count; 666 667 /* 668 * Calculate the trigger point, don't allow user 669 * screwups to blow us up. This prevents us from 670 * recycling vnodes with lots of resident pages. We 671 * aren't trying to free memory, we are trying to 672 * free vnodes. 673 */ 674 usevnodes = desiredvnodes; 675 if (usevnodes <= 0) 676 usevnodes = 1; 677 trigger = cnt.v_page_count * 2 / usevnodes; 678 done = 0; 679 vn_start_write(NULL, &mp, V_WAIT); 680 MNT_ILOCK(mp); 681 count = mp->mnt_nvnodelistsize / 10 + 1; 682 while (count != 0) { 683 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 684 while (vp != NULL && vp->v_type == VMARKER) 685 vp = TAILQ_NEXT(vp, v_nmntvnodes); 686 if (vp == NULL) 687 break; 688 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 689 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 690 --count; 691 if (!VI_TRYLOCK(vp)) 692 goto next_iter; 693 /* 694 * If it's been deconstructed already, it's still 695 * referenced, or it exceeds the trigger, skip it. 696 */ 697 if (vp->v_usecount || 698 (!vlru_allow_cache_src && 699 !LIST_EMPTY(&(vp)->v_cache_src)) || 700 (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && 701 vp->v_object->resident_page_count > trigger)) { 702 VI_UNLOCK(vp); 703 goto next_iter; 704 } 705 MNT_IUNLOCK(mp); 706 vholdl(vp); 707 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 708 vdrop(vp); 709 goto next_iter_mntunlocked; 710 } 711 VI_LOCK(vp); 712 /* 713 * v_usecount may have been bumped after VOP_LOCK() dropped 714 * the vnode interlock and before it was locked again. 715 * 716 * It is not necessary to recheck VI_DOOMED because it can 717 * only be set by another thread that holds both the vnode 718 * lock and vnode interlock. If another thread has the 719 * vnode lock before we get to VOP_LOCK() and obtains the 720 * vnode interlock after VOP_LOCK() drops the vnode 721 * interlock, the other thread will be unable to drop the 722 * vnode lock before our VOP_LOCK() call fails. 723 */ 724 if (vp->v_usecount || 725 (!vlru_allow_cache_src && 726 !LIST_EMPTY(&(vp)->v_cache_src)) || 727 (vp->v_object != NULL && 728 vp->v_object->resident_page_count > trigger)) { 729 VOP_UNLOCK(vp, LK_INTERLOCK); 730 goto next_iter_mntunlocked; 731 } 732 KASSERT((vp->v_iflag & VI_DOOMED) == 0, 733 ("VI_DOOMED unexpectedly detected in vlrureclaim()")); 734 vgonel(vp); 735 VOP_UNLOCK(vp, 0); 736 vdropl(vp); 737 done++; 738 next_iter_mntunlocked: 739 if (!should_yield()) 740 goto relock_mnt; 741 goto yield; 742 next_iter: 743 if (!should_yield()) 744 continue; 745 MNT_IUNLOCK(mp); 746 yield: 747 kern_yield(PRI_UNCHANGED); 748 relock_mnt: 749 MNT_ILOCK(mp); 750 } 751 MNT_IUNLOCK(mp); 752 vn_finished_write(mp); 753 return done; 754 } 755 756 /* 757 * Attempt to keep the free list at wantfreevnodes length. 758 */ 759 static void 760 vnlru_free(int count) 761 { 762 struct vnode *vp; 763 int vfslocked; 764 765 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 766 for (; count > 0; count--) { 767 vp = TAILQ_FIRST(&vnode_free_list); 768 /* 769 * The list can be modified while the free_list_mtx 770 * has been dropped and vp could be NULL here. 771 */ 772 if (!vp) 773 break; 774 VNASSERT(vp->v_op != NULL, vp, 775 ("vnlru_free: vnode already reclaimed.")); 776 KASSERT((vp->v_iflag & VI_FREE) != 0, 777 ("Removing vnode not on freelist")); 778 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 779 ("Mangling active vnode")); 780 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 781 /* 782 * Don't recycle if we can't get the interlock. 783 */ 784 if (!VI_TRYLOCK(vp)) { 785 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 786 continue; 787 } 788 VNASSERT(VCANRECYCLE(vp), vp, 789 ("vp inconsistent on freelist")); 790 freevnodes--; 791 vp->v_iflag &= ~VI_FREE; 792 vholdl(vp); 793 mtx_unlock(&vnode_free_list_mtx); 794 VI_UNLOCK(vp); 795 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 796 vtryrecycle(vp); 797 VFS_UNLOCK_GIANT(vfslocked); 798 /* 799 * If the recycled succeeded this vdrop will actually free 800 * the vnode. If not it will simply place it back on 801 * the free list. 802 */ 803 vdrop(vp); 804 mtx_lock(&vnode_free_list_mtx); 805 } 806 } 807 /* 808 * Attempt to recycle vnodes in a context that is always safe to block. 809 * Calling vlrurecycle() from the bowels of filesystem code has some 810 * interesting deadlock problems. 811 */ 812 static struct proc *vnlruproc; 813 static int vnlruproc_sig; 814 815 static void 816 vnlru_proc(void) 817 { 818 struct mount *mp, *nmp; 819 int done, vfslocked; 820 struct proc *p = vnlruproc; 821 822 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 823 SHUTDOWN_PRI_FIRST); 824 825 for (;;) { 826 kproc_suspend_check(p); 827 mtx_lock(&vnode_free_list_mtx); 828 if (freevnodes > wantfreevnodes) 829 vnlru_free(freevnodes - wantfreevnodes); 830 if (numvnodes <= desiredvnodes * 9 / 10) { 831 vnlruproc_sig = 0; 832 wakeup(&vnlruproc_sig); 833 msleep(vnlruproc, &vnode_free_list_mtx, 834 PVFS|PDROP, "vlruwt", hz); 835 continue; 836 } 837 mtx_unlock(&vnode_free_list_mtx); 838 done = 0; 839 mtx_lock(&mountlist_mtx); 840 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 841 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 842 nmp = TAILQ_NEXT(mp, mnt_list); 843 continue; 844 } 845 vfslocked = VFS_LOCK_GIANT(mp); 846 done += vlrureclaim(mp); 847 VFS_UNLOCK_GIANT(vfslocked); 848 mtx_lock(&mountlist_mtx); 849 nmp = TAILQ_NEXT(mp, mnt_list); 850 vfs_unbusy(mp); 851 } 852 mtx_unlock(&mountlist_mtx); 853 if (done == 0) { 854 #if 0 855 /* These messages are temporary debugging aids */ 856 if (vnlru_nowhere < 5) 857 printf("vnlru process getting nowhere..\n"); 858 else if (vnlru_nowhere == 5) 859 printf("vnlru process messages stopped.\n"); 860 #endif 861 vnlru_nowhere++; 862 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 863 } else 864 kern_yield(PRI_UNCHANGED); 865 } 866 } 867 868 static struct kproc_desc vnlru_kp = { 869 "vnlru", 870 vnlru_proc, 871 &vnlruproc 872 }; 873 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 874 &vnlru_kp); 875 876 /* 877 * Routines having to do with the management of the vnode table. 878 */ 879 880 /* 881 * Try to recycle a freed vnode. We abort if anyone picks up a reference 882 * before we actually vgone(). This function must be called with the vnode 883 * held to prevent the vnode from being returned to the free list midway 884 * through vgone(). 885 */ 886 static int 887 vtryrecycle(struct vnode *vp) 888 { 889 struct mount *vnmp; 890 891 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 892 VNASSERT(vp->v_holdcnt, vp, 893 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 894 /* 895 * This vnode may found and locked via some other list, if so we 896 * can't recycle it yet. 897 */ 898 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 899 CTR2(KTR_VFS, 900 "%s: impossible to recycle, vp %p lock is already held", 901 __func__, vp); 902 return (EWOULDBLOCK); 903 } 904 /* 905 * Don't recycle if its filesystem is being suspended. 906 */ 907 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 908 VOP_UNLOCK(vp, 0); 909 CTR2(KTR_VFS, 910 "%s: impossible to recycle, cannot start the write for %p", 911 __func__, vp); 912 return (EBUSY); 913 } 914 /* 915 * If we got this far, we need to acquire the interlock and see if 916 * anyone picked up this vnode from another list. If not, we will 917 * mark it with DOOMED via vgonel() so that anyone who does find it 918 * will skip over it. 919 */ 920 VI_LOCK(vp); 921 if (vp->v_usecount) { 922 VOP_UNLOCK(vp, LK_INTERLOCK); 923 vn_finished_write(vnmp); 924 CTR2(KTR_VFS, 925 "%s: impossible to recycle, %p is already referenced", 926 __func__, vp); 927 return (EBUSY); 928 } 929 if ((vp->v_iflag & VI_DOOMED) == 0) 930 vgonel(vp); 931 VOP_UNLOCK(vp, LK_INTERLOCK); 932 vn_finished_write(vnmp); 933 return (0); 934 } 935 936 /* 937 * Return the next vnode from the free list. 938 */ 939 int 940 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 941 struct vnode **vpp) 942 { 943 struct vnode *vp = NULL; 944 struct bufobj *bo; 945 946 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 947 mtx_lock(&vnode_free_list_mtx); 948 /* 949 * Lend our context to reclaim vnodes if they've exceeded the max. 950 */ 951 if (freevnodes > wantfreevnodes) 952 vnlru_free(1); 953 /* 954 * Wait for available vnodes. 955 */ 956 if (numvnodes > desiredvnodes) { 957 if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) { 958 /* 959 * File system is beeing suspended, we cannot risk a 960 * deadlock here, so allocate new vnode anyway. 961 */ 962 if (freevnodes > wantfreevnodes) 963 vnlru_free(freevnodes - wantfreevnodes); 964 goto alloc; 965 } 966 if (vnlruproc_sig == 0) { 967 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 968 wakeup(vnlruproc); 969 } 970 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 971 "vlruwk", hz); 972 #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 973 if (numvnodes > desiredvnodes) { 974 mtx_unlock(&vnode_free_list_mtx); 975 return (ENFILE); 976 } 977 #endif 978 } 979 alloc: 980 numvnodes++; 981 mtx_unlock(&vnode_free_list_mtx); 982 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); 983 /* 984 * Setup locks. 985 */ 986 vp->v_vnlock = &vp->v_lock; 987 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 988 /* 989 * By default, don't allow shared locks unless filesystems 990 * opt-in. 991 */ 992 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); 993 /* 994 * Initialize bufobj. 995 */ 996 bo = &vp->v_bufobj; 997 bo->__bo_vnode = vp; 998 mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF); 999 bo->bo_ops = &buf_ops_bio; 1000 bo->bo_private = vp; 1001 TAILQ_INIT(&bo->bo_clean.bv_hd); 1002 TAILQ_INIT(&bo->bo_dirty.bv_hd); 1003 /* 1004 * Initialize namecache. 1005 */ 1006 LIST_INIT(&vp->v_cache_src); 1007 TAILQ_INIT(&vp->v_cache_dst); 1008 /* 1009 * Finalize various vnode identity bits. 1010 */ 1011 vp->v_type = VNON; 1012 vp->v_tag = tag; 1013 vp->v_op = vops; 1014 v_incr_usecount(vp); 1015 vp->v_data = NULL; 1016 #ifdef MAC 1017 mac_vnode_init(vp); 1018 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1019 mac_vnode_associate_singlelabel(mp, vp); 1020 else if (mp == NULL && vops != &dead_vnodeops) 1021 printf("NULL mp in getnewvnode()\n"); 1022 #endif 1023 if (mp != NULL) { 1024 bo->bo_bsize = mp->mnt_stat.f_iosize; 1025 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1026 vp->v_vflag |= VV_NOKNOTE; 1027 } 1028 rangelock_init(&vp->v_rl); 1029 1030 *vpp = vp; 1031 return (0); 1032 } 1033 1034 /* 1035 * Delete from old mount point vnode list, if on one. 1036 */ 1037 static void 1038 delmntque(struct vnode *vp) 1039 { 1040 struct mount *mp; 1041 int active; 1042 1043 mp = vp->v_mount; 1044 if (mp == NULL) 1045 return; 1046 MNT_ILOCK(mp); 1047 VI_LOCK(vp); 1048 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1049 ("Active vnode list size %d > Vnode list size %d", 1050 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1051 active = vp->v_iflag & VI_ACTIVE; 1052 vp->v_iflag &= ~VI_ACTIVE; 1053 if (active) { 1054 mtx_lock(&vnode_free_list_mtx); 1055 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1056 mp->mnt_activevnodelistsize--; 1057 mtx_unlock(&vnode_free_list_mtx); 1058 } 1059 vp->v_mount = NULL; 1060 VI_UNLOCK(vp); 1061 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1062 ("bad mount point vnode list size")); 1063 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1064 mp->mnt_nvnodelistsize--; 1065 MNT_REL(mp); 1066 MNT_IUNLOCK(mp); 1067 } 1068 1069 static void 1070 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1071 { 1072 1073 vp->v_data = NULL; 1074 vp->v_op = &dead_vnodeops; 1075 /* XXX non mp-safe fs may still call insmntque with vnode 1076 unlocked */ 1077 if (!VOP_ISLOCKED(vp)) 1078 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1079 vgone(vp); 1080 vput(vp); 1081 } 1082 1083 /* 1084 * Insert into list of vnodes for the new mount point, if available. 1085 */ 1086 int 1087 insmntque1(struct vnode *vp, struct mount *mp, 1088 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1089 { 1090 int locked; 1091 1092 KASSERT(vp->v_mount == NULL, 1093 ("insmntque: vnode already on per mount vnode list")); 1094 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1095 #ifdef DEBUG_VFS_LOCKS 1096 if (!VFS_NEEDSGIANT(mp)) 1097 ASSERT_VOP_ELOCKED(vp, 1098 "insmntque: mp-safe fs and non-locked vp"); 1099 #endif 1100 /* 1101 * We acquire the vnode interlock early to ensure that the 1102 * vnode cannot be recycled by another process releasing a 1103 * holdcnt on it before we get it on both the vnode list 1104 * and the active vnode list. The mount mutex protects only 1105 * manipulation of the vnode list and the vnode freelist 1106 * mutex protects only manipulation of the active vnode list. 1107 * Hence the need to hold the vnode interlock throughout. 1108 */ 1109 MNT_ILOCK(mp); 1110 VI_LOCK(vp); 1111 if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && 1112 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1113 mp->mnt_nvnodelistsize == 0)) { 1114 locked = VOP_ISLOCKED(vp); 1115 if (!locked || (locked == LK_EXCLUSIVE && 1116 (vp->v_vflag & VV_FORCEINSMQ) == 0)) { 1117 VI_UNLOCK(vp); 1118 MNT_IUNLOCK(mp); 1119 if (dtr != NULL) 1120 dtr(vp, dtr_arg); 1121 return (EBUSY); 1122 } 1123 } 1124 vp->v_mount = mp; 1125 MNT_REF(mp); 1126 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1127 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1128 ("neg mount point vnode list size")); 1129 mp->mnt_nvnodelistsize++; 1130 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1131 ("Activating already active vnode")); 1132 vp->v_iflag |= VI_ACTIVE; 1133 mtx_lock(&vnode_free_list_mtx); 1134 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1135 mp->mnt_activevnodelistsize++; 1136 mtx_unlock(&vnode_free_list_mtx); 1137 VI_UNLOCK(vp); 1138 MNT_IUNLOCK(mp); 1139 return (0); 1140 } 1141 1142 int 1143 insmntque(struct vnode *vp, struct mount *mp) 1144 { 1145 1146 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1147 } 1148 1149 /* 1150 * Flush out and invalidate all buffers associated with a bufobj 1151 * Called with the underlying object locked. 1152 */ 1153 int 1154 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1155 { 1156 int error; 1157 1158 BO_LOCK(bo); 1159 if (flags & V_SAVE) { 1160 error = bufobj_wwait(bo, slpflag, slptimeo); 1161 if (error) { 1162 BO_UNLOCK(bo); 1163 return (error); 1164 } 1165 if (bo->bo_dirty.bv_cnt > 0) { 1166 BO_UNLOCK(bo); 1167 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1168 return (error); 1169 /* 1170 * XXX We could save a lock/unlock if this was only 1171 * enabled under INVARIANTS 1172 */ 1173 BO_LOCK(bo); 1174 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1175 panic("vinvalbuf: dirty bufs"); 1176 } 1177 } 1178 /* 1179 * If you alter this loop please notice that interlock is dropped and 1180 * reacquired in flushbuflist. Special care is needed to ensure that 1181 * no race conditions occur from this. 1182 */ 1183 do { 1184 error = flushbuflist(&bo->bo_clean, 1185 flags, bo, slpflag, slptimeo); 1186 if (error == 0 && !(flags & V_CLEANONLY)) 1187 error = flushbuflist(&bo->bo_dirty, 1188 flags, bo, slpflag, slptimeo); 1189 if (error != 0 && error != EAGAIN) { 1190 BO_UNLOCK(bo); 1191 return (error); 1192 } 1193 } while (error != 0); 1194 1195 /* 1196 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1197 * have write I/O in-progress but if there is a VM object then the 1198 * VM object can also have read-I/O in-progress. 1199 */ 1200 do { 1201 bufobj_wwait(bo, 0, 0); 1202 BO_UNLOCK(bo); 1203 if (bo->bo_object != NULL) { 1204 VM_OBJECT_LOCK(bo->bo_object); 1205 vm_object_pip_wait(bo->bo_object, "bovlbx"); 1206 VM_OBJECT_UNLOCK(bo->bo_object); 1207 } 1208 BO_LOCK(bo); 1209 } while (bo->bo_numoutput > 0); 1210 BO_UNLOCK(bo); 1211 1212 /* 1213 * Destroy the copy in the VM cache, too. 1214 */ 1215 if (bo->bo_object != NULL && 1216 (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { 1217 VM_OBJECT_LOCK(bo->bo_object); 1218 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1219 OBJPR_CLEANONLY : 0); 1220 VM_OBJECT_UNLOCK(bo->bo_object); 1221 } 1222 1223 #ifdef INVARIANTS 1224 BO_LOCK(bo); 1225 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && 1226 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 1227 panic("vinvalbuf: flush failed"); 1228 BO_UNLOCK(bo); 1229 #endif 1230 return (0); 1231 } 1232 1233 /* 1234 * Flush out and invalidate all buffers associated with a vnode. 1235 * Called with the underlying object locked. 1236 */ 1237 int 1238 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1239 { 1240 1241 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1242 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1243 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1244 } 1245 1246 /* 1247 * Flush out buffers on the specified list. 1248 * 1249 */ 1250 static int 1251 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1252 int slptimeo) 1253 { 1254 struct buf *bp, *nbp; 1255 int retval, error; 1256 daddr_t lblkno; 1257 b_xflags_t xflags; 1258 1259 ASSERT_BO_LOCKED(bo); 1260 1261 retval = 0; 1262 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1263 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1264 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1265 continue; 1266 } 1267 lblkno = 0; 1268 xflags = 0; 1269 if (nbp != NULL) { 1270 lblkno = nbp->b_lblkno; 1271 xflags = nbp->b_xflags & 1272 (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN); 1273 } 1274 retval = EAGAIN; 1275 error = BUF_TIMELOCK(bp, 1276 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), 1277 "flushbuf", slpflag, slptimeo); 1278 if (error) { 1279 BO_LOCK(bo); 1280 return (error != ENOLCK ? error : EAGAIN); 1281 } 1282 KASSERT(bp->b_bufobj == bo, 1283 ("bp %p wrong b_bufobj %p should be %p", 1284 bp, bp->b_bufobj, bo)); 1285 if (bp->b_bufobj != bo) { /* XXX: necessary ? */ 1286 BUF_UNLOCK(bp); 1287 BO_LOCK(bo); 1288 return (EAGAIN); 1289 } 1290 /* 1291 * XXX Since there are no node locks for NFS, I 1292 * believe there is a slight chance that a delayed 1293 * write will occur while sleeping just above, so 1294 * check for it. 1295 */ 1296 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1297 (flags & V_SAVE)) { 1298 BO_LOCK(bo); 1299 bremfree(bp); 1300 BO_UNLOCK(bo); 1301 bp->b_flags |= B_ASYNC; 1302 bwrite(bp); 1303 BO_LOCK(bo); 1304 return (EAGAIN); /* XXX: why not loop ? */ 1305 } 1306 BO_LOCK(bo); 1307 bremfree(bp); 1308 BO_UNLOCK(bo); 1309 bp->b_flags |= (B_INVAL | B_RELBUF); 1310 bp->b_flags &= ~B_ASYNC; 1311 brelse(bp); 1312 BO_LOCK(bo); 1313 if (nbp != NULL && 1314 (nbp->b_bufobj != bo || 1315 nbp->b_lblkno != lblkno || 1316 (nbp->b_xflags & 1317 (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags)) 1318 break; /* nbp invalid */ 1319 } 1320 return (retval); 1321 } 1322 1323 /* 1324 * Truncate a file's buffer and pages to a specified length. This 1325 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1326 * sync activity. 1327 */ 1328 int 1329 vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize) 1330 { 1331 struct buf *bp, *nbp; 1332 int anyfreed; 1333 int trunclbn; 1334 struct bufobj *bo; 1335 1336 CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, 1337 vp, cred, blksize, (uintmax_t)length); 1338 1339 /* 1340 * Round up to the *next* lbn. 1341 */ 1342 trunclbn = (length + blksize - 1) / blksize; 1343 1344 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1345 restart: 1346 bo = &vp->v_bufobj; 1347 BO_LOCK(bo); 1348 anyfreed = 1; 1349 for (;anyfreed;) { 1350 anyfreed = 0; 1351 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1352 if (bp->b_lblkno < trunclbn) 1353 continue; 1354 if (BUF_LOCK(bp, 1355 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1356 BO_MTX(bo)) == ENOLCK) 1357 goto restart; 1358 1359 BO_LOCK(bo); 1360 bremfree(bp); 1361 BO_UNLOCK(bo); 1362 bp->b_flags |= (B_INVAL | B_RELBUF); 1363 bp->b_flags &= ~B_ASYNC; 1364 brelse(bp); 1365 anyfreed = 1; 1366 1367 BO_LOCK(bo); 1368 if (nbp != NULL && 1369 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1370 (nbp->b_vp != vp) || 1371 (nbp->b_flags & B_DELWRI))) { 1372 BO_UNLOCK(bo); 1373 goto restart; 1374 } 1375 } 1376 1377 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1378 if (bp->b_lblkno < trunclbn) 1379 continue; 1380 if (BUF_LOCK(bp, 1381 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1382 BO_MTX(bo)) == ENOLCK) 1383 goto restart; 1384 BO_LOCK(bo); 1385 bremfree(bp); 1386 BO_UNLOCK(bo); 1387 bp->b_flags |= (B_INVAL | B_RELBUF); 1388 bp->b_flags &= ~B_ASYNC; 1389 brelse(bp); 1390 anyfreed = 1; 1391 1392 BO_LOCK(bo); 1393 if (nbp != NULL && 1394 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1395 (nbp->b_vp != vp) || 1396 (nbp->b_flags & B_DELWRI) == 0)) { 1397 BO_UNLOCK(bo); 1398 goto restart; 1399 } 1400 } 1401 } 1402 1403 if (length > 0) { 1404 restartsync: 1405 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1406 if (bp->b_lblkno > 0) 1407 continue; 1408 /* 1409 * Since we hold the vnode lock this should only 1410 * fail if we're racing with the buf daemon. 1411 */ 1412 if (BUF_LOCK(bp, 1413 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1414 BO_MTX(bo)) == ENOLCK) { 1415 goto restart; 1416 } 1417 VNASSERT((bp->b_flags & B_DELWRI), vp, 1418 ("buf(%p) on dirty queue without DELWRI", bp)); 1419 1420 BO_LOCK(bo); 1421 bremfree(bp); 1422 BO_UNLOCK(bo); 1423 bawrite(bp); 1424 BO_LOCK(bo); 1425 goto restartsync; 1426 } 1427 } 1428 1429 bufobj_wwait(bo, 0, 0); 1430 BO_UNLOCK(bo); 1431 vnode_pager_setsize(vp, length); 1432 1433 return (0); 1434 } 1435 1436 /* 1437 * buf_splay() - splay tree core for the clean/dirty list of buffers in 1438 * a vnode. 1439 * 1440 * NOTE: We have to deal with the special case of a background bitmap 1441 * buffer, a situation where two buffers will have the same logical 1442 * block offset. We want (1) only the foreground buffer to be accessed 1443 * in a lookup and (2) must differentiate between the foreground and 1444 * background buffer in the splay tree algorithm because the splay 1445 * tree cannot normally handle multiple entities with the same 'index'. 1446 * We accomplish this by adding differentiating flags to the splay tree's 1447 * numerical domain. 1448 */ 1449 static 1450 struct buf * 1451 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) 1452 { 1453 struct buf dummy; 1454 struct buf *lefttreemax, *righttreemin, *y; 1455 1456 if (root == NULL) 1457 return (NULL); 1458 lefttreemax = righttreemin = &dummy; 1459 for (;;) { 1460 if (lblkno < root->b_lblkno || 1461 (lblkno == root->b_lblkno && 1462 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1463 if ((y = root->b_left) == NULL) 1464 break; 1465 if (lblkno < y->b_lblkno) { 1466 /* Rotate right. */ 1467 root->b_left = y->b_right; 1468 y->b_right = root; 1469 root = y; 1470 if ((y = root->b_left) == NULL) 1471 break; 1472 } 1473 /* Link into the new root's right tree. */ 1474 righttreemin->b_left = root; 1475 righttreemin = root; 1476 } else if (lblkno > root->b_lblkno || 1477 (lblkno == root->b_lblkno && 1478 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { 1479 if ((y = root->b_right) == NULL) 1480 break; 1481 if (lblkno > y->b_lblkno) { 1482 /* Rotate left. */ 1483 root->b_right = y->b_left; 1484 y->b_left = root; 1485 root = y; 1486 if ((y = root->b_right) == NULL) 1487 break; 1488 } 1489 /* Link into the new root's left tree. */ 1490 lefttreemax->b_right = root; 1491 lefttreemax = root; 1492 } else { 1493 break; 1494 } 1495 root = y; 1496 } 1497 /* Assemble the new root. */ 1498 lefttreemax->b_right = root->b_left; 1499 righttreemin->b_left = root->b_right; 1500 root->b_left = dummy.b_right; 1501 root->b_right = dummy.b_left; 1502 return (root); 1503 } 1504 1505 static void 1506 buf_vlist_remove(struct buf *bp) 1507 { 1508 struct buf *root; 1509 struct bufv *bv; 1510 1511 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1512 ASSERT_BO_LOCKED(bp->b_bufobj); 1513 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 1514 (BX_VNDIRTY|BX_VNCLEAN), 1515 ("buf_vlist_remove: Buf %p is on two lists", bp)); 1516 if (bp->b_xflags & BX_VNDIRTY) 1517 bv = &bp->b_bufobj->bo_dirty; 1518 else 1519 bv = &bp->b_bufobj->bo_clean; 1520 if (bp != bv->bv_root) { 1521 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1522 KASSERT(root == bp, ("splay lookup failed in remove")); 1523 } 1524 if (bp->b_left == NULL) { 1525 root = bp->b_right; 1526 } else { 1527 root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); 1528 root->b_right = bp->b_right; 1529 } 1530 bv->bv_root = root; 1531 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1532 bv->bv_cnt--; 1533 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1534 } 1535 1536 /* 1537 * Add the buffer to the sorted clean or dirty block list using a 1538 * splay tree algorithm. 1539 * 1540 * NOTE: xflags is passed as a constant, optimizing this inline function! 1541 */ 1542 static void 1543 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1544 { 1545 struct buf *root; 1546 struct bufv *bv; 1547 1548 ASSERT_BO_LOCKED(bo); 1549 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 1550 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 1551 bp->b_xflags |= xflags; 1552 if (xflags & BX_VNDIRTY) 1553 bv = &bo->bo_dirty; 1554 else 1555 bv = &bo->bo_clean; 1556 1557 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1558 if (root == NULL) { 1559 bp->b_left = NULL; 1560 bp->b_right = NULL; 1561 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1562 } else if (bp->b_lblkno < root->b_lblkno || 1563 (bp->b_lblkno == root->b_lblkno && 1564 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1565 bp->b_left = root->b_left; 1566 bp->b_right = root; 1567 root->b_left = NULL; 1568 TAILQ_INSERT_BEFORE(root, bp, b_bobufs); 1569 } else { 1570 bp->b_right = root->b_right; 1571 bp->b_left = root; 1572 root->b_right = NULL; 1573 TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); 1574 } 1575 bv->bv_cnt++; 1576 bv->bv_root = bp; 1577 } 1578 1579 /* 1580 * Lookup a buffer using the splay tree. Note that we specifically avoid 1581 * shadow buffers used in background bitmap writes. 1582 * 1583 * This code isn't quite efficient as it could be because we are maintaining 1584 * two sorted lists and do not know which list the block resides in. 1585 * 1586 * During a "make buildworld" the desired buffer is found at one of 1587 * the roots more than 60% of the time. Thus, checking both roots 1588 * before performing either splay eliminates unnecessary splays on the 1589 * first tree splayed. 1590 */ 1591 struct buf * 1592 gbincore(struct bufobj *bo, daddr_t lblkno) 1593 { 1594 struct buf *bp; 1595 1596 ASSERT_BO_LOCKED(bo); 1597 if ((bp = bo->bo_clean.bv_root) != NULL && 1598 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1599 return (bp); 1600 if ((bp = bo->bo_dirty.bv_root) != NULL && 1601 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1602 return (bp); 1603 if ((bp = bo->bo_clean.bv_root) != NULL) { 1604 bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); 1605 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1606 return (bp); 1607 } 1608 if ((bp = bo->bo_dirty.bv_root) != NULL) { 1609 bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); 1610 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1611 return (bp); 1612 } 1613 return (NULL); 1614 } 1615 1616 /* 1617 * Associate a buffer with a vnode. 1618 */ 1619 void 1620 bgetvp(struct vnode *vp, struct buf *bp) 1621 { 1622 struct bufobj *bo; 1623 1624 bo = &vp->v_bufobj; 1625 ASSERT_BO_LOCKED(bo); 1626 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1627 1628 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1629 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1630 ("bgetvp: bp already attached! %p", bp)); 1631 1632 vhold(vp); 1633 if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT) 1634 bp->b_flags |= B_NEEDSGIANT; 1635 bp->b_vp = vp; 1636 bp->b_bufobj = bo; 1637 /* 1638 * Insert onto list for new vnode. 1639 */ 1640 buf_vlist_add(bp, bo, BX_VNCLEAN); 1641 } 1642 1643 /* 1644 * Disassociate a buffer from a vnode. 1645 */ 1646 void 1647 brelvp(struct buf *bp) 1648 { 1649 struct bufobj *bo; 1650 struct vnode *vp; 1651 1652 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1653 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1654 1655 /* 1656 * Delete from old vnode list, if on one. 1657 */ 1658 vp = bp->b_vp; /* XXX */ 1659 bo = bp->b_bufobj; 1660 BO_LOCK(bo); 1661 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1662 buf_vlist_remove(bp); 1663 else 1664 panic("brelvp: Buffer %p not on queue.", bp); 1665 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1666 bo->bo_flag &= ~BO_ONWORKLST; 1667 mtx_lock(&sync_mtx); 1668 LIST_REMOVE(bo, bo_synclist); 1669 syncer_worklist_len--; 1670 mtx_unlock(&sync_mtx); 1671 } 1672 bp->b_flags &= ~B_NEEDSGIANT; 1673 bp->b_vp = NULL; 1674 bp->b_bufobj = NULL; 1675 BO_UNLOCK(bo); 1676 vdrop(vp); 1677 } 1678 1679 /* 1680 * Add an item to the syncer work queue. 1681 */ 1682 static void 1683 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 1684 { 1685 int queue, slot; 1686 1687 ASSERT_BO_LOCKED(bo); 1688 1689 mtx_lock(&sync_mtx); 1690 if (bo->bo_flag & BO_ONWORKLST) 1691 LIST_REMOVE(bo, bo_synclist); 1692 else { 1693 bo->bo_flag |= BO_ONWORKLST; 1694 syncer_worklist_len++; 1695 } 1696 1697 if (delay > syncer_maxdelay - 2) 1698 delay = syncer_maxdelay - 2; 1699 slot = (syncer_delayno + delay) & syncer_mask; 1700 1701 queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ : 1702 WI_MPSAFEQ; 1703 LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo, 1704 bo_synclist); 1705 mtx_unlock(&sync_mtx); 1706 } 1707 1708 static int 1709 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 1710 { 1711 int error, len; 1712 1713 mtx_lock(&sync_mtx); 1714 len = syncer_worklist_len - sync_vnode_count; 1715 mtx_unlock(&sync_mtx); 1716 error = SYSCTL_OUT(req, &len, sizeof(len)); 1717 return (error); 1718 } 1719 1720 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 1721 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 1722 1723 static struct proc *updateproc; 1724 static void sched_sync(void); 1725 static struct kproc_desc up_kp = { 1726 "syncer", 1727 sched_sync, 1728 &updateproc 1729 }; 1730 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 1731 1732 static int 1733 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 1734 { 1735 struct vnode *vp; 1736 struct mount *mp; 1737 1738 *bo = LIST_FIRST(slp); 1739 if (*bo == NULL) 1740 return (0); 1741 vp = (*bo)->__bo_vnode; /* XXX */ 1742 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 1743 return (1); 1744 /* 1745 * We use vhold in case the vnode does not 1746 * successfully sync. vhold prevents the vnode from 1747 * going away when we unlock the sync_mtx so that 1748 * we can acquire the vnode interlock. 1749 */ 1750 vholdl(vp); 1751 mtx_unlock(&sync_mtx); 1752 VI_UNLOCK(vp); 1753 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1754 vdrop(vp); 1755 mtx_lock(&sync_mtx); 1756 return (*bo == LIST_FIRST(slp)); 1757 } 1758 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1759 (void) VOP_FSYNC(vp, MNT_LAZY, td); 1760 VOP_UNLOCK(vp, 0); 1761 vn_finished_write(mp); 1762 BO_LOCK(*bo); 1763 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 1764 /* 1765 * Put us back on the worklist. The worklist 1766 * routine will remove us from our current 1767 * position and then add us back in at a later 1768 * position. 1769 */ 1770 vn_syncer_add_to_worklist(*bo, syncdelay); 1771 } 1772 BO_UNLOCK(*bo); 1773 vdrop(vp); 1774 mtx_lock(&sync_mtx); 1775 return (0); 1776 } 1777 1778 /* 1779 * System filesystem synchronizer daemon. 1780 */ 1781 static void 1782 sched_sync(void) 1783 { 1784 struct synclist *gnext, *next; 1785 struct synclist *gslp, *slp; 1786 struct bufobj *bo; 1787 long starttime; 1788 struct thread *td = curthread; 1789 int last_work_seen; 1790 int net_worklist_len; 1791 int syncer_final_iter; 1792 int first_printf; 1793 int error; 1794 1795 last_work_seen = 0; 1796 syncer_final_iter = 0; 1797 first_printf = 1; 1798 syncer_state = SYNCER_RUNNING; 1799 starttime = time_uptime; 1800 td->td_pflags |= TDP_NORUNNINGBUF; 1801 1802 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 1803 SHUTDOWN_PRI_LAST); 1804 1805 mtx_lock(&sync_mtx); 1806 for (;;) { 1807 if (syncer_state == SYNCER_FINAL_DELAY && 1808 syncer_final_iter == 0) { 1809 mtx_unlock(&sync_mtx); 1810 kproc_suspend_check(td->td_proc); 1811 mtx_lock(&sync_mtx); 1812 } 1813 net_worklist_len = syncer_worklist_len - sync_vnode_count; 1814 if (syncer_state != SYNCER_RUNNING && 1815 starttime != time_uptime) { 1816 if (first_printf) { 1817 printf("\nSyncing disks, vnodes remaining..."); 1818 first_printf = 0; 1819 } 1820 printf("%d ", net_worklist_len); 1821 } 1822 starttime = time_uptime; 1823 1824 /* 1825 * Push files whose dirty time has expired. Be careful 1826 * of interrupt race on slp queue. 1827 * 1828 * Skip over empty worklist slots when shutting down. 1829 */ 1830 do { 1831 slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; 1832 gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; 1833 syncer_delayno += 1; 1834 if (syncer_delayno == syncer_maxdelay) 1835 syncer_delayno = 0; 1836 next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; 1837 gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; 1838 /* 1839 * If the worklist has wrapped since the 1840 * it was emptied of all but syncer vnodes, 1841 * switch to the FINAL_DELAY state and run 1842 * for one more second. 1843 */ 1844 if (syncer_state == SYNCER_SHUTTING_DOWN && 1845 net_worklist_len == 0 && 1846 last_work_seen == syncer_delayno) { 1847 syncer_state = SYNCER_FINAL_DELAY; 1848 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 1849 } 1850 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 1851 LIST_EMPTY(gslp) && syncer_worklist_len > 0); 1852 1853 /* 1854 * Keep track of the last time there was anything 1855 * on the worklist other than syncer vnodes. 1856 * Return to the SHUTTING_DOWN state if any 1857 * new work appears. 1858 */ 1859 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 1860 last_work_seen = syncer_delayno; 1861 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 1862 syncer_state = SYNCER_SHUTTING_DOWN; 1863 while (!LIST_EMPTY(slp)) { 1864 error = sync_vnode(slp, &bo, td); 1865 if (error == 1) { 1866 LIST_REMOVE(bo, bo_synclist); 1867 LIST_INSERT_HEAD(next, bo, bo_synclist); 1868 continue; 1869 } 1870 1871 if (first_printf == 0) 1872 wdog_kern_pat(WD_LASTVAL); 1873 1874 } 1875 if (!LIST_EMPTY(gslp)) { 1876 mtx_unlock(&sync_mtx); 1877 mtx_lock(&Giant); 1878 mtx_lock(&sync_mtx); 1879 while (!LIST_EMPTY(gslp)) { 1880 error = sync_vnode(gslp, &bo, td); 1881 if (error == 1) { 1882 LIST_REMOVE(bo, bo_synclist); 1883 LIST_INSERT_HEAD(gnext, bo, 1884 bo_synclist); 1885 continue; 1886 } 1887 } 1888 mtx_unlock(&Giant); 1889 } 1890 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 1891 syncer_final_iter--; 1892 /* 1893 * The variable rushjob allows the kernel to speed up the 1894 * processing of the filesystem syncer process. A rushjob 1895 * value of N tells the filesystem syncer to process the next 1896 * N seconds worth of work on its queue ASAP. Currently rushjob 1897 * is used by the soft update code to speed up the filesystem 1898 * syncer process when the incore state is getting so far 1899 * ahead of the disk that the kernel memory pool is being 1900 * threatened with exhaustion. 1901 */ 1902 if (rushjob > 0) { 1903 rushjob -= 1; 1904 continue; 1905 } 1906 /* 1907 * Just sleep for a short period of time between 1908 * iterations when shutting down to allow some I/O 1909 * to happen. 1910 * 1911 * If it has taken us less than a second to process the 1912 * current work, then wait. Otherwise start right over 1913 * again. We can still lose time if any single round 1914 * takes more than two seconds, but it does not really 1915 * matter as we are just trying to generally pace the 1916 * filesystem activity. 1917 */ 1918 if (syncer_state != SYNCER_RUNNING || 1919 time_uptime == starttime) { 1920 thread_lock(td); 1921 sched_prio(td, PPAUSE); 1922 thread_unlock(td); 1923 } 1924 if (syncer_state != SYNCER_RUNNING) 1925 cv_timedwait(&sync_wakeup, &sync_mtx, 1926 hz / SYNCER_SHUTDOWN_SPEEDUP); 1927 else if (time_uptime == starttime) 1928 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 1929 } 1930 } 1931 1932 /* 1933 * Request the syncer daemon to speed up its work. 1934 * We never push it to speed up more than half of its 1935 * normal turn time, otherwise it could take over the cpu. 1936 */ 1937 int 1938 speedup_syncer(void) 1939 { 1940 int ret = 0; 1941 1942 mtx_lock(&sync_mtx); 1943 if (rushjob < syncdelay / 2) { 1944 rushjob += 1; 1945 stat_rush_requests += 1; 1946 ret = 1; 1947 } 1948 mtx_unlock(&sync_mtx); 1949 cv_broadcast(&sync_wakeup); 1950 return (ret); 1951 } 1952 1953 /* 1954 * Tell the syncer to speed up its work and run though its work 1955 * list several times, then tell it to shut down. 1956 */ 1957 static void 1958 syncer_shutdown(void *arg, int howto) 1959 { 1960 1961 if (howto & RB_NOSYNC) 1962 return; 1963 mtx_lock(&sync_mtx); 1964 syncer_state = SYNCER_SHUTTING_DOWN; 1965 rushjob = 0; 1966 mtx_unlock(&sync_mtx); 1967 cv_broadcast(&sync_wakeup); 1968 kproc_shutdown(arg, howto); 1969 } 1970 1971 /* 1972 * Reassign a buffer from one vnode to another. 1973 * Used to assign file specific control information 1974 * (indirect blocks) to the vnode to which they belong. 1975 */ 1976 void 1977 reassignbuf(struct buf *bp) 1978 { 1979 struct vnode *vp; 1980 struct bufobj *bo; 1981 int delay; 1982 #ifdef INVARIANTS 1983 struct bufv *bv; 1984 #endif 1985 1986 vp = bp->b_vp; 1987 bo = bp->b_bufobj; 1988 ++reassignbufcalls; 1989 1990 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 1991 bp, bp->b_vp, bp->b_flags); 1992 /* 1993 * B_PAGING flagged buffers cannot be reassigned because their vp 1994 * is not fully linked in. 1995 */ 1996 if (bp->b_flags & B_PAGING) 1997 panic("cannot reassign paging buffer"); 1998 1999 /* 2000 * Delete from old vnode list, if on one. 2001 */ 2002 BO_LOCK(bo); 2003 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2004 buf_vlist_remove(bp); 2005 else 2006 panic("reassignbuf: Buffer %p not on queue.", bp); 2007 /* 2008 * If dirty, put on list of dirty buffers; otherwise insert onto list 2009 * of clean buffers. 2010 */ 2011 if (bp->b_flags & B_DELWRI) { 2012 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2013 switch (vp->v_type) { 2014 case VDIR: 2015 delay = dirdelay; 2016 break; 2017 case VCHR: 2018 delay = metadelay; 2019 break; 2020 default: 2021 delay = filedelay; 2022 } 2023 vn_syncer_add_to_worklist(bo, delay); 2024 } 2025 buf_vlist_add(bp, bo, BX_VNDIRTY); 2026 } else { 2027 buf_vlist_add(bp, bo, BX_VNCLEAN); 2028 2029 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2030 mtx_lock(&sync_mtx); 2031 LIST_REMOVE(bo, bo_synclist); 2032 syncer_worklist_len--; 2033 mtx_unlock(&sync_mtx); 2034 bo->bo_flag &= ~BO_ONWORKLST; 2035 } 2036 } 2037 #ifdef INVARIANTS 2038 bv = &bo->bo_clean; 2039 bp = TAILQ_FIRST(&bv->bv_hd); 2040 KASSERT(bp == NULL || bp->b_bufobj == bo, 2041 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2042 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2043 KASSERT(bp == NULL || bp->b_bufobj == bo, 2044 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2045 bv = &bo->bo_dirty; 2046 bp = TAILQ_FIRST(&bv->bv_hd); 2047 KASSERT(bp == NULL || bp->b_bufobj == bo, 2048 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2049 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2050 KASSERT(bp == NULL || bp->b_bufobj == bo, 2051 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2052 #endif 2053 BO_UNLOCK(bo); 2054 } 2055 2056 /* 2057 * Increment the use and hold counts on the vnode, taking care to reference 2058 * the driver's usecount if this is a chardev. The vholdl() will remove 2059 * the vnode from the free list if it is presently free. Requires the 2060 * vnode interlock and returns with it held. 2061 */ 2062 static void 2063 v_incr_usecount(struct vnode *vp) 2064 { 2065 2066 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2067 vp->v_usecount++; 2068 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2069 dev_lock(); 2070 vp->v_rdev->si_usecount++; 2071 dev_unlock(); 2072 } 2073 vholdl(vp); 2074 } 2075 2076 /* 2077 * Turn a holdcnt into a use+holdcnt such that only one call to 2078 * v_decr_usecount is needed. 2079 */ 2080 static void 2081 v_upgrade_usecount(struct vnode *vp) 2082 { 2083 2084 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2085 vp->v_usecount++; 2086 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2087 dev_lock(); 2088 vp->v_rdev->si_usecount++; 2089 dev_unlock(); 2090 } 2091 } 2092 2093 /* 2094 * Decrement the vnode use and hold count along with the driver's usecount 2095 * if this is a chardev. The vdropl() below releases the vnode interlock 2096 * as it may free the vnode. 2097 */ 2098 static void 2099 v_decr_usecount(struct vnode *vp) 2100 { 2101 2102 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2103 VNASSERT(vp->v_usecount > 0, vp, 2104 ("v_decr_usecount: negative usecount")); 2105 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2106 vp->v_usecount--; 2107 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2108 dev_lock(); 2109 vp->v_rdev->si_usecount--; 2110 dev_unlock(); 2111 } 2112 vdropl(vp); 2113 } 2114 2115 /* 2116 * Decrement only the use count and driver use count. This is intended to 2117 * be paired with a follow on vdropl() to release the remaining hold count. 2118 * In this way we may vgone() a vnode with a 0 usecount without risk of 2119 * having it end up on a free list because the hold count is kept above 0. 2120 */ 2121 static void 2122 v_decr_useonly(struct vnode *vp) 2123 { 2124 2125 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2126 VNASSERT(vp->v_usecount > 0, vp, 2127 ("v_decr_useonly: negative usecount")); 2128 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2129 vp->v_usecount--; 2130 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2131 dev_lock(); 2132 vp->v_rdev->si_usecount--; 2133 dev_unlock(); 2134 } 2135 } 2136 2137 /* 2138 * Grab a particular vnode from the free list, increment its 2139 * reference count and lock it. VI_DOOMED is set if the vnode 2140 * is being destroyed. Only callers who specify LK_RETRY will 2141 * see doomed vnodes. If inactive processing was delayed in 2142 * vput try to do it here. 2143 */ 2144 int 2145 vget(struct vnode *vp, int flags, struct thread *td) 2146 { 2147 int error; 2148 2149 error = 0; 2150 VFS_ASSERT_GIANT(vp->v_mount); 2151 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2152 ("vget: invalid lock operation")); 2153 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2154 2155 if ((flags & LK_INTERLOCK) == 0) 2156 VI_LOCK(vp); 2157 vholdl(vp); 2158 if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { 2159 vdrop(vp); 2160 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2161 vp); 2162 return (error); 2163 } 2164 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) 2165 panic("vget: vn_lock failed to return ENOENT\n"); 2166 VI_LOCK(vp); 2167 /* Upgrade our holdcnt to a usecount. */ 2168 v_upgrade_usecount(vp); 2169 /* 2170 * We don't guarantee that any particular close will 2171 * trigger inactive processing so just make a best effort 2172 * here at preventing a reference to a removed file. If 2173 * we don't succeed no harm is done. 2174 */ 2175 if (vp->v_iflag & VI_OWEINACT) { 2176 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2177 (flags & LK_NOWAIT) == 0) 2178 vinactive(vp, td); 2179 vp->v_iflag &= ~VI_OWEINACT; 2180 } 2181 VI_UNLOCK(vp); 2182 return (0); 2183 } 2184 2185 /* 2186 * Increase the reference count of a vnode. 2187 */ 2188 void 2189 vref(struct vnode *vp) 2190 { 2191 2192 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2193 VI_LOCK(vp); 2194 v_incr_usecount(vp); 2195 VI_UNLOCK(vp); 2196 } 2197 2198 /* 2199 * Return reference count of a vnode. 2200 * 2201 * The results of this call are only guaranteed when some mechanism other 2202 * than the VI lock is used to stop other processes from gaining references 2203 * to the vnode. This may be the case if the caller holds the only reference. 2204 * This is also useful when stale data is acceptable as race conditions may 2205 * be accounted for by some other means. 2206 */ 2207 int 2208 vrefcnt(struct vnode *vp) 2209 { 2210 int usecnt; 2211 2212 VI_LOCK(vp); 2213 usecnt = vp->v_usecount; 2214 VI_UNLOCK(vp); 2215 2216 return (usecnt); 2217 } 2218 2219 #define VPUTX_VRELE 1 2220 #define VPUTX_VPUT 2 2221 #define VPUTX_VUNREF 3 2222 2223 static void 2224 vputx(struct vnode *vp, int func) 2225 { 2226 int error; 2227 2228 KASSERT(vp != NULL, ("vputx: null vp")); 2229 if (func == VPUTX_VUNREF) 2230 ASSERT_VOP_LOCKED(vp, "vunref"); 2231 else if (func == VPUTX_VPUT) 2232 ASSERT_VOP_LOCKED(vp, "vput"); 2233 else 2234 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); 2235 VFS_ASSERT_GIANT(vp->v_mount); 2236 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2237 VI_LOCK(vp); 2238 2239 /* Skip this v_writecount check if we're going to panic below. */ 2240 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, 2241 ("vputx: missed vn_close")); 2242 error = 0; 2243 2244 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && 2245 vp->v_usecount == 1)) { 2246 if (func == VPUTX_VPUT) 2247 VOP_UNLOCK(vp, 0); 2248 v_decr_usecount(vp); 2249 return; 2250 } 2251 2252 if (vp->v_usecount != 1) { 2253 vprint("vputx: negative ref count", vp); 2254 panic("vputx: negative ref cnt"); 2255 } 2256 CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); 2257 /* 2258 * We want to hold the vnode until the inactive finishes to 2259 * prevent vgone() races. We drop the use count here and the 2260 * hold count below when we're done. 2261 */ 2262 v_decr_useonly(vp); 2263 /* 2264 * We must call VOP_INACTIVE with the node locked. Mark 2265 * as VI_DOINGINACT to avoid recursion. 2266 */ 2267 vp->v_iflag |= VI_OWEINACT; 2268 switch (func) { 2269 case VPUTX_VRELE: 2270 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2271 VI_LOCK(vp); 2272 break; 2273 case VPUTX_VPUT: 2274 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2275 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 2276 LK_NOWAIT); 2277 VI_LOCK(vp); 2278 } 2279 break; 2280 case VPUTX_VUNREF: 2281 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 2282 error = EBUSY; 2283 break; 2284 } 2285 if (vp->v_usecount > 0) 2286 vp->v_iflag &= ~VI_OWEINACT; 2287 if (error == 0) { 2288 if (vp->v_iflag & VI_OWEINACT) 2289 vinactive(vp, curthread); 2290 if (func != VPUTX_VUNREF) 2291 VOP_UNLOCK(vp, 0); 2292 } 2293 vdropl(vp); 2294 } 2295 2296 /* 2297 * Vnode put/release. 2298 * If count drops to zero, call inactive routine and return to freelist. 2299 */ 2300 void 2301 vrele(struct vnode *vp) 2302 { 2303 2304 vputx(vp, VPUTX_VRELE); 2305 } 2306 2307 /* 2308 * Release an already locked vnode. This give the same effects as 2309 * unlock+vrele(), but takes less time and avoids releasing and 2310 * re-aquiring the lock (as vrele() acquires the lock internally.) 2311 */ 2312 void 2313 vput(struct vnode *vp) 2314 { 2315 2316 vputx(vp, VPUTX_VPUT); 2317 } 2318 2319 /* 2320 * Release an exclusively locked vnode. Do not unlock the vnode lock. 2321 */ 2322 void 2323 vunref(struct vnode *vp) 2324 { 2325 2326 vputx(vp, VPUTX_VUNREF); 2327 } 2328 2329 /* 2330 * Somebody doesn't want the vnode recycled. 2331 */ 2332 void 2333 vhold(struct vnode *vp) 2334 { 2335 2336 VI_LOCK(vp); 2337 vholdl(vp); 2338 VI_UNLOCK(vp); 2339 } 2340 2341 /* 2342 * Increase the hold count and activate if this is the first reference. 2343 */ 2344 void 2345 vholdl(struct vnode *vp) 2346 { 2347 struct mount *mp; 2348 2349 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2350 vp->v_holdcnt++; 2351 if (!VSHOULDBUSY(vp)) 2352 return; 2353 ASSERT_VI_LOCKED(vp, "vholdl"); 2354 VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); 2355 VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed.")); 2356 /* 2357 * Remove a vnode from the free list, mark it as in use, 2358 * and put it on the active list. 2359 */ 2360 mtx_lock(&vnode_free_list_mtx); 2361 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 2362 freevnodes--; 2363 vp->v_iflag &= ~(VI_FREE|VI_AGE); 2364 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 2365 ("Activating already active vnode")); 2366 vp->v_iflag |= VI_ACTIVE; 2367 mp = vp->v_mount; 2368 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 2369 mp->mnt_activevnodelistsize++; 2370 mtx_unlock(&vnode_free_list_mtx); 2371 } 2372 2373 /* 2374 * Note that there is one less who cares about this vnode. 2375 * vdrop() is the opposite of vhold(). 2376 */ 2377 void 2378 vdrop(struct vnode *vp) 2379 { 2380 2381 VI_LOCK(vp); 2382 vdropl(vp); 2383 } 2384 2385 /* 2386 * Drop the hold count of the vnode. If this is the last reference to 2387 * the vnode we place it on the free list unless it has been vgone'd 2388 * (marked VI_DOOMED) in which case we will free it. 2389 */ 2390 void 2391 vdropl(struct vnode *vp) 2392 { 2393 struct bufobj *bo; 2394 struct mount *mp; 2395 int active; 2396 2397 ASSERT_VI_LOCKED(vp, "vdropl"); 2398 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2399 if (vp->v_holdcnt <= 0) 2400 panic("vdrop: holdcnt %d", vp->v_holdcnt); 2401 vp->v_holdcnt--; 2402 if (vp->v_holdcnt > 0) { 2403 VI_UNLOCK(vp); 2404 return; 2405 } 2406 if ((vp->v_iflag & VI_DOOMED) == 0) { 2407 /* 2408 * Mark a vnode as free: remove it from its active list 2409 * and put it up for recycling on the freelist. 2410 */ 2411 VNASSERT(vp->v_op != NULL, vp, 2412 ("vdropl: vnode already reclaimed.")); 2413 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2414 ("vnode already free")); 2415 VNASSERT(VSHOULDFREE(vp), vp, 2416 ("vdropl: freeing when we shouldn't")); 2417 active = vp->v_iflag & VI_ACTIVE; 2418 vp->v_iflag &= ~VI_ACTIVE; 2419 mp = vp->v_mount; 2420 mtx_lock(&vnode_free_list_mtx); 2421 if (active) { 2422 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, 2423 v_actfreelist); 2424 mp->mnt_activevnodelistsize--; 2425 } 2426 if (vp->v_iflag & VI_AGE) { 2427 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist); 2428 } else { 2429 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 2430 } 2431 freevnodes++; 2432 vp->v_iflag &= ~VI_AGE; 2433 vp->v_iflag |= VI_FREE; 2434 mtx_unlock(&vnode_free_list_mtx); 2435 VI_UNLOCK(vp); 2436 return; 2437 } 2438 /* 2439 * The vnode has been marked for destruction, so free it. 2440 */ 2441 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2442 mtx_lock(&vnode_free_list_mtx); 2443 numvnodes--; 2444 mtx_unlock(&vnode_free_list_mtx); 2445 bo = &vp->v_bufobj; 2446 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2447 ("cleaned vnode still on the free list.")); 2448 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2449 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 2450 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2451 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2452 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2453 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2454 VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); 2455 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2456 VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); 2457 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 2458 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 2459 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 2460 VI_UNLOCK(vp); 2461 #ifdef MAC 2462 mac_vnode_destroy(vp); 2463 #endif 2464 if (vp->v_pollinfo != NULL) 2465 destroy_vpollinfo(vp->v_pollinfo); 2466 #ifdef INVARIANTS 2467 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 2468 vp->v_op = NULL; 2469 #endif 2470 rangelock_destroy(&vp->v_rl); 2471 lockdestroy(vp->v_vnlock); 2472 mtx_destroy(&vp->v_interlock); 2473 mtx_destroy(BO_MTX(bo)); 2474 uma_zfree(vnode_zone, vp); 2475 } 2476 2477 /* 2478 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 2479 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 2480 * OWEINACT tracks whether a vnode missed a call to inactive due to a 2481 * failed lock upgrade. 2482 */ 2483 void 2484 vinactive(struct vnode *vp, struct thread *td) 2485 { 2486 struct vm_object *obj; 2487 2488 ASSERT_VOP_ELOCKED(vp, "vinactive"); 2489 ASSERT_VI_LOCKED(vp, "vinactive"); 2490 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2491 ("vinactive: recursed on VI_DOINGINACT")); 2492 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2493 vp->v_iflag |= VI_DOINGINACT; 2494 vp->v_iflag &= ~VI_OWEINACT; 2495 VI_UNLOCK(vp); 2496 /* 2497 * Before moving off the active list, we must be sure that any 2498 * modified pages are on the vnode's dirty list since these will 2499 * no longer be checked once the vnode is on the inactive list. 2500 */ 2501 obj = vp->v_object; 2502 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 2503 VM_OBJECT_LOCK(obj); 2504 vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); 2505 VM_OBJECT_UNLOCK(obj); 2506 } 2507 VOP_INACTIVE(vp, td); 2508 VI_LOCK(vp); 2509 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2510 ("vinactive: lost VI_DOINGINACT")); 2511 vp->v_iflag &= ~VI_DOINGINACT; 2512 } 2513 2514 /* 2515 * Remove any vnodes in the vnode table belonging to mount point mp. 2516 * 2517 * If FORCECLOSE is not specified, there should not be any active ones, 2518 * return error if any are found (nb: this is a user error, not a 2519 * system error). If FORCECLOSE is specified, detach any active vnodes 2520 * that are found. 2521 * 2522 * If WRITECLOSE is set, only flush out regular file vnodes open for 2523 * writing. 2524 * 2525 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2526 * 2527 * `rootrefs' specifies the base reference count for the root vnode 2528 * of this filesystem. The root vnode is considered busy if its 2529 * v_usecount exceeds this value. On a successful return, vflush(, td) 2530 * will call vrele() on the root vnode exactly rootrefs times. 2531 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2532 * be zero. 2533 */ 2534 #ifdef DIAGNOSTIC 2535 static int busyprt = 0; /* print out busy vnodes */ 2536 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 2537 #endif 2538 2539 int 2540 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 2541 { 2542 struct vnode *vp, *mvp, *rootvp = NULL; 2543 struct vattr vattr; 2544 int busy = 0, error; 2545 2546 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 2547 rootrefs, flags); 2548 if (rootrefs > 0) { 2549 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2550 ("vflush: bad args")); 2551 /* 2552 * Get the filesystem root vnode. We can vput() it 2553 * immediately, since with rootrefs > 0, it won't go away. 2554 */ 2555 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 2556 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 2557 __func__, error); 2558 return (error); 2559 } 2560 vput(rootvp); 2561 } 2562 loop: 2563 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2564 vholdl(vp); 2565 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 2566 if (error) { 2567 vdrop(vp); 2568 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2569 goto loop; 2570 } 2571 /* 2572 * Skip over a vnodes marked VV_SYSTEM. 2573 */ 2574 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2575 VOP_UNLOCK(vp, 0); 2576 vdrop(vp); 2577 continue; 2578 } 2579 /* 2580 * If WRITECLOSE is set, flush out unlinked but still open 2581 * files (even if open only for reading) and regular file 2582 * vnodes open for writing. 2583 */ 2584 if (flags & WRITECLOSE) { 2585 if (vp->v_object != NULL) { 2586 VM_OBJECT_LOCK(vp->v_object); 2587 vm_object_page_clean(vp->v_object, 0, 0, 0); 2588 VM_OBJECT_UNLOCK(vp->v_object); 2589 } 2590 error = VOP_FSYNC(vp, MNT_WAIT, td); 2591 if (error != 0) { 2592 VOP_UNLOCK(vp, 0); 2593 vdrop(vp); 2594 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2595 return (error); 2596 } 2597 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 2598 VI_LOCK(vp); 2599 2600 if ((vp->v_type == VNON || 2601 (error == 0 && vattr.va_nlink > 0)) && 2602 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2603 VOP_UNLOCK(vp, 0); 2604 vdropl(vp); 2605 continue; 2606 } 2607 } else 2608 VI_LOCK(vp); 2609 /* 2610 * With v_usecount == 0, all we need to do is clear out the 2611 * vnode data structures and we are done. 2612 * 2613 * If FORCECLOSE is set, forcibly close the vnode. 2614 */ 2615 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 2616 VNASSERT(vp->v_usecount == 0 || 2617 (vp->v_type != VCHR && vp->v_type != VBLK), vp, 2618 ("device VNODE %p is FORCECLOSED", vp)); 2619 vgonel(vp); 2620 } else { 2621 busy++; 2622 #ifdef DIAGNOSTIC 2623 if (busyprt) 2624 vprint("vflush: busy vnode", vp); 2625 #endif 2626 } 2627 VOP_UNLOCK(vp, 0); 2628 vdropl(vp); 2629 } 2630 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 2631 /* 2632 * If just the root vnode is busy, and if its refcount 2633 * is equal to `rootrefs', then go ahead and kill it. 2634 */ 2635 VI_LOCK(rootvp); 2636 KASSERT(busy > 0, ("vflush: not busy")); 2637 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 2638 ("vflush: usecount %d < rootrefs %d", 2639 rootvp->v_usecount, rootrefs)); 2640 if (busy == 1 && rootvp->v_usecount == rootrefs) { 2641 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 2642 vgone(rootvp); 2643 VOP_UNLOCK(rootvp, 0); 2644 busy = 0; 2645 } else 2646 VI_UNLOCK(rootvp); 2647 } 2648 if (busy) { 2649 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 2650 busy); 2651 return (EBUSY); 2652 } 2653 for (; rootrefs > 0; rootrefs--) 2654 vrele(rootvp); 2655 return (0); 2656 } 2657 2658 /* 2659 * Recycle an unused vnode to the front of the free list. 2660 */ 2661 int 2662 vrecycle(struct vnode *vp) 2663 { 2664 int recycled; 2665 2666 ASSERT_VOP_ELOCKED(vp, "vrecycle"); 2667 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2668 recycled = 0; 2669 VI_LOCK(vp); 2670 if (vp->v_usecount == 0) { 2671 recycled = 1; 2672 vgonel(vp); 2673 } 2674 VI_UNLOCK(vp); 2675 return (recycled); 2676 } 2677 2678 /* 2679 * Eliminate all activity associated with a vnode 2680 * in preparation for reuse. 2681 */ 2682 void 2683 vgone(struct vnode *vp) 2684 { 2685 VI_LOCK(vp); 2686 vgonel(vp); 2687 VI_UNLOCK(vp); 2688 } 2689 2690 /* 2691 * vgone, with the vp interlock held. 2692 */ 2693 void 2694 vgonel(struct vnode *vp) 2695 { 2696 struct thread *td; 2697 int oweinact; 2698 int active; 2699 struct mount *mp; 2700 2701 ASSERT_VOP_ELOCKED(vp, "vgonel"); 2702 ASSERT_VI_LOCKED(vp, "vgonel"); 2703 VNASSERT(vp->v_holdcnt, vp, 2704 ("vgonel: vp %p has no reference.", vp)); 2705 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2706 td = curthread; 2707 2708 /* 2709 * Don't vgonel if we're already doomed. 2710 */ 2711 if (vp->v_iflag & VI_DOOMED) 2712 return; 2713 vp->v_iflag |= VI_DOOMED; 2714 /* 2715 * Check to see if the vnode is in use. If so, we have to call 2716 * VOP_CLOSE() and VOP_INACTIVE(). 2717 */ 2718 active = vp->v_usecount; 2719 oweinact = (vp->v_iflag & VI_OWEINACT); 2720 VI_UNLOCK(vp); 2721 /* 2722 * Clean out any buffers associated with the vnode. 2723 * If the flush fails, just toss the buffers. 2724 */ 2725 mp = NULL; 2726 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 2727 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 2728 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) 2729 vinvalbuf(vp, 0, 0, 0); 2730 2731 /* 2732 * If purging an active vnode, it must be closed and 2733 * deactivated before being reclaimed. 2734 */ 2735 if (active) 2736 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 2737 if (oweinact || active) { 2738 VI_LOCK(vp); 2739 if ((vp->v_iflag & VI_DOINGINACT) == 0) 2740 vinactive(vp, td); 2741 VI_UNLOCK(vp); 2742 } 2743 if (vp->v_type == VSOCK) 2744 vfs_unp_reclaim(vp); 2745 /* 2746 * Reclaim the vnode. 2747 */ 2748 if (VOP_RECLAIM(vp, td)) 2749 panic("vgone: cannot reclaim"); 2750 if (mp != NULL) 2751 vn_finished_secondary_write(mp); 2752 VNASSERT(vp->v_object == NULL, vp, 2753 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 2754 /* 2755 * Clear the advisory locks and wake up waiting threads. 2756 */ 2757 (void)VOP_ADVLOCKPURGE(vp); 2758 /* 2759 * Delete from old mount point vnode list. 2760 */ 2761 delmntque(vp); 2762 cache_purge(vp); 2763 /* 2764 * Done with purge, reset to the standard lock and invalidate 2765 * the vnode. 2766 */ 2767 VI_LOCK(vp); 2768 vp->v_vnlock = &vp->v_lock; 2769 vp->v_op = &dead_vnodeops; 2770 vp->v_tag = "none"; 2771 vp->v_type = VBAD; 2772 } 2773 2774 /* 2775 * Calculate the total number of references to a special device. 2776 */ 2777 int 2778 vcount(struct vnode *vp) 2779 { 2780 int count; 2781 2782 dev_lock(); 2783 count = vp->v_rdev->si_usecount; 2784 dev_unlock(); 2785 return (count); 2786 } 2787 2788 /* 2789 * Same as above, but using the struct cdev *as argument 2790 */ 2791 int 2792 count_dev(struct cdev *dev) 2793 { 2794 int count; 2795 2796 dev_lock(); 2797 count = dev->si_usecount; 2798 dev_unlock(); 2799 return(count); 2800 } 2801 2802 /* 2803 * Print out a description of a vnode. 2804 */ 2805 static char *typename[] = 2806 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 2807 "VMARKER"}; 2808 2809 void 2810 vn_printf(struct vnode *vp, const char *fmt, ...) 2811 { 2812 va_list ap; 2813 char buf[256], buf2[16]; 2814 u_long flags; 2815 2816 va_start(ap, fmt); 2817 vprintf(fmt, ap); 2818 va_end(ap); 2819 printf("%p: ", (void *)vp); 2820 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 2821 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 2822 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 2823 buf[0] = '\0'; 2824 buf[1] = '\0'; 2825 if (vp->v_vflag & VV_ROOT) 2826 strlcat(buf, "|VV_ROOT", sizeof(buf)); 2827 if (vp->v_vflag & VV_ISTTY) 2828 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 2829 if (vp->v_vflag & VV_NOSYNC) 2830 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 2831 if (vp->v_vflag & VV_CACHEDLABEL) 2832 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 2833 if (vp->v_vflag & VV_TEXT) 2834 strlcat(buf, "|VV_TEXT", sizeof(buf)); 2835 if (vp->v_vflag & VV_COPYONWRITE) 2836 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 2837 if (vp->v_vflag & VV_SYSTEM) 2838 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 2839 if (vp->v_vflag & VV_PROCDEP) 2840 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 2841 if (vp->v_vflag & VV_NOKNOTE) 2842 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 2843 if (vp->v_vflag & VV_DELETED) 2844 strlcat(buf, "|VV_DELETED", sizeof(buf)); 2845 if (vp->v_vflag & VV_MD) 2846 strlcat(buf, "|VV_MD", sizeof(buf)); 2847 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | 2848 VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 2849 VV_NOKNOTE | VV_DELETED | VV_MD); 2850 if (flags != 0) { 2851 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 2852 strlcat(buf, buf2, sizeof(buf)); 2853 } 2854 if (vp->v_iflag & VI_MOUNT) 2855 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 2856 if (vp->v_iflag & VI_AGE) 2857 strlcat(buf, "|VI_AGE", sizeof(buf)); 2858 if (vp->v_iflag & VI_DOOMED) 2859 strlcat(buf, "|VI_DOOMED", sizeof(buf)); 2860 if (vp->v_iflag & VI_FREE) 2861 strlcat(buf, "|VI_FREE", sizeof(buf)); 2862 if (vp->v_iflag & VI_DOINGINACT) 2863 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 2864 if (vp->v_iflag & VI_OWEINACT) 2865 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 2866 flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | 2867 VI_DOINGINACT | VI_OWEINACT); 2868 if (flags != 0) { 2869 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 2870 strlcat(buf, buf2, sizeof(buf)); 2871 } 2872 printf(" flags (%s)\n", buf + 1); 2873 if (mtx_owned(VI_MTX(vp))) 2874 printf(" VI_LOCKed"); 2875 if (vp->v_object != NULL) 2876 printf(" v_object %p ref %d pages %d\n", 2877 vp->v_object, vp->v_object->ref_count, 2878 vp->v_object->resident_page_count); 2879 printf(" "); 2880 lockmgr_printinfo(vp->v_vnlock); 2881 if (vp->v_data != NULL) 2882 VOP_PRINT(vp); 2883 } 2884 2885 #ifdef DDB 2886 /* 2887 * List all of the locked vnodes in the system. 2888 * Called when debugging the kernel. 2889 */ 2890 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 2891 { 2892 struct mount *mp, *nmp; 2893 struct vnode *vp; 2894 2895 /* 2896 * Note: because this is DDB, we can't obey the locking semantics 2897 * for these structures, which means we could catch an inconsistent 2898 * state and dereference a nasty pointer. Not much to be done 2899 * about that. 2900 */ 2901 db_printf("Locked vnodes\n"); 2902 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2903 nmp = TAILQ_NEXT(mp, mnt_list); 2904 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2905 if (vp->v_type != VMARKER && 2906 VOP_ISLOCKED(vp)) 2907 vprint("", vp); 2908 } 2909 nmp = TAILQ_NEXT(mp, mnt_list); 2910 } 2911 } 2912 2913 /* 2914 * Show details about the given vnode. 2915 */ 2916 DB_SHOW_COMMAND(vnode, db_show_vnode) 2917 { 2918 struct vnode *vp; 2919 2920 if (!have_addr) 2921 return; 2922 vp = (struct vnode *)addr; 2923 vn_printf(vp, "vnode "); 2924 } 2925 2926 /* 2927 * Show details about the given mount point. 2928 */ 2929 DB_SHOW_COMMAND(mount, db_show_mount) 2930 { 2931 struct mount *mp; 2932 struct vfsopt *opt; 2933 struct statfs *sp; 2934 struct vnode *vp; 2935 char buf[512]; 2936 uint64_t mflags; 2937 u_int flags; 2938 2939 if (!have_addr) { 2940 /* No address given, print short info about all mount points. */ 2941 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2942 db_printf("%p %s on %s (%s)\n", mp, 2943 mp->mnt_stat.f_mntfromname, 2944 mp->mnt_stat.f_mntonname, 2945 mp->mnt_stat.f_fstypename); 2946 if (db_pager_quit) 2947 break; 2948 } 2949 db_printf("\nMore info: show mount <addr>\n"); 2950 return; 2951 } 2952 2953 mp = (struct mount *)addr; 2954 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 2955 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 2956 2957 buf[0] = '\0'; 2958 mflags = mp->mnt_flag; 2959 #define MNT_FLAG(flag) do { \ 2960 if (mflags & (flag)) { \ 2961 if (buf[0] != '\0') \ 2962 strlcat(buf, ", ", sizeof(buf)); \ 2963 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 2964 mflags &= ~(flag); \ 2965 } \ 2966 } while (0) 2967 MNT_FLAG(MNT_RDONLY); 2968 MNT_FLAG(MNT_SYNCHRONOUS); 2969 MNT_FLAG(MNT_NOEXEC); 2970 MNT_FLAG(MNT_NOSUID); 2971 MNT_FLAG(MNT_UNION); 2972 MNT_FLAG(MNT_ASYNC); 2973 MNT_FLAG(MNT_SUIDDIR); 2974 MNT_FLAG(MNT_SOFTDEP); 2975 MNT_FLAG(MNT_SUJ); 2976 MNT_FLAG(MNT_NOSYMFOLLOW); 2977 MNT_FLAG(MNT_GJOURNAL); 2978 MNT_FLAG(MNT_MULTILABEL); 2979 MNT_FLAG(MNT_ACLS); 2980 MNT_FLAG(MNT_NOATIME); 2981 MNT_FLAG(MNT_NOCLUSTERR); 2982 MNT_FLAG(MNT_NOCLUSTERW); 2983 MNT_FLAG(MNT_NFS4ACLS); 2984 MNT_FLAG(MNT_EXRDONLY); 2985 MNT_FLAG(MNT_EXPORTED); 2986 MNT_FLAG(MNT_DEFEXPORTED); 2987 MNT_FLAG(MNT_EXPORTANON); 2988 MNT_FLAG(MNT_EXKERB); 2989 MNT_FLAG(MNT_EXPUBLIC); 2990 MNT_FLAG(MNT_LOCAL); 2991 MNT_FLAG(MNT_QUOTA); 2992 MNT_FLAG(MNT_ROOTFS); 2993 MNT_FLAG(MNT_USER); 2994 MNT_FLAG(MNT_IGNORE); 2995 MNT_FLAG(MNT_UPDATE); 2996 MNT_FLAG(MNT_DELEXPORT); 2997 MNT_FLAG(MNT_RELOAD); 2998 MNT_FLAG(MNT_FORCE); 2999 MNT_FLAG(MNT_SNAPSHOT); 3000 MNT_FLAG(MNT_BYFSID); 3001 #undef MNT_FLAG 3002 if (mflags != 0) { 3003 if (buf[0] != '\0') 3004 strlcat(buf, ", ", sizeof(buf)); 3005 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3006 "0x%016jx", mflags); 3007 } 3008 db_printf(" mnt_flag = %s\n", buf); 3009 3010 buf[0] = '\0'; 3011 flags = mp->mnt_kern_flag; 3012 #define MNT_KERN_FLAG(flag) do { \ 3013 if (flags & (flag)) { \ 3014 if (buf[0] != '\0') \ 3015 strlcat(buf, ", ", sizeof(buf)); \ 3016 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3017 flags &= ~(flag); \ 3018 } \ 3019 } while (0) 3020 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3021 MNT_KERN_FLAG(MNTK_ASYNC); 3022 MNT_KERN_FLAG(MNTK_SOFTDEP); 3023 MNT_KERN_FLAG(MNTK_NOINSMNTQ); 3024 MNT_KERN_FLAG(MNTK_DRAINING); 3025 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3026 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3027 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3028 MNT_KERN_FLAG(MNTK_NOASYNC); 3029 MNT_KERN_FLAG(MNTK_UNMOUNT); 3030 MNT_KERN_FLAG(MNTK_MWAIT); 3031 MNT_KERN_FLAG(MNTK_SUSPEND); 3032 MNT_KERN_FLAG(MNTK_SUSPEND2); 3033 MNT_KERN_FLAG(MNTK_SUSPENDED); 3034 MNT_KERN_FLAG(MNTK_MPSAFE); 3035 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 3036 MNT_KERN_FLAG(MNTK_NOKNOTE); 3037 #undef MNT_KERN_FLAG 3038 if (flags != 0) { 3039 if (buf[0] != '\0') 3040 strlcat(buf, ", ", sizeof(buf)); 3041 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3042 "0x%08x", flags); 3043 } 3044 db_printf(" mnt_kern_flag = %s\n", buf); 3045 3046 db_printf(" mnt_opt = "); 3047 opt = TAILQ_FIRST(mp->mnt_opt); 3048 if (opt != NULL) { 3049 db_printf("%s", opt->name); 3050 opt = TAILQ_NEXT(opt, link); 3051 while (opt != NULL) { 3052 db_printf(", %s", opt->name); 3053 opt = TAILQ_NEXT(opt, link); 3054 } 3055 } 3056 db_printf("\n"); 3057 3058 sp = &mp->mnt_stat; 3059 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 3060 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 3061 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 3062 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 3063 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 3064 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 3065 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 3066 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 3067 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 3068 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 3069 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 3070 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 3071 3072 db_printf(" mnt_cred = { uid=%u ruid=%u", 3073 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 3074 if (jailed(mp->mnt_cred)) 3075 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 3076 db_printf(" }\n"); 3077 db_printf(" mnt_ref = %d\n", mp->mnt_ref); 3078 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 3079 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 3080 db_printf(" mnt_activevnodelistsize = %d\n", 3081 mp->mnt_activevnodelistsize); 3082 db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); 3083 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 3084 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 3085 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 3086 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 3087 db_printf(" mnt_secondary_accwrites = %d\n", 3088 mp->mnt_secondary_accwrites); 3089 db_printf(" mnt_gjprovider = %s\n", 3090 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 3091 3092 db_printf("\n\nList of active vnodes\n"); 3093 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 3094 if (vp->v_type != VMARKER) { 3095 vn_printf(vp, "vnode "); 3096 if (db_pager_quit) 3097 break; 3098 } 3099 } 3100 db_printf("\n\nList of inactive vnodes\n"); 3101 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3102 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 3103 vn_printf(vp, "vnode "); 3104 if (db_pager_quit) 3105 break; 3106 } 3107 } 3108 } 3109 #endif /* DDB */ 3110 3111 /* 3112 * Fill in a struct xvfsconf based on a struct vfsconf. 3113 */ 3114 static void 3115 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) 3116 { 3117 3118 strcpy(xvfsp->vfc_name, vfsp->vfc_name); 3119 xvfsp->vfc_typenum = vfsp->vfc_typenum; 3120 xvfsp->vfc_refcount = vfsp->vfc_refcount; 3121 xvfsp->vfc_flags = vfsp->vfc_flags; 3122 /* 3123 * These are unused in userland, we keep them 3124 * to not break binary compatibility. 3125 */ 3126 xvfsp->vfc_vfsops = NULL; 3127 xvfsp->vfc_next = NULL; 3128 } 3129 3130 /* 3131 * Top level filesystem related information gathering. 3132 */ 3133 static int 3134 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 3135 { 3136 struct vfsconf *vfsp; 3137 struct xvfsconf xvfsp; 3138 int error; 3139 3140 error = 0; 3141 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3142 bzero(&xvfsp, sizeof(xvfsp)); 3143 vfsconf2x(vfsp, &xvfsp); 3144 error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); 3145 if (error) 3146 break; 3147 } 3148 return (error); 3149 } 3150 3151 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD, 3152 NULL, 0, sysctl_vfs_conflist, 3153 "S,xvfsconf", "List of all configured filesystems"); 3154 3155 #ifndef BURN_BRIDGES 3156 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 3157 3158 static int 3159 vfs_sysctl(SYSCTL_HANDLER_ARGS) 3160 { 3161 int *name = (int *)arg1 - 1; /* XXX */ 3162 u_int namelen = arg2 + 1; /* XXX */ 3163 struct vfsconf *vfsp; 3164 struct xvfsconf xvfsp; 3165 3166 log(LOG_WARNING, "userland calling deprecated sysctl, " 3167 "please rebuild world\n"); 3168 3169 #if 1 || defined(COMPAT_PRELITE2) 3170 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 3171 if (namelen == 1) 3172 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 3173 #endif 3174 3175 switch (name[1]) { 3176 case VFS_MAXTYPENUM: 3177 if (namelen != 2) 3178 return (ENOTDIR); 3179 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 3180 case VFS_CONF: 3181 if (namelen != 3) 3182 return (ENOTDIR); /* overloaded */ 3183 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) 3184 if (vfsp->vfc_typenum == name[2]) 3185 break; 3186 if (vfsp == NULL) 3187 return (EOPNOTSUPP); 3188 bzero(&xvfsp, sizeof(xvfsp)); 3189 vfsconf2x(vfsp, &xvfsp); 3190 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3191 } 3192 return (EOPNOTSUPP); 3193 } 3194 3195 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, 3196 vfs_sysctl, "Generic filesystem"); 3197 3198 #if 1 || defined(COMPAT_PRELITE2) 3199 3200 static int 3201 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 3202 { 3203 int error; 3204 struct vfsconf *vfsp; 3205 struct ovfsconf ovfs; 3206 3207 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3208 bzero(&ovfs, sizeof(ovfs)); 3209 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 3210 strcpy(ovfs.vfc_name, vfsp->vfc_name); 3211 ovfs.vfc_index = vfsp->vfc_typenum; 3212 ovfs.vfc_refcount = vfsp->vfc_refcount; 3213 ovfs.vfc_flags = vfsp->vfc_flags; 3214 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 3215 if (error) 3216 return error; 3217 } 3218 return 0; 3219 } 3220 3221 #endif /* 1 || COMPAT_PRELITE2 */ 3222 #endif /* !BURN_BRIDGES */ 3223 3224 #define KINFO_VNODESLOP 10 3225 #ifdef notyet 3226 /* 3227 * Dump vnode list (via sysctl). 3228 */ 3229 /* ARGSUSED */ 3230 static int 3231 sysctl_vnode(SYSCTL_HANDLER_ARGS) 3232 { 3233 struct xvnode *xvn; 3234 struct mount *mp; 3235 struct vnode *vp; 3236 int error, len, n; 3237 3238 /* 3239 * Stale numvnodes access is not fatal here. 3240 */ 3241 req->lock = 0; 3242 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 3243 if (!req->oldptr) 3244 /* Make an estimate */ 3245 return (SYSCTL_OUT(req, 0, len)); 3246 3247 error = sysctl_wire_old_buffer(req, 0); 3248 if (error != 0) 3249 return (error); 3250 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 3251 n = 0; 3252 mtx_lock(&mountlist_mtx); 3253 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3254 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 3255 continue; 3256 MNT_ILOCK(mp); 3257 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3258 if (n == len) 3259 break; 3260 vref(vp); 3261 xvn[n].xv_size = sizeof *xvn; 3262 xvn[n].xv_vnode = vp; 3263 xvn[n].xv_id = 0; /* XXX compat */ 3264 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 3265 XV_COPY(usecount); 3266 XV_COPY(writecount); 3267 XV_COPY(holdcnt); 3268 XV_COPY(mount); 3269 XV_COPY(numoutput); 3270 XV_COPY(type); 3271 #undef XV_COPY 3272 xvn[n].xv_flag = vp->v_vflag; 3273 3274 switch (vp->v_type) { 3275 case VREG: 3276 case VDIR: 3277 case VLNK: 3278 break; 3279 case VBLK: 3280 case VCHR: 3281 if (vp->v_rdev == NULL) { 3282 vrele(vp); 3283 continue; 3284 } 3285 xvn[n].xv_dev = dev2udev(vp->v_rdev); 3286 break; 3287 case VSOCK: 3288 xvn[n].xv_socket = vp->v_socket; 3289 break; 3290 case VFIFO: 3291 xvn[n].xv_fifo = vp->v_fifoinfo; 3292 break; 3293 case VNON: 3294 case VBAD: 3295 default: 3296 /* shouldn't happen? */ 3297 vrele(vp); 3298 continue; 3299 } 3300 vrele(vp); 3301 ++n; 3302 } 3303 MNT_IUNLOCK(mp); 3304 mtx_lock(&mountlist_mtx); 3305 vfs_unbusy(mp); 3306 if (n == len) 3307 break; 3308 } 3309 mtx_unlock(&mountlist_mtx); 3310 3311 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 3312 free(xvn, M_TEMP); 3313 return (error); 3314 } 3315 3316 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 3317 0, 0, sysctl_vnode, "S,xvnode", ""); 3318 #endif 3319 3320 /* 3321 * Unmount all filesystems. The list is traversed in reverse order 3322 * of mounting to avoid dependencies. 3323 */ 3324 void 3325 vfs_unmountall(void) 3326 { 3327 struct mount *mp; 3328 struct thread *td; 3329 int error; 3330 3331 KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread")); 3332 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 3333 td = curthread; 3334 3335 /* 3336 * Since this only runs when rebooting, it is not interlocked. 3337 */ 3338 while(!TAILQ_EMPTY(&mountlist)) { 3339 mp = TAILQ_LAST(&mountlist, mntlist); 3340 error = dounmount(mp, MNT_FORCE, td); 3341 if (error) { 3342 TAILQ_REMOVE(&mountlist, mp, mnt_list); 3343 /* 3344 * XXX: Due to the way in which we mount the root 3345 * file system off of devfs, devfs will generate a 3346 * "busy" warning when we try to unmount it before 3347 * the root. Don't print a warning as a result in 3348 * order to avoid false positive errors that may 3349 * cause needless upset. 3350 */ 3351 if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { 3352 printf("unmount of %s failed (", 3353 mp->mnt_stat.f_mntonname); 3354 if (error == EBUSY) 3355 printf("BUSY)\n"); 3356 else 3357 printf("%d)\n", error); 3358 } 3359 } else { 3360 /* The unmount has removed mp from the mountlist */ 3361 } 3362 } 3363 } 3364 3365 /* 3366 * perform msync on all vnodes under a mount point 3367 * the mount point must be locked. 3368 */ 3369 void 3370 vfs_msync(struct mount *mp, int flags) 3371 { 3372 struct vnode *vp, *mvp; 3373 struct vm_object *obj; 3374 3375 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 3376 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 3377 obj = vp->v_object; 3378 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && 3379 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 3380 if (!vget(vp, 3381 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 3382 curthread)) { 3383 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 3384 vput(vp); 3385 continue; 3386 } 3387 3388 obj = vp->v_object; 3389 if (obj != NULL) { 3390 VM_OBJECT_LOCK(obj); 3391 vm_object_page_clean(obj, 0, 0, 3392 flags == MNT_WAIT ? 3393 OBJPC_SYNC : OBJPC_NOSYNC); 3394 VM_OBJECT_UNLOCK(obj); 3395 } 3396 vput(vp); 3397 } 3398 } else 3399 VI_UNLOCK(vp); 3400 } 3401 } 3402 3403 static void 3404 destroy_vpollinfo(struct vpollinfo *vi) 3405 { 3406 seldrain(&vi->vpi_selinfo); 3407 knlist_destroy(&vi->vpi_selinfo.si_note); 3408 mtx_destroy(&vi->vpi_lock); 3409 uma_zfree(vnodepoll_zone, vi); 3410 } 3411 3412 /* 3413 * Initalize per-vnode helper structure to hold poll-related state. 3414 */ 3415 void 3416 v_addpollinfo(struct vnode *vp) 3417 { 3418 struct vpollinfo *vi; 3419 3420 if (vp->v_pollinfo != NULL) 3421 return; 3422 vi = uma_zalloc(vnodepoll_zone, M_WAITOK); 3423 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 3424 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 3425 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 3426 VI_LOCK(vp); 3427 if (vp->v_pollinfo != NULL) { 3428 VI_UNLOCK(vp); 3429 destroy_vpollinfo(vi); 3430 return; 3431 } 3432 vp->v_pollinfo = vi; 3433 VI_UNLOCK(vp); 3434 } 3435 3436 /* 3437 * Record a process's interest in events which might happen to 3438 * a vnode. Because poll uses the historic select-style interface 3439 * internally, this routine serves as both the ``check for any 3440 * pending events'' and the ``record my interest in future events'' 3441 * functions. (These are done together, while the lock is held, 3442 * to avoid race conditions.) 3443 */ 3444 int 3445 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 3446 { 3447 3448 v_addpollinfo(vp); 3449 mtx_lock(&vp->v_pollinfo->vpi_lock); 3450 if (vp->v_pollinfo->vpi_revents & events) { 3451 /* 3452 * This leaves events we are not interested 3453 * in available for the other process which 3454 * which presumably had requested them 3455 * (otherwise they would never have been 3456 * recorded). 3457 */ 3458 events &= vp->v_pollinfo->vpi_revents; 3459 vp->v_pollinfo->vpi_revents &= ~events; 3460 3461 mtx_unlock(&vp->v_pollinfo->vpi_lock); 3462 return (events); 3463 } 3464 vp->v_pollinfo->vpi_events |= events; 3465 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 3466 mtx_unlock(&vp->v_pollinfo->vpi_lock); 3467 return (0); 3468 } 3469 3470 /* 3471 * Routine to create and manage a filesystem syncer vnode. 3472 */ 3473 #define sync_close ((int (*)(struct vop_close_args *))nullop) 3474 static int sync_fsync(struct vop_fsync_args *); 3475 static int sync_inactive(struct vop_inactive_args *); 3476 static int sync_reclaim(struct vop_reclaim_args *); 3477 3478 static struct vop_vector sync_vnodeops = { 3479 .vop_bypass = VOP_EOPNOTSUPP, 3480 .vop_close = sync_close, /* close */ 3481 .vop_fsync = sync_fsync, /* fsync */ 3482 .vop_inactive = sync_inactive, /* inactive */ 3483 .vop_reclaim = sync_reclaim, /* reclaim */ 3484 .vop_lock1 = vop_stdlock, /* lock */ 3485 .vop_unlock = vop_stdunlock, /* unlock */ 3486 .vop_islocked = vop_stdislocked, /* islocked */ 3487 }; 3488 3489 /* 3490 * Create a new filesystem syncer vnode for the specified mount point. 3491 */ 3492 void 3493 vfs_allocate_syncvnode(struct mount *mp) 3494 { 3495 struct vnode *vp; 3496 struct bufobj *bo; 3497 static long start, incr, next; 3498 int error; 3499 3500 /* Allocate a new vnode */ 3501 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 3502 if (error != 0) 3503 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 3504 vp->v_type = VNON; 3505 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3506 vp->v_vflag |= VV_FORCEINSMQ; 3507 error = insmntque(vp, mp); 3508 if (error != 0) 3509 panic("vfs_allocate_syncvnode: insmntque() failed"); 3510 vp->v_vflag &= ~VV_FORCEINSMQ; 3511 VOP_UNLOCK(vp, 0); 3512 /* 3513 * Place the vnode onto the syncer worklist. We attempt to 3514 * scatter them about on the list so that they will go off 3515 * at evenly distributed times even if all the filesystems 3516 * are mounted at once. 3517 */ 3518 next += incr; 3519 if (next == 0 || next > syncer_maxdelay) { 3520 start /= 2; 3521 incr /= 2; 3522 if (start == 0) { 3523 start = syncer_maxdelay / 2; 3524 incr = syncer_maxdelay; 3525 } 3526 next = start; 3527 } 3528 bo = &vp->v_bufobj; 3529 BO_LOCK(bo); 3530 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 3531 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 3532 mtx_lock(&sync_mtx); 3533 sync_vnode_count++; 3534 if (mp->mnt_syncer == NULL) { 3535 mp->mnt_syncer = vp; 3536 vp = NULL; 3537 } 3538 mtx_unlock(&sync_mtx); 3539 BO_UNLOCK(bo); 3540 if (vp != NULL) { 3541 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3542 vgone(vp); 3543 vput(vp); 3544 } 3545 } 3546 3547 void 3548 vfs_deallocate_syncvnode(struct mount *mp) 3549 { 3550 struct vnode *vp; 3551 3552 mtx_lock(&sync_mtx); 3553 vp = mp->mnt_syncer; 3554 if (vp != NULL) 3555 mp->mnt_syncer = NULL; 3556 mtx_unlock(&sync_mtx); 3557 if (vp != NULL) 3558 vrele(vp); 3559 } 3560 3561 /* 3562 * Do a lazy sync of the filesystem. 3563 */ 3564 static int 3565 sync_fsync(struct vop_fsync_args *ap) 3566 { 3567 struct vnode *syncvp = ap->a_vp; 3568 struct mount *mp = syncvp->v_mount; 3569 int error, save; 3570 struct bufobj *bo; 3571 3572 /* 3573 * We only need to do something if this is a lazy evaluation. 3574 */ 3575 if (ap->a_waitfor != MNT_LAZY) 3576 return (0); 3577 3578 /* 3579 * Move ourselves to the back of the sync list. 3580 */ 3581 bo = &syncvp->v_bufobj; 3582 BO_LOCK(bo); 3583 vn_syncer_add_to_worklist(bo, syncdelay); 3584 BO_UNLOCK(bo); 3585 3586 /* 3587 * Walk the list of vnodes pushing all that are dirty and 3588 * not already on the sync list. 3589 */ 3590 mtx_lock(&mountlist_mtx); 3591 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { 3592 mtx_unlock(&mountlist_mtx); 3593 return (0); 3594 } 3595 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 3596 vfs_unbusy(mp); 3597 return (0); 3598 } 3599 save = curthread_pflags_set(TDP_SYNCIO); 3600 vfs_msync(mp, MNT_NOWAIT); 3601 error = VFS_SYNC(mp, MNT_LAZY); 3602 curthread_pflags_restore(save); 3603 vn_finished_write(mp); 3604 vfs_unbusy(mp); 3605 return (error); 3606 } 3607 3608 /* 3609 * The syncer vnode is no referenced. 3610 */ 3611 static int 3612 sync_inactive(struct vop_inactive_args *ap) 3613 { 3614 3615 vgone(ap->a_vp); 3616 return (0); 3617 } 3618 3619 /* 3620 * The syncer vnode is no longer needed and is being decommissioned. 3621 * 3622 * Modifications to the worklist must be protected by sync_mtx. 3623 */ 3624 static int 3625 sync_reclaim(struct vop_reclaim_args *ap) 3626 { 3627 struct vnode *vp = ap->a_vp; 3628 struct bufobj *bo; 3629 3630 bo = &vp->v_bufobj; 3631 BO_LOCK(bo); 3632 mtx_lock(&sync_mtx); 3633 if (vp->v_mount->mnt_syncer == vp) 3634 vp->v_mount->mnt_syncer = NULL; 3635 if (bo->bo_flag & BO_ONWORKLST) { 3636 LIST_REMOVE(bo, bo_synclist); 3637 syncer_worklist_len--; 3638 sync_vnode_count--; 3639 bo->bo_flag &= ~BO_ONWORKLST; 3640 } 3641 mtx_unlock(&sync_mtx); 3642 BO_UNLOCK(bo); 3643 3644 return (0); 3645 } 3646 3647 /* 3648 * Check if vnode represents a disk device 3649 */ 3650 int 3651 vn_isdisk(struct vnode *vp, int *errp) 3652 { 3653 int error; 3654 3655 error = 0; 3656 dev_lock(); 3657 if (vp->v_type != VCHR) 3658 error = ENOTBLK; 3659 else if (vp->v_rdev == NULL) 3660 error = ENXIO; 3661 else if (vp->v_rdev->si_devsw == NULL) 3662 error = ENXIO; 3663 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 3664 error = ENOTBLK; 3665 dev_unlock(); 3666 if (errp != NULL) 3667 *errp = error; 3668 return (error == 0); 3669 } 3670 3671 /* 3672 * Common filesystem object access control check routine. Accepts a 3673 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3674 * and optional call-by-reference privused argument allowing vaccess() 3675 * to indicate to the caller whether privilege was used to satisfy the 3676 * request (obsoleted). Returns 0 on success, or an errno on failure. 3677 */ 3678 int 3679 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 3680 accmode_t accmode, struct ucred *cred, int *privused) 3681 { 3682 accmode_t dac_granted; 3683 accmode_t priv_granted; 3684 3685 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 3686 ("invalid bit in accmode")); 3687 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 3688 ("VAPPEND without VWRITE")); 3689 3690 /* 3691 * Look for a normal, non-privileged way to access the file/directory 3692 * as requested. If it exists, go with that. 3693 */ 3694 3695 if (privused != NULL) 3696 *privused = 0; 3697 3698 dac_granted = 0; 3699 3700 /* Check the owner. */ 3701 if (cred->cr_uid == file_uid) { 3702 dac_granted |= VADMIN; 3703 if (file_mode & S_IXUSR) 3704 dac_granted |= VEXEC; 3705 if (file_mode & S_IRUSR) 3706 dac_granted |= VREAD; 3707 if (file_mode & S_IWUSR) 3708 dac_granted |= (VWRITE | VAPPEND); 3709 3710 if ((accmode & dac_granted) == accmode) 3711 return (0); 3712 3713 goto privcheck; 3714 } 3715 3716 /* Otherwise, check the groups (first match) */ 3717 if (groupmember(file_gid, cred)) { 3718 if (file_mode & S_IXGRP) 3719 dac_granted |= VEXEC; 3720 if (file_mode & S_IRGRP) 3721 dac_granted |= VREAD; 3722 if (file_mode & S_IWGRP) 3723 dac_granted |= (VWRITE | VAPPEND); 3724 3725 if ((accmode & dac_granted) == accmode) 3726 return (0); 3727 3728 goto privcheck; 3729 } 3730 3731 /* Otherwise, check everyone else. */ 3732 if (file_mode & S_IXOTH) 3733 dac_granted |= VEXEC; 3734 if (file_mode & S_IROTH) 3735 dac_granted |= VREAD; 3736 if (file_mode & S_IWOTH) 3737 dac_granted |= (VWRITE | VAPPEND); 3738 if ((accmode & dac_granted) == accmode) 3739 return (0); 3740 3741 privcheck: 3742 /* 3743 * Build a privilege mask to determine if the set of privileges 3744 * satisfies the requirements when combined with the granted mask 3745 * from above. For each privilege, if the privilege is required, 3746 * bitwise or the request type onto the priv_granted mask. 3747 */ 3748 priv_granted = 0; 3749 3750 if (type == VDIR) { 3751 /* 3752 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 3753 * requests, instead of PRIV_VFS_EXEC. 3754 */ 3755 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3756 !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) 3757 priv_granted |= VEXEC; 3758 } else { 3759 /* 3760 * Ensure that at least one execute bit is on. Otherwise, 3761 * a privileged user will always succeed, and we don't want 3762 * this to happen unless the file really is executable. 3763 */ 3764 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3765 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 3766 !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) 3767 priv_granted |= VEXEC; 3768 } 3769 3770 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 3771 !priv_check_cred(cred, PRIV_VFS_READ, 0)) 3772 priv_granted |= VREAD; 3773 3774 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3775 !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) 3776 priv_granted |= (VWRITE | VAPPEND); 3777 3778 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3779 !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) 3780 priv_granted |= VADMIN; 3781 3782 if ((accmode & (priv_granted | dac_granted)) == accmode) { 3783 /* XXX audit: privilege used */ 3784 if (privused != NULL) 3785 *privused = 1; 3786 return (0); 3787 } 3788 3789 return ((accmode & VADMIN) ? EPERM : EACCES); 3790 } 3791 3792 /* 3793 * Credential check based on process requesting service, and per-attribute 3794 * permissions. 3795 */ 3796 int 3797 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 3798 struct thread *td, accmode_t accmode) 3799 { 3800 3801 /* 3802 * Kernel-invoked always succeeds. 3803 */ 3804 if (cred == NOCRED) 3805 return (0); 3806 3807 /* 3808 * Do not allow privileged processes in jail to directly manipulate 3809 * system attributes. 3810 */ 3811 switch (attrnamespace) { 3812 case EXTATTR_NAMESPACE_SYSTEM: 3813 /* Potentially should be: return (EPERM); */ 3814 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); 3815 case EXTATTR_NAMESPACE_USER: 3816 return (VOP_ACCESS(vp, accmode, cred, td)); 3817 default: 3818 return (EPERM); 3819 } 3820 } 3821 3822 #ifdef DEBUG_VFS_LOCKS 3823 /* 3824 * This only exists to supress warnings from unlocked specfs accesses. It is 3825 * no longer ok to have an unlocked VFS. 3826 */ 3827 #define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 3828 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 3829 3830 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 3831 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 3832 "Drop into debugger on lock violation"); 3833 3834 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 3835 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 3836 0, "Check for interlock across VOPs"); 3837 3838 int vfs_badlock_print = 1; /* Print lock violations. */ 3839 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 3840 0, "Print lock violations"); 3841 3842 #ifdef KDB 3843 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 3844 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 3845 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 3846 #endif 3847 3848 static void 3849 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 3850 { 3851 3852 #ifdef KDB 3853 if (vfs_badlock_backtrace) 3854 kdb_backtrace(); 3855 #endif 3856 if (vfs_badlock_print) 3857 printf("%s: %p %s\n", str, (void *)vp, msg); 3858 if (vfs_badlock_ddb) 3859 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 3860 } 3861 3862 void 3863 assert_vi_locked(struct vnode *vp, const char *str) 3864 { 3865 3866 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 3867 vfs_badlock("interlock is not locked but should be", str, vp); 3868 } 3869 3870 void 3871 assert_vi_unlocked(struct vnode *vp, const char *str) 3872 { 3873 3874 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 3875 vfs_badlock("interlock is locked but should not be", str, vp); 3876 } 3877 3878 void 3879 assert_vop_locked(struct vnode *vp, const char *str) 3880 { 3881 3882 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0) 3883 vfs_badlock("is not locked but should be", str, vp); 3884 } 3885 3886 void 3887 assert_vop_unlocked(struct vnode *vp, const char *str) 3888 { 3889 3890 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 3891 vfs_badlock("is locked but should not be", str, vp); 3892 } 3893 3894 void 3895 assert_vop_elocked(struct vnode *vp, const char *str) 3896 { 3897 3898 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 3899 vfs_badlock("is not exclusive locked but should be", str, vp); 3900 } 3901 3902 #if 0 3903 void 3904 assert_vop_elocked_other(struct vnode *vp, const char *str) 3905 { 3906 3907 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) 3908 vfs_badlock("is not exclusive locked by another thread", 3909 str, vp); 3910 } 3911 3912 void 3913 assert_vop_slocked(struct vnode *vp, const char *str) 3914 { 3915 3916 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) 3917 vfs_badlock("is not locked shared but should be", str, vp); 3918 } 3919 #endif /* 0 */ 3920 #endif /* DEBUG_VFS_LOCKS */ 3921 3922 void 3923 vop_rename_fail(struct vop_rename_args *ap) 3924 { 3925 3926 if (ap->a_tvp != NULL) 3927 vput(ap->a_tvp); 3928 if (ap->a_tdvp == ap->a_tvp) 3929 vrele(ap->a_tdvp); 3930 else 3931 vput(ap->a_tdvp); 3932 vrele(ap->a_fdvp); 3933 vrele(ap->a_fvp); 3934 } 3935 3936 void 3937 vop_rename_pre(void *ap) 3938 { 3939 struct vop_rename_args *a = ap; 3940 3941 #ifdef DEBUG_VFS_LOCKS 3942 if (a->a_tvp) 3943 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 3944 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 3945 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 3946 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 3947 3948 /* Check the source (from). */ 3949 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 3950 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 3951 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 3952 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 3953 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 3954 3955 /* Check the target. */ 3956 if (a->a_tvp) 3957 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 3958 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 3959 #endif 3960 if (a->a_tdvp != a->a_fdvp) 3961 vhold(a->a_fdvp); 3962 if (a->a_tvp != a->a_fvp) 3963 vhold(a->a_fvp); 3964 vhold(a->a_tdvp); 3965 if (a->a_tvp) 3966 vhold(a->a_tvp); 3967 } 3968 3969 void 3970 vop_strategy_pre(void *ap) 3971 { 3972 #ifdef DEBUG_VFS_LOCKS 3973 struct vop_strategy_args *a; 3974 struct buf *bp; 3975 3976 a = ap; 3977 bp = a->a_bp; 3978 3979 /* 3980 * Cluster ops lock their component buffers but not the IO container. 3981 */ 3982 if ((bp->b_flags & B_CLUSTER) != 0) 3983 return; 3984 3985 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 3986 if (vfs_badlock_print) 3987 printf( 3988 "VOP_STRATEGY: bp is not locked but should be\n"); 3989 if (vfs_badlock_ddb) 3990 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 3991 } 3992 #endif 3993 } 3994 3995 void 3996 vop_lookup_pre(void *ap) 3997 { 3998 #ifdef DEBUG_VFS_LOCKS 3999 struct vop_lookup_args *a; 4000 struct vnode *dvp; 4001 4002 a = ap; 4003 dvp = a->a_dvp; 4004 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); 4005 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); 4006 #endif 4007 } 4008 4009 void 4010 vop_lookup_post(void *ap, int rc) 4011 { 4012 #ifdef DEBUG_VFS_LOCKS 4013 struct vop_lookup_args *a; 4014 struct vnode *dvp; 4015 struct vnode *vp; 4016 4017 a = ap; 4018 dvp = a->a_dvp; 4019 vp = *(a->a_vpp); 4020 4021 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); 4022 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); 4023 4024 if (!rc) 4025 ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)"); 4026 #endif 4027 } 4028 4029 void 4030 vop_lock_pre(void *ap) 4031 { 4032 #ifdef DEBUG_VFS_LOCKS 4033 struct vop_lock1_args *a = ap; 4034 4035 if ((a->a_flags & LK_INTERLOCK) == 0) 4036 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4037 else 4038 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 4039 #endif 4040 } 4041 4042 void 4043 vop_lock_post(void *ap, int rc) 4044 { 4045 #ifdef DEBUG_VFS_LOCKS 4046 struct vop_lock1_args *a = ap; 4047 4048 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4049 if (rc == 0) 4050 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 4051 #endif 4052 } 4053 4054 void 4055 vop_unlock_pre(void *ap) 4056 { 4057 #ifdef DEBUG_VFS_LOCKS 4058 struct vop_unlock_args *a = ap; 4059 4060 if (a->a_flags & LK_INTERLOCK) 4061 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 4062 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 4063 #endif 4064 } 4065 4066 void 4067 vop_unlock_post(void *ap, int rc) 4068 { 4069 #ifdef DEBUG_VFS_LOCKS 4070 struct vop_unlock_args *a = ap; 4071 4072 if (a->a_flags & LK_INTERLOCK) 4073 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 4074 #endif 4075 } 4076 4077 void 4078 vop_create_post(void *ap, int rc) 4079 { 4080 struct vop_create_args *a = ap; 4081 4082 if (!rc) 4083 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4084 } 4085 4086 void 4087 vop_deleteextattr_post(void *ap, int rc) 4088 { 4089 struct vop_deleteextattr_args *a = ap; 4090 4091 if (!rc) 4092 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4093 } 4094 4095 void 4096 vop_link_post(void *ap, int rc) 4097 { 4098 struct vop_link_args *a = ap; 4099 4100 if (!rc) { 4101 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 4102 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 4103 } 4104 } 4105 4106 void 4107 vop_mkdir_post(void *ap, int rc) 4108 { 4109 struct vop_mkdir_args *a = ap; 4110 4111 if (!rc) 4112 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4113 } 4114 4115 void 4116 vop_mknod_post(void *ap, int rc) 4117 { 4118 struct vop_mknod_args *a = ap; 4119 4120 if (!rc) 4121 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4122 } 4123 4124 void 4125 vop_remove_post(void *ap, int rc) 4126 { 4127 struct vop_remove_args *a = ap; 4128 4129 if (!rc) { 4130 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4131 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4132 } 4133 } 4134 4135 void 4136 vop_rename_post(void *ap, int rc) 4137 { 4138 struct vop_rename_args *a = ap; 4139 4140 if (!rc) { 4141 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); 4142 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); 4143 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 4144 if (a->a_tvp) 4145 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 4146 } 4147 if (a->a_tdvp != a->a_fdvp) 4148 vdrop(a->a_fdvp); 4149 if (a->a_tvp != a->a_fvp) 4150 vdrop(a->a_fvp); 4151 vdrop(a->a_tdvp); 4152 if (a->a_tvp) 4153 vdrop(a->a_tvp); 4154 } 4155 4156 void 4157 vop_rmdir_post(void *ap, int rc) 4158 { 4159 struct vop_rmdir_args *a = ap; 4160 4161 if (!rc) { 4162 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4163 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4164 } 4165 } 4166 4167 void 4168 vop_setattr_post(void *ap, int rc) 4169 { 4170 struct vop_setattr_args *a = ap; 4171 4172 if (!rc) 4173 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4174 } 4175 4176 void 4177 vop_setextattr_post(void *ap, int rc) 4178 { 4179 struct vop_setextattr_args *a = ap; 4180 4181 if (!rc) 4182 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4183 } 4184 4185 void 4186 vop_symlink_post(void *ap, int rc) 4187 { 4188 struct vop_symlink_args *a = ap; 4189 4190 if (!rc) 4191 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4192 } 4193 4194 static struct knlist fs_knlist; 4195 4196 static void 4197 vfs_event_init(void *arg) 4198 { 4199 knlist_init_mtx(&fs_knlist, NULL); 4200 } 4201 /* XXX - correct order? */ 4202 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 4203 4204 void 4205 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 4206 { 4207 4208 KNOTE_UNLOCKED(&fs_knlist, event); 4209 } 4210 4211 static int filt_fsattach(struct knote *kn); 4212 static void filt_fsdetach(struct knote *kn); 4213 static int filt_fsevent(struct knote *kn, long hint); 4214 4215 struct filterops fs_filtops = { 4216 .f_isfd = 0, 4217 .f_attach = filt_fsattach, 4218 .f_detach = filt_fsdetach, 4219 .f_event = filt_fsevent 4220 }; 4221 4222 static int 4223 filt_fsattach(struct knote *kn) 4224 { 4225 4226 kn->kn_flags |= EV_CLEAR; 4227 knlist_add(&fs_knlist, kn, 0); 4228 return (0); 4229 } 4230 4231 static void 4232 filt_fsdetach(struct knote *kn) 4233 { 4234 4235 knlist_remove(&fs_knlist, kn, 0); 4236 } 4237 4238 static int 4239 filt_fsevent(struct knote *kn, long hint) 4240 { 4241 4242 kn->kn_fflags |= hint; 4243 return (kn->kn_fflags != 0); 4244 } 4245 4246 static int 4247 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 4248 { 4249 struct vfsidctl vc; 4250 int error; 4251 struct mount *mp; 4252 4253 error = SYSCTL_IN(req, &vc, sizeof(vc)); 4254 if (error) 4255 return (error); 4256 if (vc.vc_vers != VFS_CTL_VERS1) 4257 return (EINVAL); 4258 mp = vfs_getvfs(&vc.vc_fsid); 4259 if (mp == NULL) 4260 return (ENOENT); 4261 /* ensure that a specific sysctl goes to the right filesystem. */ 4262 if (strcmp(vc.vc_fstypename, "*") != 0 && 4263 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 4264 vfs_rel(mp); 4265 return (EINVAL); 4266 } 4267 VCTLTOREQ(&vc, req); 4268 error = VFS_SYSCTL(mp, vc.vc_op, req); 4269 vfs_rel(mp); 4270 return (error); 4271 } 4272 4273 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 4274 NULL, 0, sysctl_vfs_ctl, "", 4275 "Sysctl by fsid"); 4276 4277 /* 4278 * Function to initialize a va_filerev field sensibly. 4279 * XXX: Wouldn't a random number make a lot more sense ?? 4280 */ 4281 u_quad_t 4282 init_va_filerev(void) 4283 { 4284 struct bintime bt; 4285 4286 getbinuptime(&bt); 4287 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 4288 } 4289 4290 static int filt_vfsread(struct knote *kn, long hint); 4291 static int filt_vfswrite(struct knote *kn, long hint); 4292 static int filt_vfsvnode(struct knote *kn, long hint); 4293 static void filt_vfsdetach(struct knote *kn); 4294 static struct filterops vfsread_filtops = { 4295 .f_isfd = 1, 4296 .f_detach = filt_vfsdetach, 4297 .f_event = filt_vfsread 4298 }; 4299 static struct filterops vfswrite_filtops = { 4300 .f_isfd = 1, 4301 .f_detach = filt_vfsdetach, 4302 .f_event = filt_vfswrite 4303 }; 4304 static struct filterops vfsvnode_filtops = { 4305 .f_isfd = 1, 4306 .f_detach = filt_vfsdetach, 4307 .f_event = filt_vfsvnode 4308 }; 4309 4310 static void 4311 vfs_knllock(void *arg) 4312 { 4313 struct vnode *vp = arg; 4314 4315 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4316 } 4317 4318 static void 4319 vfs_knlunlock(void *arg) 4320 { 4321 struct vnode *vp = arg; 4322 4323 VOP_UNLOCK(vp, 0); 4324 } 4325 4326 static void 4327 vfs_knl_assert_locked(void *arg) 4328 { 4329 #ifdef DEBUG_VFS_LOCKS 4330 struct vnode *vp = arg; 4331 4332 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 4333 #endif 4334 } 4335 4336 static void 4337 vfs_knl_assert_unlocked(void *arg) 4338 { 4339 #ifdef DEBUG_VFS_LOCKS 4340 struct vnode *vp = arg; 4341 4342 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 4343 #endif 4344 } 4345 4346 int 4347 vfs_kqfilter(struct vop_kqfilter_args *ap) 4348 { 4349 struct vnode *vp = ap->a_vp; 4350 struct knote *kn = ap->a_kn; 4351 struct knlist *knl; 4352 4353 switch (kn->kn_filter) { 4354 case EVFILT_READ: 4355 kn->kn_fop = &vfsread_filtops; 4356 break; 4357 case EVFILT_WRITE: 4358 kn->kn_fop = &vfswrite_filtops; 4359 break; 4360 case EVFILT_VNODE: 4361 kn->kn_fop = &vfsvnode_filtops; 4362 break; 4363 default: 4364 return (EINVAL); 4365 } 4366 4367 kn->kn_hook = (caddr_t)vp; 4368 4369 v_addpollinfo(vp); 4370 if (vp->v_pollinfo == NULL) 4371 return (ENOMEM); 4372 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 4373 knlist_add(knl, kn, 0); 4374 4375 return (0); 4376 } 4377 4378 /* 4379 * Detach knote from vnode 4380 */ 4381 static void 4382 filt_vfsdetach(struct knote *kn) 4383 { 4384 struct vnode *vp = (struct vnode *)kn->kn_hook; 4385 4386 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 4387 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 4388 } 4389 4390 /*ARGSUSED*/ 4391 static int 4392 filt_vfsread(struct knote *kn, long hint) 4393 { 4394 struct vnode *vp = (struct vnode *)kn->kn_hook; 4395 struct vattr va; 4396 int res; 4397 4398 /* 4399 * filesystem is gone, so set the EOF flag and schedule 4400 * the knote for deletion. 4401 */ 4402 if (hint == NOTE_REVOKE) { 4403 VI_LOCK(vp); 4404 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4405 VI_UNLOCK(vp); 4406 return (1); 4407 } 4408 4409 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 4410 return (0); 4411 4412 VI_LOCK(vp); 4413 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 4414 res = (kn->kn_data != 0); 4415 VI_UNLOCK(vp); 4416 return (res); 4417 } 4418 4419 /*ARGSUSED*/ 4420 static int 4421 filt_vfswrite(struct knote *kn, long hint) 4422 { 4423 struct vnode *vp = (struct vnode *)kn->kn_hook; 4424 4425 VI_LOCK(vp); 4426 4427 /* 4428 * filesystem is gone, so set the EOF flag and schedule 4429 * the knote for deletion. 4430 */ 4431 if (hint == NOTE_REVOKE) 4432 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4433 4434 kn->kn_data = 0; 4435 VI_UNLOCK(vp); 4436 return (1); 4437 } 4438 4439 static int 4440 filt_vfsvnode(struct knote *kn, long hint) 4441 { 4442 struct vnode *vp = (struct vnode *)kn->kn_hook; 4443 int res; 4444 4445 VI_LOCK(vp); 4446 if (kn->kn_sfflags & hint) 4447 kn->kn_fflags |= hint; 4448 if (hint == NOTE_REVOKE) { 4449 kn->kn_flags |= EV_EOF; 4450 VI_UNLOCK(vp); 4451 return (1); 4452 } 4453 res = (kn->kn_fflags != 0); 4454 VI_UNLOCK(vp); 4455 return (res); 4456 } 4457 4458 int 4459 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 4460 { 4461 int error; 4462 4463 if (dp->d_reclen > ap->a_uio->uio_resid) 4464 return (ENAMETOOLONG); 4465 error = uiomove(dp, dp->d_reclen, ap->a_uio); 4466 if (error) { 4467 if (ap->a_ncookies != NULL) { 4468 if (ap->a_cookies != NULL) 4469 free(ap->a_cookies, M_TEMP); 4470 ap->a_cookies = NULL; 4471 *ap->a_ncookies = 0; 4472 } 4473 return (error); 4474 } 4475 if (ap->a_ncookies == NULL) 4476 return (0); 4477 4478 KASSERT(ap->a_cookies, 4479 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 4480 4481 *ap->a_cookies = realloc(*ap->a_cookies, 4482 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 4483 (*ap->a_cookies)[*ap->a_ncookies] = off; 4484 return (0); 4485 } 4486 4487 /* 4488 * Mark for update the access time of the file if the filesystem 4489 * supports VOP_MARKATIME. This functionality is used by execve and 4490 * mmap, so we want to avoid the I/O implied by directly setting 4491 * va_atime for the sake of efficiency. 4492 */ 4493 void 4494 vfs_mark_atime(struct vnode *vp, struct ucred *cred) 4495 { 4496 struct mount *mp; 4497 4498 mp = vp->v_mount; 4499 VFS_ASSERT_GIANT(mp); 4500 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 4501 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 4502 (void)VOP_MARKATIME(vp); 4503 } 4504 4505 /* 4506 * The purpose of this routine is to remove granularity from accmode_t, 4507 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 4508 * VADMIN and VAPPEND. 4509 * 4510 * If it returns 0, the caller is supposed to continue with the usual 4511 * access checks using 'accmode' as modified by this routine. If it 4512 * returns nonzero value, the caller is supposed to return that value 4513 * as errno. 4514 * 4515 * Note that after this routine runs, accmode may be zero. 4516 */ 4517 int 4518 vfs_unixify_accmode(accmode_t *accmode) 4519 { 4520 /* 4521 * There is no way to specify explicit "deny" rule using 4522 * file mode or POSIX.1e ACLs. 4523 */ 4524 if (*accmode & VEXPLICIT_DENY) { 4525 *accmode = 0; 4526 return (0); 4527 } 4528 4529 /* 4530 * None of these can be translated into usual access bits. 4531 * Also, the common case for NFSv4 ACLs is to not contain 4532 * either of these bits. Caller should check for VWRITE 4533 * on the containing directory instead. 4534 */ 4535 if (*accmode & (VDELETE_CHILD | VDELETE)) 4536 return (EPERM); 4537 4538 if (*accmode & VADMIN_PERMS) { 4539 *accmode &= ~VADMIN_PERMS; 4540 *accmode |= VADMIN; 4541 } 4542 4543 /* 4544 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 4545 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 4546 */ 4547 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 4548 4549 return (0); 4550 } 4551 4552 /* 4553 * These are helper functions for filesystems to traverse all 4554 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 4555 * 4556 * This interface replaces MNT_VNODE_FOREACH. 4557 */ 4558 4559 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 4560 4561 struct vnode * 4562 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 4563 { 4564 struct vnode *vp; 4565 4566 if (should_yield()) 4567 kern_yield(PRI_UNCHANGED); 4568 MNT_ILOCK(mp); 4569 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4570 vp = TAILQ_NEXT(*mvp, v_nmntvnodes); 4571 while (vp != NULL && (vp->v_type == VMARKER || 4572 (vp->v_iflag & VI_DOOMED) != 0)) 4573 vp = TAILQ_NEXT(vp, v_nmntvnodes); 4574 4575 /* Check if we are done */ 4576 if (vp == NULL) { 4577 __mnt_vnode_markerfree_all(mvp, mp); 4578 /* MNT_IUNLOCK(mp); -- done in above function */ 4579 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 4580 return (NULL); 4581 } 4582 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 4583 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 4584 VI_LOCK(vp); 4585 MNT_IUNLOCK(mp); 4586 return (vp); 4587 } 4588 4589 struct vnode * 4590 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 4591 { 4592 struct vnode *vp; 4593 4594 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 4595 MNT_ILOCK(mp); 4596 MNT_REF(mp); 4597 (*mvp)->v_type = VMARKER; 4598 4599 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 4600 while (vp != NULL && (vp->v_type == VMARKER || 4601 (vp->v_iflag & VI_DOOMED) != 0)) 4602 vp = TAILQ_NEXT(vp, v_nmntvnodes); 4603 4604 /* Check if we are done */ 4605 if (vp == NULL) { 4606 MNT_REL(mp); 4607 MNT_IUNLOCK(mp); 4608 free(*mvp, M_VNODE_MARKER); 4609 *mvp = NULL; 4610 return (NULL); 4611 } 4612 (*mvp)->v_mount = mp; 4613 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 4614 VI_LOCK(vp); 4615 MNT_IUNLOCK(mp); 4616 return (vp); 4617 } 4618 4619 4620 void 4621 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 4622 { 4623 4624 if (*mvp == NULL) { 4625 MNT_IUNLOCK(mp); 4626 return; 4627 } 4628 4629 mtx_assert(MNT_MTX(mp), MA_OWNED); 4630 4631 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4632 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 4633 MNT_REL(mp); 4634 MNT_IUNLOCK(mp); 4635 free(*mvp, M_VNODE_MARKER); 4636 *mvp = NULL; 4637 } 4638 4639 /* 4640 * These are helper functions for filesystems to traverse their 4641 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 4642 */ 4643 struct vnode * 4644 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 4645 { 4646 struct vnode *vp, *nvp; 4647 4648 if (should_yield()) 4649 kern_yield(PRI_UNCHANGED); 4650 MNT_ILOCK(mp); 4651 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4652 vp = TAILQ_NEXT(*mvp, v_actfreelist); 4653 while (vp != NULL) { 4654 VI_LOCK(vp); 4655 if (vp->v_mount == mp && vp->v_type != VMARKER && 4656 (vp->v_iflag & VI_DOOMED) == 0) 4657 break; 4658 nvp = TAILQ_NEXT(vp, v_actfreelist); 4659 VI_UNLOCK(vp); 4660 vp = nvp; 4661 } 4662 4663 /* Check if we are done */ 4664 if (vp == NULL) { 4665 __mnt_vnode_markerfree_active(mvp, mp); 4666 /* MNT_IUNLOCK(mp); -- done in above function */ 4667 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 4668 return (NULL); 4669 } 4670 mtx_lock(&vnode_free_list_mtx); 4671 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 4672 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 4673 mtx_unlock(&vnode_free_list_mtx); 4674 MNT_IUNLOCK(mp); 4675 return (vp); 4676 } 4677 4678 struct vnode * 4679 __mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 4680 { 4681 struct vnode *vp, *nvp; 4682 4683 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 4684 MNT_ILOCK(mp); 4685 MNT_REF(mp); 4686 (*mvp)->v_type = VMARKER; 4687 4688 vp = TAILQ_NEXT(*mvp, v_actfreelist); 4689 while (vp != NULL) { 4690 VI_LOCK(vp); 4691 if (vp->v_mount == mp && vp->v_type != VMARKER && 4692 (vp->v_iflag & VI_DOOMED) == 0) 4693 break; 4694 nvp = TAILQ_NEXT(vp, v_actfreelist); 4695 VI_UNLOCK(vp); 4696 vp = nvp; 4697 } 4698 4699 /* Check if we are done */ 4700 if (vp == NULL) { 4701 MNT_REL(mp); 4702 MNT_IUNLOCK(mp); 4703 free(*mvp, M_VNODE_MARKER); 4704 *mvp = NULL; 4705 return (NULL); 4706 } 4707 (*mvp)->v_mount = mp; 4708 mtx_lock(&vnode_free_list_mtx); 4709 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 4710 mtx_unlock(&vnode_free_list_mtx); 4711 MNT_IUNLOCK(mp); 4712 return (vp); 4713 } 4714 4715 void 4716 __mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 4717 { 4718 4719 if (*mvp == NULL) { 4720 MNT_IUNLOCK(mp); 4721 return; 4722 } 4723 4724 mtx_assert(MNT_MTX(mp), MA_OWNED); 4725 4726 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4727 mtx_lock(&vnode_free_list_mtx); 4728 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 4729 mtx_unlock(&vnode_free_list_mtx); 4730 MNT_REL(mp); 4731 MNT_IUNLOCK(mp); 4732 free(*mvp, M_VNODE_MARKER); 4733 *mvp = NULL; 4734 } 4735