1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/bio.h> 52 #include <sys/buf.h> 53 #include <sys/capsicum.h> 54 #include <sys/condvar.h> 55 #include <sys/conf.h> 56 #include <sys/counter.h> 57 #include <sys/dirent.h> 58 #include <sys/event.h> 59 #include <sys/eventhandler.h> 60 #include <sys/extattr.h> 61 #include <sys/file.h> 62 #include <sys/fcntl.h> 63 #include <sys/jail.h> 64 #include <sys/kdb.h> 65 #include <sys/kernel.h> 66 #include <sys/kthread.h> 67 #include <sys/ktr.h> 68 #include <sys/lockf.h> 69 #include <sys/malloc.h> 70 #include <sys/mount.h> 71 #include <sys/namei.h> 72 #include <sys/pctrie.h> 73 #include <sys/priv.h> 74 #include <sys/reboot.h> 75 #include <sys/refcount.h> 76 #include <sys/rwlock.h> 77 #include <sys/sched.h> 78 #include <sys/sleepqueue.h> 79 #include <sys/smr.h> 80 #include <sys/smp.h> 81 #include <sys/stat.h> 82 #include <sys/sysctl.h> 83 #include <sys/syslog.h> 84 #include <sys/vmmeter.h> 85 #include <sys/vnode.h> 86 #include <sys/watchdog.h> 87 88 #include <machine/stdarg.h> 89 90 #include <security/mac/mac_framework.h> 91 92 #include <vm/vm.h> 93 #include <vm/vm_object.h> 94 #include <vm/vm_extern.h> 95 #include <vm/pmap.h> 96 #include <vm/vm_map.h> 97 #include <vm/vm_page.h> 98 #include <vm/vm_kern.h> 99 #include <vm/uma.h> 100 101 #ifdef DDB 102 #include <ddb/ddb.h> 103 #endif 104 105 static void delmntque(struct vnode *vp); 106 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 107 int slpflag, int slptimeo); 108 static void syncer_shutdown(void *arg, int howto); 109 static int vtryrecycle(struct vnode *vp); 110 static void v_init_counters(struct vnode *); 111 static void v_incr_devcount(struct vnode *); 112 static void v_decr_devcount(struct vnode *); 113 static void vgonel(struct vnode *); 114 static void vfs_knllock(void *arg); 115 static void vfs_knlunlock(void *arg); 116 static void vfs_knl_assert_locked(void *arg); 117 static void vfs_knl_assert_unlocked(void *arg); 118 static void destroy_vpollinfo(struct vpollinfo *vi); 119 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 120 daddr_t startlbn, daddr_t endlbn); 121 static void vnlru_recalc(void); 122 123 /* 124 * These fences are intended for cases where some synchronization is 125 * needed between access of v_iflags and lockless vnode refcount (v_holdcnt 126 * and v_usecount) updates. Access to v_iflags is generally synchronized 127 * by the interlock, but we have some internal assertions that check vnode 128 * flags without acquiring the lock. Thus, these fences are INVARIANTS-only 129 * for now. 130 */ 131 #ifdef INVARIANTS 132 #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() 133 #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() 134 #else 135 #define VNODE_REFCOUNT_FENCE_ACQ() 136 #define VNODE_REFCOUNT_FENCE_REL() 137 #endif 138 139 /* 140 * Number of vnodes in existence. Increased whenever getnewvnode() 141 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 142 */ 143 static u_long __exclusive_cache_line numvnodes; 144 145 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 146 "Number of vnodes in existence"); 147 148 static counter_u64_t vnodes_created; 149 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 150 "Number of vnodes created by getnewvnode"); 151 152 /* 153 * Conversion tables for conversion from vnode types to inode formats 154 * and back. 155 */ 156 enum vtype iftovt_tab[16] = { 157 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 158 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 159 }; 160 int vttoif_tab[10] = { 161 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 162 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 163 }; 164 165 /* 166 * List of allocates vnodes in the system. 167 */ 168 static TAILQ_HEAD(freelst, vnode) vnode_list; 169 static struct vnode *vnode_list_free_marker; 170 static struct vnode *vnode_list_reclaim_marker; 171 172 /* 173 * "Free" vnode target. Free vnodes are rarely completely free, but are 174 * just ones that are cheap to recycle. Usually they are for files which 175 * have been stat'd but not read; these usually have inode and namecache 176 * data attached to them. This target is the preferred minimum size of a 177 * sub-cache consisting mostly of such files. The system balances the size 178 * of this sub-cache with its complement to try to prevent either from 179 * thrashing while the other is relatively inactive. The targets express 180 * a preference for the best balance. 181 * 182 * "Above" this target there are 2 further targets (watermarks) related 183 * to recyling of free vnodes. In the best-operating case, the cache is 184 * exactly full, the free list has size between vlowat and vhiwat above the 185 * free target, and recycling from it and normal use maintains this state. 186 * Sometimes the free list is below vlowat or even empty, but this state 187 * is even better for immediate use provided the cache is not full. 188 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 189 * ones) to reach one of these states. The watermarks are currently hard- 190 * coded as 4% and 9% of the available space higher. These and the default 191 * of 25% for wantfreevnodes are too large if the memory size is large. 192 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 193 * whenever vnlru_proc() becomes active. 194 */ 195 static long wantfreevnodes; 196 static long __exclusive_cache_line freevnodes; 197 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 198 &freevnodes, 0, "Number of \"free\" vnodes"); 199 static long freevnodes_old; 200 201 static counter_u64_t recycles_count; 202 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 203 "Number of vnodes recycled to meet vnode cache targets"); 204 205 static counter_u64_t recycles_free_count; 206 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, 207 "Number of free vnodes recycled to meet vnode cache targets"); 208 209 static counter_u64_t deferred_inact; 210 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 211 "Number of times inactive processing was deferred"); 212 213 /* To keep more than one thread at a time from running vfs_getnewfsid */ 214 static struct mtx mntid_mtx; 215 216 /* 217 * Lock for any access to the following: 218 * vnode_list 219 * numvnodes 220 * freevnodes 221 */ 222 static struct mtx __exclusive_cache_line vnode_list_mtx; 223 224 /* Publicly exported FS */ 225 struct nfs_public nfs_pub; 226 227 static uma_zone_t buf_trie_zone; 228 static smr_t buf_trie_smr; 229 230 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 231 static uma_zone_t vnode_zone; 232 static uma_zone_t vnodepoll_zone; 233 234 __read_frequently smr_t vfs_smr; 235 236 /* 237 * The workitem queue. 238 * 239 * It is useful to delay writes of file data and filesystem metadata 240 * for tens of seconds so that quickly created and deleted files need 241 * not waste disk bandwidth being created and removed. To realize this, 242 * we append vnodes to a "workitem" queue. When running with a soft 243 * updates implementation, most pending metadata dependencies should 244 * not wait for more than a few seconds. Thus, mounted on block devices 245 * are delayed only about a half the time that file data is delayed. 246 * Similarly, directory updates are more critical, so are only delayed 247 * about a third the time that file data is delayed. Thus, there are 248 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 249 * one each second (driven off the filesystem syncer process). The 250 * syncer_delayno variable indicates the next queue that is to be processed. 251 * Items that need to be processed soon are placed in this queue: 252 * 253 * syncer_workitem_pending[syncer_delayno] 254 * 255 * A delay of fifteen seconds is done by placing the request fifteen 256 * entries later in the queue: 257 * 258 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 259 * 260 */ 261 static int syncer_delayno; 262 static long syncer_mask; 263 LIST_HEAD(synclist, bufobj); 264 static struct synclist *syncer_workitem_pending; 265 /* 266 * The sync_mtx protects: 267 * bo->bo_synclist 268 * sync_vnode_count 269 * syncer_delayno 270 * syncer_state 271 * syncer_workitem_pending 272 * syncer_worklist_len 273 * rushjob 274 */ 275 static struct mtx sync_mtx; 276 static struct cv sync_wakeup; 277 278 #define SYNCER_MAXDELAY 32 279 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 280 static int syncdelay = 30; /* max time to delay syncing data */ 281 static int filedelay = 30; /* time to delay syncing files */ 282 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 283 "Time to delay syncing files (in seconds)"); 284 static int dirdelay = 29; /* time to delay syncing directories */ 285 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 286 "Time to delay syncing directories (in seconds)"); 287 static int metadelay = 28; /* time to delay syncing metadata */ 288 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 289 "Time to delay syncing metadata (in seconds)"); 290 static int rushjob; /* number of slots to run ASAP */ 291 static int stat_rush_requests; /* number of times I/O speeded up */ 292 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 293 "Number of times I/O speeded up (rush requests)"); 294 295 #define VDBATCH_SIZE 8 296 struct vdbatch { 297 u_int index; 298 long freevnodes; 299 struct mtx lock; 300 struct vnode *tab[VDBATCH_SIZE]; 301 }; 302 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 303 304 static void vdbatch_dequeue(struct vnode *vp); 305 306 /* 307 * When shutting down the syncer, run it at four times normal speed. 308 */ 309 #define SYNCER_SHUTDOWN_SPEEDUP 4 310 static int sync_vnode_count; 311 static int syncer_worklist_len; 312 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 313 syncer_state; 314 315 /* Target for maximum number of vnodes. */ 316 u_long desiredvnodes; 317 static u_long gapvnodes; /* gap between wanted and desired */ 318 static u_long vhiwat; /* enough extras after expansion */ 319 static u_long vlowat; /* minimal extras before expansion */ 320 static u_long vstir; /* nonzero to stir non-free vnodes */ 321 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 322 323 static u_long vnlru_read_freevnodes(void); 324 325 /* 326 * Note that no attempt is made to sanitize these parameters. 327 */ 328 static int 329 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 330 { 331 u_long val; 332 int error; 333 334 val = desiredvnodes; 335 error = sysctl_handle_long(oidp, &val, 0, req); 336 if (error != 0 || req->newptr == NULL) 337 return (error); 338 339 if (val == desiredvnodes) 340 return (0); 341 mtx_lock(&vnode_list_mtx); 342 desiredvnodes = val; 343 wantfreevnodes = desiredvnodes / 4; 344 vnlru_recalc(); 345 mtx_unlock(&vnode_list_mtx); 346 /* 347 * XXX There is no protection against multiple threads changing 348 * desiredvnodes at the same time. Locking above only helps vnlru and 349 * getnewvnode. 350 */ 351 vfs_hash_changesize(desiredvnodes); 352 cache_changesize(desiredvnodes); 353 return (0); 354 } 355 356 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 357 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 358 "LU", "Target for maximum number of vnodes"); 359 360 static int 361 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 362 { 363 u_long val; 364 int error; 365 366 val = wantfreevnodes; 367 error = sysctl_handle_long(oidp, &val, 0, req); 368 if (error != 0 || req->newptr == NULL) 369 return (error); 370 371 if (val == wantfreevnodes) 372 return (0); 373 mtx_lock(&vnode_list_mtx); 374 wantfreevnodes = val; 375 vnlru_recalc(); 376 mtx_unlock(&vnode_list_mtx); 377 return (0); 378 } 379 380 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 381 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 382 "LU", "Target for minimum number of \"free\" vnodes"); 383 384 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 385 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 386 static int vnlru_nowhere; 387 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 388 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 389 390 static int 391 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 392 { 393 struct vnode *vp; 394 struct nameidata nd; 395 char *buf; 396 unsigned long ndflags; 397 int error; 398 399 if (req->newptr == NULL) 400 return (EINVAL); 401 if (req->newlen >= PATH_MAX) 402 return (E2BIG); 403 404 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 405 error = SYSCTL_IN(req, buf, req->newlen); 406 if (error != 0) 407 goto out; 408 409 buf[req->newlen] = '\0'; 410 411 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME; 412 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); 413 if ((error = namei(&nd)) != 0) 414 goto out; 415 vp = nd.ni_vp; 416 417 if (VN_IS_DOOMED(vp)) { 418 /* 419 * This vnode is being recycled. Return != 0 to let the caller 420 * know that the sysctl had no effect. Return EAGAIN because a 421 * subsequent call will likely succeed (since namei will create 422 * a new vnode if necessary) 423 */ 424 error = EAGAIN; 425 goto putvnode; 426 } 427 428 counter_u64_add(recycles_count, 1); 429 vgone(vp); 430 putvnode: 431 NDFREE(&nd, 0); 432 out: 433 free(buf, M_TEMP); 434 return (error); 435 } 436 437 static int 438 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 439 { 440 struct thread *td = curthread; 441 struct vnode *vp; 442 struct file *fp; 443 int error; 444 int fd; 445 446 if (req->newptr == NULL) 447 return (EBADF); 448 449 error = sysctl_handle_int(oidp, &fd, 0, req); 450 if (error != 0) 451 return (error); 452 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 453 if (error != 0) 454 return (error); 455 vp = fp->f_vnode; 456 457 error = vn_lock(vp, LK_EXCLUSIVE); 458 if (error != 0) 459 goto drop; 460 461 counter_u64_add(recycles_count, 1); 462 vgone(vp); 463 VOP_UNLOCK(vp); 464 drop: 465 fdrop(fp, td); 466 return (error); 467 } 468 469 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 470 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 471 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 472 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 473 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 474 sysctl_ftry_reclaim_vnode, "I", 475 "Try to reclaim a vnode by its file descriptor"); 476 477 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 478 static int vnsz2log; 479 480 /* 481 * Support for the bufobj clean & dirty pctrie. 482 */ 483 static void * 484 buf_trie_alloc(struct pctrie *ptree) 485 { 486 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 487 } 488 489 static void 490 buf_trie_free(struct pctrie *ptree, void *node) 491 { 492 uma_zfree_smr(buf_trie_zone, node); 493 } 494 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 495 buf_trie_smr); 496 497 /* 498 * Initialize the vnode management data structures. 499 * 500 * Reevaluate the following cap on the number of vnodes after the physical 501 * memory size exceeds 512GB. In the limit, as the physical memory size 502 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 503 */ 504 #ifndef MAXVNODES_MAX 505 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 506 #endif 507 508 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 509 510 static struct vnode * 511 vn_alloc_marker(struct mount *mp) 512 { 513 struct vnode *vp; 514 515 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 516 vp->v_type = VMARKER; 517 vp->v_mount = mp; 518 519 return (vp); 520 } 521 522 static void 523 vn_free_marker(struct vnode *vp) 524 { 525 526 MPASS(vp->v_type == VMARKER); 527 free(vp, M_VNODE_MARKER); 528 } 529 530 /* 531 * Initialize a vnode as it first enters the zone. 532 */ 533 static int 534 vnode_init(void *mem, int size, int flags) 535 { 536 struct vnode *vp; 537 538 vp = mem; 539 bzero(vp, size); 540 /* 541 * Setup locks. 542 */ 543 vp->v_vnlock = &vp->v_lock; 544 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 545 /* 546 * By default, don't allow shared locks unless filesystems opt-in. 547 */ 548 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 549 LK_NOSHARE | LK_IS_VNODE); 550 /* 551 * Initialize bufobj. 552 */ 553 bufobj_init(&vp->v_bufobj, vp); 554 /* 555 * Initialize namecache. 556 */ 557 cache_vnode_init(vp); 558 /* 559 * Initialize rangelocks. 560 */ 561 rangelock_init(&vp->v_rl); 562 563 vp->v_dbatchcpu = NOCPU; 564 565 mtx_lock(&vnode_list_mtx); 566 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 567 mtx_unlock(&vnode_list_mtx); 568 return (0); 569 } 570 571 /* 572 * Free a vnode when it is cleared from the zone. 573 */ 574 static void 575 vnode_fini(void *mem, int size) 576 { 577 struct vnode *vp; 578 struct bufobj *bo; 579 580 vp = mem; 581 vdbatch_dequeue(vp); 582 mtx_lock(&vnode_list_mtx); 583 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 584 mtx_unlock(&vnode_list_mtx); 585 rangelock_destroy(&vp->v_rl); 586 lockdestroy(vp->v_vnlock); 587 mtx_destroy(&vp->v_interlock); 588 bo = &vp->v_bufobj; 589 rw_destroy(BO_LOCKPTR(bo)); 590 } 591 592 /* 593 * Provide the size of NFS nclnode and NFS fh for calculation of the 594 * vnode memory consumption. The size is specified directly to 595 * eliminate dependency on NFS-private header. 596 * 597 * Other filesystems may use bigger or smaller (like UFS and ZFS) 598 * private inode data, but the NFS-based estimation is ample enough. 599 * Still, we care about differences in the size between 64- and 32-bit 600 * platforms. 601 * 602 * Namecache structure size is heuristically 603 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 604 */ 605 #ifdef _LP64 606 #define NFS_NCLNODE_SZ (528 + 64) 607 #define NC_SZ 148 608 #else 609 #define NFS_NCLNODE_SZ (360 + 32) 610 #define NC_SZ 92 611 #endif 612 613 static void 614 vntblinit(void *dummy __unused) 615 { 616 struct vdbatch *vd; 617 int cpu, physvnodes, virtvnodes; 618 u_int i; 619 620 /* 621 * Desiredvnodes is a function of the physical memory size and the 622 * kernel's heap size. Generally speaking, it scales with the 623 * physical memory size. The ratio of desiredvnodes to the physical 624 * memory size is 1:16 until desiredvnodes exceeds 98,304. 625 * Thereafter, the 626 * marginal ratio of desiredvnodes to the physical memory size is 627 * 1:64. However, desiredvnodes is limited by the kernel's heap 628 * size. The memory required by desiredvnodes vnodes and vm objects 629 * must not exceed 1/10th of the kernel's heap size. 630 */ 631 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 632 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 633 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 634 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 635 desiredvnodes = min(physvnodes, virtvnodes); 636 if (desiredvnodes > MAXVNODES_MAX) { 637 if (bootverbose) 638 printf("Reducing kern.maxvnodes %lu -> %lu\n", 639 desiredvnodes, MAXVNODES_MAX); 640 desiredvnodes = MAXVNODES_MAX; 641 } 642 wantfreevnodes = desiredvnodes / 4; 643 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 644 TAILQ_INIT(&vnode_list); 645 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 646 /* 647 * The lock is taken to appease WITNESS. 648 */ 649 mtx_lock(&vnode_list_mtx); 650 vnlru_recalc(); 651 mtx_unlock(&vnode_list_mtx); 652 vnode_list_free_marker = vn_alloc_marker(NULL); 653 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 654 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 655 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 656 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 657 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 658 uma_zone_set_smr(vnode_zone, vfs_smr); 659 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 660 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 661 /* 662 * Preallocate enough nodes to support one-per buf so that 663 * we can not fail an insert. reassignbuf() callers can not 664 * tolerate the insertion failure. 665 */ 666 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 667 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 668 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 669 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 670 uma_prealloc(buf_trie_zone, nbuf); 671 672 vnodes_created = counter_u64_alloc(M_WAITOK); 673 recycles_count = counter_u64_alloc(M_WAITOK); 674 recycles_free_count = counter_u64_alloc(M_WAITOK); 675 deferred_inact = counter_u64_alloc(M_WAITOK); 676 677 /* 678 * Initialize the filesystem syncer. 679 */ 680 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 681 &syncer_mask); 682 syncer_maxdelay = syncer_mask + 1; 683 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 684 cv_init(&sync_wakeup, "syncer"); 685 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 686 vnsz2log++; 687 vnsz2log--; 688 689 CPU_FOREACH(cpu) { 690 vd = DPCPU_ID_PTR((cpu), vd); 691 bzero(vd, sizeof(*vd)); 692 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 693 } 694 } 695 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 696 697 /* 698 * Mark a mount point as busy. Used to synchronize access and to delay 699 * unmounting. Eventually, mountlist_mtx is not released on failure. 700 * 701 * vfs_busy() is a custom lock, it can block the caller. 702 * vfs_busy() only sleeps if the unmount is active on the mount point. 703 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 704 * vnode belonging to mp. 705 * 706 * Lookup uses vfs_busy() to traverse mount points. 707 * root fs var fs 708 * / vnode lock A / vnode lock (/var) D 709 * /var vnode lock B /log vnode lock(/var/log) E 710 * vfs_busy lock C vfs_busy lock F 711 * 712 * Within each file system, the lock order is C->A->B and F->D->E. 713 * 714 * When traversing across mounts, the system follows that lock order: 715 * 716 * C->A->B 717 * | 718 * +->F->D->E 719 * 720 * The lookup() process for namei("/var") illustrates the process: 721 * VOP_LOOKUP() obtains B while A is held 722 * vfs_busy() obtains a shared lock on F while A and B are held 723 * vput() releases lock on B 724 * vput() releases lock on A 725 * VFS_ROOT() obtains lock on D while shared lock on F is held 726 * vfs_unbusy() releases shared lock on F 727 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 728 * Attempt to lock A (instead of vp_crossmp) while D is held would 729 * violate the global order, causing deadlocks. 730 * 731 * dounmount() locks B while F is drained. 732 */ 733 int 734 vfs_busy(struct mount *mp, int flags) 735 { 736 737 MPASS((flags & ~MBF_MASK) == 0); 738 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 739 740 if (vfs_op_thread_enter(mp)) { 741 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 742 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 743 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 744 vfs_mp_count_add_pcpu(mp, ref, 1); 745 vfs_mp_count_add_pcpu(mp, lockref, 1); 746 vfs_op_thread_exit(mp); 747 if (flags & MBF_MNTLSTLOCK) 748 mtx_unlock(&mountlist_mtx); 749 return (0); 750 } 751 752 MNT_ILOCK(mp); 753 vfs_assert_mount_counters(mp); 754 MNT_REF(mp); 755 /* 756 * If mount point is currently being unmounted, sleep until the 757 * mount point fate is decided. If thread doing the unmounting fails, 758 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 759 * that this mount point has survived the unmount attempt and vfs_busy 760 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 761 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 762 * about to be really destroyed. vfs_busy needs to release its 763 * reference on the mount point in this case and return with ENOENT, 764 * telling the caller that mount mount it tried to busy is no longer 765 * valid. 766 */ 767 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 768 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 769 MNT_REL(mp); 770 MNT_IUNLOCK(mp); 771 CTR1(KTR_VFS, "%s: failed busying before sleeping", 772 __func__); 773 return (ENOENT); 774 } 775 if (flags & MBF_MNTLSTLOCK) 776 mtx_unlock(&mountlist_mtx); 777 mp->mnt_kern_flag |= MNTK_MWAIT; 778 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 779 if (flags & MBF_MNTLSTLOCK) 780 mtx_lock(&mountlist_mtx); 781 MNT_ILOCK(mp); 782 } 783 if (flags & MBF_MNTLSTLOCK) 784 mtx_unlock(&mountlist_mtx); 785 mp->mnt_lockref++; 786 MNT_IUNLOCK(mp); 787 return (0); 788 } 789 790 /* 791 * Free a busy filesystem. 792 */ 793 void 794 vfs_unbusy(struct mount *mp) 795 { 796 int c; 797 798 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 799 800 if (vfs_op_thread_enter(mp)) { 801 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 802 vfs_mp_count_sub_pcpu(mp, lockref, 1); 803 vfs_mp_count_sub_pcpu(mp, ref, 1); 804 vfs_op_thread_exit(mp); 805 return; 806 } 807 808 MNT_ILOCK(mp); 809 vfs_assert_mount_counters(mp); 810 MNT_REL(mp); 811 c = --mp->mnt_lockref; 812 if (mp->mnt_vfs_ops == 0) { 813 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 814 MNT_IUNLOCK(mp); 815 return; 816 } 817 if (c < 0) 818 vfs_dump_mount_counters(mp); 819 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 820 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 821 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 822 mp->mnt_kern_flag &= ~MNTK_DRAINING; 823 wakeup(&mp->mnt_lockref); 824 } 825 MNT_IUNLOCK(mp); 826 } 827 828 /* 829 * Lookup a mount point by filesystem identifier. 830 */ 831 struct mount * 832 vfs_getvfs(fsid_t *fsid) 833 { 834 struct mount *mp; 835 836 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 837 mtx_lock(&mountlist_mtx); 838 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 839 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 840 vfs_ref(mp); 841 mtx_unlock(&mountlist_mtx); 842 return (mp); 843 } 844 } 845 mtx_unlock(&mountlist_mtx); 846 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 847 return ((struct mount *) 0); 848 } 849 850 /* 851 * Lookup a mount point by filesystem identifier, busying it before 852 * returning. 853 * 854 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 855 * cache for popular filesystem identifiers. The cache is lockess, using 856 * the fact that struct mount's are never freed. In worst case we may 857 * get pointer to unmounted or even different filesystem, so we have to 858 * check what we got, and go slow way if so. 859 */ 860 struct mount * 861 vfs_busyfs(fsid_t *fsid) 862 { 863 #define FSID_CACHE_SIZE 256 864 typedef struct mount * volatile vmp_t; 865 static vmp_t cache[FSID_CACHE_SIZE]; 866 struct mount *mp; 867 int error; 868 uint32_t hash; 869 870 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 871 hash = fsid->val[0] ^ fsid->val[1]; 872 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 873 mp = cache[hash]; 874 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 875 goto slow; 876 if (vfs_busy(mp, 0) != 0) { 877 cache[hash] = NULL; 878 goto slow; 879 } 880 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 881 return (mp); 882 else 883 vfs_unbusy(mp); 884 885 slow: 886 mtx_lock(&mountlist_mtx); 887 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 888 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 889 error = vfs_busy(mp, MBF_MNTLSTLOCK); 890 if (error) { 891 cache[hash] = NULL; 892 mtx_unlock(&mountlist_mtx); 893 return (NULL); 894 } 895 cache[hash] = mp; 896 return (mp); 897 } 898 } 899 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 900 mtx_unlock(&mountlist_mtx); 901 return ((struct mount *) 0); 902 } 903 904 /* 905 * Check if a user can access privileged mount options. 906 */ 907 int 908 vfs_suser(struct mount *mp, struct thread *td) 909 { 910 int error; 911 912 if (jailed(td->td_ucred)) { 913 /* 914 * If the jail of the calling thread lacks permission for 915 * this type of file system, deny immediately. 916 */ 917 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 918 return (EPERM); 919 920 /* 921 * If the file system was mounted outside the jail of the 922 * calling thread, deny immediately. 923 */ 924 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 925 return (EPERM); 926 } 927 928 /* 929 * If file system supports delegated administration, we don't check 930 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 931 * by the file system itself. 932 * If this is not the user that did original mount, we check for 933 * the PRIV_VFS_MOUNT_OWNER privilege. 934 */ 935 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 936 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 937 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 938 return (error); 939 } 940 return (0); 941 } 942 943 /* 944 * Get a new unique fsid. Try to make its val[0] unique, since this value 945 * will be used to create fake device numbers for stat(). Also try (but 946 * not so hard) make its val[0] unique mod 2^16, since some emulators only 947 * support 16-bit device numbers. We end up with unique val[0]'s for the 948 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 949 * 950 * Keep in mind that several mounts may be running in parallel. Starting 951 * the search one past where the previous search terminated is both a 952 * micro-optimization and a defense against returning the same fsid to 953 * different mounts. 954 */ 955 void 956 vfs_getnewfsid(struct mount *mp) 957 { 958 static uint16_t mntid_base; 959 struct mount *nmp; 960 fsid_t tfsid; 961 int mtype; 962 963 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 964 mtx_lock(&mntid_mtx); 965 mtype = mp->mnt_vfc->vfc_typenum; 966 tfsid.val[1] = mtype; 967 mtype = (mtype & 0xFF) << 24; 968 for (;;) { 969 tfsid.val[0] = makedev(255, 970 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 971 mntid_base++; 972 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 973 break; 974 vfs_rel(nmp); 975 } 976 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 977 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 978 mtx_unlock(&mntid_mtx); 979 } 980 981 /* 982 * Knob to control the precision of file timestamps: 983 * 984 * 0 = seconds only; nanoseconds zeroed. 985 * 1 = seconds and nanoseconds, accurate within 1/HZ. 986 * 2 = seconds and nanoseconds, truncated to microseconds. 987 * >=3 = seconds and nanoseconds, maximum precision. 988 */ 989 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 990 991 static int timestamp_precision = TSP_USEC; 992 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 993 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 994 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 995 "3+: sec + ns (max. precision))"); 996 997 /* 998 * Get a current timestamp. 999 */ 1000 void 1001 vfs_timestamp(struct timespec *tsp) 1002 { 1003 struct timeval tv; 1004 1005 switch (timestamp_precision) { 1006 case TSP_SEC: 1007 tsp->tv_sec = time_second; 1008 tsp->tv_nsec = 0; 1009 break; 1010 case TSP_HZ: 1011 getnanotime(tsp); 1012 break; 1013 case TSP_USEC: 1014 microtime(&tv); 1015 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1016 break; 1017 case TSP_NSEC: 1018 default: 1019 nanotime(tsp); 1020 break; 1021 } 1022 } 1023 1024 /* 1025 * Set vnode attributes to VNOVAL 1026 */ 1027 void 1028 vattr_null(struct vattr *vap) 1029 { 1030 1031 vap->va_type = VNON; 1032 vap->va_size = VNOVAL; 1033 vap->va_bytes = VNOVAL; 1034 vap->va_mode = VNOVAL; 1035 vap->va_nlink = VNOVAL; 1036 vap->va_uid = VNOVAL; 1037 vap->va_gid = VNOVAL; 1038 vap->va_fsid = VNOVAL; 1039 vap->va_fileid = VNOVAL; 1040 vap->va_blocksize = VNOVAL; 1041 vap->va_rdev = VNOVAL; 1042 vap->va_atime.tv_sec = VNOVAL; 1043 vap->va_atime.tv_nsec = VNOVAL; 1044 vap->va_mtime.tv_sec = VNOVAL; 1045 vap->va_mtime.tv_nsec = VNOVAL; 1046 vap->va_ctime.tv_sec = VNOVAL; 1047 vap->va_ctime.tv_nsec = VNOVAL; 1048 vap->va_birthtime.tv_sec = VNOVAL; 1049 vap->va_birthtime.tv_nsec = VNOVAL; 1050 vap->va_flags = VNOVAL; 1051 vap->va_gen = VNOVAL; 1052 vap->va_vaflags = 0; 1053 } 1054 1055 /* 1056 * Try to reduce the total number of vnodes. 1057 * 1058 * This routine (and its user) are buggy in at least the following ways: 1059 * - all parameters were picked years ago when RAM sizes were significantly 1060 * smaller 1061 * - it can pick vnodes based on pages used by the vm object, but filesystems 1062 * like ZFS don't use it making the pick broken 1063 * - since ZFS has its own aging policy it gets partially combated by this one 1064 * - a dedicated method should be provided for filesystems to let them decide 1065 * whether the vnode should be recycled 1066 * 1067 * This routine is called when we have too many vnodes. It attempts 1068 * to free <count> vnodes and will potentially free vnodes that still 1069 * have VM backing store (VM backing store is typically the cause 1070 * of a vnode blowout so we want to do this). Therefore, this operation 1071 * is not considered cheap. 1072 * 1073 * A number of conditions may prevent a vnode from being reclaimed. 1074 * the buffer cache may have references on the vnode, a directory 1075 * vnode may still have references due to the namei cache representing 1076 * underlying files, or the vnode may be in active use. It is not 1077 * desirable to reuse such vnodes. These conditions may cause the 1078 * number of vnodes to reach some minimum value regardless of what 1079 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1080 * 1081 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1082 * entries if this argument is strue 1083 * @param trigger Only reclaim vnodes with fewer than this many resident 1084 * pages. 1085 * @param target How many vnodes to reclaim. 1086 * @return The number of vnodes that were reclaimed. 1087 */ 1088 static int 1089 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1090 { 1091 struct vnode *vp, *mvp; 1092 struct mount *mp; 1093 struct vm_object *object; 1094 u_long done; 1095 bool retried; 1096 1097 mtx_assert(&vnode_list_mtx, MA_OWNED); 1098 1099 retried = false; 1100 done = 0; 1101 1102 mvp = vnode_list_reclaim_marker; 1103 restart: 1104 vp = mvp; 1105 while (done < target) { 1106 vp = TAILQ_NEXT(vp, v_vnodelist); 1107 if (__predict_false(vp == NULL)) 1108 break; 1109 1110 if (__predict_false(vp->v_type == VMARKER)) 1111 continue; 1112 1113 /* 1114 * If it's been deconstructed already, it's still 1115 * referenced, or it exceeds the trigger, skip it. 1116 * Also skip free vnodes. We are trying to make space 1117 * to expand the free list, not reduce it. 1118 */ 1119 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1120 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1121 goto next_iter; 1122 1123 if (vp->v_type == VBAD || vp->v_type == VNON) 1124 goto next_iter; 1125 1126 if (!VI_TRYLOCK(vp)) 1127 goto next_iter; 1128 1129 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1130 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1131 VN_IS_DOOMED(vp) || vp->v_type == VNON) { 1132 VI_UNLOCK(vp); 1133 goto next_iter; 1134 } 1135 1136 object = atomic_load_ptr(&vp->v_object); 1137 if (object == NULL || object->resident_page_count > trigger) { 1138 VI_UNLOCK(vp); 1139 goto next_iter; 1140 } 1141 1142 vholdl(vp); 1143 VI_UNLOCK(vp); 1144 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1145 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1146 mtx_unlock(&vnode_list_mtx); 1147 1148 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1149 vdrop(vp); 1150 goto next_iter_unlocked; 1151 } 1152 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1153 vdrop(vp); 1154 vn_finished_write(mp); 1155 goto next_iter_unlocked; 1156 } 1157 1158 VI_LOCK(vp); 1159 if (vp->v_usecount > 0 || 1160 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1161 (vp->v_object != NULL && 1162 vp->v_object->resident_page_count > trigger)) { 1163 VOP_UNLOCK(vp); 1164 vdropl(vp); 1165 vn_finished_write(mp); 1166 goto next_iter_unlocked; 1167 } 1168 counter_u64_add(recycles_count, 1); 1169 vgonel(vp); 1170 VOP_UNLOCK(vp); 1171 vdropl(vp); 1172 vn_finished_write(mp); 1173 done++; 1174 next_iter_unlocked: 1175 if (should_yield()) 1176 kern_yield(PRI_USER); 1177 mtx_lock(&vnode_list_mtx); 1178 goto restart; 1179 next_iter: 1180 MPASS(vp->v_type != VMARKER); 1181 if (!should_yield()) 1182 continue; 1183 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1184 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1185 mtx_unlock(&vnode_list_mtx); 1186 kern_yield(PRI_USER); 1187 mtx_lock(&vnode_list_mtx); 1188 goto restart; 1189 } 1190 if (done == 0 && !retried) { 1191 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1192 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1193 retried = true; 1194 goto restart; 1195 } 1196 return (done); 1197 } 1198 1199 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1200 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1201 0, 1202 "limit on vnode free requests per call to the vnlru_free routine"); 1203 1204 /* 1205 * Attempt to reduce the free list by the requested amount. 1206 */ 1207 static int 1208 vnlru_free_locked(int count, struct vfsops *mnt_op) 1209 { 1210 struct vnode *vp, *mvp; 1211 struct mount *mp; 1212 int ocount; 1213 1214 mtx_assert(&vnode_list_mtx, MA_OWNED); 1215 if (count > max_vnlru_free) 1216 count = max_vnlru_free; 1217 ocount = count; 1218 mvp = vnode_list_free_marker; 1219 restart: 1220 vp = mvp; 1221 while (count > 0) { 1222 vp = TAILQ_NEXT(vp, v_vnodelist); 1223 if (__predict_false(vp == NULL)) { 1224 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1225 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1226 break; 1227 } 1228 if (__predict_false(vp->v_type == VMARKER)) 1229 continue; 1230 1231 /* 1232 * Don't recycle if our vnode is from different type 1233 * of mount point. Note that mp is type-safe, the 1234 * check does not reach unmapped address even if 1235 * vnode is reclaimed. 1236 * Don't recycle if we can't get the interlock without 1237 * blocking. 1238 */ 1239 if (vp->v_holdcnt > 0 || (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1240 mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { 1241 continue; 1242 } 1243 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1244 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1245 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1246 VI_UNLOCK(vp); 1247 continue; 1248 } 1249 vholdl(vp); 1250 count--; 1251 mtx_unlock(&vnode_list_mtx); 1252 VI_UNLOCK(vp); 1253 vtryrecycle(vp); 1254 vdrop(vp); 1255 mtx_lock(&vnode_list_mtx); 1256 goto restart; 1257 } 1258 return (ocount - count); 1259 } 1260 1261 void 1262 vnlru_free(int count, struct vfsops *mnt_op) 1263 { 1264 1265 mtx_lock(&vnode_list_mtx); 1266 vnlru_free_locked(count, mnt_op); 1267 mtx_unlock(&vnode_list_mtx); 1268 } 1269 1270 static void 1271 vnlru_recalc(void) 1272 { 1273 1274 mtx_assert(&vnode_list_mtx, MA_OWNED); 1275 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1276 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1277 vlowat = vhiwat / 2; 1278 } 1279 1280 /* 1281 * Attempt to recycle vnodes in a context that is always safe to block. 1282 * Calling vlrurecycle() from the bowels of filesystem code has some 1283 * interesting deadlock problems. 1284 */ 1285 static struct proc *vnlruproc; 1286 static int vnlruproc_sig; 1287 1288 /* 1289 * The main freevnodes counter is only updated when threads requeue their vnode 1290 * batches. CPUs are conditionally walked to compute a more accurate total. 1291 * 1292 * Limit how much of a slop are we willing to tolerate. Note: the actual value 1293 * at any given moment can still exceed slop, but it should not be by significant 1294 * margin in practice. 1295 */ 1296 #define VNLRU_FREEVNODES_SLOP 128 1297 1298 static u_long 1299 vnlru_read_freevnodes(void) 1300 { 1301 struct vdbatch *vd; 1302 long slop; 1303 int cpu; 1304 1305 mtx_assert(&vnode_list_mtx, MA_OWNED); 1306 if (freevnodes > freevnodes_old) 1307 slop = freevnodes - freevnodes_old; 1308 else 1309 slop = freevnodes_old - freevnodes; 1310 if (slop < VNLRU_FREEVNODES_SLOP) 1311 return (freevnodes >= 0 ? freevnodes : 0); 1312 freevnodes_old = freevnodes; 1313 CPU_FOREACH(cpu) { 1314 vd = DPCPU_ID_PTR((cpu), vd); 1315 freevnodes_old += vd->freevnodes; 1316 } 1317 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1318 } 1319 1320 static bool 1321 vnlru_under(u_long rnumvnodes, u_long limit) 1322 { 1323 u_long rfreevnodes, space; 1324 1325 if (__predict_false(rnumvnodes > desiredvnodes)) 1326 return (true); 1327 1328 space = desiredvnodes - rnumvnodes; 1329 if (space < limit) { 1330 rfreevnodes = vnlru_read_freevnodes(); 1331 if (rfreevnodes > wantfreevnodes) 1332 space += rfreevnodes - wantfreevnodes; 1333 } 1334 return (space < limit); 1335 } 1336 1337 static bool 1338 vnlru_under_unlocked(u_long rnumvnodes, u_long limit) 1339 { 1340 long rfreevnodes, space; 1341 1342 if (__predict_false(rnumvnodes > desiredvnodes)) 1343 return (true); 1344 1345 space = desiredvnodes - rnumvnodes; 1346 if (space < limit) { 1347 rfreevnodes = atomic_load_long(&freevnodes); 1348 if (rfreevnodes > wantfreevnodes) 1349 space += rfreevnodes - wantfreevnodes; 1350 } 1351 return (space < limit); 1352 } 1353 1354 static void 1355 vnlru_kick(void) 1356 { 1357 1358 mtx_assert(&vnode_list_mtx, MA_OWNED); 1359 if (vnlruproc_sig == 0) { 1360 vnlruproc_sig = 1; 1361 wakeup(vnlruproc); 1362 } 1363 } 1364 1365 static void 1366 vnlru_proc(void) 1367 { 1368 u_long rnumvnodes, rfreevnodes, target; 1369 unsigned long onumvnodes; 1370 int done, force, trigger, usevnodes; 1371 bool reclaim_nc_src, want_reread; 1372 1373 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1374 SHUTDOWN_PRI_FIRST); 1375 1376 force = 0; 1377 want_reread = false; 1378 for (;;) { 1379 kproc_suspend_check(vnlruproc); 1380 mtx_lock(&vnode_list_mtx); 1381 rnumvnodes = atomic_load_long(&numvnodes); 1382 1383 if (want_reread) { 1384 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1385 want_reread = false; 1386 } 1387 1388 /* 1389 * If numvnodes is too large (due to desiredvnodes being 1390 * adjusted using its sysctl, or emergency growth), first 1391 * try to reduce it by discarding from the free list. 1392 */ 1393 if (rnumvnodes > desiredvnodes) { 1394 vnlru_free_locked(rnumvnodes - desiredvnodes, NULL); 1395 rnumvnodes = atomic_load_long(&numvnodes); 1396 } 1397 /* 1398 * Sleep if the vnode cache is in a good state. This is 1399 * when it is not over-full and has space for about a 4% 1400 * or 9% expansion (by growing its size or inexcessively 1401 * reducing its free list). Otherwise, try to reclaim 1402 * space for a 10% expansion. 1403 */ 1404 if (vstir && force == 0) { 1405 force = 1; 1406 vstir = 0; 1407 } 1408 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1409 vnlruproc_sig = 0; 1410 wakeup(&vnlruproc_sig); 1411 msleep(vnlruproc, &vnode_list_mtx, 1412 PVFS|PDROP, "vlruwt", hz); 1413 continue; 1414 } 1415 rfreevnodes = vnlru_read_freevnodes(); 1416 1417 onumvnodes = rnumvnodes; 1418 /* 1419 * Calculate parameters for recycling. These are the same 1420 * throughout the loop to give some semblance of fairness. 1421 * The trigger point is to avoid recycling vnodes with lots 1422 * of resident pages. We aren't trying to free memory; we 1423 * are trying to recycle or at least free vnodes. 1424 */ 1425 if (rnumvnodes <= desiredvnodes) 1426 usevnodes = rnumvnodes - rfreevnodes; 1427 else 1428 usevnodes = rnumvnodes; 1429 if (usevnodes <= 0) 1430 usevnodes = 1; 1431 /* 1432 * The trigger value is is chosen to give a conservatively 1433 * large value to ensure that it alone doesn't prevent 1434 * making progress. The value can easily be so large that 1435 * it is effectively infinite in some congested and 1436 * misconfigured cases, and this is necessary. Normally 1437 * it is about 8 to 100 (pages), which is quite large. 1438 */ 1439 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1440 if (force < 2) 1441 trigger = vsmalltrigger; 1442 reclaim_nc_src = force >= 3; 1443 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1444 target = target / 10 + 1; 1445 done = vlrureclaim(reclaim_nc_src, trigger, target); 1446 mtx_unlock(&vnode_list_mtx); 1447 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1448 uma_reclaim(UMA_RECLAIM_DRAIN); 1449 if (done == 0) { 1450 if (force == 0 || force == 1) { 1451 force = 2; 1452 continue; 1453 } 1454 if (force == 2) { 1455 force = 3; 1456 continue; 1457 } 1458 want_reread = true; 1459 force = 0; 1460 vnlru_nowhere++; 1461 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1462 } else { 1463 want_reread = true; 1464 kern_yield(PRI_USER); 1465 } 1466 } 1467 } 1468 1469 static struct kproc_desc vnlru_kp = { 1470 "vnlru", 1471 vnlru_proc, 1472 &vnlruproc 1473 }; 1474 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1475 &vnlru_kp); 1476 1477 /* 1478 * Routines having to do with the management of the vnode table. 1479 */ 1480 1481 /* 1482 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1483 * before we actually vgone(). This function must be called with the vnode 1484 * held to prevent the vnode from being returned to the free list midway 1485 * through vgone(). 1486 */ 1487 static int 1488 vtryrecycle(struct vnode *vp) 1489 { 1490 struct mount *vnmp; 1491 1492 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1493 VNASSERT(vp->v_holdcnt, vp, 1494 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1495 /* 1496 * This vnode may found and locked via some other list, if so we 1497 * can't recycle it yet. 1498 */ 1499 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1500 CTR2(KTR_VFS, 1501 "%s: impossible to recycle, vp %p lock is already held", 1502 __func__, vp); 1503 return (EWOULDBLOCK); 1504 } 1505 /* 1506 * Don't recycle if its filesystem is being suspended. 1507 */ 1508 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1509 VOP_UNLOCK(vp); 1510 CTR2(KTR_VFS, 1511 "%s: impossible to recycle, cannot start the write for %p", 1512 __func__, vp); 1513 return (EBUSY); 1514 } 1515 /* 1516 * If we got this far, we need to acquire the interlock and see if 1517 * anyone picked up this vnode from another list. If not, we will 1518 * mark it with DOOMED via vgonel() so that anyone who does find it 1519 * will skip over it. 1520 */ 1521 VI_LOCK(vp); 1522 if (vp->v_usecount) { 1523 VOP_UNLOCK(vp); 1524 VI_UNLOCK(vp); 1525 vn_finished_write(vnmp); 1526 CTR2(KTR_VFS, 1527 "%s: impossible to recycle, %p is already referenced", 1528 __func__, vp); 1529 return (EBUSY); 1530 } 1531 if (!VN_IS_DOOMED(vp)) { 1532 counter_u64_add(recycles_free_count, 1); 1533 vgonel(vp); 1534 } 1535 VOP_UNLOCK(vp); 1536 VI_UNLOCK(vp); 1537 vn_finished_write(vnmp); 1538 return (0); 1539 } 1540 1541 /* 1542 * Allocate a new vnode. 1543 * 1544 * The operation never returns an error. Returning an error was disabled 1545 * in r145385 (dated 2005) with the following comment: 1546 * 1547 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1548 * 1549 * Given the age of this commit (almost 15 years at the time of writing this 1550 * comment) restoring the ability to fail requires a significant audit of 1551 * all codepaths. 1552 * 1553 * The routine can try to free a vnode or stall for up to 1 second waiting for 1554 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1555 */ 1556 static u_long vn_alloc_cyclecount; 1557 1558 static struct vnode * __noinline 1559 vn_alloc_hard(struct mount *mp) 1560 { 1561 u_long rnumvnodes, rfreevnodes; 1562 1563 mtx_lock(&vnode_list_mtx); 1564 rnumvnodes = atomic_load_long(&numvnodes); 1565 if (rnumvnodes + 1 < desiredvnodes) { 1566 vn_alloc_cyclecount = 0; 1567 goto alloc; 1568 } 1569 rfreevnodes = vnlru_read_freevnodes(); 1570 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1571 vn_alloc_cyclecount = 0; 1572 vstir = 1; 1573 } 1574 /* 1575 * Grow the vnode cache if it will not be above its target max 1576 * after growing. Otherwise, if the free list is nonempty, try 1577 * to reclaim 1 item from it before growing the cache (possibly 1578 * above its target max if the reclamation failed or is delayed). 1579 * Otherwise, wait for some space. In all cases, schedule 1580 * vnlru_proc() if we are getting short of space. The watermarks 1581 * should be chosen so that we never wait or even reclaim from 1582 * the free list to below its target minimum. 1583 */ 1584 if (vnlru_free_locked(1, NULL) > 0) 1585 goto alloc; 1586 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1587 /* 1588 * Wait for space for a new vnode. 1589 */ 1590 vnlru_kick(); 1591 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1592 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1593 vnlru_read_freevnodes() > 1) 1594 vnlru_free_locked(1, NULL); 1595 } 1596 alloc: 1597 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1598 if (vnlru_under(rnumvnodes, vlowat)) 1599 vnlru_kick(); 1600 mtx_unlock(&vnode_list_mtx); 1601 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1602 } 1603 1604 static struct vnode * 1605 vn_alloc(struct mount *mp) 1606 { 1607 u_long rnumvnodes; 1608 1609 if (__predict_false(vn_alloc_cyclecount != 0)) 1610 return (vn_alloc_hard(mp)); 1611 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1612 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { 1613 atomic_subtract_long(&numvnodes, 1); 1614 return (vn_alloc_hard(mp)); 1615 } 1616 1617 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1618 } 1619 1620 static void 1621 vn_free(struct vnode *vp) 1622 { 1623 1624 atomic_subtract_long(&numvnodes, 1); 1625 uma_zfree_smr(vnode_zone, vp); 1626 } 1627 1628 /* 1629 * Return the next vnode from the free list. 1630 */ 1631 int 1632 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1633 struct vnode **vpp) 1634 { 1635 struct vnode *vp; 1636 struct thread *td; 1637 struct lock_object *lo; 1638 1639 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1640 1641 KASSERT(vops->registered, 1642 ("%s: not registered vector op %p\n", __func__, vops)); 1643 1644 td = curthread; 1645 if (td->td_vp_reserved != NULL) { 1646 vp = td->td_vp_reserved; 1647 td->td_vp_reserved = NULL; 1648 } else { 1649 vp = vn_alloc(mp); 1650 } 1651 counter_u64_add(vnodes_created, 1); 1652 /* 1653 * Locks are given the generic name "vnode" when created. 1654 * Follow the historic practice of using the filesystem 1655 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1656 * 1657 * Locks live in a witness group keyed on their name. Thus, 1658 * when a lock is renamed, it must also move from the witness 1659 * group of its old name to the witness group of its new name. 1660 * 1661 * The change only needs to be made when the vnode moves 1662 * from one filesystem type to another. We ensure that each 1663 * filesystem use a single static name pointer for its tag so 1664 * that we can compare pointers rather than doing a strcmp(). 1665 */ 1666 lo = &vp->v_vnlock->lock_object; 1667 #ifdef WITNESS 1668 if (lo->lo_name != tag) { 1669 #endif 1670 lo->lo_name = tag; 1671 #ifdef WITNESS 1672 WITNESS_DESTROY(lo); 1673 WITNESS_INIT(lo, tag); 1674 } 1675 #endif 1676 /* 1677 * By default, don't allow shared locks unless filesystems opt-in. 1678 */ 1679 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1680 /* 1681 * Finalize various vnode identity bits. 1682 */ 1683 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1684 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1685 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1686 vp->v_type = VNON; 1687 vp->v_op = vops; 1688 v_init_counters(vp); 1689 vp->v_bufobj.bo_ops = &buf_ops_bio; 1690 #ifdef DIAGNOSTIC 1691 if (mp == NULL && vops != &dead_vnodeops) 1692 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1693 #endif 1694 #ifdef MAC 1695 mac_vnode_init(vp); 1696 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1697 mac_vnode_associate_singlelabel(mp, vp); 1698 #endif 1699 if (mp != NULL) { 1700 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1701 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1702 vp->v_vflag |= VV_NOKNOTE; 1703 } 1704 1705 /* 1706 * For the filesystems which do not use vfs_hash_insert(), 1707 * still initialize v_hash to have vfs_hash_index() useful. 1708 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1709 * its own hashing. 1710 */ 1711 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1712 1713 *vpp = vp; 1714 return (0); 1715 } 1716 1717 void 1718 getnewvnode_reserve(void) 1719 { 1720 struct thread *td; 1721 1722 td = curthread; 1723 MPASS(td->td_vp_reserved == NULL); 1724 td->td_vp_reserved = vn_alloc(NULL); 1725 } 1726 1727 void 1728 getnewvnode_drop_reserve(void) 1729 { 1730 struct thread *td; 1731 1732 td = curthread; 1733 if (td->td_vp_reserved != NULL) { 1734 vn_free(td->td_vp_reserved); 1735 td->td_vp_reserved = NULL; 1736 } 1737 } 1738 1739 static void 1740 freevnode(struct vnode *vp) 1741 { 1742 struct bufobj *bo; 1743 1744 /* 1745 * The vnode has been marked for destruction, so free it. 1746 * 1747 * The vnode will be returned to the zone where it will 1748 * normally remain until it is needed for another vnode. We 1749 * need to cleanup (or verify that the cleanup has already 1750 * been done) any residual data left from its current use 1751 * so as not to contaminate the freshly allocated vnode. 1752 */ 1753 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1754 /* 1755 * Paired with vgone. 1756 */ 1757 vn_seqc_write_end_locked(vp); 1758 VNPASS(vp->v_seqc_users == 0, vp); 1759 1760 bo = &vp->v_bufobj; 1761 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1762 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 1763 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1764 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1765 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1766 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1767 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1768 ("clean blk trie not empty")); 1769 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1770 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1771 ("dirty blk trie not empty")); 1772 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 1773 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 1774 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 1775 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1776 ("Dangling rangelock waiters")); 1777 VI_UNLOCK(vp); 1778 #ifdef MAC 1779 mac_vnode_destroy(vp); 1780 #endif 1781 if (vp->v_pollinfo != NULL) { 1782 destroy_vpollinfo(vp->v_pollinfo); 1783 vp->v_pollinfo = NULL; 1784 } 1785 #ifdef INVARIANTS 1786 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 1787 vp->v_op = NULL; 1788 #endif 1789 vp->v_mountedhere = NULL; 1790 vp->v_unpcb = NULL; 1791 vp->v_rdev = NULL; 1792 vp->v_fifoinfo = NULL; 1793 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 1794 vp->v_irflag = 0; 1795 vp->v_iflag = 0; 1796 vp->v_vflag = 0; 1797 bo->bo_flag = 0; 1798 vn_free(vp); 1799 } 1800 1801 /* 1802 * Delete from old mount point vnode list, if on one. 1803 */ 1804 static void 1805 delmntque(struct vnode *vp) 1806 { 1807 struct mount *mp; 1808 1809 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 1810 1811 mp = vp->v_mount; 1812 if (mp == NULL) 1813 return; 1814 MNT_ILOCK(mp); 1815 VI_LOCK(vp); 1816 vp->v_mount = NULL; 1817 VI_UNLOCK(vp); 1818 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1819 ("bad mount point vnode list size")); 1820 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1821 mp->mnt_nvnodelistsize--; 1822 MNT_REL(mp); 1823 MNT_IUNLOCK(mp); 1824 } 1825 1826 static void 1827 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1828 { 1829 1830 vp->v_data = NULL; 1831 vp->v_op = &dead_vnodeops; 1832 vgone(vp); 1833 vput(vp); 1834 } 1835 1836 /* 1837 * Insert into list of vnodes for the new mount point, if available. 1838 */ 1839 int 1840 insmntque1(struct vnode *vp, struct mount *mp, 1841 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1842 { 1843 1844 KASSERT(vp->v_mount == NULL, 1845 ("insmntque: vnode already on per mount vnode list")); 1846 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1847 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1848 1849 /* 1850 * We acquire the vnode interlock early to ensure that the 1851 * vnode cannot be recycled by another process releasing a 1852 * holdcnt on it before we get it on both the vnode list 1853 * and the active vnode list. The mount mutex protects only 1854 * manipulation of the vnode list and the vnode freelist 1855 * mutex protects only manipulation of the active vnode list. 1856 * Hence the need to hold the vnode interlock throughout. 1857 */ 1858 MNT_ILOCK(mp); 1859 VI_LOCK(vp); 1860 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 1861 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1862 mp->mnt_nvnodelistsize == 0)) && 1863 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1864 VI_UNLOCK(vp); 1865 MNT_IUNLOCK(mp); 1866 if (dtr != NULL) 1867 dtr(vp, dtr_arg); 1868 return (EBUSY); 1869 } 1870 vp->v_mount = mp; 1871 MNT_REF(mp); 1872 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1873 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1874 ("neg mount point vnode list size")); 1875 mp->mnt_nvnodelistsize++; 1876 VI_UNLOCK(vp); 1877 MNT_IUNLOCK(mp); 1878 return (0); 1879 } 1880 1881 int 1882 insmntque(struct vnode *vp, struct mount *mp) 1883 { 1884 1885 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1886 } 1887 1888 /* 1889 * Flush out and invalidate all buffers associated with a bufobj 1890 * Called with the underlying object locked. 1891 */ 1892 int 1893 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1894 { 1895 int error; 1896 1897 BO_LOCK(bo); 1898 if (flags & V_SAVE) { 1899 error = bufobj_wwait(bo, slpflag, slptimeo); 1900 if (error) { 1901 BO_UNLOCK(bo); 1902 return (error); 1903 } 1904 if (bo->bo_dirty.bv_cnt > 0) { 1905 BO_UNLOCK(bo); 1906 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1907 return (error); 1908 /* 1909 * XXX We could save a lock/unlock if this was only 1910 * enabled under INVARIANTS 1911 */ 1912 BO_LOCK(bo); 1913 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1914 panic("vinvalbuf: dirty bufs"); 1915 } 1916 } 1917 /* 1918 * If you alter this loop please notice that interlock is dropped and 1919 * reacquired in flushbuflist. Special care is needed to ensure that 1920 * no race conditions occur from this. 1921 */ 1922 do { 1923 error = flushbuflist(&bo->bo_clean, 1924 flags, bo, slpflag, slptimeo); 1925 if (error == 0 && !(flags & V_CLEANONLY)) 1926 error = flushbuflist(&bo->bo_dirty, 1927 flags, bo, slpflag, slptimeo); 1928 if (error != 0 && error != EAGAIN) { 1929 BO_UNLOCK(bo); 1930 return (error); 1931 } 1932 } while (error != 0); 1933 1934 /* 1935 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1936 * have write I/O in-progress but if there is a VM object then the 1937 * VM object can also have read-I/O in-progress. 1938 */ 1939 do { 1940 bufobj_wwait(bo, 0, 0); 1941 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 1942 BO_UNLOCK(bo); 1943 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 1944 BO_LOCK(bo); 1945 } 1946 } while (bo->bo_numoutput > 0); 1947 BO_UNLOCK(bo); 1948 1949 /* 1950 * Destroy the copy in the VM cache, too. 1951 */ 1952 if (bo->bo_object != NULL && 1953 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 1954 VM_OBJECT_WLOCK(bo->bo_object); 1955 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1956 OBJPR_CLEANONLY : 0); 1957 VM_OBJECT_WUNLOCK(bo->bo_object); 1958 } 1959 1960 #ifdef INVARIANTS 1961 BO_LOCK(bo); 1962 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 1963 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 1964 bo->bo_clean.bv_cnt > 0)) 1965 panic("vinvalbuf: flush failed"); 1966 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 1967 bo->bo_dirty.bv_cnt > 0) 1968 panic("vinvalbuf: flush dirty failed"); 1969 BO_UNLOCK(bo); 1970 #endif 1971 return (0); 1972 } 1973 1974 /* 1975 * Flush out and invalidate all buffers associated with a vnode. 1976 * Called with the underlying object locked. 1977 */ 1978 int 1979 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1980 { 1981 1982 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1983 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1984 if (vp->v_object != NULL && vp->v_object->handle != vp) 1985 return (0); 1986 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1987 } 1988 1989 /* 1990 * Flush out buffers on the specified list. 1991 * 1992 */ 1993 static int 1994 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1995 int slptimeo) 1996 { 1997 struct buf *bp, *nbp; 1998 int retval, error; 1999 daddr_t lblkno; 2000 b_xflags_t xflags; 2001 2002 ASSERT_BO_WLOCKED(bo); 2003 2004 retval = 0; 2005 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2006 /* 2007 * If we are flushing both V_NORMAL and V_ALT buffers then 2008 * do not skip any buffers. If we are flushing only V_NORMAL 2009 * buffers then skip buffers marked as BX_ALTDATA. If we are 2010 * flushing only V_ALT buffers then skip buffers not marked 2011 * as BX_ALTDATA. 2012 */ 2013 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2014 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2015 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2016 continue; 2017 } 2018 if (nbp != NULL) { 2019 lblkno = nbp->b_lblkno; 2020 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2021 } 2022 retval = EAGAIN; 2023 error = BUF_TIMELOCK(bp, 2024 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2025 "flushbuf", slpflag, slptimeo); 2026 if (error) { 2027 BO_LOCK(bo); 2028 return (error != ENOLCK ? error : EAGAIN); 2029 } 2030 KASSERT(bp->b_bufobj == bo, 2031 ("bp %p wrong b_bufobj %p should be %p", 2032 bp, bp->b_bufobj, bo)); 2033 /* 2034 * XXX Since there are no node locks for NFS, I 2035 * believe there is a slight chance that a delayed 2036 * write will occur while sleeping just above, so 2037 * check for it. 2038 */ 2039 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2040 (flags & V_SAVE)) { 2041 bremfree(bp); 2042 bp->b_flags |= B_ASYNC; 2043 bwrite(bp); 2044 BO_LOCK(bo); 2045 return (EAGAIN); /* XXX: why not loop ? */ 2046 } 2047 bremfree(bp); 2048 bp->b_flags |= (B_INVAL | B_RELBUF); 2049 bp->b_flags &= ~B_ASYNC; 2050 brelse(bp); 2051 BO_LOCK(bo); 2052 if (nbp == NULL) 2053 break; 2054 nbp = gbincore(bo, lblkno); 2055 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2056 != xflags) 2057 break; /* nbp invalid */ 2058 } 2059 return (retval); 2060 } 2061 2062 int 2063 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2064 { 2065 struct buf *bp; 2066 int error; 2067 daddr_t lblkno; 2068 2069 ASSERT_BO_LOCKED(bo); 2070 2071 for (lblkno = startn;;) { 2072 again: 2073 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2074 if (bp == NULL || bp->b_lblkno >= endn || 2075 bp->b_lblkno < startn) 2076 break; 2077 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2078 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2079 if (error != 0) { 2080 BO_RLOCK(bo); 2081 if (error == ENOLCK) 2082 goto again; 2083 return (error); 2084 } 2085 KASSERT(bp->b_bufobj == bo, 2086 ("bp %p wrong b_bufobj %p should be %p", 2087 bp, bp->b_bufobj, bo)); 2088 lblkno = bp->b_lblkno + 1; 2089 if ((bp->b_flags & B_MANAGED) == 0) 2090 bremfree(bp); 2091 bp->b_flags |= B_RELBUF; 2092 /* 2093 * In the VMIO case, use the B_NOREUSE flag to hint that the 2094 * pages backing each buffer in the range are unlikely to be 2095 * reused. Dirty buffers will have the hint applied once 2096 * they've been written. 2097 */ 2098 if ((bp->b_flags & B_VMIO) != 0) 2099 bp->b_flags |= B_NOREUSE; 2100 brelse(bp); 2101 BO_RLOCK(bo); 2102 } 2103 return (0); 2104 } 2105 2106 /* 2107 * Truncate a file's buffer and pages to a specified length. This 2108 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2109 * sync activity. 2110 */ 2111 int 2112 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2113 { 2114 struct buf *bp, *nbp; 2115 struct bufobj *bo; 2116 daddr_t startlbn; 2117 2118 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2119 vp, blksize, (uintmax_t)length); 2120 2121 /* 2122 * Round up to the *next* lbn. 2123 */ 2124 startlbn = howmany(length, blksize); 2125 2126 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2127 2128 bo = &vp->v_bufobj; 2129 restart_unlocked: 2130 BO_LOCK(bo); 2131 2132 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2133 ; 2134 2135 if (length > 0) { 2136 restartsync: 2137 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2138 if (bp->b_lblkno > 0) 2139 continue; 2140 /* 2141 * Since we hold the vnode lock this should only 2142 * fail if we're racing with the buf daemon. 2143 */ 2144 if (BUF_LOCK(bp, 2145 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2146 BO_LOCKPTR(bo)) == ENOLCK) 2147 goto restart_unlocked; 2148 2149 VNASSERT((bp->b_flags & B_DELWRI), vp, 2150 ("buf(%p) on dirty queue without DELWRI", bp)); 2151 2152 bremfree(bp); 2153 bawrite(bp); 2154 BO_LOCK(bo); 2155 goto restartsync; 2156 } 2157 } 2158 2159 bufobj_wwait(bo, 0, 0); 2160 BO_UNLOCK(bo); 2161 vnode_pager_setsize(vp, length); 2162 2163 return (0); 2164 } 2165 2166 /* 2167 * Invalidate the cached pages of a file's buffer within the range of block 2168 * numbers [startlbn, endlbn). 2169 */ 2170 void 2171 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2172 int blksize) 2173 { 2174 struct bufobj *bo; 2175 off_t start, end; 2176 2177 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2178 2179 start = blksize * startlbn; 2180 end = blksize * endlbn; 2181 2182 bo = &vp->v_bufobj; 2183 BO_LOCK(bo); 2184 MPASS(blksize == bo->bo_bsize); 2185 2186 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2187 ; 2188 2189 BO_UNLOCK(bo); 2190 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2191 } 2192 2193 static int 2194 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2195 daddr_t startlbn, daddr_t endlbn) 2196 { 2197 struct buf *bp, *nbp; 2198 bool anyfreed; 2199 2200 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2201 ASSERT_BO_LOCKED(bo); 2202 2203 do { 2204 anyfreed = false; 2205 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2206 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2207 continue; 2208 if (BUF_LOCK(bp, 2209 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2210 BO_LOCKPTR(bo)) == ENOLCK) { 2211 BO_LOCK(bo); 2212 return (EAGAIN); 2213 } 2214 2215 bremfree(bp); 2216 bp->b_flags |= B_INVAL | B_RELBUF; 2217 bp->b_flags &= ~B_ASYNC; 2218 brelse(bp); 2219 anyfreed = true; 2220 2221 BO_LOCK(bo); 2222 if (nbp != NULL && 2223 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2224 nbp->b_vp != vp || 2225 (nbp->b_flags & B_DELWRI) != 0)) 2226 return (EAGAIN); 2227 } 2228 2229 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2230 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2231 continue; 2232 if (BUF_LOCK(bp, 2233 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2234 BO_LOCKPTR(bo)) == ENOLCK) { 2235 BO_LOCK(bo); 2236 return (EAGAIN); 2237 } 2238 bremfree(bp); 2239 bp->b_flags |= B_INVAL | B_RELBUF; 2240 bp->b_flags &= ~B_ASYNC; 2241 brelse(bp); 2242 anyfreed = true; 2243 2244 BO_LOCK(bo); 2245 if (nbp != NULL && 2246 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2247 (nbp->b_vp != vp) || 2248 (nbp->b_flags & B_DELWRI) == 0)) 2249 return (EAGAIN); 2250 } 2251 } while (anyfreed); 2252 return (0); 2253 } 2254 2255 static void 2256 buf_vlist_remove(struct buf *bp) 2257 { 2258 struct bufv *bv; 2259 b_xflags_t flags; 2260 2261 flags = bp->b_xflags; 2262 2263 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2264 ASSERT_BO_WLOCKED(bp->b_bufobj); 2265 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2266 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2267 ("%s: buffer %p has invalid queue state", __func__, bp)); 2268 2269 if ((flags & BX_VNDIRTY) != 0) 2270 bv = &bp->b_bufobj->bo_dirty; 2271 else 2272 bv = &bp->b_bufobj->bo_clean; 2273 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2274 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2275 bv->bv_cnt--; 2276 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2277 } 2278 2279 /* 2280 * Add the buffer to the sorted clean or dirty block list. 2281 * 2282 * NOTE: xflags is passed as a constant, optimizing this inline function! 2283 */ 2284 static void 2285 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2286 { 2287 struct bufv *bv; 2288 struct buf *n; 2289 int error; 2290 2291 ASSERT_BO_WLOCKED(bo); 2292 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2293 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2294 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2295 ("dead bo %p", bo)); 2296 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2297 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2298 bp->b_xflags |= xflags; 2299 if (xflags & BX_VNDIRTY) 2300 bv = &bo->bo_dirty; 2301 else 2302 bv = &bo->bo_clean; 2303 2304 /* 2305 * Keep the list ordered. Optimize empty list insertion. Assume 2306 * we tend to grow at the tail so lookup_le should usually be cheaper 2307 * than _ge. 2308 */ 2309 if (bv->bv_cnt == 0 || 2310 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2311 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2312 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2313 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2314 else 2315 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2316 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2317 if (error) 2318 panic("buf_vlist_add: Preallocated nodes insufficient."); 2319 bv->bv_cnt++; 2320 } 2321 2322 /* 2323 * Look up a buffer using the buffer tries. 2324 */ 2325 struct buf * 2326 gbincore(struct bufobj *bo, daddr_t lblkno) 2327 { 2328 struct buf *bp; 2329 2330 ASSERT_BO_LOCKED(bo); 2331 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2332 if (bp != NULL) 2333 return (bp); 2334 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2335 } 2336 2337 /* 2338 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2339 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2340 * stability of the result. Like other lockless lookups, the found buf may 2341 * already be invalid by the time this function returns. 2342 */ 2343 struct buf * 2344 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2345 { 2346 struct buf *bp; 2347 2348 ASSERT_BO_UNLOCKED(bo); 2349 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2350 if (bp != NULL) 2351 return (bp); 2352 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2353 } 2354 2355 /* 2356 * Associate a buffer with a vnode. 2357 */ 2358 void 2359 bgetvp(struct vnode *vp, struct buf *bp) 2360 { 2361 struct bufobj *bo; 2362 2363 bo = &vp->v_bufobj; 2364 ASSERT_BO_WLOCKED(bo); 2365 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2366 2367 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2368 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2369 ("bgetvp: bp already attached! %p", bp)); 2370 2371 vhold(vp); 2372 bp->b_vp = vp; 2373 bp->b_bufobj = bo; 2374 /* 2375 * Insert onto list for new vnode. 2376 */ 2377 buf_vlist_add(bp, bo, BX_VNCLEAN); 2378 } 2379 2380 /* 2381 * Disassociate a buffer from a vnode. 2382 */ 2383 void 2384 brelvp(struct buf *bp) 2385 { 2386 struct bufobj *bo; 2387 struct vnode *vp; 2388 2389 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2390 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2391 2392 /* 2393 * Delete from old vnode list, if on one. 2394 */ 2395 vp = bp->b_vp; /* XXX */ 2396 bo = bp->b_bufobj; 2397 BO_LOCK(bo); 2398 buf_vlist_remove(bp); 2399 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2400 bo->bo_flag &= ~BO_ONWORKLST; 2401 mtx_lock(&sync_mtx); 2402 LIST_REMOVE(bo, bo_synclist); 2403 syncer_worklist_len--; 2404 mtx_unlock(&sync_mtx); 2405 } 2406 bp->b_vp = NULL; 2407 bp->b_bufobj = NULL; 2408 BO_UNLOCK(bo); 2409 vdrop(vp); 2410 } 2411 2412 /* 2413 * Add an item to the syncer work queue. 2414 */ 2415 static void 2416 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2417 { 2418 int slot; 2419 2420 ASSERT_BO_WLOCKED(bo); 2421 2422 mtx_lock(&sync_mtx); 2423 if (bo->bo_flag & BO_ONWORKLST) 2424 LIST_REMOVE(bo, bo_synclist); 2425 else { 2426 bo->bo_flag |= BO_ONWORKLST; 2427 syncer_worklist_len++; 2428 } 2429 2430 if (delay > syncer_maxdelay - 2) 2431 delay = syncer_maxdelay - 2; 2432 slot = (syncer_delayno + delay) & syncer_mask; 2433 2434 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2435 mtx_unlock(&sync_mtx); 2436 } 2437 2438 static int 2439 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2440 { 2441 int error, len; 2442 2443 mtx_lock(&sync_mtx); 2444 len = syncer_worklist_len - sync_vnode_count; 2445 mtx_unlock(&sync_mtx); 2446 error = SYSCTL_OUT(req, &len, sizeof(len)); 2447 return (error); 2448 } 2449 2450 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2451 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2452 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2453 2454 static struct proc *updateproc; 2455 static void sched_sync(void); 2456 static struct kproc_desc up_kp = { 2457 "syncer", 2458 sched_sync, 2459 &updateproc 2460 }; 2461 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2462 2463 static int 2464 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2465 { 2466 struct vnode *vp; 2467 struct mount *mp; 2468 2469 *bo = LIST_FIRST(slp); 2470 if (*bo == NULL) 2471 return (0); 2472 vp = bo2vnode(*bo); 2473 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2474 return (1); 2475 /* 2476 * We use vhold in case the vnode does not 2477 * successfully sync. vhold prevents the vnode from 2478 * going away when we unlock the sync_mtx so that 2479 * we can acquire the vnode interlock. 2480 */ 2481 vholdl(vp); 2482 mtx_unlock(&sync_mtx); 2483 VI_UNLOCK(vp); 2484 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2485 vdrop(vp); 2486 mtx_lock(&sync_mtx); 2487 return (*bo == LIST_FIRST(slp)); 2488 } 2489 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2490 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2491 VOP_UNLOCK(vp); 2492 vn_finished_write(mp); 2493 BO_LOCK(*bo); 2494 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2495 /* 2496 * Put us back on the worklist. The worklist 2497 * routine will remove us from our current 2498 * position and then add us back in at a later 2499 * position. 2500 */ 2501 vn_syncer_add_to_worklist(*bo, syncdelay); 2502 } 2503 BO_UNLOCK(*bo); 2504 vdrop(vp); 2505 mtx_lock(&sync_mtx); 2506 return (0); 2507 } 2508 2509 static int first_printf = 1; 2510 2511 /* 2512 * System filesystem synchronizer daemon. 2513 */ 2514 static void 2515 sched_sync(void) 2516 { 2517 struct synclist *next, *slp; 2518 struct bufobj *bo; 2519 long starttime; 2520 struct thread *td = curthread; 2521 int last_work_seen; 2522 int net_worklist_len; 2523 int syncer_final_iter; 2524 int error; 2525 2526 last_work_seen = 0; 2527 syncer_final_iter = 0; 2528 syncer_state = SYNCER_RUNNING; 2529 starttime = time_uptime; 2530 td->td_pflags |= TDP_NORUNNINGBUF; 2531 2532 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2533 SHUTDOWN_PRI_LAST); 2534 2535 mtx_lock(&sync_mtx); 2536 for (;;) { 2537 if (syncer_state == SYNCER_FINAL_DELAY && 2538 syncer_final_iter == 0) { 2539 mtx_unlock(&sync_mtx); 2540 kproc_suspend_check(td->td_proc); 2541 mtx_lock(&sync_mtx); 2542 } 2543 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2544 if (syncer_state != SYNCER_RUNNING && 2545 starttime != time_uptime) { 2546 if (first_printf) { 2547 printf("\nSyncing disks, vnodes remaining... "); 2548 first_printf = 0; 2549 } 2550 printf("%d ", net_worklist_len); 2551 } 2552 starttime = time_uptime; 2553 2554 /* 2555 * Push files whose dirty time has expired. Be careful 2556 * of interrupt race on slp queue. 2557 * 2558 * Skip over empty worklist slots when shutting down. 2559 */ 2560 do { 2561 slp = &syncer_workitem_pending[syncer_delayno]; 2562 syncer_delayno += 1; 2563 if (syncer_delayno == syncer_maxdelay) 2564 syncer_delayno = 0; 2565 next = &syncer_workitem_pending[syncer_delayno]; 2566 /* 2567 * If the worklist has wrapped since the 2568 * it was emptied of all but syncer vnodes, 2569 * switch to the FINAL_DELAY state and run 2570 * for one more second. 2571 */ 2572 if (syncer_state == SYNCER_SHUTTING_DOWN && 2573 net_worklist_len == 0 && 2574 last_work_seen == syncer_delayno) { 2575 syncer_state = SYNCER_FINAL_DELAY; 2576 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2577 } 2578 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2579 syncer_worklist_len > 0); 2580 2581 /* 2582 * Keep track of the last time there was anything 2583 * on the worklist other than syncer vnodes. 2584 * Return to the SHUTTING_DOWN state if any 2585 * new work appears. 2586 */ 2587 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2588 last_work_seen = syncer_delayno; 2589 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2590 syncer_state = SYNCER_SHUTTING_DOWN; 2591 while (!LIST_EMPTY(slp)) { 2592 error = sync_vnode(slp, &bo, td); 2593 if (error == 1) { 2594 LIST_REMOVE(bo, bo_synclist); 2595 LIST_INSERT_HEAD(next, bo, bo_synclist); 2596 continue; 2597 } 2598 2599 if (first_printf == 0) { 2600 /* 2601 * Drop the sync mutex, because some watchdog 2602 * drivers need to sleep while patting 2603 */ 2604 mtx_unlock(&sync_mtx); 2605 wdog_kern_pat(WD_LASTVAL); 2606 mtx_lock(&sync_mtx); 2607 } 2608 2609 } 2610 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2611 syncer_final_iter--; 2612 /* 2613 * The variable rushjob allows the kernel to speed up the 2614 * processing of the filesystem syncer process. A rushjob 2615 * value of N tells the filesystem syncer to process the next 2616 * N seconds worth of work on its queue ASAP. Currently rushjob 2617 * is used by the soft update code to speed up the filesystem 2618 * syncer process when the incore state is getting so far 2619 * ahead of the disk that the kernel memory pool is being 2620 * threatened with exhaustion. 2621 */ 2622 if (rushjob > 0) { 2623 rushjob -= 1; 2624 continue; 2625 } 2626 /* 2627 * Just sleep for a short period of time between 2628 * iterations when shutting down to allow some I/O 2629 * to happen. 2630 * 2631 * If it has taken us less than a second to process the 2632 * current work, then wait. Otherwise start right over 2633 * again. We can still lose time if any single round 2634 * takes more than two seconds, but it does not really 2635 * matter as we are just trying to generally pace the 2636 * filesystem activity. 2637 */ 2638 if (syncer_state != SYNCER_RUNNING || 2639 time_uptime == starttime) { 2640 thread_lock(td); 2641 sched_prio(td, PPAUSE); 2642 thread_unlock(td); 2643 } 2644 if (syncer_state != SYNCER_RUNNING) 2645 cv_timedwait(&sync_wakeup, &sync_mtx, 2646 hz / SYNCER_SHUTDOWN_SPEEDUP); 2647 else if (time_uptime == starttime) 2648 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2649 } 2650 } 2651 2652 /* 2653 * Request the syncer daemon to speed up its work. 2654 * We never push it to speed up more than half of its 2655 * normal turn time, otherwise it could take over the cpu. 2656 */ 2657 int 2658 speedup_syncer(void) 2659 { 2660 int ret = 0; 2661 2662 mtx_lock(&sync_mtx); 2663 if (rushjob < syncdelay / 2) { 2664 rushjob += 1; 2665 stat_rush_requests += 1; 2666 ret = 1; 2667 } 2668 mtx_unlock(&sync_mtx); 2669 cv_broadcast(&sync_wakeup); 2670 return (ret); 2671 } 2672 2673 /* 2674 * Tell the syncer to speed up its work and run though its work 2675 * list several times, then tell it to shut down. 2676 */ 2677 static void 2678 syncer_shutdown(void *arg, int howto) 2679 { 2680 2681 if (howto & RB_NOSYNC) 2682 return; 2683 mtx_lock(&sync_mtx); 2684 syncer_state = SYNCER_SHUTTING_DOWN; 2685 rushjob = 0; 2686 mtx_unlock(&sync_mtx); 2687 cv_broadcast(&sync_wakeup); 2688 kproc_shutdown(arg, howto); 2689 } 2690 2691 void 2692 syncer_suspend(void) 2693 { 2694 2695 syncer_shutdown(updateproc, 0); 2696 } 2697 2698 void 2699 syncer_resume(void) 2700 { 2701 2702 mtx_lock(&sync_mtx); 2703 first_printf = 1; 2704 syncer_state = SYNCER_RUNNING; 2705 mtx_unlock(&sync_mtx); 2706 cv_broadcast(&sync_wakeup); 2707 kproc_resume(updateproc); 2708 } 2709 2710 /* 2711 * Move the buffer between the clean and dirty lists of its vnode. 2712 */ 2713 void 2714 reassignbuf(struct buf *bp) 2715 { 2716 struct vnode *vp; 2717 struct bufobj *bo; 2718 int delay; 2719 #ifdef INVARIANTS 2720 struct bufv *bv; 2721 #endif 2722 2723 vp = bp->b_vp; 2724 bo = bp->b_bufobj; 2725 2726 KASSERT((bp->b_flags & B_PAGING) == 0, 2727 ("%s: cannot reassign paging buffer %p", __func__, bp)); 2728 2729 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2730 bp, bp->b_vp, bp->b_flags); 2731 2732 BO_LOCK(bo); 2733 buf_vlist_remove(bp); 2734 2735 /* 2736 * If dirty, put on list of dirty buffers; otherwise insert onto list 2737 * of clean buffers. 2738 */ 2739 if (bp->b_flags & B_DELWRI) { 2740 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2741 switch (vp->v_type) { 2742 case VDIR: 2743 delay = dirdelay; 2744 break; 2745 case VCHR: 2746 delay = metadelay; 2747 break; 2748 default: 2749 delay = filedelay; 2750 } 2751 vn_syncer_add_to_worklist(bo, delay); 2752 } 2753 buf_vlist_add(bp, bo, BX_VNDIRTY); 2754 } else { 2755 buf_vlist_add(bp, bo, BX_VNCLEAN); 2756 2757 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2758 mtx_lock(&sync_mtx); 2759 LIST_REMOVE(bo, bo_synclist); 2760 syncer_worklist_len--; 2761 mtx_unlock(&sync_mtx); 2762 bo->bo_flag &= ~BO_ONWORKLST; 2763 } 2764 } 2765 #ifdef INVARIANTS 2766 bv = &bo->bo_clean; 2767 bp = TAILQ_FIRST(&bv->bv_hd); 2768 KASSERT(bp == NULL || bp->b_bufobj == bo, 2769 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2770 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2771 KASSERT(bp == NULL || bp->b_bufobj == bo, 2772 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2773 bv = &bo->bo_dirty; 2774 bp = TAILQ_FIRST(&bv->bv_hd); 2775 KASSERT(bp == NULL || bp->b_bufobj == bo, 2776 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2777 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2778 KASSERT(bp == NULL || bp->b_bufobj == bo, 2779 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2780 #endif 2781 BO_UNLOCK(bo); 2782 } 2783 2784 static void 2785 v_init_counters(struct vnode *vp) 2786 { 2787 2788 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2789 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2790 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2791 2792 refcount_init(&vp->v_holdcnt, 1); 2793 refcount_init(&vp->v_usecount, 1); 2794 } 2795 2796 /* 2797 * Increment si_usecount of the associated device, if any. 2798 */ 2799 static void 2800 v_incr_devcount(struct vnode *vp) 2801 { 2802 2803 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2804 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2805 dev_lock(); 2806 vp->v_rdev->si_usecount++; 2807 dev_unlock(); 2808 } 2809 } 2810 2811 /* 2812 * Decrement si_usecount of the associated device, if any. 2813 * 2814 * The caller is required to hold the interlock when transitioning a VCHR use 2815 * count to zero. This prevents a race with devfs_reclaim_vchr() that would 2816 * leak a si_usecount reference. The vnode lock will also prevent this race 2817 * if it is held while dropping the last ref. 2818 * 2819 * The race is: 2820 * 2821 * CPU1 CPU2 2822 * devfs_reclaim_vchr 2823 * make v_usecount == 0 2824 * VI_LOCK 2825 * sees v_usecount == 0, no updates 2826 * vp->v_rdev = NULL; 2827 * ... 2828 * VI_UNLOCK 2829 * VI_LOCK 2830 * v_decr_devcount 2831 * sees v_rdev == NULL, no updates 2832 * 2833 * In this scenario si_devcount decrement is not performed. 2834 */ 2835 static void 2836 v_decr_devcount(struct vnode *vp) 2837 { 2838 2839 ASSERT_VOP_LOCKED(vp, __func__); 2840 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2841 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2842 dev_lock(); 2843 VNPASS(vp->v_rdev->si_usecount > 0, vp); 2844 vp->v_rdev->si_usecount--; 2845 dev_unlock(); 2846 } 2847 } 2848 2849 /* 2850 * Grab a particular vnode from the free list, increment its 2851 * reference count and lock it. VIRF_DOOMED is set if the vnode 2852 * is being destroyed. Only callers who specify LK_RETRY will 2853 * see doomed vnodes. If inactive processing was delayed in 2854 * vput try to do it here. 2855 * 2856 * usecount is manipulated using atomics without holding any locks. 2857 * 2858 * holdcnt can be manipulated using atomics without holding any locks, 2859 * except when transitioning 1<->0, in which case the interlock is held. 2860 * 2861 * Consumers which don't guarantee liveness of the vnode can use SMR to 2862 * try to get a reference. Note this operation can fail since the vnode 2863 * may be awaiting getting freed by the time they get to it. 2864 */ 2865 enum vgetstate 2866 vget_prep_smr(struct vnode *vp) 2867 { 2868 enum vgetstate vs; 2869 2870 VFS_SMR_ASSERT_ENTERED(); 2871 2872 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2873 vs = VGET_USECOUNT; 2874 } else { 2875 if (vhold_smr(vp)) 2876 vs = VGET_HOLDCNT; 2877 else 2878 vs = VGET_NONE; 2879 } 2880 return (vs); 2881 } 2882 2883 enum vgetstate 2884 vget_prep(struct vnode *vp) 2885 { 2886 enum vgetstate vs; 2887 2888 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2889 vs = VGET_USECOUNT; 2890 } else { 2891 vhold(vp); 2892 vs = VGET_HOLDCNT; 2893 } 2894 return (vs); 2895 } 2896 2897 void 2898 vget_abort(struct vnode *vp, enum vgetstate vs) 2899 { 2900 2901 switch (vs) { 2902 case VGET_USECOUNT: 2903 vrele(vp); 2904 break; 2905 case VGET_HOLDCNT: 2906 vdrop(vp); 2907 break; 2908 default: 2909 __assert_unreachable(); 2910 } 2911 } 2912 2913 int 2914 vget(struct vnode *vp, int flags, struct thread *td) 2915 { 2916 enum vgetstate vs; 2917 2918 MPASS(td == curthread); 2919 2920 vs = vget_prep(vp); 2921 return (vget_finish(vp, flags, vs)); 2922 } 2923 2924 static void __noinline 2925 vget_finish_vchr(struct vnode *vp) 2926 { 2927 2928 VNASSERT(vp->v_type == VCHR, vp, ("type != VCHR)")); 2929 2930 /* 2931 * See the comment in vget_finish before usecount bump. 2932 */ 2933 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2934 #ifdef INVARIANTS 2935 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2936 VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old)); 2937 #else 2938 refcount_release(&vp->v_holdcnt); 2939 #endif 2940 return; 2941 } 2942 2943 VI_LOCK(vp); 2944 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2945 #ifdef INVARIANTS 2946 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2947 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 2948 #else 2949 refcount_release(&vp->v_holdcnt); 2950 #endif 2951 VI_UNLOCK(vp); 2952 return; 2953 } 2954 v_incr_devcount(vp); 2955 refcount_acquire(&vp->v_usecount); 2956 VI_UNLOCK(vp); 2957 } 2958 2959 int 2960 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 2961 { 2962 int error; 2963 2964 if ((flags & LK_INTERLOCK) != 0) 2965 ASSERT_VI_LOCKED(vp, __func__); 2966 else 2967 ASSERT_VI_UNLOCKED(vp, __func__); 2968 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 2969 VNPASS(vp->v_holdcnt > 0, vp); 2970 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 2971 2972 error = vn_lock(vp, flags); 2973 if (__predict_false(error != 0)) { 2974 vget_abort(vp, vs); 2975 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2976 vp); 2977 return (error); 2978 } 2979 2980 vget_finish_ref(vp, vs); 2981 return (0); 2982 } 2983 2984 void 2985 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 2986 { 2987 int old; 2988 2989 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 2990 VNPASS(vp->v_holdcnt > 0, vp); 2991 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 2992 2993 if (vs == VGET_USECOUNT) 2994 return; 2995 2996 if (__predict_false(vp->v_type == VCHR)) { 2997 vget_finish_vchr(vp); 2998 return; 2999 } 3000 3001 /* 3002 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3003 * the vnode around. Otherwise someone else lended their hold count and 3004 * we have to drop ours. 3005 */ 3006 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3007 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3008 if (old != 0) { 3009 #ifdef INVARIANTS 3010 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3011 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3012 #else 3013 refcount_release(&vp->v_holdcnt); 3014 #endif 3015 } 3016 } 3017 3018 /* 3019 * Increase the reference (use) and hold count of a vnode. 3020 * This will also remove the vnode from the free list if it is presently free. 3021 */ 3022 static void __noinline 3023 vref_vchr(struct vnode *vp, bool interlock) 3024 { 3025 3026 /* 3027 * See the comment in vget_finish before usecount bump. 3028 */ 3029 if (!interlock) { 3030 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3031 VNODE_REFCOUNT_FENCE_ACQ(); 3032 VNASSERT(vp->v_holdcnt > 0, vp, 3033 ("%s: active vnode not held", __func__)); 3034 return; 3035 } 3036 VI_LOCK(vp); 3037 /* 3038 * By the time we get here the vnode might have been doomed, at 3039 * which point the 0->1 use count transition is no longer 3040 * protected by the interlock. Since it can't bounce back to 3041 * VCHR and requires vref semantics, punt it back 3042 */ 3043 if (__predict_false(vp->v_type == VBAD)) { 3044 VI_UNLOCK(vp); 3045 vref(vp); 3046 return; 3047 } 3048 } 3049 VNASSERT(vp->v_type == VCHR, vp, ("type != VCHR)")); 3050 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3051 VNODE_REFCOUNT_FENCE_ACQ(); 3052 VNASSERT(vp->v_holdcnt > 0, vp, 3053 ("%s: active vnode not held", __func__)); 3054 if (!interlock) 3055 VI_UNLOCK(vp); 3056 return; 3057 } 3058 vhold(vp); 3059 v_incr_devcount(vp); 3060 refcount_acquire(&vp->v_usecount); 3061 if (!interlock) 3062 VI_UNLOCK(vp); 3063 return; 3064 } 3065 3066 void 3067 vref(struct vnode *vp) 3068 { 3069 int old; 3070 3071 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3072 if (__predict_false(vp->v_type == VCHR)) { 3073 vref_vchr(vp, false); 3074 return; 3075 } 3076 3077 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3078 VNODE_REFCOUNT_FENCE_ACQ(); 3079 VNASSERT(vp->v_holdcnt > 0, vp, 3080 ("%s: active vnode not held", __func__)); 3081 return; 3082 } 3083 vhold(vp); 3084 /* 3085 * See the comment in vget_finish. 3086 */ 3087 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3088 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3089 if (old != 0) { 3090 #ifdef INVARIANTS 3091 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3092 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3093 #else 3094 refcount_release(&vp->v_holdcnt); 3095 #endif 3096 } 3097 } 3098 3099 void 3100 vrefl(struct vnode *vp) 3101 { 3102 3103 ASSERT_VI_LOCKED(vp, __func__); 3104 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3105 if (__predict_false(vp->v_type == VCHR)) { 3106 vref_vchr(vp, true); 3107 return; 3108 } 3109 vref(vp); 3110 } 3111 3112 void 3113 vrefact(struct vnode *vp) 3114 { 3115 3116 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3117 #ifdef INVARIANTS 3118 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 3119 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3120 #else 3121 refcount_acquire(&vp->v_usecount); 3122 #endif 3123 } 3124 3125 void 3126 vrefactn(struct vnode *vp, u_int n) 3127 { 3128 3129 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3130 #ifdef INVARIANTS 3131 int old = atomic_fetchadd_int(&vp->v_usecount, n); 3132 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3133 #else 3134 atomic_add_int(&vp->v_usecount, n); 3135 #endif 3136 } 3137 3138 /* 3139 * Return reference count of a vnode. 3140 * 3141 * The results of this call are only guaranteed when some mechanism is used to 3142 * stop other processes from gaining references to the vnode. This may be the 3143 * case if the caller holds the only reference. This is also useful when stale 3144 * data is acceptable as race conditions may be accounted for by some other 3145 * means. 3146 */ 3147 int 3148 vrefcnt(struct vnode *vp) 3149 { 3150 3151 return (vp->v_usecount); 3152 } 3153 3154 void 3155 vlazy(struct vnode *vp) 3156 { 3157 struct mount *mp; 3158 3159 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3160 3161 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3162 return; 3163 /* 3164 * We may get here for inactive routines after the vnode got doomed. 3165 */ 3166 if (VN_IS_DOOMED(vp)) 3167 return; 3168 mp = vp->v_mount; 3169 mtx_lock(&mp->mnt_listmtx); 3170 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3171 vp->v_mflag |= VMP_LAZYLIST; 3172 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3173 mp->mnt_lazyvnodelistsize++; 3174 } 3175 mtx_unlock(&mp->mnt_listmtx); 3176 } 3177 3178 /* 3179 * This routine is only meant to be called from vgonel prior to dooming 3180 * the vnode. 3181 */ 3182 static void 3183 vunlazy_gone(struct vnode *vp) 3184 { 3185 struct mount *mp; 3186 3187 ASSERT_VOP_ELOCKED(vp, __func__); 3188 ASSERT_VI_LOCKED(vp, __func__); 3189 VNPASS(!VN_IS_DOOMED(vp), vp); 3190 3191 if (vp->v_mflag & VMP_LAZYLIST) { 3192 mp = vp->v_mount; 3193 mtx_lock(&mp->mnt_listmtx); 3194 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3195 vp->v_mflag &= ~VMP_LAZYLIST; 3196 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3197 mp->mnt_lazyvnodelistsize--; 3198 mtx_unlock(&mp->mnt_listmtx); 3199 } 3200 } 3201 3202 static void 3203 vdefer_inactive(struct vnode *vp) 3204 { 3205 3206 ASSERT_VI_LOCKED(vp, __func__); 3207 VNASSERT(vp->v_holdcnt > 0, vp, 3208 ("%s: vnode without hold count", __func__)); 3209 if (VN_IS_DOOMED(vp)) { 3210 vdropl(vp); 3211 return; 3212 } 3213 if (vp->v_iflag & VI_DEFINACT) { 3214 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 3215 vdropl(vp); 3216 return; 3217 } 3218 if (vp->v_usecount > 0) { 3219 vp->v_iflag &= ~VI_OWEINACT; 3220 vdropl(vp); 3221 return; 3222 } 3223 vlazy(vp); 3224 vp->v_iflag |= VI_DEFINACT; 3225 VI_UNLOCK(vp); 3226 counter_u64_add(deferred_inact, 1); 3227 } 3228 3229 static void 3230 vdefer_inactive_unlocked(struct vnode *vp) 3231 { 3232 3233 VI_LOCK(vp); 3234 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3235 vdropl(vp); 3236 return; 3237 } 3238 vdefer_inactive(vp); 3239 } 3240 3241 enum vput_op { VRELE, VPUT, VUNREF }; 3242 3243 /* 3244 * Handle ->v_usecount transitioning to 0. 3245 * 3246 * By releasing the last usecount we take ownership of the hold count which 3247 * provides liveness of the vnode, meaning we have to vdrop. 3248 * 3249 * If the vnode is of type VCHR we may need to decrement si_usecount, see 3250 * v_decr_devcount for details. 3251 * 3252 * For all vnodes we may need to perform inactive processing. It requires an 3253 * exclusive lock on the vnode, while it is legal to call here with only a 3254 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3255 * inactive processing gets deferred to the syncer. 3256 * 3257 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3258 * on the lock being held all the way until VOP_INACTIVE. This in particular 3259 * happens with UFS which adds half-constructed vnodes to the hash, where they 3260 * can be found by other code. 3261 */ 3262 static void 3263 vput_final(struct vnode *vp, enum vput_op func) 3264 { 3265 int error; 3266 bool want_unlock; 3267 3268 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3269 VNPASS(vp->v_holdcnt > 0, vp); 3270 3271 VI_LOCK(vp); 3272 if (__predict_false(vp->v_type == VCHR && func != VRELE)) 3273 v_decr_devcount(vp); 3274 3275 /* 3276 * By the time we got here someone else might have transitioned 3277 * the count back to > 0. 3278 */ 3279 if (vp->v_usecount > 0) 3280 goto out; 3281 3282 /* 3283 * If the vnode is doomed vgone already performed inactive processing 3284 * (if needed). 3285 */ 3286 if (VN_IS_DOOMED(vp)) 3287 goto out; 3288 3289 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3290 goto out; 3291 3292 if (vp->v_iflag & VI_DOINGINACT) 3293 goto out; 3294 3295 /* 3296 * Locking operations here will drop the interlock and possibly the 3297 * vnode lock, opening a window where the vnode can get doomed all the 3298 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3299 * perform inactive. 3300 */ 3301 vp->v_iflag |= VI_OWEINACT; 3302 want_unlock = false; 3303 error = 0; 3304 switch (func) { 3305 case VRELE: 3306 switch (VOP_ISLOCKED(vp)) { 3307 case LK_EXCLUSIVE: 3308 break; 3309 case LK_EXCLOTHER: 3310 case 0: 3311 want_unlock = true; 3312 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3313 VI_LOCK(vp); 3314 break; 3315 default: 3316 /* 3317 * The lock has at least one sharer, but we have no way 3318 * to conclude whether this is us. Play it safe and 3319 * defer processing. 3320 */ 3321 error = EAGAIN; 3322 break; 3323 } 3324 break; 3325 case VPUT: 3326 want_unlock = true; 3327 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3328 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3329 LK_NOWAIT); 3330 VI_LOCK(vp); 3331 } 3332 break; 3333 case VUNREF: 3334 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3335 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3336 VI_LOCK(vp); 3337 } 3338 break; 3339 } 3340 if (error == 0) { 3341 vinactive(vp); 3342 if (want_unlock) 3343 VOP_UNLOCK(vp); 3344 vdropl(vp); 3345 } else { 3346 vdefer_inactive(vp); 3347 } 3348 return; 3349 out: 3350 if (func == VPUT) 3351 VOP_UNLOCK(vp); 3352 vdropl(vp); 3353 } 3354 3355 /* 3356 * Decrement ->v_usecount for a vnode. 3357 * 3358 * Releasing the last use count requires additional processing, see vput_final 3359 * above for details. 3360 * 3361 * Note that releasing use count without the vnode lock requires special casing 3362 * for VCHR, see v_decr_devcount for details. 3363 * 3364 * Comment above each variant denotes lock state on entry and exit. 3365 */ 3366 3367 static void __noinline 3368 vrele_vchr(struct vnode *vp) 3369 { 3370 3371 if (refcount_release_if_not_last(&vp->v_usecount)) 3372 return; 3373 VI_LOCK(vp); 3374 if (!refcount_release(&vp->v_usecount)) { 3375 VI_UNLOCK(vp); 3376 return; 3377 } 3378 v_decr_devcount(vp); 3379 VI_UNLOCK(vp); 3380 vput_final(vp, VRELE); 3381 } 3382 3383 /* 3384 * in: any 3385 * out: same as passed in 3386 */ 3387 void 3388 vrele(struct vnode *vp) 3389 { 3390 3391 ASSERT_VI_UNLOCKED(vp, __func__); 3392 if (__predict_false(vp->v_type == VCHR)) { 3393 vrele_vchr(vp); 3394 return; 3395 } 3396 if (!refcount_release(&vp->v_usecount)) 3397 return; 3398 vput_final(vp, VRELE); 3399 } 3400 3401 /* 3402 * in: locked 3403 * out: unlocked 3404 */ 3405 void 3406 vput(struct vnode *vp) 3407 { 3408 3409 ASSERT_VOP_LOCKED(vp, __func__); 3410 ASSERT_VI_UNLOCKED(vp, __func__); 3411 if (!refcount_release(&vp->v_usecount)) { 3412 VOP_UNLOCK(vp); 3413 return; 3414 } 3415 vput_final(vp, VPUT); 3416 } 3417 3418 /* 3419 * in: locked 3420 * out: locked 3421 */ 3422 void 3423 vunref(struct vnode *vp) 3424 { 3425 3426 ASSERT_VOP_LOCKED(vp, __func__); 3427 ASSERT_VI_UNLOCKED(vp, __func__); 3428 if (!refcount_release(&vp->v_usecount)) 3429 return; 3430 vput_final(vp, VUNREF); 3431 } 3432 3433 void 3434 vhold(struct vnode *vp) 3435 { 3436 struct vdbatch *vd; 3437 int old; 3438 3439 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3440 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3441 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3442 ("%s: wrong hold count %d", __func__, old)); 3443 if (old != 0) 3444 return; 3445 critical_enter(); 3446 vd = DPCPU_PTR(vd); 3447 vd->freevnodes--; 3448 critical_exit(); 3449 } 3450 3451 void 3452 vholdl(struct vnode *vp) 3453 { 3454 3455 ASSERT_VI_LOCKED(vp, __func__); 3456 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3457 vhold(vp); 3458 } 3459 3460 void 3461 vholdnz(struct vnode *vp) 3462 { 3463 3464 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3465 #ifdef INVARIANTS 3466 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3467 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3468 ("%s: wrong hold count %d", __func__, old)); 3469 #else 3470 atomic_add_int(&vp->v_holdcnt, 1); 3471 #endif 3472 } 3473 3474 /* 3475 * Grab a hold count unless the vnode is freed. 3476 * 3477 * Only use this routine if vfs smr is the only protection you have against 3478 * freeing the vnode. 3479 * 3480 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3481 * is not set. After the flag is set the vnode becomes immutable to anyone but 3482 * the thread which managed to set the flag. 3483 * 3484 * It may be tempting to replace the loop with: 3485 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3486 * if (count & VHOLD_NO_SMR) { 3487 * backpedal and error out; 3488 * } 3489 * 3490 * However, while this is more performant, it hinders debugging by eliminating 3491 * the previously mentioned invariant. 3492 */ 3493 bool 3494 vhold_smr(struct vnode *vp) 3495 { 3496 int count; 3497 3498 VFS_SMR_ASSERT_ENTERED(); 3499 3500 count = atomic_load_int(&vp->v_holdcnt); 3501 for (;;) { 3502 if (count & VHOLD_NO_SMR) { 3503 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3504 ("non-zero hold count with flags %d\n", count)); 3505 return (false); 3506 } 3507 3508 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3509 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) 3510 return (true); 3511 } 3512 } 3513 3514 static void __noinline 3515 vdbatch_process(struct vdbatch *vd) 3516 { 3517 struct vnode *vp; 3518 int i; 3519 3520 mtx_assert(&vd->lock, MA_OWNED); 3521 MPASS(curthread->td_pinned > 0); 3522 MPASS(vd->index == VDBATCH_SIZE); 3523 3524 mtx_lock(&vnode_list_mtx); 3525 critical_enter(); 3526 freevnodes += vd->freevnodes; 3527 for (i = 0; i < VDBATCH_SIZE; i++) { 3528 vp = vd->tab[i]; 3529 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3530 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3531 MPASS(vp->v_dbatchcpu != NOCPU); 3532 vp->v_dbatchcpu = NOCPU; 3533 } 3534 mtx_unlock(&vnode_list_mtx); 3535 vd->freevnodes = 0; 3536 bzero(vd->tab, sizeof(vd->tab)); 3537 vd->index = 0; 3538 critical_exit(); 3539 } 3540 3541 static void 3542 vdbatch_enqueue(struct vnode *vp) 3543 { 3544 struct vdbatch *vd; 3545 3546 ASSERT_VI_LOCKED(vp, __func__); 3547 VNASSERT(!VN_IS_DOOMED(vp), vp, 3548 ("%s: deferring requeue of a doomed vnode", __func__)); 3549 3550 critical_enter(); 3551 vd = DPCPU_PTR(vd); 3552 vd->freevnodes++; 3553 if (vp->v_dbatchcpu != NOCPU) { 3554 VI_UNLOCK(vp); 3555 critical_exit(); 3556 return; 3557 } 3558 3559 sched_pin(); 3560 critical_exit(); 3561 mtx_lock(&vd->lock); 3562 MPASS(vd->index < VDBATCH_SIZE); 3563 MPASS(vd->tab[vd->index] == NULL); 3564 /* 3565 * A hack: we depend on being pinned so that we know what to put in 3566 * ->v_dbatchcpu. 3567 */ 3568 vp->v_dbatchcpu = curcpu; 3569 vd->tab[vd->index] = vp; 3570 vd->index++; 3571 VI_UNLOCK(vp); 3572 if (vd->index == VDBATCH_SIZE) 3573 vdbatch_process(vd); 3574 mtx_unlock(&vd->lock); 3575 sched_unpin(); 3576 } 3577 3578 /* 3579 * This routine must only be called for vnodes which are about to be 3580 * deallocated. Supporting dequeue for arbitrary vndoes would require 3581 * validating that the locked batch matches. 3582 */ 3583 static void 3584 vdbatch_dequeue(struct vnode *vp) 3585 { 3586 struct vdbatch *vd; 3587 int i; 3588 short cpu; 3589 3590 VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, 3591 ("%s: called for a used vnode\n", __func__)); 3592 3593 cpu = vp->v_dbatchcpu; 3594 if (cpu == NOCPU) 3595 return; 3596 3597 vd = DPCPU_ID_PTR(cpu, vd); 3598 mtx_lock(&vd->lock); 3599 for (i = 0; i < vd->index; i++) { 3600 if (vd->tab[i] != vp) 3601 continue; 3602 vp->v_dbatchcpu = NOCPU; 3603 vd->index--; 3604 vd->tab[i] = vd->tab[vd->index]; 3605 vd->tab[vd->index] = NULL; 3606 break; 3607 } 3608 mtx_unlock(&vd->lock); 3609 /* 3610 * Either we dequeued the vnode above or the target CPU beat us to it. 3611 */ 3612 MPASS(vp->v_dbatchcpu == NOCPU); 3613 } 3614 3615 /* 3616 * Drop the hold count of the vnode. If this is the last reference to 3617 * the vnode we place it on the free list unless it has been vgone'd 3618 * (marked VIRF_DOOMED) in which case we will free it. 3619 * 3620 * Because the vnode vm object keeps a hold reference on the vnode if 3621 * there is at least one resident non-cached page, the vnode cannot 3622 * leave the active list without the page cleanup done. 3623 */ 3624 static void 3625 vdrop_deactivate(struct vnode *vp) 3626 { 3627 struct mount *mp; 3628 3629 ASSERT_VI_LOCKED(vp, __func__); 3630 /* 3631 * Mark a vnode as free: remove it from its active list 3632 * and put it up for recycling on the freelist. 3633 */ 3634 VNASSERT(!VN_IS_DOOMED(vp), vp, 3635 ("vdrop: returning doomed vnode")); 3636 VNASSERT(vp->v_op != NULL, vp, 3637 ("vdrop: vnode already reclaimed.")); 3638 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 3639 ("vnode with VI_OWEINACT set")); 3640 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, 3641 ("vnode with VI_DEFINACT set")); 3642 if (vp->v_mflag & VMP_LAZYLIST) { 3643 mp = vp->v_mount; 3644 mtx_lock(&mp->mnt_listmtx); 3645 VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST")); 3646 /* 3647 * Don't remove the vnode from the lazy list if another thread 3648 * has increased the hold count. It may have re-enqueued the 3649 * vnode to the lazy list and is now responsible for its 3650 * removal. 3651 */ 3652 if (vp->v_holdcnt == 0) { 3653 vp->v_mflag &= ~VMP_LAZYLIST; 3654 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3655 mp->mnt_lazyvnodelistsize--; 3656 } 3657 mtx_unlock(&mp->mnt_listmtx); 3658 } 3659 vdbatch_enqueue(vp); 3660 } 3661 3662 void 3663 vdrop(struct vnode *vp) 3664 { 3665 3666 ASSERT_VI_UNLOCKED(vp, __func__); 3667 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3668 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3669 return; 3670 VI_LOCK(vp); 3671 vdropl(vp); 3672 } 3673 3674 void 3675 vdropl(struct vnode *vp) 3676 { 3677 3678 ASSERT_VI_LOCKED(vp, __func__); 3679 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3680 if (!refcount_release(&vp->v_holdcnt)) { 3681 VI_UNLOCK(vp); 3682 return; 3683 } 3684 if (!VN_IS_DOOMED(vp)) { 3685 vdrop_deactivate(vp); 3686 return; 3687 } 3688 /* 3689 * We may be racing against vhold_smr. 3690 * 3691 * If they win we can just pretend we never got this far, they will 3692 * vdrop later. 3693 */ 3694 if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) { 3695 /* 3696 * We lost the aforementioned race. Note that any subsequent 3697 * access is invalid as they might have managed to vdropl on 3698 * their own. 3699 */ 3700 return; 3701 } 3702 freevnode(vp); 3703 } 3704 3705 /* 3706 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3707 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3708 */ 3709 static void 3710 vinactivef(struct vnode *vp) 3711 { 3712 struct vm_object *obj; 3713 3714 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3715 ASSERT_VI_LOCKED(vp, "vinactive"); 3716 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3717 ("vinactive: recursed on VI_DOINGINACT")); 3718 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3719 vp->v_iflag |= VI_DOINGINACT; 3720 vp->v_iflag &= ~VI_OWEINACT; 3721 VI_UNLOCK(vp); 3722 /* 3723 * Before moving off the active list, we must be sure that any 3724 * modified pages are converted into the vnode's dirty 3725 * buffers, since these will no longer be checked once the 3726 * vnode is on the inactive list. 3727 * 3728 * The write-out of the dirty pages is asynchronous. At the 3729 * point that VOP_INACTIVE() is called, there could still be 3730 * pending I/O and dirty pages in the object. 3731 */ 3732 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3733 vm_object_mightbedirty(obj)) { 3734 VM_OBJECT_WLOCK(obj); 3735 vm_object_page_clean(obj, 0, 0, 0); 3736 VM_OBJECT_WUNLOCK(obj); 3737 } 3738 VOP_INACTIVE(vp, curthread); 3739 VI_LOCK(vp); 3740 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3741 ("vinactive: lost VI_DOINGINACT")); 3742 vp->v_iflag &= ~VI_DOINGINACT; 3743 } 3744 3745 void 3746 vinactive(struct vnode *vp) 3747 { 3748 3749 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3750 ASSERT_VI_LOCKED(vp, "vinactive"); 3751 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3752 3753 if ((vp->v_iflag & VI_OWEINACT) == 0) 3754 return; 3755 if (vp->v_iflag & VI_DOINGINACT) 3756 return; 3757 if (vp->v_usecount > 0) { 3758 vp->v_iflag &= ~VI_OWEINACT; 3759 return; 3760 } 3761 vinactivef(vp); 3762 } 3763 3764 /* 3765 * Remove any vnodes in the vnode table belonging to mount point mp. 3766 * 3767 * If FORCECLOSE is not specified, there should not be any active ones, 3768 * return error if any are found (nb: this is a user error, not a 3769 * system error). If FORCECLOSE is specified, detach any active vnodes 3770 * that are found. 3771 * 3772 * If WRITECLOSE is set, only flush out regular file vnodes open for 3773 * writing. 3774 * 3775 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3776 * 3777 * `rootrefs' specifies the base reference count for the root vnode 3778 * of this filesystem. The root vnode is considered busy if its 3779 * v_usecount exceeds this value. On a successful return, vflush(, td) 3780 * will call vrele() on the root vnode exactly rootrefs times. 3781 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3782 * be zero. 3783 */ 3784 #ifdef DIAGNOSTIC 3785 static int busyprt = 0; /* print out busy vnodes */ 3786 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3787 #endif 3788 3789 int 3790 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3791 { 3792 struct vnode *vp, *mvp, *rootvp = NULL; 3793 struct vattr vattr; 3794 int busy = 0, error; 3795 3796 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3797 rootrefs, flags); 3798 if (rootrefs > 0) { 3799 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3800 ("vflush: bad args")); 3801 /* 3802 * Get the filesystem root vnode. We can vput() it 3803 * immediately, since with rootrefs > 0, it won't go away. 3804 */ 3805 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3806 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3807 __func__, error); 3808 return (error); 3809 } 3810 vput(rootvp); 3811 } 3812 loop: 3813 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3814 vholdl(vp); 3815 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3816 if (error) { 3817 vdrop(vp); 3818 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3819 goto loop; 3820 } 3821 /* 3822 * Skip over a vnodes marked VV_SYSTEM. 3823 */ 3824 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3825 VOP_UNLOCK(vp); 3826 vdrop(vp); 3827 continue; 3828 } 3829 /* 3830 * If WRITECLOSE is set, flush out unlinked but still open 3831 * files (even if open only for reading) and regular file 3832 * vnodes open for writing. 3833 */ 3834 if (flags & WRITECLOSE) { 3835 if (vp->v_object != NULL) { 3836 VM_OBJECT_WLOCK(vp->v_object); 3837 vm_object_page_clean(vp->v_object, 0, 0, 0); 3838 VM_OBJECT_WUNLOCK(vp->v_object); 3839 } 3840 error = VOP_FSYNC(vp, MNT_WAIT, td); 3841 if (error != 0) { 3842 VOP_UNLOCK(vp); 3843 vdrop(vp); 3844 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3845 return (error); 3846 } 3847 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3848 VI_LOCK(vp); 3849 3850 if ((vp->v_type == VNON || 3851 (error == 0 && vattr.va_nlink > 0)) && 3852 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3853 VOP_UNLOCK(vp); 3854 vdropl(vp); 3855 continue; 3856 } 3857 } else 3858 VI_LOCK(vp); 3859 /* 3860 * With v_usecount == 0, all we need to do is clear out the 3861 * vnode data structures and we are done. 3862 * 3863 * If FORCECLOSE is set, forcibly close the vnode. 3864 */ 3865 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3866 vgonel(vp); 3867 } else { 3868 busy++; 3869 #ifdef DIAGNOSTIC 3870 if (busyprt) 3871 vn_printf(vp, "vflush: busy vnode "); 3872 #endif 3873 } 3874 VOP_UNLOCK(vp); 3875 vdropl(vp); 3876 } 3877 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3878 /* 3879 * If just the root vnode is busy, and if its refcount 3880 * is equal to `rootrefs', then go ahead and kill it. 3881 */ 3882 VI_LOCK(rootvp); 3883 KASSERT(busy > 0, ("vflush: not busy")); 3884 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3885 ("vflush: usecount %d < rootrefs %d", 3886 rootvp->v_usecount, rootrefs)); 3887 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3888 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3889 vgone(rootvp); 3890 VOP_UNLOCK(rootvp); 3891 busy = 0; 3892 } else 3893 VI_UNLOCK(rootvp); 3894 } 3895 if (busy) { 3896 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3897 busy); 3898 return (EBUSY); 3899 } 3900 for (; rootrefs > 0; rootrefs--) 3901 vrele(rootvp); 3902 return (0); 3903 } 3904 3905 /* 3906 * Recycle an unused vnode to the front of the free list. 3907 */ 3908 int 3909 vrecycle(struct vnode *vp) 3910 { 3911 int recycled; 3912 3913 VI_LOCK(vp); 3914 recycled = vrecyclel(vp); 3915 VI_UNLOCK(vp); 3916 return (recycled); 3917 } 3918 3919 /* 3920 * vrecycle, with the vp interlock held. 3921 */ 3922 int 3923 vrecyclel(struct vnode *vp) 3924 { 3925 int recycled; 3926 3927 ASSERT_VOP_ELOCKED(vp, __func__); 3928 ASSERT_VI_LOCKED(vp, __func__); 3929 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3930 recycled = 0; 3931 if (vp->v_usecount == 0) { 3932 recycled = 1; 3933 vgonel(vp); 3934 } 3935 return (recycled); 3936 } 3937 3938 /* 3939 * Eliminate all activity associated with a vnode 3940 * in preparation for reuse. 3941 */ 3942 void 3943 vgone(struct vnode *vp) 3944 { 3945 VI_LOCK(vp); 3946 vgonel(vp); 3947 VI_UNLOCK(vp); 3948 } 3949 3950 static void 3951 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3952 struct vnode *lowervp __unused) 3953 { 3954 } 3955 3956 /* 3957 * Notify upper mounts about reclaimed or unlinked vnode. 3958 */ 3959 void 3960 vfs_notify_upper(struct vnode *vp, int event) 3961 { 3962 static struct vfsops vgonel_vfsops = { 3963 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3964 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3965 }; 3966 struct mount *mp, *ump, *mmp; 3967 3968 mp = vp->v_mount; 3969 if (mp == NULL) 3970 return; 3971 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3972 return; 3973 3974 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3975 mmp->mnt_op = &vgonel_vfsops; 3976 mmp->mnt_kern_flag |= MNTK_MARKER; 3977 MNT_ILOCK(mp); 3978 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3979 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3980 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3981 ump = TAILQ_NEXT(ump, mnt_upper_link); 3982 continue; 3983 } 3984 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3985 MNT_IUNLOCK(mp); 3986 switch (event) { 3987 case VFS_NOTIFY_UPPER_RECLAIM: 3988 VFS_RECLAIM_LOWERVP(ump, vp); 3989 break; 3990 case VFS_NOTIFY_UPPER_UNLINK: 3991 VFS_UNLINK_LOWERVP(ump, vp); 3992 break; 3993 default: 3994 KASSERT(0, ("invalid event %d", event)); 3995 break; 3996 } 3997 MNT_ILOCK(mp); 3998 ump = TAILQ_NEXT(mmp, mnt_upper_link); 3999 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 4000 } 4001 free(mmp, M_TEMP); 4002 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 4003 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 4004 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 4005 wakeup(&mp->mnt_uppers); 4006 } 4007 MNT_IUNLOCK(mp); 4008 } 4009 4010 /* 4011 * vgone, with the vp interlock held. 4012 */ 4013 static void 4014 vgonel(struct vnode *vp) 4015 { 4016 struct thread *td; 4017 struct mount *mp; 4018 vm_object_t object; 4019 bool active, oweinact; 4020 4021 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4022 ASSERT_VI_LOCKED(vp, "vgonel"); 4023 VNASSERT(vp->v_holdcnt, vp, 4024 ("vgonel: vp %p has no reference.", vp)); 4025 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4026 td = curthread; 4027 4028 /* 4029 * Don't vgonel if we're already doomed. 4030 */ 4031 if (vp->v_irflag & VIRF_DOOMED) 4032 return; 4033 /* 4034 * Paired with freevnode. 4035 */ 4036 vn_seqc_write_begin_locked(vp); 4037 vunlazy_gone(vp); 4038 vp->v_irflag |= VIRF_DOOMED; 4039 4040 /* 4041 * Check to see if the vnode is in use. If so, we have to call 4042 * VOP_CLOSE() and VOP_INACTIVE(). 4043 */ 4044 active = vp->v_usecount > 0; 4045 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4046 /* 4047 * If we need to do inactive VI_OWEINACT will be set. 4048 */ 4049 if (vp->v_iflag & VI_DEFINACT) { 4050 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4051 vp->v_iflag &= ~VI_DEFINACT; 4052 vdropl(vp); 4053 } else { 4054 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4055 VI_UNLOCK(vp); 4056 } 4057 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4058 4059 /* 4060 * If purging an active vnode, it must be closed and 4061 * deactivated before being reclaimed. 4062 */ 4063 if (active) 4064 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4065 if (oweinact || active) { 4066 VI_LOCK(vp); 4067 vinactivef(vp); 4068 VI_UNLOCK(vp); 4069 } 4070 if (vp->v_type == VSOCK) 4071 vfs_unp_reclaim(vp); 4072 4073 /* 4074 * Clean out any buffers associated with the vnode. 4075 * If the flush fails, just toss the buffers. 4076 */ 4077 mp = NULL; 4078 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4079 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4080 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4081 while (vinvalbuf(vp, 0, 0, 0) != 0) 4082 ; 4083 } 4084 4085 BO_LOCK(&vp->v_bufobj); 4086 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4087 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4088 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4089 vp->v_bufobj.bo_clean.bv_cnt == 0, 4090 ("vp %p bufobj not invalidated", vp)); 4091 4092 /* 4093 * For VMIO bufobj, BO_DEAD is set later, or in 4094 * vm_object_terminate() after the object's page queue is 4095 * flushed. 4096 */ 4097 object = vp->v_bufobj.bo_object; 4098 if (object == NULL) 4099 vp->v_bufobj.bo_flag |= BO_DEAD; 4100 BO_UNLOCK(&vp->v_bufobj); 4101 4102 /* 4103 * Handle the VM part. Tmpfs handles v_object on its own (the 4104 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4105 * should not touch the object borrowed from the lower vnode 4106 * (the handle check). 4107 */ 4108 if (object != NULL && object->type == OBJT_VNODE && 4109 object->handle == vp) 4110 vnode_destroy_vobject(vp); 4111 4112 /* 4113 * Reclaim the vnode. 4114 */ 4115 if (VOP_RECLAIM(vp, td)) 4116 panic("vgone: cannot reclaim"); 4117 if (mp != NULL) 4118 vn_finished_secondary_write(mp); 4119 VNASSERT(vp->v_object == NULL, vp, 4120 ("vop_reclaim left v_object vp=%p", vp)); 4121 /* 4122 * Clear the advisory locks and wake up waiting threads. 4123 */ 4124 (void)VOP_ADVLOCKPURGE(vp); 4125 vp->v_lockf = NULL; 4126 /* 4127 * Delete from old mount point vnode list. 4128 */ 4129 delmntque(vp); 4130 cache_purge_vgone(vp); 4131 /* 4132 * Done with purge, reset to the standard lock and invalidate 4133 * the vnode. 4134 */ 4135 VI_LOCK(vp); 4136 vp->v_vnlock = &vp->v_lock; 4137 vp->v_op = &dead_vnodeops; 4138 vp->v_type = VBAD; 4139 } 4140 4141 /* 4142 * Calculate the total number of references to a special device. 4143 */ 4144 int 4145 vcount(struct vnode *vp) 4146 { 4147 int count; 4148 4149 dev_lock(); 4150 count = vp->v_rdev->si_usecount; 4151 dev_unlock(); 4152 return (count); 4153 } 4154 4155 /* 4156 * Print out a description of a vnode. 4157 */ 4158 static const char * const typename[] = 4159 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 4160 "VMARKER"}; 4161 4162 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4163 "new hold count flag not added to vn_printf"); 4164 4165 void 4166 vn_printf(struct vnode *vp, const char *fmt, ...) 4167 { 4168 va_list ap; 4169 char buf[256], buf2[16]; 4170 u_long flags; 4171 u_int holdcnt; 4172 4173 va_start(ap, fmt); 4174 vprintf(fmt, ap); 4175 va_end(ap); 4176 printf("%p: ", (void *)vp); 4177 printf("type %s\n", typename[vp->v_type]); 4178 holdcnt = atomic_load_int(&vp->v_holdcnt); 4179 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4180 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4181 vp->v_seqc_users); 4182 switch (vp->v_type) { 4183 case VDIR: 4184 printf(" mountedhere %p\n", vp->v_mountedhere); 4185 break; 4186 case VCHR: 4187 printf(" rdev %p\n", vp->v_rdev); 4188 break; 4189 case VSOCK: 4190 printf(" socket %p\n", vp->v_unpcb); 4191 break; 4192 case VFIFO: 4193 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4194 break; 4195 default: 4196 printf("\n"); 4197 break; 4198 } 4199 buf[0] = '\0'; 4200 buf[1] = '\0'; 4201 if (holdcnt & VHOLD_NO_SMR) 4202 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4203 printf(" hold count flags (%s)\n", buf + 1); 4204 4205 buf[0] = '\0'; 4206 buf[1] = '\0'; 4207 if (vp->v_irflag & VIRF_DOOMED) 4208 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4209 flags = vp->v_irflag & ~(VIRF_DOOMED); 4210 if (flags != 0) { 4211 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4212 strlcat(buf, buf2, sizeof(buf)); 4213 } 4214 if (vp->v_vflag & VV_ROOT) 4215 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4216 if (vp->v_vflag & VV_ISTTY) 4217 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4218 if (vp->v_vflag & VV_NOSYNC) 4219 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4220 if (vp->v_vflag & VV_ETERNALDEV) 4221 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4222 if (vp->v_vflag & VV_CACHEDLABEL) 4223 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4224 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4225 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4226 if (vp->v_vflag & VV_COPYONWRITE) 4227 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4228 if (vp->v_vflag & VV_SYSTEM) 4229 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4230 if (vp->v_vflag & VV_PROCDEP) 4231 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4232 if (vp->v_vflag & VV_NOKNOTE) 4233 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 4234 if (vp->v_vflag & VV_DELETED) 4235 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4236 if (vp->v_vflag & VV_MD) 4237 strlcat(buf, "|VV_MD", sizeof(buf)); 4238 if (vp->v_vflag & VV_FORCEINSMQ) 4239 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4240 if (vp->v_vflag & VV_READLINK) 4241 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4242 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4243 VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 4244 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 4245 if (flags != 0) { 4246 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4247 strlcat(buf, buf2, sizeof(buf)); 4248 } 4249 if (vp->v_iflag & VI_TEXT_REF) 4250 strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); 4251 if (vp->v_iflag & VI_MOUNT) 4252 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4253 if (vp->v_iflag & VI_DOINGINACT) 4254 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4255 if (vp->v_iflag & VI_OWEINACT) 4256 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4257 if (vp->v_iflag & VI_DEFINACT) 4258 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4259 flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT | 4260 VI_OWEINACT | VI_DEFINACT); 4261 if (flags != 0) { 4262 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4263 strlcat(buf, buf2, sizeof(buf)); 4264 } 4265 if (vp->v_mflag & VMP_LAZYLIST) 4266 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4267 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4268 if (flags != 0) { 4269 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4270 strlcat(buf, buf2, sizeof(buf)); 4271 } 4272 printf(" flags (%s)\n", buf + 1); 4273 if (mtx_owned(VI_MTX(vp))) 4274 printf(" VI_LOCKed"); 4275 if (vp->v_object != NULL) 4276 printf(" v_object %p ref %d pages %d " 4277 "cleanbuf %d dirtybuf %d\n", 4278 vp->v_object, vp->v_object->ref_count, 4279 vp->v_object->resident_page_count, 4280 vp->v_bufobj.bo_clean.bv_cnt, 4281 vp->v_bufobj.bo_dirty.bv_cnt); 4282 printf(" "); 4283 lockmgr_printinfo(vp->v_vnlock); 4284 if (vp->v_data != NULL) 4285 VOP_PRINT(vp); 4286 } 4287 4288 #ifdef DDB 4289 /* 4290 * List all of the locked vnodes in the system. 4291 * Called when debugging the kernel. 4292 */ 4293 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 4294 { 4295 struct mount *mp; 4296 struct vnode *vp; 4297 4298 /* 4299 * Note: because this is DDB, we can't obey the locking semantics 4300 * for these structures, which means we could catch an inconsistent 4301 * state and dereference a nasty pointer. Not much to be done 4302 * about that. 4303 */ 4304 db_printf("Locked vnodes\n"); 4305 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4306 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4307 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4308 vn_printf(vp, "vnode "); 4309 } 4310 } 4311 } 4312 4313 /* 4314 * Show details about the given vnode. 4315 */ 4316 DB_SHOW_COMMAND(vnode, db_show_vnode) 4317 { 4318 struct vnode *vp; 4319 4320 if (!have_addr) 4321 return; 4322 vp = (struct vnode *)addr; 4323 vn_printf(vp, "vnode "); 4324 } 4325 4326 /* 4327 * Show details about the given mount point. 4328 */ 4329 DB_SHOW_COMMAND(mount, db_show_mount) 4330 { 4331 struct mount *mp; 4332 struct vfsopt *opt; 4333 struct statfs *sp; 4334 struct vnode *vp; 4335 char buf[512]; 4336 uint64_t mflags; 4337 u_int flags; 4338 4339 if (!have_addr) { 4340 /* No address given, print short info about all mount points. */ 4341 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4342 db_printf("%p %s on %s (%s)\n", mp, 4343 mp->mnt_stat.f_mntfromname, 4344 mp->mnt_stat.f_mntonname, 4345 mp->mnt_stat.f_fstypename); 4346 if (db_pager_quit) 4347 break; 4348 } 4349 db_printf("\nMore info: show mount <addr>\n"); 4350 return; 4351 } 4352 4353 mp = (struct mount *)addr; 4354 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4355 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4356 4357 buf[0] = '\0'; 4358 mflags = mp->mnt_flag; 4359 #define MNT_FLAG(flag) do { \ 4360 if (mflags & (flag)) { \ 4361 if (buf[0] != '\0') \ 4362 strlcat(buf, ", ", sizeof(buf)); \ 4363 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4364 mflags &= ~(flag); \ 4365 } \ 4366 } while (0) 4367 MNT_FLAG(MNT_RDONLY); 4368 MNT_FLAG(MNT_SYNCHRONOUS); 4369 MNT_FLAG(MNT_NOEXEC); 4370 MNT_FLAG(MNT_NOSUID); 4371 MNT_FLAG(MNT_NFS4ACLS); 4372 MNT_FLAG(MNT_UNION); 4373 MNT_FLAG(MNT_ASYNC); 4374 MNT_FLAG(MNT_SUIDDIR); 4375 MNT_FLAG(MNT_SOFTDEP); 4376 MNT_FLAG(MNT_NOSYMFOLLOW); 4377 MNT_FLAG(MNT_GJOURNAL); 4378 MNT_FLAG(MNT_MULTILABEL); 4379 MNT_FLAG(MNT_ACLS); 4380 MNT_FLAG(MNT_NOATIME); 4381 MNT_FLAG(MNT_NOCLUSTERR); 4382 MNT_FLAG(MNT_NOCLUSTERW); 4383 MNT_FLAG(MNT_SUJ); 4384 MNT_FLAG(MNT_EXRDONLY); 4385 MNT_FLAG(MNT_EXPORTED); 4386 MNT_FLAG(MNT_DEFEXPORTED); 4387 MNT_FLAG(MNT_EXPORTANON); 4388 MNT_FLAG(MNT_EXKERB); 4389 MNT_FLAG(MNT_EXPUBLIC); 4390 MNT_FLAG(MNT_LOCAL); 4391 MNT_FLAG(MNT_QUOTA); 4392 MNT_FLAG(MNT_ROOTFS); 4393 MNT_FLAG(MNT_USER); 4394 MNT_FLAG(MNT_IGNORE); 4395 MNT_FLAG(MNT_UPDATE); 4396 MNT_FLAG(MNT_DELEXPORT); 4397 MNT_FLAG(MNT_RELOAD); 4398 MNT_FLAG(MNT_FORCE); 4399 MNT_FLAG(MNT_SNAPSHOT); 4400 MNT_FLAG(MNT_BYFSID); 4401 #undef MNT_FLAG 4402 if (mflags != 0) { 4403 if (buf[0] != '\0') 4404 strlcat(buf, ", ", sizeof(buf)); 4405 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4406 "0x%016jx", mflags); 4407 } 4408 db_printf(" mnt_flag = %s\n", buf); 4409 4410 buf[0] = '\0'; 4411 flags = mp->mnt_kern_flag; 4412 #define MNT_KERN_FLAG(flag) do { \ 4413 if (flags & (flag)) { \ 4414 if (buf[0] != '\0') \ 4415 strlcat(buf, ", ", sizeof(buf)); \ 4416 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4417 flags &= ~(flag); \ 4418 } \ 4419 } while (0) 4420 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4421 MNT_KERN_FLAG(MNTK_ASYNC); 4422 MNT_KERN_FLAG(MNTK_SOFTDEP); 4423 MNT_KERN_FLAG(MNTK_DRAINING); 4424 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4425 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4426 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4427 MNT_KERN_FLAG(MNTK_NO_IOPF); 4428 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 4429 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 4430 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 4431 MNT_KERN_FLAG(MNTK_MARKER); 4432 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4433 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4434 MNT_KERN_FLAG(MNTK_NOASYNC); 4435 MNT_KERN_FLAG(MNTK_UNMOUNT); 4436 MNT_KERN_FLAG(MNTK_MWAIT); 4437 MNT_KERN_FLAG(MNTK_SUSPEND); 4438 MNT_KERN_FLAG(MNTK_SUSPEND2); 4439 MNT_KERN_FLAG(MNTK_SUSPENDED); 4440 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4441 MNT_KERN_FLAG(MNTK_NOKNOTE); 4442 #undef MNT_KERN_FLAG 4443 if (flags != 0) { 4444 if (buf[0] != '\0') 4445 strlcat(buf, ", ", sizeof(buf)); 4446 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4447 "0x%08x", flags); 4448 } 4449 db_printf(" mnt_kern_flag = %s\n", buf); 4450 4451 db_printf(" mnt_opt = "); 4452 opt = TAILQ_FIRST(mp->mnt_opt); 4453 if (opt != NULL) { 4454 db_printf("%s", opt->name); 4455 opt = TAILQ_NEXT(opt, link); 4456 while (opt != NULL) { 4457 db_printf(", %s", opt->name); 4458 opt = TAILQ_NEXT(opt, link); 4459 } 4460 } 4461 db_printf("\n"); 4462 4463 sp = &mp->mnt_stat; 4464 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4465 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4466 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4467 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4468 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4469 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4470 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4471 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4472 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4473 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4474 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4475 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4476 4477 db_printf(" mnt_cred = { uid=%u ruid=%u", 4478 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4479 if (jailed(mp->mnt_cred)) 4480 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4481 db_printf(" }\n"); 4482 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4483 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4484 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4485 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4486 db_printf(" mnt_lazyvnodelistsize = %d\n", 4487 mp->mnt_lazyvnodelistsize); 4488 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4489 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4490 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 4491 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4492 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4493 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4494 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4495 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4496 db_printf(" mnt_secondary_accwrites = %d\n", 4497 mp->mnt_secondary_accwrites); 4498 db_printf(" mnt_gjprovider = %s\n", 4499 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4500 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4501 4502 db_printf("\n\nList of active vnodes\n"); 4503 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4504 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4505 vn_printf(vp, "vnode "); 4506 if (db_pager_quit) 4507 break; 4508 } 4509 } 4510 db_printf("\n\nList of inactive vnodes\n"); 4511 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4512 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4513 vn_printf(vp, "vnode "); 4514 if (db_pager_quit) 4515 break; 4516 } 4517 } 4518 } 4519 #endif /* DDB */ 4520 4521 /* 4522 * Fill in a struct xvfsconf based on a struct vfsconf. 4523 */ 4524 static int 4525 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4526 { 4527 struct xvfsconf xvfsp; 4528 4529 bzero(&xvfsp, sizeof(xvfsp)); 4530 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4531 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4532 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4533 xvfsp.vfc_flags = vfsp->vfc_flags; 4534 /* 4535 * These are unused in userland, we keep them 4536 * to not break binary compatibility. 4537 */ 4538 xvfsp.vfc_vfsops = NULL; 4539 xvfsp.vfc_next = NULL; 4540 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4541 } 4542 4543 #ifdef COMPAT_FREEBSD32 4544 struct xvfsconf32 { 4545 uint32_t vfc_vfsops; 4546 char vfc_name[MFSNAMELEN]; 4547 int32_t vfc_typenum; 4548 int32_t vfc_refcount; 4549 int32_t vfc_flags; 4550 uint32_t vfc_next; 4551 }; 4552 4553 static int 4554 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4555 { 4556 struct xvfsconf32 xvfsp; 4557 4558 bzero(&xvfsp, sizeof(xvfsp)); 4559 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4560 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4561 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4562 xvfsp.vfc_flags = vfsp->vfc_flags; 4563 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4564 } 4565 #endif 4566 4567 /* 4568 * Top level filesystem related information gathering. 4569 */ 4570 static int 4571 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4572 { 4573 struct vfsconf *vfsp; 4574 int error; 4575 4576 error = 0; 4577 vfsconf_slock(); 4578 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4579 #ifdef COMPAT_FREEBSD32 4580 if (req->flags & SCTL_MASK32) 4581 error = vfsconf2x32(req, vfsp); 4582 else 4583 #endif 4584 error = vfsconf2x(req, vfsp); 4585 if (error) 4586 break; 4587 } 4588 vfsconf_sunlock(); 4589 return (error); 4590 } 4591 4592 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4593 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4594 "S,xvfsconf", "List of all configured filesystems"); 4595 4596 #ifndef BURN_BRIDGES 4597 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4598 4599 static int 4600 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4601 { 4602 int *name = (int *)arg1 - 1; /* XXX */ 4603 u_int namelen = arg2 + 1; /* XXX */ 4604 struct vfsconf *vfsp; 4605 4606 log(LOG_WARNING, "userland calling deprecated sysctl, " 4607 "please rebuild world\n"); 4608 4609 #if 1 || defined(COMPAT_PRELITE2) 4610 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4611 if (namelen == 1) 4612 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4613 #endif 4614 4615 switch (name[1]) { 4616 case VFS_MAXTYPENUM: 4617 if (namelen != 2) 4618 return (ENOTDIR); 4619 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4620 case VFS_CONF: 4621 if (namelen != 3) 4622 return (ENOTDIR); /* overloaded */ 4623 vfsconf_slock(); 4624 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4625 if (vfsp->vfc_typenum == name[2]) 4626 break; 4627 } 4628 vfsconf_sunlock(); 4629 if (vfsp == NULL) 4630 return (EOPNOTSUPP); 4631 #ifdef COMPAT_FREEBSD32 4632 if (req->flags & SCTL_MASK32) 4633 return (vfsconf2x32(req, vfsp)); 4634 else 4635 #endif 4636 return (vfsconf2x(req, vfsp)); 4637 } 4638 return (EOPNOTSUPP); 4639 } 4640 4641 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4642 CTLFLAG_MPSAFE, vfs_sysctl, 4643 "Generic filesystem"); 4644 4645 #if 1 || defined(COMPAT_PRELITE2) 4646 4647 static int 4648 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4649 { 4650 int error; 4651 struct vfsconf *vfsp; 4652 struct ovfsconf ovfs; 4653 4654 vfsconf_slock(); 4655 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4656 bzero(&ovfs, sizeof(ovfs)); 4657 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4658 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4659 ovfs.vfc_index = vfsp->vfc_typenum; 4660 ovfs.vfc_refcount = vfsp->vfc_refcount; 4661 ovfs.vfc_flags = vfsp->vfc_flags; 4662 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4663 if (error != 0) { 4664 vfsconf_sunlock(); 4665 return (error); 4666 } 4667 } 4668 vfsconf_sunlock(); 4669 return (0); 4670 } 4671 4672 #endif /* 1 || COMPAT_PRELITE2 */ 4673 #endif /* !BURN_BRIDGES */ 4674 4675 #define KINFO_VNODESLOP 10 4676 #ifdef notyet 4677 /* 4678 * Dump vnode list (via sysctl). 4679 */ 4680 /* ARGSUSED */ 4681 static int 4682 sysctl_vnode(SYSCTL_HANDLER_ARGS) 4683 { 4684 struct xvnode *xvn; 4685 struct mount *mp; 4686 struct vnode *vp; 4687 int error, len, n; 4688 4689 /* 4690 * Stale numvnodes access is not fatal here. 4691 */ 4692 req->lock = 0; 4693 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 4694 if (!req->oldptr) 4695 /* Make an estimate */ 4696 return (SYSCTL_OUT(req, 0, len)); 4697 4698 error = sysctl_wire_old_buffer(req, 0); 4699 if (error != 0) 4700 return (error); 4701 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 4702 n = 0; 4703 mtx_lock(&mountlist_mtx); 4704 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4705 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 4706 continue; 4707 MNT_ILOCK(mp); 4708 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4709 if (n == len) 4710 break; 4711 vref(vp); 4712 xvn[n].xv_size = sizeof *xvn; 4713 xvn[n].xv_vnode = vp; 4714 xvn[n].xv_id = 0; /* XXX compat */ 4715 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 4716 XV_COPY(usecount); 4717 XV_COPY(writecount); 4718 XV_COPY(holdcnt); 4719 XV_COPY(mount); 4720 XV_COPY(numoutput); 4721 XV_COPY(type); 4722 #undef XV_COPY 4723 xvn[n].xv_flag = vp->v_vflag; 4724 4725 switch (vp->v_type) { 4726 case VREG: 4727 case VDIR: 4728 case VLNK: 4729 break; 4730 case VBLK: 4731 case VCHR: 4732 if (vp->v_rdev == NULL) { 4733 vrele(vp); 4734 continue; 4735 } 4736 xvn[n].xv_dev = dev2udev(vp->v_rdev); 4737 break; 4738 case VSOCK: 4739 xvn[n].xv_socket = vp->v_socket; 4740 break; 4741 case VFIFO: 4742 xvn[n].xv_fifo = vp->v_fifoinfo; 4743 break; 4744 case VNON: 4745 case VBAD: 4746 default: 4747 /* shouldn't happen? */ 4748 vrele(vp); 4749 continue; 4750 } 4751 vrele(vp); 4752 ++n; 4753 } 4754 MNT_IUNLOCK(mp); 4755 mtx_lock(&mountlist_mtx); 4756 vfs_unbusy(mp); 4757 if (n == len) 4758 break; 4759 } 4760 mtx_unlock(&mountlist_mtx); 4761 4762 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4763 free(xvn, M_TEMP); 4764 return (error); 4765 } 4766 4767 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4768 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4769 ""); 4770 #endif 4771 4772 static void 4773 unmount_or_warn(struct mount *mp) 4774 { 4775 int error; 4776 4777 error = dounmount(mp, MNT_FORCE, curthread); 4778 if (error != 0) { 4779 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4780 if (error == EBUSY) 4781 printf("BUSY)\n"); 4782 else 4783 printf("%d)\n", error); 4784 } 4785 } 4786 4787 /* 4788 * Unmount all filesystems. The list is traversed in reverse order 4789 * of mounting to avoid dependencies. 4790 */ 4791 void 4792 vfs_unmountall(void) 4793 { 4794 struct mount *mp, *tmp; 4795 4796 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4797 4798 /* 4799 * Since this only runs when rebooting, it is not interlocked. 4800 */ 4801 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4802 vfs_ref(mp); 4803 4804 /* 4805 * Forcibly unmounting "/dev" before "/" would prevent clean 4806 * unmount of the latter. 4807 */ 4808 if (mp == rootdevmp) 4809 continue; 4810 4811 unmount_or_warn(mp); 4812 } 4813 4814 if (rootdevmp != NULL) 4815 unmount_or_warn(rootdevmp); 4816 } 4817 4818 static void 4819 vfs_deferred_inactive(struct vnode *vp, int lkflags) 4820 { 4821 4822 ASSERT_VI_LOCKED(vp, __func__); 4823 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); 4824 if ((vp->v_iflag & VI_OWEINACT) == 0) { 4825 vdropl(vp); 4826 return; 4827 } 4828 if (vn_lock(vp, lkflags) == 0) { 4829 VI_LOCK(vp); 4830 vinactive(vp); 4831 VOP_UNLOCK(vp); 4832 vdropl(vp); 4833 return; 4834 } 4835 vdefer_inactive_unlocked(vp); 4836 } 4837 4838 static int 4839 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 4840 { 4841 4842 return (vp->v_iflag & VI_DEFINACT); 4843 } 4844 4845 static void __noinline 4846 vfs_periodic_inactive(struct mount *mp, int flags) 4847 { 4848 struct vnode *vp, *mvp; 4849 int lkflags; 4850 4851 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4852 if (flags != MNT_WAIT) 4853 lkflags |= LK_NOWAIT; 4854 4855 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 4856 if ((vp->v_iflag & VI_DEFINACT) == 0) { 4857 VI_UNLOCK(vp); 4858 continue; 4859 } 4860 vp->v_iflag &= ~VI_DEFINACT; 4861 vfs_deferred_inactive(vp, lkflags); 4862 } 4863 } 4864 4865 static inline bool 4866 vfs_want_msync(struct vnode *vp) 4867 { 4868 struct vm_object *obj; 4869 4870 /* 4871 * This test may be performed without any locks held. 4872 * We rely on vm_object's type stability. 4873 */ 4874 if (vp->v_vflag & VV_NOSYNC) 4875 return (false); 4876 obj = vp->v_object; 4877 return (obj != NULL && vm_object_mightbedirty(obj)); 4878 } 4879 4880 static int 4881 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 4882 { 4883 4884 if (vp->v_vflag & VV_NOSYNC) 4885 return (false); 4886 if (vp->v_iflag & VI_DEFINACT) 4887 return (true); 4888 return (vfs_want_msync(vp)); 4889 } 4890 4891 static void __noinline 4892 vfs_periodic_msync_inactive(struct mount *mp, int flags) 4893 { 4894 struct vnode *vp, *mvp; 4895 struct vm_object *obj; 4896 struct thread *td; 4897 int lkflags, objflags; 4898 bool seen_defer; 4899 4900 td = curthread; 4901 4902 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4903 if (flags != MNT_WAIT) { 4904 lkflags |= LK_NOWAIT; 4905 objflags = OBJPC_NOSYNC; 4906 } else { 4907 objflags = OBJPC_SYNC; 4908 } 4909 4910 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 4911 seen_defer = false; 4912 if (vp->v_iflag & VI_DEFINACT) { 4913 vp->v_iflag &= ~VI_DEFINACT; 4914 seen_defer = true; 4915 } 4916 if (!vfs_want_msync(vp)) { 4917 if (seen_defer) 4918 vfs_deferred_inactive(vp, lkflags); 4919 else 4920 VI_UNLOCK(vp); 4921 continue; 4922 } 4923 if (vget(vp, lkflags, td) == 0) { 4924 obj = vp->v_object; 4925 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 4926 VM_OBJECT_WLOCK(obj); 4927 vm_object_page_clean(obj, 0, 0, objflags); 4928 VM_OBJECT_WUNLOCK(obj); 4929 } 4930 vput(vp); 4931 if (seen_defer) 4932 vdrop(vp); 4933 } else { 4934 if (seen_defer) 4935 vdefer_inactive_unlocked(vp); 4936 } 4937 } 4938 } 4939 4940 void 4941 vfs_periodic(struct mount *mp, int flags) 4942 { 4943 4944 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4945 4946 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4947 vfs_periodic_inactive(mp, flags); 4948 else 4949 vfs_periodic_msync_inactive(mp, flags); 4950 } 4951 4952 static void 4953 destroy_vpollinfo_free(struct vpollinfo *vi) 4954 { 4955 4956 knlist_destroy(&vi->vpi_selinfo.si_note); 4957 mtx_destroy(&vi->vpi_lock); 4958 uma_zfree(vnodepoll_zone, vi); 4959 } 4960 4961 static void 4962 destroy_vpollinfo(struct vpollinfo *vi) 4963 { 4964 4965 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4966 seldrain(&vi->vpi_selinfo); 4967 destroy_vpollinfo_free(vi); 4968 } 4969 4970 /* 4971 * Initialize per-vnode helper structure to hold poll-related state. 4972 */ 4973 void 4974 v_addpollinfo(struct vnode *vp) 4975 { 4976 struct vpollinfo *vi; 4977 4978 if (vp->v_pollinfo != NULL) 4979 return; 4980 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 4981 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4982 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4983 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 4984 VI_LOCK(vp); 4985 if (vp->v_pollinfo != NULL) { 4986 VI_UNLOCK(vp); 4987 destroy_vpollinfo_free(vi); 4988 return; 4989 } 4990 vp->v_pollinfo = vi; 4991 VI_UNLOCK(vp); 4992 } 4993 4994 /* 4995 * Record a process's interest in events which might happen to 4996 * a vnode. Because poll uses the historic select-style interface 4997 * internally, this routine serves as both the ``check for any 4998 * pending events'' and the ``record my interest in future events'' 4999 * functions. (These are done together, while the lock is held, 5000 * to avoid race conditions.) 5001 */ 5002 int 5003 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5004 { 5005 5006 v_addpollinfo(vp); 5007 mtx_lock(&vp->v_pollinfo->vpi_lock); 5008 if (vp->v_pollinfo->vpi_revents & events) { 5009 /* 5010 * This leaves events we are not interested 5011 * in available for the other process which 5012 * which presumably had requested them 5013 * (otherwise they would never have been 5014 * recorded). 5015 */ 5016 events &= vp->v_pollinfo->vpi_revents; 5017 vp->v_pollinfo->vpi_revents &= ~events; 5018 5019 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5020 return (events); 5021 } 5022 vp->v_pollinfo->vpi_events |= events; 5023 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5024 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5025 return (0); 5026 } 5027 5028 /* 5029 * Routine to create and manage a filesystem syncer vnode. 5030 */ 5031 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5032 static int sync_fsync(struct vop_fsync_args *); 5033 static int sync_inactive(struct vop_inactive_args *); 5034 static int sync_reclaim(struct vop_reclaim_args *); 5035 5036 static struct vop_vector sync_vnodeops = { 5037 .vop_bypass = VOP_EOPNOTSUPP, 5038 .vop_close = sync_close, /* close */ 5039 .vop_fsync = sync_fsync, /* fsync */ 5040 .vop_inactive = sync_inactive, /* inactive */ 5041 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 5042 .vop_reclaim = sync_reclaim, /* reclaim */ 5043 .vop_lock1 = vop_stdlock, /* lock */ 5044 .vop_unlock = vop_stdunlock, /* unlock */ 5045 .vop_islocked = vop_stdislocked, /* islocked */ 5046 }; 5047 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5048 5049 /* 5050 * Create a new filesystem syncer vnode for the specified mount point. 5051 */ 5052 void 5053 vfs_allocate_syncvnode(struct mount *mp) 5054 { 5055 struct vnode *vp; 5056 struct bufobj *bo; 5057 static long start, incr, next; 5058 int error; 5059 5060 /* Allocate a new vnode */ 5061 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5062 if (error != 0) 5063 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5064 vp->v_type = VNON; 5065 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5066 vp->v_vflag |= VV_FORCEINSMQ; 5067 error = insmntque(vp, mp); 5068 if (error != 0) 5069 panic("vfs_allocate_syncvnode: insmntque() failed"); 5070 vp->v_vflag &= ~VV_FORCEINSMQ; 5071 VOP_UNLOCK(vp); 5072 /* 5073 * Place the vnode onto the syncer worklist. We attempt to 5074 * scatter them about on the list so that they will go off 5075 * at evenly distributed times even if all the filesystems 5076 * are mounted at once. 5077 */ 5078 next += incr; 5079 if (next == 0 || next > syncer_maxdelay) { 5080 start /= 2; 5081 incr /= 2; 5082 if (start == 0) { 5083 start = syncer_maxdelay / 2; 5084 incr = syncer_maxdelay; 5085 } 5086 next = start; 5087 } 5088 bo = &vp->v_bufobj; 5089 BO_LOCK(bo); 5090 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5091 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5092 mtx_lock(&sync_mtx); 5093 sync_vnode_count++; 5094 if (mp->mnt_syncer == NULL) { 5095 mp->mnt_syncer = vp; 5096 vp = NULL; 5097 } 5098 mtx_unlock(&sync_mtx); 5099 BO_UNLOCK(bo); 5100 if (vp != NULL) { 5101 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5102 vgone(vp); 5103 vput(vp); 5104 } 5105 } 5106 5107 void 5108 vfs_deallocate_syncvnode(struct mount *mp) 5109 { 5110 struct vnode *vp; 5111 5112 mtx_lock(&sync_mtx); 5113 vp = mp->mnt_syncer; 5114 if (vp != NULL) 5115 mp->mnt_syncer = NULL; 5116 mtx_unlock(&sync_mtx); 5117 if (vp != NULL) 5118 vrele(vp); 5119 } 5120 5121 /* 5122 * Do a lazy sync of the filesystem. 5123 */ 5124 static int 5125 sync_fsync(struct vop_fsync_args *ap) 5126 { 5127 struct vnode *syncvp = ap->a_vp; 5128 struct mount *mp = syncvp->v_mount; 5129 int error, save; 5130 struct bufobj *bo; 5131 5132 /* 5133 * We only need to do something if this is a lazy evaluation. 5134 */ 5135 if (ap->a_waitfor != MNT_LAZY) 5136 return (0); 5137 5138 /* 5139 * Move ourselves to the back of the sync list. 5140 */ 5141 bo = &syncvp->v_bufobj; 5142 BO_LOCK(bo); 5143 vn_syncer_add_to_worklist(bo, syncdelay); 5144 BO_UNLOCK(bo); 5145 5146 /* 5147 * Walk the list of vnodes pushing all that are dirty and 5148 * not already on the sync list. 5149 */ 5150 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5151 return (0); 5152 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 5153 vfs_unbusy(mp); 5154 return (0); 5155 } 5156 save = curthread_pflags_set(TDP_SYNCIO); 5157 /* 5158 * The filesystem at hand may be idle with free vnodes stored in the 5159 * batch. Return them instead of letting them stay there indefinitely. 5160 */ 5161 vfs_periodic(mp, MNT_NOWAIT); 5162 error = VFS_SYNC(mp, MNT_LAZY); 5163 curthread_pflags_restore(save); 5164 vn_finished_write(mp); 5165 vfs_unbusy(mp); 5166 return (error); 5167 } 5168 5169 /* 5170 * The syncer vnode is no referenced. 5171 */ 5172 static int 5173 sync_inactive(struct vop_inactive_args *ap) 5174 { 5175 5176 vgone(ap->a_vp); 5177 return (0); 5178 } 5179 5180 /* 5181 * The syncer vnode is no longer needed and is being decommissioned. 5182 * 5183 * Modifications to the worklist must be protected by sync_mtx. 5184 */ 5185 static int 5186 sync_reclaim(struct vop_reclaim_args *ap) 5187 { 5188 struct vnode *vp = ap->a_vp; 5189 struct bufobj *bo; 5190 5191 bo = &vp->v_bufobj; 5192 BO_LOCK(bo); 5193 mtx_lock(&sync_mtx); 5194 if (vp->v_mount->mnt_syncer == vp) 5195 vp->v_mount->mnt_syncer = NULL; 5196 if (bo->bo_flag & BO_ONWORKLST) { 5197 LIST_REMOVE(bo, bo_synclist); 5198 syncer_worklist_len--; 5199 sync_vnode_count--; 5200 bo->bo_flag &= ~BO_ONWORKLST; 5201 } 5202 mtx_unlock(&sync_mtx); 5203 BO_UNLOCK(bo); 5204 5205 return (0); 5206 } 5207 5208 int 5209 vn_need_pageq_flush(struct vnode *vp) 5210 { 5211 struct vm_object *obj; 5212 int need; 5213 5214 MPASS(mtx_owned(VI_MTX(vp))); 5215 need = 0; 5216 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5217 vm_object_mightbedirty(obj)) 5218 need = 1; 5219 return (need); 5220 } 5221 5222 /* 5223 * Check if vnode represents a disk device 5224 */ 5225 int 5226 vn_isdisk(struct vnode *vp, int *errp) 5227 { 5228 int error; 5229 5230 if (vp->v_type != VCHR) { 5231 error = ENOTBLK; 5232 goto out; 5233 } 5234 error = 0; 5235 dev_lock(); 5236 if (vp->v_rdev == NULL) 5237 error = ENXIO; 5238 else if (vp->v_rdev->si_devsw == NULL) 5239 error = ENXIO; 5240 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5241 error = ENOTBLK; 5242 dev_unlock(); 5243 out: 5244 if (errp != NULL) 5245 *errp = error; 5246 return (error == 0); 5247 } 5248 5249 /* 5250 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5251 * the comment above cache_fplookup for details. 5252 * 5253 * We never deny as priv_check_cred calls are not yet supported, see vaccess. 5254 */ 5255 int 5256 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5257 { 5258 5259 VFS_SMR_ASSERT_ENTERED(); 5260 5261 /* Check the owner. */ 5262 if (cred->cr_uid == file_uid) { 5263 if (file_mode & S_IXUSR) 5264 return (0); 5265 return (EAGAIN); 5266 } 5267 5268 /* Otherwise, check the groups (first match) */ 5269 if (groupmember(file_gid, cred)) { 5270 if (file_mode & S_IXGRP) 5271 return (0); 5272 return (EAGAIN); 5273 } 5274 5275 /* Otherwise, check everyone else. */ 5276 if (file_mode & S_IXOTH) 5277 return (0); 5278 return (EAGAIN); 5279 } 5280 5281 /* 5282 * Common filesystem object access control check routine. Accepts a 5283 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5284 * Returns 0 on success, or an errno on failure. 5285 */ 5286 int 5287 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5288 accmode_t accmode, struct ucred *cred) 5289 { 5290 accmode_t dac_granted; 5291 accmode_t priv_granted; 5292 5293 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5294 ("invalid bit in accmode")); 5295 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5296 ("VAPPEND without VWRITE")); 5297 5298 /* 5299 * Look for a normal, non-privileged way to access the file/directory 5300 * as requested. If it exists, go with that. 5301 */ 5302 5303 dac_granted = 0; 5304 5305 /* Check the owner. */ 5306 if (cred->cr_uid == file_uid) { 5307 dac_granted |= VADMIN; 5308 if (file_mode & S_IXUSR) 5309 dac_granted |= VEXEC; 5310 if (file_mode & S_IRUSR) 5311 dac_granted |= VREAD; 5312 if (file_mode & S_IWUSR) 5313 dac_granted |= (VWRITE | VAPPEND); 5314 5315 if ((accmode & dac_granted) == accmode) 5316 return (0); 5317 5318 goto privcheck; 5319 } 5320 5321 /* Otherwise, check the groups (first match) */ 5322 if (groupmember(file_gid, cred)) { 5323 if (file_mode & S_IXGRP) 5324 dac_granted |= VEXEC; 5325 if (file_mode & S_IRGRP) 5326 dac_granted |= VREAD; 5327 if (file_mode & S_IWGRP) 5328 dac_granted |= (VWRITE | VAPPEND); 5329 5330 if ((accmode & dac_granted) == accmode) 5331 return (0); 5332 5333 goto privcheck; 5334 } 5335 5336 /* Otherwise, check everyone else. */ 5337 if (file_mode & S_IXOTH) 5338 dac_granted |= VEXEC; 5339 if (file_mode & S_IROTH) 5340 dac_granted |= VREAD; 5341 if (file_mode & S_IWOTH) 5342 dac_granted |= (VWRITE | VAPPEND); 5343 if ((accmode & dac_granted) == accmode) 5344 return (0); 5345 5346 privcheck: 5347 /* 5348 * Build a privilege mask to determine if the set of privileges 5349 * satisfies the requirements when combined with the granted mask 5350 * from above. For each privilege, if the privilege is required, 5351 * bitwise or the request type onto the priv_granted mask. 5352 */ 5353 priv_granted = 0; 5354 5355 if (type == VDIR) { 5356 /* 5357 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5358 * requests, instead of PRIV_VFS_EXEC. 5359 */ 5360 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5361 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5362 priv_granted |= VEXEC; 5363 } else { 5364 /* 5365 * Ensure that at least one execute bit is on. Otherwise, 5366 * a privileged user will always succeed, and we don't want 5367 * this to happen unless the file really is executable. 5368 */ 5369 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5370 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5371 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5372 priv_granted |= VEXEC; 5373 } 5374 5375 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5376 !priv_check_cred(cred, PRIV_VFS_READ)) 5377 priv_granted |= VREAD; 5378 5379 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5380 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5381 priv_granted |= (VWRITE | VAPPEND); 5382 5383 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5384 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5385 priv_granted |= VADMIN; 5386 5387 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5388 return (0); 5389 } 5390 5391 return ((accmode & VADMIN) ? EPERM : EACCES); 5392 } 5393 5394 /* 5395 * Credential check based on process requesting service, and per-attribute 5396 * permissions. 5397 */ 5398 int 5399 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5400 struct thread *td, accmode_t accmode) 5401 { 5402 5403 /* 5404 * Kernel-invoked always succeeds. 5405 */ 5406 if (cred == NOCRED) 5407 return (0); 5408 5409 /* 5410 * Do not allow privileged processes in jail to directly manipulate 5411 * system attributes. 5412 */ 5413 switch (attrnamespace) { 5414 case EXTATTR_NAMESPACE_SYSTEM: 5415 /* Potentially should be: return (EPERM); */ 5416 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5417 case EXTATTR_NAMESPACE_USER: 5418 return (VOP_ACCESS(vp, accmode, cred, td)); 5419 default: 5420 return (EPERM); 5421 } 5422 } 5423 5424 #ifdef DEBUG_VFS_LOCKS 5425 /* 5426 * This only exists to suppress warnings from unlocked specfs accesses. It is 5427 * no longer ok to have an unlocked VFS. 5428 */ 5429 #define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \ 5430 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 5431 5432 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5433 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5434 "Drop into debugger on lock violation"); 5435 5436 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5437 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5438 0, "Check for interlock across VOPs"); 5439 5440 int vfs_badlock_print = 1; /* Print lock violations. */ 5441 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5442 0, "Print lock violations"); 5443 5444 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5445 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5446 0, "Print vnode details on lock violations"); 5447 5448 #ifdef KDB 5449 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5450 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5451 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5452 #endif 5453 5454 static void 5455 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5456 { 5457 5458 #ifdef KDB 5459 if (vfs_badlock_backtrace) 5460 kdb_backtrace(); 5461 #endif 5462 if (vfs_badlock_vnode) 5463 vn_printf(vp, "vnode "); 5464 if (vfs_badlock_print) 5465 printf("%s: %p %s\n", str, (void *)vp, msg); 5466 if (vfs_badlock_ddb) 5467 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5468 } 5469 5470 void 5471 assert_vi_locked(struct vnode *vp, const char *str) 5472 { 5473 5474 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5475 vfs_badlock("interlock is not locked but should be", str, vp); 5476 } 5477 5478 void 5479 assert_vi_unlocked(struct vnode *vp, const char *str) 5480 { 5481 5482 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5483 vfs_badlock("interlock is locked but should not be", str, vp); 5484 } 5485 5486 void 5487 assert_vop_locked(struct vnode *vp, const char *str) 5488 { 5489 int locked; 5490 5491 if (!IGNORE_LOCK(vp)) { 5492 locked = VOP_ISLOCKED(vp); 5493 if (locked == 0 || locked == LK_EXCLOTHER) 5494 vfs_badlock("is not locked but should be", str, vp); 5495 } 5496 } 5497 5498 void 5499 assert_vop_unlocked(struct vnode *vp, const char *str) 5500 { 5501 5502 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5503 vfs_badlock("is locked but should not be", str, vp); 5504 } 5505 5506 void 5507 assert_vop_elocked(struct vnode *vp, const char *str) 5508 { 5509 5510 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5511 vfs_badlock("is not exclusive locked but should be", str, vp); 5512 } 5513 #endif /* DEBUG_VFS_LOCKS */ 5514 5515 void 5516 vop_rename_fail(struct vop_rename_args *ap) 5517 { 5518 5519 if (ap->a_tvp != NULL) 5520 vput(ap->a_tvp); 5521 if (ap->a_tdvp == ap->a_tvp) 5522 vrele(ap->a_tdvp); 5523 else 5524 vput(ap->a_tdvp); 5525 vrele(ap->a_fdvp); 5526 vrele(ap->a_fvp); 5527 } 5528 5529 void 5530 vop_rename_pre(void *ap) 5531 { 5532 struct vop_rename_args *a = ap; 5533 5534 #ifdef DEBUG_VFS_LOCKS 5535 if (a->a_tvp) 5536 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5537 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5538 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5539 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5540 5541 /* Check the source (from). */ 5542 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5543 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5544 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5545 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5546 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5547 5548 /* Check the target. */ 5549 if (a->a_tvp) 5550 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5551 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5552 #endif 5553 /* 5554 * It may be tempting to add vn_seqc_write_begin/end calls here and 5555 * in vop_rename_post but that's not going to work out since some 5556 * filesystems relookup vnodes mid-rename. This is probably a bug. 5557 * 5558 * For now filesystems are expected to do the relevant calls after they 5559 * decide what vnodes to operate on. 5560 */ 5561 if (a->a_tdvp != a->a_fdvp) 5562 vhold(a->a_fdvp); 5563 if (a->a_tvp != a->a_fvp) 5564 vhold(a->a_fvp); 5565 vhold(a->a_tdvp); 5566 if (a->a_tvp) 5567 vhold(a->a_tvp); 5568 } 5569 5570 #ifdef DEBUG_VFS_LOCKS 5571 void 5572 vop_fplookup_vexec_debugpre(void *ap __unused) 5573 { 5574 5575 VFS_SMR_ASSERT_ENTERED(); 5576 } 5577 5578 void 5579 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) 5580 { 5581 5582 VFS_SMR_ASSERT_ENTERED(); 5583 } 5584 5585 void 5586 vop_strategy_debugpre(void *ap) 5587 { 5588 struct vop_strategy_args *a; 5589 struct buf *bp; 5590 5591 a = ap; 5592 bp = a->a_bp; 5593 5594 /* 5595 * Cluster ops lock their component buffers but not the IO container. 5596 */ 5597 if ((bp->b_flags & B_CLUSTER) != 0) 5598 return; 5599 5600 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5601 if (vfs_badlock_print) 5602 printf( 5603 "VOP_STRATEGY: bp is not locked but should be\n"); 5604 if (vfs_badlock_ddb) 5605 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5606 } 5607 } 5608 5609 void 5610 vop_lock_debugpre(void *ap) 5611 { 5612 struct vop_lock1_args *a = ap; 5613 5614 if ((a->a_flags & LK_INTERLOCK) == 0) 5615 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5616 else 5617 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5618 } 5619 5620 void 5621 vop_lock_debugpost(void *ap, int rc) 5622 { 5623 struct vop_lock1_args *a = ap; 5624 5625 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5626 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5627 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5628 } 5629 5630 void 5631 vop_unlock_debugpre(void *ap) 5632 { 5633 struct vop_unlock_args *a = ap; 5634 5635 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 5636 } 5637 5638 void 5639 vop_need_inactive_debugpre(void *ap) 5640 { 5641 struct vop_need_inactive_args *a = ap; 5642 5643 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5644 } 5645 5646 void 5647 vop_need_inactive_debugpost(void *ap, int rc) 5648 { 5649 struct vop_need_inactive_args *a = ap; 5650 5651 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5652 } 5653 #endif 5654 5655 void 5656 vop_create_pre(void *ap) 5657 { 5658 struct vop_create_args *a; 5659 struct vnode *dvp; 5660 5661 a = ap; 5662 dvp = a->a_dvp; 5663 vn_seqc_write_begin(dvp); 5664 } 5665 5666 void 5667 vop_create_post(void *ap, int rc) 5668 { 5669 struct vop_create_args *a; 5670 struct vnode *dvp; 5671 5672 a = ap; 5673 dvp = a->a_dvp; 5674 vn_seqc_write_end(dvp); 5675 if (!rc) 5676 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5677 } 5678 5679 void 5680 vop_whiteout_pre(void *ap) 5681 { 5682 struct vop_whiteout_args *a; 5683 struct vnode *dvp; 5684 5685 a = ap; 5686 dvp = a->a_dvp; 5687 vn_seqc_write_begin(dvp); 5688 } 5689 5690 void 5691 vop_whiteout_post(void *ap, int rc) 5692 { 5693 struct vop_whiteout_args *a; 5694 struct vnode *dvp; 5695 5696 a = ap; 5697 dvp = a->a_dvp; 5698 vn_seqc_write_end(dvp); 5699 } 5700 5701 void 5702 vop_deleteextattr_pre(void *ap) 5703 { 5704 struct vop_deleteextattr_args *a; 5705 struct vnode *vp; 5706 5707 a = ap; 5708 vp = a->a_vp; 5709 vn_seqc_write_begin(vp); 5710 } 5711 5712 void 5713 vop_deleteextattr_post(void *ap, int rc) 5714 { 5715 struct vop_deleteextattr_args *a; 5716 struct vnode *vp; 5717 5718 a = ap; 5719 vp = a->a_vp; 5720 vn_seqc_write_end(vp); 5721 if (!rc) 5722 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5723 } 5724 5725 void 5726 vop_link_pre(void *ap) 5727 { 5728 struct vop_link_args *a; 5729 struct vnode *vp, *tdvp; 5730 5731 a = ap; 5732 vp = a->a_vp; 5733 tdvp = a->a_tdvp; 5734 vn_seqc_write_begin(vp); 5735 vn_seqc_write_begin(tdvp); 5736 } 5737 5738 void 5739 vop_link_post(void *ap, int rc) 5740 { 5741 struct vop_link_args *a; 5742 struct vnode *vp, *tdvp; 5743 5744 a = ap; 5745 vp = a->a_vp; 5746 tdvp = a->a_tdvp; 5747 vn_seqc_write_end(vp); 5748 vn_seqc_write_end(tdvp); 5749 if (!rc) { 5750 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 5751 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 5752 } 5753 } 5754 5755 void 5756 vop_mkdir_pre(void *ap) 5757 { 5758 struct vop_mkdir_args *a; 5759 struct vnode *dvp; 5760 5761 a = ap; 5762 dvp = a->a_dvp; 5763 vn_seqc_write_begin(dvp); 5764 } 5765 5766 void 5767 vop_mkdir_post(void *ap, int rc) 5768 { 5769 struct vop_mkdir_args *a; 5770 struct vnode *dvp; 5771 5772 a = ap; 5773 dvp = a->a_dvp; 5774 vn_seqc_write_end(dvp); 5775 if (!rc) 5776 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5777 } 5778 5779 void 5780 vop_mknod_pre(void *ap) 5781 { 5782 struct vop_mknod_args *a; 5783 struct vnode *dvp; 5784 5785 a = ap; 5786 dvp = a->a_dvp; 5787 vn_seqc_write_begin(dvp); 5788 } 5789 5790 void 5791 vop_mknod_post(void *ap, int rc) 5792 { 5793 struct vop_mknod_args *a; 5794 struct vnode *dvp; 5795 5796 a = ap; 5797 dvp = a->a_dvp; 5798 vn_seqc_write_end(dvp); 5799 if (!rc) 5800 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5801 } 5802 5803 void 5804 vop_reclaim_post(void *ap, int rc) 5805 { 5806 struct vop_reclaim_args *a; 5807 struct vnode *vp; 5808 5809 a = ap; 5810 vp = a->a_vp; 5811 ASSERT_VOP_IN_SEQC(vp); 5812 if (!rc) 5813 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 5814 } 5815 5816 void 5817 vop_remove_pre(void *ap) 5818 { 5819 struct vop_remove_args *a; 5820 struct vnode *dvp, *vp; 5821 5822 a = ap; 5823 dvp = a->a_dvp; 5824 vp = a->a_vp; 5825 vn_seqc_write_begin(dvp); 5826 vn_seqc_write_begin(vp); 5827 } 5828 5829 void 5830 vop_remove_post(void *ap, int rc) 5831 { 5832 struct vop_remove_args *a; 5833 struct vnode *dvp, *vp; 5834 5835 a = ap; 5836 dvp = a->a_dvp; 5837 vp = a->a_vp; 5838 vn_seqc_write_end(dvp); 5839 vn_seqc_write_end(vp); 5840 if (!rc) { 5841 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5842 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5843 } 5844 } 5845 5846 void 5847 vop_rename_post(void *ap, int rc) 5848 { 5849 struct vop_rename_args *a = ap; 5850 long hint; 5851 5852 if (!rc) { 5853 hint = NOTE_WRITE; 5854 if (a->a_fdvp == a->a_tdvp) { 5855 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5856 hint |= NOTE_LINK; 5857 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5858 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5859 } else { 5860 hint |= NOTE_EXTEND; 5861 if (a->a_fvp->v_type == VDIR) 5862 hint |= NOTE_LINK; 5863 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5864 5865 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5866 a->a_tvp->v_type == VDIR) 5867 hint &= ~NOTE_LINK; 5868 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5869 } 5870 5871 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5872 if (a->a_tvp) 5873 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5874 } 5875 if (a->a_tdvp != a->a_fdvp) 5876 vdrop(a->a_fdvp); 5877 if (a->a_tvp != a->a_fvp) 5878 vdrop(a->a_fvp); 5879 vdrop(a->a_tdvp); 5880 if (a->a_tvp) 5881 vdrop(a->a_tvp); 5882 } 5883 5884 void 5885 vop_rmdir_pre(void *ap) 5886 { 5887 struct vop_rmdir_args *a; 5888 struct vnode *dvp, *vp; 5889 5890 a = ap; 5891 dvp = a->a_dvp; 5892 vp = a->a_vp; 5893 vn_seqc_write_begin(dvp); 5894 vn_seqc_write_begin(vp); 5895 } 5896 5897 void 5898 vop_rmdir_post(void *ap, int rc) 5899 { 5900 struct vop_rmdir_args *a; 5901 struct vnode *dvp, *vp; 5902 5903 a = ap; 5904 dvp = a->a_dvp; 5905 vp = a->a_vp; 5906 vn_seqc_write_end(dvp); 5907 vn_seqc_write_end(vp); 5908 if (!rc) { 5909 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5910 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5911 } 5912 } 5913 5914 void 5915 vop_setattr_pre(void *ap) 5916 { 5917 struct vop_setattr_args *a; 5918 struct vnode *vp; 5919 5920 a = ap; 5921 vp = a->a_vp; 5922 vn_seqc_write_begin(vp); 5923 } 5924 5925 void 5926 vop_setattr_post(void *ap, int rc) 5927 { 5928 struct vop_setattr_args *a; 5929 struct vnode *vp; 5930 5931 a = ap; 5932 vp = a->a_vp; 5933 vn_seqc_write_end(vp); 5934 if (!rc) 5935 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 5936 } 5937 5938 void 5939 vop_setacl_pre(void *ap) 5940 { 5941 struct vop_setacl_args *a; 5942 struct vnode *vp; 5943 5944 a = ap; 5945 vp = a->a_vp; 5946 vn_seqc_write_begin(vp); 5947 } 5948 5949 void 5950 vop_setacl_post(void *ap, int rc __unused) 5951 { 5952 struct vop_setacl_args *a; 5953 struct vnode *vp; 5954 5955 a = ap; 5956 vp = a->a_vp; 5957 vn_seqc_write_end(vp); 5958 } 5959 5960 void 5961 vop_setextattr_pre(void *ap) 5962 { 5963 struct vop_setextattr_args *a; 5964 struct vnode *vp; 5965 5966 a = ap; 5967 vp = a->a_vp; 5968 vn_seqc_write_begin(vp); 5969 } 5970 5971 void 5972 vop_setextattr_post(void *ap, int rc) 5973 { 5974 struct vop_setextattr_args *a; 5975 struct vnode *vp; 5976 5977 a = ap; 5978 vp = a->a_vp; 5979 vn_seqc_write_end(vp); 5980 if (!rc) 5981 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 5982 } 5983 5984 void 5985 vop_symlink_pre(void *ap) 5986 { 5987 struct vop_symlink_args *a; 5988 struct vnode *dvp; 5989 5990 a = ap; 5991 dvp = a->a_dvp; 5992 vn_seqc_write_begin(dvp); 5993 } 5994 5995 void 5996 vop_symlink_post(void *ap, int rc) 5997 { 5998 struct vop_symlink_args *a; 5999 struct vnode *dvp; 6000 6001 a = ap; 6002 dvp = a->a_dvp; 6003 vn_seqc_write_end(dvp); 6004 if (!rc) 6005 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6006 } 6007 6008 void 6009 vop_open_post(void *ap, int rc) 6010 { 6011 struct vop_open_args *a = ap; 6012 6013 if (!rc) 6014 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6015 } 6016 6017 void 6018 vop_close_post(void *ap, int rc) 6019 { 6020 struct vop_close_args *a = ap; 6021 6022 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6023 !VN_IS_DOOMED(a->a_vp))) { 6024 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6025 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6026 } 6027 } 6028 6029 void 6030 vop_read_post(void *ap, int rc) 6031 { 6032 struct vop_read_args *a = ap; 6033 6034 if (!rc) 6035 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6036 } 6037 6038 void 6039 vop_readdir_post(void *ap, int rc) 6040 { 6041 struct vop_readdir_args *a = ap; 6042 6043 if (!rc) 6044 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6045 } 6046 6047 static struct knlist fs_knlist; 6048 6049 static void 6050 vfs_event_init(void *arg) 6051 { 6052 knlist_init_mtx(&fs_knlist, NULL); 6053 } 6054 /* XXX - correct order? */ 6055 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6056 6057 void 6058 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6059 { 6060 6061 KNOTE_UNLOCKED(&fs_knlist, event); 6062 } 6063 6064 static int filt_fsattach(struct knote *kn); 6065 static void filt_fsdetach(struct knote *kn); 6066 static int filt_fsevent(struct knote *kn, long hint); 6067 6068 struct filterops fs_filtops = { 6069 .f_isfd = 0, 6070 .f_attach = filt_fsattach, 6071 .f_detach = filt_fsdetach, 6072 .f_event = filt_fsevent 6073 }; 6074 6075 static int 6076 filt_fsattach(struct knote *kn) 6077 { 6078 6079 kn->kn_flags |= EV_CLEAR; 6080 knlist_add(&fs_knlist, kn, 0); 6081 return (0); 6082 } 6083 6084 static void 6085 filt_fsdetach(struct knote *kn) 6086 { 6087 6088 knlist_remove(&fs_knlist, kn, 0); 6089 } 6090 6091 static int 6092 filt_fsevent(struct knote *kn, long hint) 6093 { 6094 6095 kn->kn_fflags |= hint; 6096 return (kn->kn_fflags != 0); 6097 } 6098 6099 static int 6100 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6101 { 6102 struct vfsidctl vc; 6103 int error; 6104 struct mount *mp; 6105 6106 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6107 if (error) 6108 return (error); 6109 if (vc.vc_vers != VFS_CTL_VERS1) 6110 return (EINVAL); 6111 mp = vfs_getvfs(&vc.vc_fsid); 6112 if (mp == NULL) 6113 return (ENOENT); 6114 /* ensure that a specific sysctl goes to the right filesystem. */ 6115 if (strcmp(vc.vc_fstypename, "*") != 0 && 6116 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6117 vfs_rel(mp); 6118 return (EINVAL); 6119 } 6120 VCTLTOREQ(&vc, req); 6121 error = VFS_SYSCTL(mp, vc.vc_op, req); 6122 vfs_rel(mp); 6123 return (error); 6124 } 6125 6126 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6127 NULL, 0, sysctl_vfs_ctl, "", 6128 "Sysctl by fsid"); 6129 6130 /* 6131 * Function to initialize a va_filerev field sensibly. 6132 * XXX: Wouldn't a random number make a lot more sense ?? 6133 */ 6134 u_quad_t 6135 init_va_filerev(void) 6136 { 6137 struct bintime bt; 6138 6139 getbinuptime(&bt); 6140 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6141 } 6142 6143 static int filt_vfsread(struct knote *kn, long hint); 6144 static int filt_vfswrite(struct knote *kn, long hint); 6145 static int filt_vfsvnode(struct knote *kn, long hint); 6146 static void filt_vfsdetach(struct knote *kn); 6147 static struct filterops vfsread_filtops = { 6148 .f_isfd = 1, 6149 .f_detach = filt_vfsdetach, 6150 .f_event = filt_vfsread 6151 }; 6152 static struct filterops vfswrite_filtops = { 6153 .f_isfd = 1, 6154 .f_detach = filt_vfsdetach, 6155 .f_event = filt_vfswrite 6156 }; 6157 static struct filterops vfsvnode_filtops = { 6158 .f_isfd = 1, 6159 .f_detach = filt_vfsdetach, 6160 .f_event = filt_vfsvnode 6161 }; 6162 6163 static void 6164 vfs_knllock(void *arg) 6165 { 6166 struct vnode *vp = arg; 6167 6168 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6169 } 6170 6171 static void 6172 vfs_knlunlock(void *arg) 6173 { 6174 struct vnode *vp = arg; 6175 6176 VOP_UNLOCK(vp); 6177 } 6178 6179 static void 6180 vfs_knl_assert_locked(void *arg) 6181 { 6182 #ifdef DEBUG_VFS_LOCKS 6183 struct vnode *vp = arg; 6184 6185 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6186 #endif 6187 } 6188 6189 static void 6190 vfs_knl_assert_unlocked(void *arg) 6191 { 6192 #ifdef DEBUG_VFS_LOCKS 6193 struct vnode *vp = arg; 6194 6195 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6196 #endif 6197 } 6198 6199 int 6200 vfs_kqfilter(struct vop_kqfilter_args *ap) 6201 { 6202 struct vnode *vp = ap->a_vp; 6203 struct knote *kn = ap->a_kn; 6204 struct knlist *knl; 6205 6206 switch (kn->kn_filter) { 6207 case EVFILT_READ: 6208 kn->kn_fop = &vfsread_filtops; 6209 break; 6210 case EVFILT_WRITE: 6211 kn->kn_fop = &vfswrite_filtops; 6212 break; 6213 case EVFILT_VNODE: 6214 kn->kn_fop = &vfsvnode_filtops; 6215 break; 6216 default: 6217 return (EINVAL); 6218 } 6219 6220 kn->kn_hook = (caddr_t)vp; 6221 6222 v_addpollinfo(vp); 6223 if (vp->v_pollinfo == NULL) 6224 return (ENOMEM); 6225 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6226 vhold(vp); 6227 knlist_add(knl, kn, 0); 6228 6229 return (0); 6230 } 6231 6232 /* 6233 * Detach knote from vnode 6234 */ 6235 static void 6236 filt_vfsdetach(struct knote *kn) 6237 { 6238 struct vnode *vp = (struct vnode *)kn->kn_hook; 6239 6240 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6241 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6242 vdrop(vp); 6243 } 6244 6245 /*ARGSUSED*/ 6246 static int 6247 filt_vfsread(struct knote *kn, long hint) 6248 { 6249 struct vnode *vp = (struct vnode *)kn->kn_hook; 6250 struct vattr va; 6251 int res; 6252 6253 /* 6254 * filesystem is gone, so set the EOF flag and schedule 6255 * the knote for deletion. 6256 */ 6257 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6258 VI_LOCK(vp); 6259 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6260 VI_UNLOCK(vp); 6261 return (1); 6262 } 6263 6264 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 6265 return (0); 6266 6267 VI_LOCK(vp); 6268 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 6269 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6270 VI_UNLOCK(vp); 6271 return (res); 6272 } 6273 6274 /*ARGSUSED*/ 6275 static int 6276 filt_vfswrite(struct knote *kn, long hint) 6277 { 6278 struct vnode *vp = (struct vnode *)kn->kn_hook; 6279 6280 VI_LOCK(vp); 6281 6282 /* 6283 * filesystem is gone, so set the EOF flag and schedule 6284 * the knote for deletion. 6285 */ 6286 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6287 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6288 6289 kn->kn_data = 0; 6290 VI_UNLOCK(vp); 6291 return (1); 6292 } 6293 6294 static int 6295 filt_vfsvnode(struct knote *kn, long hint) 6296 { 6297 struct vnode *vp = (struct vnode *)kn->kn_hook; 6298 int res; 6299 6300 VI_LOCK(vp); 6301 if (kn->kn_sfflags & hint) 6302 kn->kn_fflags |= hint; 6303 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6304 kn->kn_flags |= EV_EOF; 6305 VI_UNLOCK(vp); 6306 return (1); 6307 } 6308 res = (kn->kn_fflags != 0); 6309 VI_UNLOCK(vp); 6310 return (res); 6311 } 6312 6313 /* 6314 * Returns whether the directory is empty or not. 6315 * If it is empty, the return value is 0; otherwise 6316 * the return value is an error value (which may 6317 * be ENOTEMPTY). 6318 */ 6319 int 6320 vfs_emptydir(struct vnode *vp) 6321 { 6322 struct uio uio; 6323 struct iovec iov; 6324 struct dirent *dirent, *dp, *endp; 6325 int error, eof; 6326 6327 error = 0; 6328 eof = 0; 6329 6330 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 6331 6332 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 6333 iov.iov_base = dirent; 6334 iov.iov_len = sizeof(struct dirent); 6335 6336 uio.uio_iov = &iov; 6337 uio.uio_iovcnt = 1; 6338 uio.uio_offset = 0; 6339 uio.uio_resid = sizeof(struct dirent); 6340 uio.uio_segflg = UIO_SYSSPACE; 6341 uio.uio_rw = UIO_READ; 6342 uio.uio_td = curthread; 6343 6344 while (eof == 0 && error == 0) { 6345 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 6346 NULL, NULL); 6347 if (error != 0) 6348 break; 6349 endp = (void *)((uint8_t *)dirent + 6350 sizeof(struct dirent) - uio.uio_resid); 6351 for (dp = dirent; dp < endp; 6352 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 6353 if (dp->d_type == DT_WHT) 6354 continue; 6355 if (dp->d_namlen == 0) 6356 continue; 6357 if (dp->d_type != DT_DIR && 6358 dp->d_type != DT_UNKNOWN) { 6359 error = ENOTEMPTY; 6360 break; 6361 } 6362 if (dp->d_namlen > 2) { 6363 error = ENOTEMPTY; 6364 break; 6365 } 6366 if (dp->d_namlen == 1 && 6367 dp->d_name[0] != '.') { 6368 error = ENOTEMPTY; 6369 break; 6370 } 6371 if (dp->d_namlen == 2 && 6372 dp->d_name[1] != '.') { 6373 error = ENOTEMPTY; 6374 break; 6375 } 6376 uio.uio_resid = sizeof(struct dirent); 6377 } 6378 } 6379 free(dirent, M_TEMP); 6380 return (error); 6381 } 6382 6383 int 6384 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6385 { 6386 int error; 6387 6388 if (dp->d_reclen > ap->a_uio->uio_resid) 6389 return (ENAMETOOLONG); 6390 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6391 if (error) { 6392 if (ap->a_ncookies != NULL) { 6393 if (ap->a_cookies != NULL) 6394 free(ap->a_cookies, M_TEMP); 6395 ap->a_cookies = NULL; 6396 *ap->a_ncookies = 0; 6397 } 6398 return (error); 6399 } 6400 if (ap->a_ncookies == NULL) 6401 return (0); 6402 6403 KASSERT(ap->a_cookies, 6404 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6405 6406 *ap->a_cookies = realloc(*ap->a_cookies, 6407 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 6408 (*ap->a_cookies)[*ap->a_ncookies] = off; 6409 *ap->a_ncookies += 1; 6410 return (0); 6411 } 6412 6413 /* 6414 * The purpose of this routine is to remove granularity from accmode_t, 6415 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6416 * VADMIN and VAPPEND. 6417 * 6418 * If it returns 0, the caller is supposed to continue with the usual 6419 * access checks using 'accmode' as modified by this routine. If it 6420 * returns nonzero value, the caller is supposed to return that value 6421 * as errno. 6422 * 6423 * Note that after this routine runs, accmode may be zero. 6424 */ 6425 int 6426 vfs_unixify_accmode(accmode_t *accmode) 6427 { 6428 /* 6429 * There is no way to specify explicit "deny" rule using 6430 * file mode or POSIX.1e ACLs. 6431 */ 6432 if (*accmode & VEXPLICIT_DENY) { 6433 *accmode = 0; 6434 return (0); 6435 } 6436 6437 /* 6438 * None of these can be translated into usual access bits. 6439 * Also, the common case for NFSv4 ACLs is to not contain 6440 * either of these bits. Caller should check for VWRITE 6441 * on the containing directory instead. 6442 */ 6443 if (*accmode & (VDELETE_CHILD | VDELETE)) 6444 return (EPERM); 6445 6446 if (*accmode & VADMIN_PERMS) { 6447 *accmode &= ~VADMIN_PERMS; 6448 *accmode |= VADMIN; 6449 } 6450 6451 /* 6452 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6453 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6454 */ 6455 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6456 6457 return (0); 6458 } 6459 6460 /* 6461 * Clear out a doomed vnode (if any) and replace it with a new one as long 6462 * as the fs is not being unmounted. Return the root vnode to the caller. 6463 */ 6464 static int __noinline 6465 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6466 { 6467 struct vnode *vp; 6468 int error; 6469 6470 restart: 6471 if (mp->mnt_rootvnode != NULL) { 6472 MNT_ILOCK(mp); 6473 vp = mp->mnt_rootvnode; 6474 if (vp != NULL) { 6475 if (!VN_IS_DOOMED(vp)) { 6476 vrefact(vp); 6477 MNT_IUNLOCK(mp); 6478 error = vn_lock(vp, flags); 6479 if (error == 0) { 6480 *vpp = vp; 6481 return (0); 6482 } 6483 vrele(vp); 6484 goto restart; 6485 } 6486 /* 6487 * Clear the old one. 6488 */ 6489 mp->mnt_rootvnode = NULL; 6490 } 6491 MNT_IUNLOCK(mp); 6492 if (vp != NULL) { 6493 vfs_op_barrier_wait(mp); 6494 vrele(vp); 6495 } 6496 } 6497 error = VFS_CACHEDROOT(mp, flags, vpp); 6498 if (error != 0) 6499 return (error); 6500 if (mp->mnt_vfs_ops == 0) { 6501 MNT_ILOCK(mp); 6502 if (mp->mnt_vfs_ops != 0) { 6503 MNT_IUNLOCK(mp); 6504 return (0); 6505 } 6506 if (mp->mnt_rootvnode == NULL) { 6507 vrefact(*vpp); 6508 mp->mnt_rootvnode = *vpp; 6509 } else { 6510 if (mp->mnt_rootvnode != *vpp) { 6511 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6512 panic("%s: mismatch between vnode returned " 6513 " by VFS_CACHEDROOT and the one cached " 6514 " (%p != %p)", 6515 __func__, *vpp, mp->mnt_rootvnode); 6516 } 6517 } 6518 } 6519 MNT_IUNLOCK(mp); 6520 } 6521 return (0); 6522 } 6523 6524 int 6525 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6526 { 6527 struct vnode *vp; 6528 int error; 6529 6530 if (!vfs_op_thread_enter(mp)) 6531 return (vfs_cache_root_fallback(mp, flags, vpp)); 6532 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6533 if (vp == NULL || VN_IS_DOOMED(vp)) { 6534 vfs_op_thread_exit(mp); 6535 return (vfs_cache_root_fallback(mp, flags, vpp)); 6536 } 6537 vrefact(vp); 6538 vfs_op_thread_exit(mp); 6539 error = vn_lock(vp, flags); 6540 if (error != 0) { 6541 vrele(vp); 6542 return (vfs_cache_root_fallback(mp, flags, vpp)); 6543 } 6544 *vpp = vp; 6545 return (0); 6546 } 6547 6548 struct vnode * 6549 vfs_cache_root_clear(struct mount *mp) 6550 { 6551 struct vnode *vp; 6552 6553 /* 6554 * ops > 0 guarantees there is nobody who can see this vnode 6555 */ 6556 MPASS(mp->mnt_vfs_ops > 0); 6557 vp = mp->mnt_rootvnode; 6558 if (vp != NULL) 6559 vn_seqc_write_begin(vp); 6560 mp->mnt_rootvnode = NULL; 6561 return (vp); 6562 } 6563 6564 void 6565 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6566 { 6567 6568 MPASS(mp->mnt_vfs_ops > 0); 6569 vrefact(vp); 6570 mp->mnt_rootvnode = vp; 6571 } 6572 6573 /* 6574 * These are helper functions for filesystems to traverse all 6575 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6576 * 6577 * This interface replaces MNT_VNODE_FOREACH. 6578 */ 6579 6580 struct vnode * 6581 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6582 { 6583 struct vnode *vp; 6584 6585 if (should_yield()) 6586 kern_yield(PRI_USER); 6587 MNT_ILOCK(mp); 6588 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6589 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6590 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6591 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6592 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6593 continue; 6594 VI_LOCK(vp); 6595 if (VN_IS_DOOMED(vp)) { 6596 VI_UNLOCK(vp); 6597 continue; 6598 } 6599 break; 6600 } 6601 if (vp == NULL) { 6602 __mnt_vnode_markerfree_all(mvp, mp); 6603 /* MNT_IUNLOCK(mp); -- done in above function */ 6604 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6605 return (NULL); 6606 } 6607 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6608 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6609 MNT_IUNLOCK(mp); 6610 return (vp); 6611 } 6612 6613 struct vnode * 6614 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6615 { 6616 struct vnode *vp; 6617 6618 *mvp = vn_alloc_marker(mp); 6619 MNT_ILOCK(mp); 6620 MNT_REF(mp); 6621 6622 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6623 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6624 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6625 continue; 6626 VI_LOCK(vp); 6627 if (VN_IS_DOOMED(vp)) { 6628 VI_UNLOCK(vp); 6629 continue; 6630 } 6631 break; 6632 } 6633 if (vp == NULL) { 6634 MNT_REL(mp); 6635 MNT_IUNLOCK(mp); 6636 vn_free_marker(*mvp); 6637 *mvp = NULL; 6638 return (NULL); 6639 } 6640 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6641 MNT_IUNLOCK(mp); 6642 return (vp); 6643 } 6644 6645 void 6646 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6647 { 6648 6649 if (*mvp == NULL) { 6650 MNT_IUNLOCK(mp); 6651 return; 6652 } 6653 6654 mtx_assert(MNT_MTX(mp), MA_OWNED); 6655 6656 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6657 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6658 MNT_REL(mp); 6659 MNT_IUNLOCK(mp); 6660 vn_free_marker(*mvp); 6661 *mvp = NULL; 6662 } 6663 6664 /* 6665 * These are helper functions for filesystems to traverse their 6666 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6667 */ 6668 static void 6669 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6670 { 6671 6672 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6673 6674 MNT_ILOCK(mp); 6675 MNT_REL(mp); 6676 MNT_IUNLOCK(mp); 6677 vn_free_marker(*mvp); 6678 *mvp = NULL; 6679 } 6680 6681 /* 6682 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6683 * conventional lock order during mnt_vnode_next_lazy iteration. 6684 * 6685 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6686 * The list lock is dropped and reacquired. On success, both locks are held. 6687 * On failure, the mount vnode list lock is held but the vnode interlock is 6688 * not, and the procedure may have yielded. 6689 */ 6690 static bool 6691 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6692 struct vnode *vp) 6693 { 6694 6695 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6696 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6697 ("%s: bad marker", __func__)); 6698 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6699 ("%s: inappropriate vnode", __func__)); 6700 ASSERT_VI_UNLOCKED(vp, __func__); 6701 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6702 6703 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6704 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6705 6706 /* 6707 * Note we may be racing against vdrop which transitioned the hold 6708 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6709 * if we are the only user after we get the interlock we will just 6710 * vdrop. 6711 */ 6712 vhold(vp); 6713 mtx_unlock(&mp->mnt_listmtx); 6714 VI_LOCK(vp); 6715 if (VN_IS_DOOMED(vp)) { 6716 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6717 goto out_lost; 6718 } 6719 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6720 /* 6721 * There is nothing to do if we are the last user. 6722 */ 6723 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6724 goto out_lost; 6725 mtx_lock(&mp->mnt_listmtx); 6726 return (true); 6727 out_lost: 6728 vdropl(vp); 6729 maybe_yield(); 6730 mtx_lock(&mp->mnt_listmtx); 6731 return (false); 6732 } 6733 6734 static struct vnode * 6735 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6736 void *cbarg) 6737 { 6738 struct vnode *vp; 6739 6740 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6741 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6742 restart: 6743 vp = TAILQ_NEXT(*mvp, v_lazylist); 6744 while (vp != NULL) { 6745 if (vp->v_type == VMARKER) { 6746 vp = TAILQ_NEXT(vp, v_lazylist); 6747 continue; 6748 } 6749 /* 6750 * See if we want to process the vnode. Note we may encounter a 6751 * long string of vnodes we don't care about and hog the list 6752 * as a result. Check for it and requeue the marker. 6753 */ 6754 VNPASS(!VN_IS_DOOMED(vp), vp); 6755 if (!cb(vp, cbarg)) { 6756 if (!should_yield()) { 6757 vp = TAILQ_NEXT(vp, v_lazylist); 6758 continue; 6759 } 6760 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 6761 v_lazylist); 6762 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 6763 v_lazylist); 6764 mtx_unlock(&mp->mnt_listmtx); 6765 kern_yield(PRI_USER); 6766 mtx_lock(&mp->mnt_listmtx); 6767 goto restart; 6768 } 6769 /* 6770 * Try-lock because this is the wrong lock order. 6771 */ 6772 if (!VI_TRYLOCK(vp) && 6773 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 6774 goto restart; 6775 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6776 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6777 ("alien vnode on the lazy list %p %p", vp, mp)); 6778 VNPASS(vp->v_mount == mp, vp); 6779 VNPASS(!VN_IS_DOOMED(vp), vp); 6780 break; 6781 } 6782 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6783 6784 /* Check if we are done */ 6785 if (vp == NULL) { 6786 mtx_unlock(&mp->mnt_listmtx); 6787 mnt_vnode_markerfree_lazy(mvp, mp); 6788 return (NULL); 6789 } 6790 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 6791 mtx_unlock(&mp->mnt_listmtx); 6792 ASSERT_VI_LOCKED(vp, "lazy iter"); 6793 return (vp); 6794 } 6795 6796 struct vnode * 6797 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6798 void *cbarg) 6799 { 6800 6801 if (should_yield()) 6802 kern_yield(PRI_USER); 6803 mtx_lock(&mp->mnt_listmtx); 6804 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6805 } 6806 6807 struct vnode * 6808 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6809 void *cbarg) 6810 { 6811 struct vnode *vp; 6812 6813 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 6814 return (NULL); 6815 6816 *mvp = vn_alloc_marker(mp); 6817 MNT_ILOCK(mp); 6818 MNT_REF(mp); 6819 MNT_IUNLOCK(mp); 6820 6821 mtx_lock(&mp->mnt_listmtx); 6822 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 6823 if (vp == NULL) { 6824 mtx_unlock(&mp->mnt_listmtx); 6825 mnt_vnode_markerfree_lazy(mvp, mp); 6826 return (NULL); 6827 } 6828 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 6829 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6830 } 6831 6832 void 6833 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6834 { 6835 6836 if (*mvp == NULL) 6837 return; 6838 6839 mtx_lock(&mp->mnt_listmtx); 6840 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6841 mtx_unlock(&mp->mnt_listmtx); 6842 mnt_vnode_markerfree_lazy(mvp, mp); 6843 } 6844 6845 int 6846 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 6847 { 6848 6849 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 6850 cnp->cn_flags &= ~NOEXECCHECK; 6851 return (0); 6852 } 6853 6854 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); 6855 } 6856 6857 /* 6858 * Do not use this variant unless you have means other than the hold count 6859 * to prevent the vnode from getting freed. 6860 */ 6861 void 6862 vn_seqc_write_begin_unheld_locked(struct vnode *vp) 6863 { 6864 6865 ASSERT_VI_LOCKED(vp, __func__); 6866 VNPASS(vp->v_seqc_users >= 0, vp); 6867 vp->v_seqc_users++; 6868 if (vp->v_seqc_users == 1) 6869 seqc_sleepable_write_begin(&vp->v_seqc); 6870 } 6871 6872 void 6873 vn_seqc_write_begin_locked(struct vnode *vp) 6874 { 6875 6876 ASSERT_VI_LOCKED(vp, __func__); 6877 VNPASS(vp->v_holdcnt > 0, vp); 6878 vn_seqc_write_begin_unheld_locked(vp); 6879 } 6880 6881 void 6882 vn_seqc_write_begin(struct vnode *vp) 6883 { 6884 6885 VI_LOCK(vp); 6886 vn_seqc_write_begin_locked(vp); 6887 VI_UNLOCK(vp); 6888 } 6889 6890 void 6891 vn_seqc_write_begin_unheld(struct vnode *vp) 6892 { 6893 6894 VI_LOCK(vp); 6895 vn_seqc_write_begin_unheld_locked(vp); 6896 VI_UNLOCK(vp); 6897 } 6898 6899 void 6900 vn_seqc_write_end_locked(struct vnode *vp) 6901 { 6902 6903 ASSERT_VI_LOCKED(vp, __func__); 6904 VNPASS(vp->v_seqc_users > 0, vp); 6905 vp->v_seqc_users--; 6906 if (vp->v_seqc_users == 0) 6907 seqc_sleepable_write_end(&vp->v_seqc); 6908 } 6909 6910 void 6911 vn_seqc_write_end(struct vnode *vp) 6912 { 6913 6914 VI_LOCK(vp); 6915 vn_seqc_write_end_locked(vp); 6916 VI_UNLOCK(vp); 6917 } 6918