1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/bio.h> 52 #include <sys/buf.h> 53 #include <sys/capsicum.h> 54 #include <sys/condvar.h> 55 #include <sys/conf.h> 56 #include <sys/counter.h> 57 #include <sys/dirent.h> 58 #include <sys/event.h> 59 #include <sys/eventhandler.h> 60 #include <sys/extattr.h> 61 #include <sys/file.h> 62 #include <sys/fcntl.h> 63 #include <sys/jail.h> 64 #include <sys/kdb.h> 65 #include <sys/kernel.h> 66 #include <sys/kthread.h> 67 #include <sys/ktr.h> 68 #include <sys/lockf.h> 69 #include <sys/malloc.h> 70 #include <sys/mount.h> 71 #include <sys/namei.h> 72 #include <sys/pctrie.h> 73 #include <sys/priv.h> 74 #include <sys/reboot.h> 75 #include <sys/refcount.h> 76 #include <sys/rwlock.h> 77 #include <sys/sched.h> 78 #include <sys/sleepqueue.h> 79 #include <sys/smr.h> 80 #include <sys/smp.h> 81 #include <sys/stat.h> 82 #include <sys/sysctl.h> 83 #include <sys/syslog.h> 84 #include <sys/vmmeter.h> 85 #include <sys/vnode.h> 86 #include <sys/watchdog.h> 87 88 #include <machine/stdarg.h> 89 90 #include <security/mac/mac_framework.h> 91 92 #include <vm/vm.h> 93 #include <vm/vm_object.h> 94 #include <vm/vm_extern.h> 95 #include <vm/pmap.h> 96 #include <vm/vm_map.h> 97 #include <vm/vm_page.h> 98 #include <vm/vm_kern.h> 99 #include <vm/uma.h> 100 101 #ifdef DDB 102 #include <ddb/ddb.h> 103 #endif 104 105 static void delmntque(struct vnode *vp); 106 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 107 int slpflag, int slptimeo); 108 static void syncer_shutdown(void *arg, int howto); 109 static int vtryrecycle(struct vnode *vp); 110 static void v_init_counters(struct vnode *); 111 static void v_incr_devcount(struct vnode *); 112 static void v_decr_devcount(struct vnode *); 113 static void vgonel(struct vnode *); 114 static void vfs_knllock(void *arg); 115 static void vfs_knlunlock(void *arg); 116 static void vfs_knl_assert_locked(void *arg); 117 static void vfs_knl_assert_unlocked(void *arg); 118 static void destroy_vpollinfo(struct vpollinfo *vi); 119 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 120 daddr_t startlbn, daddr_t endlbn); 121 static void vnlru_recalc(void); 122 123 /* 124 * These fences are intended for cases where some synchronization is 125 * needed between access of v_iflags and lockless vnode refcount (v_holdcnt 126 * and v_usecount) updates. Access to v_iflags is generally synchronized 127 * by the interlock, but we have some internal assertions that check vnode 128 * flags without acquiring the lock. Thus, these fences are INVARIANTS-only 129 * for now. 130 */ 131 #ifdef INVARIANTS 132 #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() 133 #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() 134 #else 135 #define VNODE_REFCOUNT_FENCE_ACQ() 136 #define VNODE_REFCOUNT_FENCE_REL() 137 #endif 138 139 /* 140 * Number of vnodes in existence. Increased whenever getnewvnode() 141 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 142 */ 143 static u_long __exclusive_cache_line numvnodes; 144 145 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 146 "Number of vnodes in existence"); 147 148 static counter_u64_t vnodes_created; 149 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 150 "Number of vnodes created by getnewvnode"); 151 152 /* 153 * Conversion tables for conversion from vnode types to inode formats 154 * and back. 155 */ 156 enum vtype iftovt_tab[16] = { 157 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 158 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 159 }; 160 int vttoif_tab[10] = { 161 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 162 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 163 }; 164 165 /* 166 * List of allocates vnodes in the system. 167 */ 168 static TAILQ_HEAD(freelst, vnode) vnode_list; 169 static struct vnode *vnode_list_free_marker; 170 static struct vnode *vnode_list_reclaim_marker; 171 172 /* 173 * "Free" vnode target. Free vnodes are rarely completely free, but are 174 * just ones that are cheap to recycle. Usually they are for files which 175 * have been stat'd but not read; these usually have inode and namecache 176 * data attached to them. This target is the preferred minimum size of a 177 * sub-cache consisting mostly of such files. The system balances the size 178 * of this sub-cache with its complement to try to prevent either from 179 * thrashing while the other is relatively inactive. The targets express 180 * a preference for the best balance. 181 * 182 * "Above" this target there are 2 further targets (watermarks) related 183 * to recyling of free vnodes. In the best-operating case, the cache is 184 * exactly full, the free list has size between vlowat and vhiwat above the 185 * free target, and recycling from it and normal use maintains this state. 186 * Sometimes the free list is below vlowat or even empty, but this state 187 * is even better for immediate use provided the cache is not full. 188 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 189 * ones) to reach one of these states. The watermarks are currently hard- 190 * coded as 4% and 9% of the available space higher. These and the default 191 * of 25% for wantfreevnodes are too large if the memory size is large. 192 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 193 * whenever vnlru_proc() becomes active. 194 */ 195 static long wantfreevnodes; 196 static long __exclusive_cache_line freevnodes; 197 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 198 &freevnodes, 0, "Number of \"free\" vnodes"); 199 static long freevnodes_old; 200 201 static counter_u64_t recycles_count; 202 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 203 "Number of vnodes recycled to meet vnode cache targets"); 204 205 static counter_u64_t recycles_free_count; 206 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, 207 "Number of free vnodes recycled to meet vnode cache targets"); 208 209 static counter_u64_t deferred_inact; 210 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 211 "Number of times inactive processing was deferred"); 212 213 /* To keep more than one thread at a time from running vfs_getnewfsid */ 214 static struct mtx mntid_mtx; 215 216 /* 217 * Lock for any access to the following: 218 * vnode_list 219 * numvnodes 220 * freevnodes 221 */ 222 static struct mtx __exclusive_cache_line vnode_list_mtx; 223 224 /* Publicly exported FS */ 225 struct nfs_public nfs_pub; 226 227 static uma_zone_t buf_trie_zone; 228 static smr_t buf_trie_smr; 229 230 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 231 static uma_zone_t vnode_zone; 232 static uma_zone_t vnodepoll_zone; 233 234 __read_frequently smr_t vfs_smr; 235 236 /* 237 * The workitem queue. 238 * 239 * It is useful to delay writes of file data and filesystem metadata 240 * for tens of seconds so that quickly created and deleted files need 241 * not waste disk bandwidth being created and removed. To realize this, 242 * we append vnodes to a "workitem" queue. When running with a soft 243 * updates implementation, most pending metadata dependencies should 244 * not wait for more than a few seconds. Thus, mounted on block devices 245 * are delayed only about a half the time that file data is delayed. 246 * Similarly, directory updates are more critical, so are only delayed 247 * about a third the time that file data is delayed. Thus, there are 248 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 249 * one each second (driven off the filesystem syncer process). The 250 * syncer_delayno variable indicates the next queue that is to be processed. 251 * Items that need to be processed soon are placed in this queue: 252 * 253 * syncer_workitem_pending[syncer_delayno] 254 * 255 * A delay of fifteen seconds is done by placing the request fifteen 256 * entries later in the queue: 257 * 258 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 259 * 260 */ 261 static int syncer_delayno; 262 static long syncer_mask; 263 LIST_HEAD(synclist, bufobj); 264 static struct synclist *syncer_workitem_pending; 265 /* 266 * The sync_mtx protects: 267 * bo->bo_synclist 268 * sync_vnode_count 269 * syncer_delayno 270 * syncer_state 271 * syncer_workitem_pending 272 * syncer_worklist_len 273 * rushjob 274 */ 275 static struct mtx sync_mtx; 276 static struct cv sync_wakeup; 277 278 #define SYNCER_MAXDELAY 32 279 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 280 static int syncdelay = 30; /* max time to delay syncing data */ 281 static int filedelay = 30; /* time to delay syncing files */ 282 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 283 "Time to delay syncing files (in seconds)"); 284 static int dirdelay = 29; /* time to delay syncing directories */ 285 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 286 "Time to delay syncing directories (in seconds)"); 287 static int metadelay = 28; /* time to delay syncing metadata */ 288 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 289 "Time to delay syncing metadata (in seconds)"); 290 static int rushjob; /* number of slots to run ASAP */ 291 static int stat_rush_requests; /* number of times I/O speeded up */ 292 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 293 "Number of times I/O speeded up (rush requests)"); 294 295 #define VDBATCH_SIZE 8 296 struct vdbatch { 297 u_int index; 298 long freevnodes; 299 struct mtx lock; 300 struct vnode *tab[VDBATCH_SIZE]; 301 }; 302 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 303 304 static void vdbatch_dequeue(struct vnode *vp); 305 306 /* 307 * When shutting down the syncer, run it at four times normal speed. 308 */ 309 #define SYNCER_SHUTDOWN_SPEEDUP 4 310 static int sync_vnode_count; 311 static int syncer_worklist_len; 312 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 313 syncer_state; 314 315 /* Target for maximum number of vnodes. */ 316 u_long desiredvnodes; 317 static u_long gapvnodes; /* gap between wanted and desired */ 318 static u_long vhiwat; /* enough extras after expansion */ 319 static u_long vlowat; /* minimal extras before expansion */ 320 static u_long vstir; /* nonzero to stir non-free vnodes */ 321 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 322 323 static u_long vnlru_read_freevnodes(void); 324 325 /* 326 * Note that no attempt is made to sanitize these parameters. 327 */ 328 static int 329 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 330 { 331 u_long val; 332 int error; 333 334 val = desiredvnodes; 335 error = sysctl_handle_long(oidp, &val, 0, req); 336 if (error != 0 || req->newptr == NULL) 337 return (error); 338 339 if (val == desiredvnodes) 340 return (0); 341 mtx_lock(&vnode_list_mtx); 342 desiredvnodes = val; 343 wantfreevnodes = desiredvnodes / 4; 344 vnlru_recalc(); 345 mtx_unlock(&vnode_list_mtx); 346 /* 347 * XXX There is no protection against multiple threads changing 348 * desiredvnodes at the same time. Locking above only helps vnlru and 349 * getnewvnode. 350 */ 351 vfs_hash_changesize(desiredvnodes); 352 cache_changesize(desiredvnodes); 353 return (0); 354 } 355 356 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 357 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 358 "LU", "Target for maximum number of vnodes"); 359 360 static int 361 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 362 { 363 u_long val; 364 int error; 365 366 val = wantfreevnodes; 367 error = sysctl_handle_long(oidp, &val, 0, req); 368 if (error != 0 || req->newptr == NULL) 369 return (error); 370 371 if (val == wantfreevnodes) 372 return (0); 373 mtx_lock(&vnode_list_mtx); 374 wantfreevnodes = val; 375 vnlru_recalc(); 376 mtx_unlock(&vnode_list_mtx); 377 return (0); 378 } 379 380 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 381 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 382 "LU", "Target for minimum number of \"free\" vnodes"); 383 384 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 385 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 386 static int vnlru_nowhere; 387 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 388 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 389 390 static int 391 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 392 { 393 struct vnode *vp; 394 struct nameidata nd; 395 char *buf; 396 unsigned long ndflags; 397 int error; 398 399 if (req->newptr == NULL) 400 return (EINVAL); 401 if (req->newlen >= PATH_MAX) 402 return (E2BIG); 403 404 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 405 error = SYSCTL_IN(req, buf, req->newlen); 406 if (error != 0) 407 goto out; 408 409 buf[req->newlen] = '\0'; 410 411 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | NOCACHE | SAVENAME; 412 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); 413 if ((error = namei(&nd)) != 0) 414 goto out; 415 vp = nd.ni_vp; 416 417 if (VN_IS_DOOMED(vp)) { 418 /* 419 * This vnode is being recycled. Return != 0 to let the caller 420 * know that the sysctl had no effect. Return EAGAIN because a 421 * subsequent call will likely succeed (since namei will create 422 * a new vnode if necessary) 423 */ 424 error = EAGAIN; 425 goto putvnode; 426 } 427 428 counter_u64_add(recycles_count, 1); 429 vgone(vp); 430 putvnode: 431 NDFREE(&nd, 0); 432 out: 433 free(buf, M_TEMP); 434 return (error); 435 } 436 437 static int 438 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 439 { 440 struct thread *td = curthread; 441 struct vnode *vp; 442 struct file *fp; 443 int error; 444 int fd; 445 446 if (req->newptr == NULL) 447 return (EBADF); 448 449 error = sysctl_handle_int(oidp, &fd, 0, req); 450 if (error != 0) 451 return (error); 452 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 453 if (error != 0) 454 return (error); 455 vp = fp->f_vnode; 456 457 error = vn_lock(vp, LK_EXCLUSIVE); 458 if (error != 0) 459 goto drop; 460 461 counter_u64_add(recycles_count, 1); 462 vgone(vp); 463 VOP_UNLOCK(vp); 464 drop: 465 fdrop(fp, td); 466 return (error); 467 } 468 469 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 470 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 471 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 472 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 473 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 474 sysctl_ftry_reclaim_vnode, "I", 475 "Try to reclaim a vnode by its file descriptor"); 476 477 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 478 static int vnsz2log; 479 480 /* 481 * Support for the bufobj clean & dirty pctrie. 482 */ 483 static void * 484 buf_trie_alloc(struct pctrie *ptree) 485 { 486 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 487 } 488 489 static void 490 buf_trie_free(struct pctrie *ptree, void *node) 491 { 492 uma_zfree_smr(buf_trie_zone, node); 493 } 494 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 495 buf_trie_smr); 496 497 /* 498 * Initialize the vnode management data structures. 499 * 500 * Reevaluate the following cap on the number of vnodes after the physical 501 * memory size exceeds 512GB. In the limit, as the physical memory size 502 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 503 */ 504 #ifndef MAXVNODES_MAX 505 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 506 #endif 507 508 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 509 510 static struct vnode * 511 vn_alloc_marker(struct mount *mp) 512 { 513 struct vnode *vp; 514 515 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 516 vp->v_type = VMARKER; 517 vp->v_mount = mp; 518 519 return (vp); 520 } 521 522 static void 523 vn_free_marker(struct vnode *vp) 524 { 525 526 MPASS(vp->v_type == VMARKER); 527 free(vp, M_VNODE_MARKER); 528 } 529 530 /* 531 * Initialize a vnode as it first enters the zone. 532 */ 533 static int 534 vnode_init(void *mem, int size, int flags) 535 { 536 struct vnode *vp; 537 538 vp = mem; 539 bzero(vp, size); 540 /* 541 * Setup locks. 542 */ 543 vp->v_vnlock = &vp->v_lock; 544 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 545 /* 546 * By default, don't allow shared locks unless filesystems opt-in. 547 */ 548 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 549 LK_NOSHARE | LK_IS_VNODE); 550 /* 551 * Initialize bufobj. 552 */ 553 bufobj_init(&vp->v_bufobj, vp); 554 /* 555 * Initialize namecache. 556 */ 557 cache_vnode_init(vp); 558 /* 559 * Initialize rangelocks. 560 */ 561 rangelock_init(&vp->v_rl); 562 563 vp->v_dbatchcpu = NOCPU; 564 565 mtx_lock(&vnode_list_mtx); 566 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 567 mtx_unlock(&vnode_list_mtx); 568 return (0); 569 } 570 571 /* 572 * Free a vnode when it is cleared from the zone. 573 */ 574 static void 575 vnode_fini(void *mem, int size) 576 { 577 struct vnode *vp; 578 struct bufobj *bo; 579 580 vp = mem; 581 vdbatch_dequeue(vp); 582 mtx_lock(&vnode_list_mtx); 583 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 584 mtx_unlock(&vnode_list_mtx); 585 rangelock_destroy(&vp->v_rl); 586 lockdestroy(vp->v_vnlock); 587 mtx_destroy(&vp->v_interlock); 588 bo = &vp->v_bufobj; 589 rw_destroy(BO_LOCKPTR(bo)); 590 } 591 592 /* 593 * Provide the size of NFS nclnode and NFS fh for calculation of the 594 * vnode memory consumption. The size is specified directly to 595 * eliminate dependency on NFS-private header. 596 * 597 * Other filesystems may use bigger or smaller (like UFS and ZFS) 598 * private inode data, but the NFS-based estimation is ample enough. 599 * Still, we care about differences in the size between 64- and 32-bit 600 * platforms. 601 * 602 * Namecache structure size is heuristically 603 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 604 */ 605 #ifdef _LP64 606 #define NFS_NCLNODE_SZ (528 + 64) 607 #define NC_SZ 148 608 #else 609 #define NFS_NCLNODE_SZ (360 + 32) 610 #define NC_SZ 92 611 #endif 612 613 static void 614 vntblinit(void *dummy __unused) 615 { 616 struct vdbatch *vd; 617 int cpu, physvnodes, virtvnodes; 618 u_int i; 619 620 /* 621 * Desiredvnodes is a function of the physical memory size and the 622 * kernel's heap size. Generally speaking, it scales with the 623 * physical memory size. The ratio of desiredvnodes to the physical 624 * memory size is 1:16 until desiredvnodes exceeds 98,304. 625 * Thereafter, the 626 * marginal ratio of desiredvnodes to the physical memory size is 627 * 1:64. However, desiredvnodes is limited by the kernel's heap 628 * size. The memory required by desiredvnodes vnodes and vm objects 629 * must not exceed 1/10th of the kernel's heap size. 630 */ 631 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 632 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 633 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 634 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 635 desiredvnodes = min(physvnodes, virtvnodes); 636 if (desiredvnodes > MAXVNODES_MAX) { 637 if (bootverbose) 638 printf("Reducing kern.maxvnodes %lu -> %lu\n", 639 desiredvnodes, MAXVNODES_MAX); 640 desiredvnodes = MAXVNODES_MAX; 641 } 642 wantfreevnodes = desiredvnodes / 4; 643 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 644 TAILQ_INIT(&vnode_list); 645 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 646 /* 647 * The lock is taken to appease WITNESS. 648 */ 649 mtx_lock(&vnode_list_mtx); 650 vnlru_recalc(); 651 mtx_unlock(&vnode_list_mtx); 652 vnode_list_free_marker = vn_alloc_marker(NULL); 653 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 654 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 655 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 656 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 657 vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); 658 uma_zone_set_smr(vnode_zone, vfs_smr); 659 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 660 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 661 /* 662 * Preallocate enough nodes to support one-per buf so that 663 * we can not fail an insert. reassignbuf() callers can not 664 * tolerate the insertion failure. 665 */ 666 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 667 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 668 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 669 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 670 uma_prealloc(buf_trie_zone, nbuf); 671 672 vnodes_created = counter_u64_alloc(M_WAITOK); 673 recycles_count = counter_u64_alloc(M_WAITOK); 674 recycles_free_count = counter_u64_alloc(M_WAITOK); 675 deferred_inact = counter_u64_alloc(M_WAITOK); 676 677 /* 678 * Initialize the filesystem syncer. 679 */ 680 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 681 &syncer_mask); 682 syncer_maxdelay = syncer_mask + 1; 683 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 684 cv_init(&sync_wakeup, "syncer"); 685 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 686 vnsz2log++; 687 vnsz2log--; 688 689 CPU_FOREACH(cpu) { 690 vd = DPCPU_ID_PTR((cpu), vd); 691 bzero(vd, sizeof(*vd)); 692 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 693 } 694 } 695 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 696 697 /* 698 * Mark a mount point as busy. Used to synchronize access and to delay 699 * unmounting. Eventually, mountlist_mtx is not released on failure. 700 * 701 * vfs_busy() is a custom lock, it can block the caller. 702 * vfs_busy() only sleeps if the unmount is active on the mount point. 703 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 704 * vnode belonging to mp. 705 * 706 * Lookup uses vfs_busy() to traverse mount points. 707 * root fs var fs 708 * / vnode lock A / vnode lock (/var) D 709 * /var vnode lock B /log vnode lock(/var/log) E 710 * vfs_busy lock C vfs_busy lock F 711 * 712 * Within each file system, the lock order is C->A->B and F->D->E. 713 * 714 * When traversing across mounts, the system follows that lock order: 715 * 716 * C->A->B 717 * | 718 * +->F->D->E 719 * 720 * The lookup() process for namei("/var") illustrates the process: 721 * VOP_LOOKUP() obtains B while A is held 722 * vfs_busy() obtains a shared lock on F while A and B are held 723 * vput() releases lock on B 724 * vput() releases lock on A 725 * VFS_ROOT() obtains lock on D while shared lock on F is held 726 * vfs_unbusy() releases shared lock on F 727 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 728 * Attempt to lock A (instead of vp_crossmp) while D is held would 729 * violate the global order, causing deadlocks. 730 * 731 * dounmount() locks B while F is drained. 732 */ 733 int 734 vfs_busy(struct mount *mp, int flags) 735 { 736 737 MPASS((flags & ~MBF_MASK) == 0); 738 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 739 740 if (vfs_op_thread_enter(mp)) { 741 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 742 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 743 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 744 vfs_mp_count_add_pcpu(mp, ref, 1); 745 vfs_mp_count_add_pcpu(mp, lockref, 1); 746 vfs_op_thread_exit(mp); 747 if (flags & MBF_MNTLSTLOCK) 748 mtx_unlock(&mountlist_mtx); 749 return (0); 750 } 751 752 MNT_ILOCK(mp); 753 vfs_assert_mount_counters(mp); 754 MNT_REF(mp); 755 /* 756 * If mount point is currently being unmounted, sleep until the 757 * mount point fate is decided. If thread doing the unmounting fails, 758 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 759 * that this mount point has survived the unmount attempt and vfs_busy 760 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 761 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 762 * about to be really destroyed. vfs_busy needs to release its 763 * reference on the mount point in this case and return with ENOENT, 764 * telling the caller that mount mount it tried to busy is no longer 765 * valid. 766 */ 767 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 768 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 769 MNT_REL(mp); 770 MNT_IUNLOCK(mp); 771 CTR1(KTR_VFS, "%s: failed busying before sleeping", 772 __func__); 773 return (ENOENT); 774 } 775 if (flags & MBF_MNTLSTLOCK) 776 mtx_unlock(&mountlist_mtx); 777 mp->mnt_kern_flag |= MNTK_MWAIT; 778 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 779 if (flags & MBF_MNTLSTLOCK) 780 mtx_lock(&mountlist_mtx); 781 MNT_ILOCK(mp); 782 } 783 if (flags & MBF_MNTLSTLOCK) 784 mtx_unlock(&mountlist_mtx); 785 mp->mnt_lockref++; 786 MNT_IUNLOCK(mp); 787 return (0); 788 } 789 790 /* 791 * Free a busy filesystem. 792 */ 793 void 794 vfs_unbusy(struct mount *mp) 795 { 796 int c; 797 798 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 799 800 if (vfs_op_thread_enter(mp)) { 801 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 802 vfs_mp_count_sub_pcpu(mp, lockref, 1); 803 vfs_mp_count_sub_pcpu(mp, ref, 1); 804 vfs_op_thread_exit(mp); 805 return; 806 } 807 808 MNT_ILOCK(mp); 809 vfs_assert_mount_counters(mp); 810 MNT_REL(mp); 811 c = --mp->mnt_lockref; 812 if (mp->mnt_vfs_ops == 0) { 813 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 814 MNT_IUNLOCK(mp); 815 return; 816 } 817 if (c < 0) 818 vfs_dump_mount_counters(mp); 819 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 820 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 821 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 822 mp->mnt_kern_flag &= ~MNTK_DRAINING; 823 wakeup(&mp->mnt_lockref); 824 } 825 MNT_IUNLOCK(mp); 826 } 827 828 /* 829 * Lookup a mount point by filesystem identifier. 830 */ 831 struct mount * 832 vfs_getvfs(fsid_t *fsid) 833 { 834 struct mount *mp; 835 836 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 837 mtx_lock(&mountlist_mtx); 838 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 839 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 840 vfs_ref(mp); 841 mtx_unlock(&mountlist_mtx); 842 return (mp); 843 } 844 } 845 mtx_unlock(&mountlist_mtx); 846 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 847 return ((struct mount *) 0); 848 } 849 850 /* 851 * Lookup a mount point by filesystem identifier, busying it before 852 * returning. 853 * 854 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 855 * cache for popular filesystem identifiers. The cache is lockess, using 856 * the fact that struct mount's are never freed. In worst case we may 857 * get pointer to unmounted or even different filesystem, so we have to 858 * check what we got, and go slow way if so. 859 */ 860 struct mount * 861 vfs_busyfs(fsid_t *fsid) 862 { 863 #define FSID_CACHE_SIZE 256 864 typedef struct mount * volatile vmp_t; 865 static vmp_t cache[FSID_CACHE_SIZE]; 866 struct mount *mp; 867 int error; 868 uint32_t hash; 869 870 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 871 hash = fsid->val[0] ^ fsid->val[1]; 872 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 873 mp = cache[hash]; 874 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 875 goto slow; 876 if (vfs_busy(mp, 0) != 0) { 877 cache[hash] = NULL; 878 goto slow; 879 } 880 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 881 return (mp); 882 else 883 vfs_unbusy(mp); 884 885 slow: 886 mtx_lock(&mountlist_mtx); 887 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 888 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 889 error = vfs_busy(mp, MBF_MNTLSTLOCK); 890 if (error) { 891 cache[hash] = NULL; 892 mtx_unlock(&mountlist_mtx); 893 return (NULL); 894 } 895 cache[hash] = mp; 896 return (mp); 897 } 898 } 899 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 900 mtx_unlock(&mountlist_mtx); 901 return ((struct mount *) 0); 902 } 903 904 /* 905 * Check if a user can access privileged mount options. 906 */ 907 int 908 vfs_suser(struct mount *mp, struct thread *td) 909 { 910 int error; 911 912 if (jailed(td->td_ucred)) { 913 /* 914 * If the jail of the calling thread lacks permission for 915 * this type of file system, deny immediately. 916 */ 917 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 918 return (EPERM); 919 920 /* 921 * If the file system was mounted outside the jail of the 922 * calling thread, deny immediately. 923 */ 924 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 925 return (EPERM); 926 } 927 928 /* 929 * If file system supports delegated administration, we don't check 930 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 931 * by the file system itself. 932 * If this is not the user that did original mount, we check for 933 * the PRIV_VFS_MOUNT_OWNER privilege. 934 */ 935 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 936 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 937 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 938 return (error); 939 } 940 return (0); 941 } 942 943 /* 944 * Get a new unique fsid. Try to make its val[0] unique, since this value 945 * will be used to create fake device numbers for stat(). Also try (but 946 * not so hard) make its val[0] unique mod 2^16, since some emulators only 947 * support 16-bit device numbers. We end up with unique val[0]'s for the 948 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 949 * 950 * Keep in mind that several mounts may be running in parallel. Starting 951 * the search one past where the previous search terminated is both a 952 * micro-optimization and a defense against returning the same fsid to 953 * different mounts. 954 */ 955 void 956 vfs_getnewfsid(struct mount *mp) 957 { 958 static uint16_t mntid_base; 959 struct mount *nmp; 960 fsid_t tfsid; 961 int mtype; 962 963 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 964 mtx_lock(&mntid_mtx); 965 mtype = mp->mnt_vfc->vfc_typenum; 966 tfsid.val[1] = mtype; 967 mtype = (mtype & 0xFF) << 24; 968 for (;;) { 969 tfsid.val[0] = makedev(255, 970 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 971 mntid_base++; 972 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 973 break; 974 vfs_rel(nmp); 975 } 976 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 977 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 978 mtx_unlock(&mntid_mtx); 979 } 980 981 /* 982 * Knob to control the precision of file timestamps: 983 * 984 * 0 = seconds only; nanoseconds zeroed. 985 * 1 = seconds and nanoseconds, accurate within 1/HZ. 986 * 2 = seconds and nanoseconds, truncated to microseconds. 987 * >=3 = seconds and nanoseconds, maximum precision. 988 */ 989 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 990 991 static int timestamp_precision = TSP_USEC; 992 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 993 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 994 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 995 "3+: sec + ns (max. precision))"); 996 997 /* 998 * Get a current timestamp. 999 */ 1000 void 1001 vfs_timestamp(struct timespec *tsp) 1002 { 1003 struct timeval tv; 1004 1005 switch (timestamp_precision) { 1006 case TSP_SEC: 1007 tsp->tv_sec = time_second; 1008 tsp->tv_nsec = 0; 1009 break; 1010 case TSP_HZ: 1011 getnanotime(tsp); 1012 break; 1013 case TSP_USEC: 1014 microtime(&tv); 1015 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1016 break; 1017 case TSP_NSEC: 1018 default: 1019 nanotime(tsp); 1020 break; 1021 } 1022 } 1023 1024 /* 1025 * Set vnode attributes to VNOVAL 1026 */ 1027 void 1028 vattr_null(struct vattr *vap) 1029 { 1030 1031 vap->va_type = VNON; 1032 vap->va_size = VNOVAL; 1033 vap->va_bytes = VNOVAL; 1034 vap->va_mode = VNOVAL; 1035 vap->va_nlink = VNOVAL; 1036 vap->va_uid = VNOVAL; 1037 vap->va_gid = VNOVAL; 1038 vap->va_fsid = VNOVAL; 1039 vap->va_fileid = VNOVAL; 1040 vap->va_blocksize = VNOVAL; 1041 vap->va_rdev = VNOVAL; 1042 vap->va_atime.tv_sec = VNOVAL; 1043 vap->va_atime.tv_nsec = VNOVAL; 1044 vap->va_mtime.tv_sec = VNOVAL; 1045 vap->va_mtime.tv_nsec = VNOVAL; 1046 vap->va_ctime.tv_sec = VNOVAL; 1047 vap->va_ctime.tv_nsec = VNOVAL; 1048 vap->va_birthtime.tv_sec = VNOVAL; 1049 vap->va_birthtime.tv_nsec = VNOVAL; 1050 vap->va_flags = VNOVAL; 1051 vap->va_gen = VNOVAL; 1052 vap->va_vaflags = 0; 1053 } 1054 1055 /* 1056 * Try to reduce the total number of vnodes. 1057 * 1058 * This routine (and its user) are buggy in at least the following ways: 1059 * - all parameters were picked years ago when RAM sizes were significantly 1060 * smaller 1061 * - it can pick vnodes based on pages used by the vm object, but filesystems 1062 * like ZFS don't use it making the pick broken 1063 * - since ZFS has its own aging policy it gets partially combated by this one 1064 * - a dedicated method should be provided for filesystems to let them decide 1065 * whether the vnode should be recycled 1066 * 1067 * This routine is called when we have too many vnodes. It attempts 1068 * to free <count> vnodes and will potentially free vnodes that still 1069 * have VM backing store (VM backing store is typically the cause 1070 * of a vnode blowout so we want to do this). Therefore, this operation 1071 * is not considered cheap. 1072 * 1073 * A number of conditions may prevent a vnode from being reclaimed. 1074 * the buffer cache may have references on the vnode, a directory 1075 * vnode may still have references due to the namei cache representing 1076 * underlying files, or the vnode may be in active use. It is not 1077 * desirable to reuse such vnodes. These conditions may cause the 1078 * number of vnodes to reach some minimum value regardless of what 1079 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1080 * 1081 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1082 * entries if this argument is strue 1083 * @param trigger Only reclaim vnodes with fewer than this many resident 1084 * pages. 1085 * @param target How many vnodes to reclaim. 1086 * @return The number of vnodes that were reclaimed. 1087 */ 1088 static int 1089 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1090 { 1091 struct vnode *vp, *mvp; 1092 struct mount *mp; 1093 struct vm_object *object; 1094 u_long done; 1095 bool retried; 1096 1097 mtx_assert(&vnode_list_mtx, MA_OWNED); 1098 1099 retried = false; 1100 done = 0; 1101 1102 mvp = vnode_list_reclaim_marker; 1103 restart: 1104 vp = mvp; 1105 while (done < target) { 1106 vp = TAILQ_NEXT(vp, v_vnodelist); 1107 if (__predict_false(vp == NULL)) 1108 break; 1109 1110 if (__predict_false(vp->v_type == VMARKER)) 1111 continue; 1112 1113 /* 1114 * If it's been deconstructed already, it's still 1115 * referenced, or it exceeds the trigger, skip it. 1116 * Also skip free vnodes. We are trying to make space 1117 * to expand the free list, not reduce it. 1118 */ 1119 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1120 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1121 goto next_iter; 1122 1123 if (vp->v_type == VBAD || vp->v_type == VNON) 1124 goto next_iter; 1125 1126 if (!VI_TRYLOCK(vp)) 1127 goto next_iter; 1128 1129 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1130 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1131 VN_IS_DOOMED(vp) || vp->v_type == VNON) { 1132 VI_UNLOCK(vp); 1133 goto next_iter; 1134 } 1135 1136 object = atomic_load_ptr(&vp->v_object); 1137 if (object == NULL || object->resident_page_count > trigger) { 1138 VI_UNLOCK(vp); 1139 goto next_iter; 1140 } 1141 1142 vholdl(vp); 1143 VI_UNLOCK(vp); 1144 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1145 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1146 mtx_unlock(&vnode_list_mtx); 1147 1148 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1149 vdrop(vp); 1150 goto next_iter_unlocked; 1151 } 1152 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1153 vdrop(vp); 1154 vn_finished_write(mp); 1155 goto next_iter_unlocked; 1156 } 1157 1158 VI_LOCK(vp); 1159 if (vp->v_usecount > 0 || 1160 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1161 (vp->v_object != NULL && 1162 vp->v_object->resident_page_count > trigger)) { 1163 VOP_UNLOCK(vp); 1164 vdropl(vp); 1165 vn_finished_write(mp); 1166 goto next_iter_unlocked; 1167 } 1168 counter_u64_add(recycles_count, 1); 1169 vgonel(vp); 1170 VOP_UNLOCK(vp); 1171 vdropl(vp); 1172 vn_finished_write(mp); 1173 done++; 1174 next_iter_unlocked: 1175 if (should_yield()) 1176 kern_yield(PRI_USER); 1177 mtx_lock(&vnode_list_mtx); 1178 goto restart; 1179 next_iter: 1180 MPASS(vp->v_type != VMARKER); 1181 if (!should_yield()) 1182 continue; 1183 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1184 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1185 mtx_unlock(&vnode_list_mtx); 1186 kern_yield(PRI_USER); 1187 mtx_lock(&vnode_list_mtx); 1188 goto restart; 1189 } 1190 if (done == 0 && !retried) { 1191 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1192 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1193 retried = true; 1194 goto restart; 1195 } 1196 return (done); 1197 } 1198 1199 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1200 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1201 0, 1202 "limit on vnode free requests per call to the vnlru_free routine"); 1203 1204 /* 1205 * Attempt to reduce the free list by the requested amount. 1206 */ 1207 static int 1208 vnlru_free_locked(int count, struct vfsops *mnt_op) 1209 { 1210 struct vnode *vp, *mvp; 1211 struct mount *mp; 1212 int ocount; 1213 1214 mtx_assert(&vnode_list_mtx, MA_OWNED); 1215 if (count > max_vnlru_free) 1216 count = max_vnlru_free; 1217 ocount = count; 1218 mvp = vnode_list_free_marker; 1219 restart: 1220 vp = mvp; 1221 while (count > 0) { 1222 vp = TAILQ_NEXT(vp, v_vnodelist); 1223 if (__predict_false(vp == NULL)) { 1224 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1225 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1226 break; 1227 } 1228 if (__predict_false(vp->v_type == VMARKER)) 1229 continue; 1230 1231 /* 1232 * Don't recycle if our vnode is from different type 1233 * of mount point. Note that mp is type-safe, the 1234 * check does not reach unmapped address even if 1235 * vnode is reclaimed. 1236 * Don't recycle if we can't get the interlock without 1237 * blocking. 1238 */ 1239 if (vp->v_holdcnt > 0 || (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1240 mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { 1241 continue; 1242 } 1243 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1244 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1245 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1246 VI_UNLOCK(vp); 1247 continue; 1248 } 1249 vholdl(vp); 1250 count--; 1251 mtx_unlock(&vnode_list_mtx); 1252 VI_UNLOCK(vp); 1253 vtryrecycle(vp); 1254 vdrop(vp); 1255 mtx_lock(&vnode_list_mtx); 1256 goto restart; 1257 } 1258 return (ocount - count); 1259 } 1260 1261 void 1262 vnlru_free(int count, struct vfsops *mnt_op) 1263 { 1264 1265 mtx_lock(&vnode_list_mtx); 1266 vnlru_free_locked(count, mnt_op); 1267 mtx_unlock(&vnode_list_mtx); 1268 } 1269 1270 static void 1271 vnlru_recalc(void) 1272 { 1273 1274 mtx_assert(&vnode_list_mtx, MA_OWNED); 1275 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1276 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1277 vlowat = vhiwat / 2; 1278 } 1279 1280 /* 1281 * Attempt to recycle vnodes in a context that is always safe to block. 1282 * Calling vlrurecycle() from the bowels of filesystem code has some 1283 * interesting deadlock problems. 1284 */ 1285 static struct proc *vnlruproc; 1286 static int vnlruproc_sig; 1287 1288 /* 1289 * The main freevnodes counter is only updated when threads requeue their vnode 1290 * batches. CPUs are conditionally walked to compute a more accurate total. 1291 * 1292 * Limit how much of a slop are we willing to tolerate. Note: the actual value 1293 * at any given moment can still exceed slop, but it should not be by significant 1294 * margin in practice. 1295 */ 1296 #define VNLRU_FREEVNODES_SLOP 128 1297 1298 static u_long 1299 vnlru_read_freevnodes(void) 1300 { 1301 struct vdbatch *vd; 1302 long slop; 1303 int cpu; 1304 1305 mtx_assert(&vnode_list_mtx, MA_OWNED); 1306 if (freevnodes > freevnodes_old) 1307 slop = freevnodes - freevnodes_old; 1308 else 1309 slop = freevnodes_old - freevnodes; 1310 if (slop < VNLRU_FREEVNODES_SLOP) 1311 return (freevnodes >= 0 ? freevnodes : 0); 1312 freevnodes_old = freevnodes; 1313 CPU_FOREACH(cpu) { 1314 vd = DPCPU_ID_PTR((cpu), vd); 1315 freevnodes_old += vd->freevnodes; 1316 } 1317 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1318 } 1319 1320 static bool 1321 vnlru_under(u_long rnumvnodes, u_long limit) 1322 { 1323 u_long rfreevnodes, space; 1324 1325 if (__predict_false(rnumvnodes > desiredvnodes)) 1326 return (true); 1327 1328 space = desiredvnodes - rnumvnodes; 1329 if (space < limit) { 1330 rfreevnodes = vnlru_read_freevnodes(); 1331 if (rfreevnodes > wantfreevnodes) 1332 space += rfreevnodes - wantfreevnodes; 1333 } 1334 return (space < limit); 1335 } 1336 1337 static bool 1338 vnlru_under_unlocked(u_long rnumvnodes, u_long limit) 1339 { 1340 long rfreevnodes, space; 1341 1342 if (__predict_false(rnumvnodes > desiredvnodes)) 1343 return (true); 1344 1345 space = desiredvnodes - rnumvnodes; 1346 if (space < limit) { 1347 rfreevnodes = atomic_load_long(&freevnodes); 1348 if (rfreevnodes > wantfreevnodes) 1349 space += rfreevnodes - wantfreevnodes; 1350 } 1351 return (space < limit); 1352 } 1353 1354 static void 1355 vnlru_kick(void) 1356 { 1357 1358 mtx_assert(&vnode_list_mtx, MA_OWNED); 1359 if (vnlruproc_sig == 0) { 1360 vnlruproc_sig = 1; 1361 wakeup(vnlruproc); 1362 } 1363 } 1364 1365 static void 1366 vnlru_proc(void) 1367 { 1368 u_long rnumvnodes, rfreevnodes, target; 1369 unsigned long onumvnodes; 1370 int done, force, trigger, usevnodes; 1371 bool reclaim_nc_src, want_reread; 1372 1373 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1374 SHUTDOWN_PRI_FIRST); 1375 1376 force = 0; 1377 want_reread = false; 1378 for (;;) { 1379 kproc_suspend_check(vnlruproc); 1380 mtx_lock(&vnode_list_mtx); 1381 rnumvnodes = atomic_load_long(&numvnodes); 1382 1383 if (want_reread) { 1384 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1385 want_reread = false; 1386 } 1387 1388 /* 1389 * If numvnodes is too large (due to desiredvnodes being 1390 * adjusted using its sysctl, or emergency growth), first 1391 * try to reduce it by discarding from the free list. 1392 */ 1393 if (rnumvnodes > desiredvnodes) { 1394 vnlru_free_locked(rnumvnodes - desiredvnodes, NULL); 1395 rnumvnodes = atomic_load_long(&numvnodes); 1396 } 1397 /* 1398 * Sleep if the vnode cache is in a good state. This is 1399 * when it is not over-full and has space for about a 4% 1400 * or 9% expansion (by growing its size or inexcessively 1401 * reducing its free list). Otherwise, try to reclaim 1402 * space for a 10% expansion. 1403 */ 1404 if (vstir && force == 0) { 1405 force = 1; 1406 vstir = 0; 1407 } 1408 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1409 vnlruproc_sig = 0; 1410 wakeup(&vnlruproc_sig); 1411 msleep(vnlruproc, &vnode_list_mtx, 1412 PVFS|PDROP, "vlruwt", hz); 1413 continue; 1414 } 1415 rfreevnodes = vnlru_read_freevnodes(); 1416 1417 onumvnodes = rnumvnodes; 1418 /* 1419 * Calculate parameters for recycling. These are the same 1420 * throughout the loop to give some semblance of fairness. 1421 * The trigger point is to avoid recycling vnodes with lots 1422 * of resident pages. We aren't trying to free memory; we 1423 * are trying to recycle or at least free vnodes. 1424 */ 1425 if (rnumvnodes <= desiredvnodes) 1426 usevnodes = rnumvnodes - rfreevnodes; 1427 else 1428 usevnodes = rnumvnodes; 1429 if (usevnodes <= 0) 1430 usevnodes = 1; 1431 /* 1432 * The trigger value is is chosen to give a conservatively 1433 * large value to ensure that it alone doesn't prevent 1434 * making progress. The value can easily be so large that 1435 * it is effectively infinite in some congested and 1436 * misconfigured cases, and this is necessary. Normally 1437 * it is about 8 to 100 (pages), which is quite large. 1438 */ 1439 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1440 if (force < 2) 1441 trigger = vsmalltrigger; 1442 reclaim_nc_src = force >= 3; 1443 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1444 target = target / 10 + 1; 1445 done = vlrureclaim(reclaim_nc_src, trigger, target); 1446 mtx_unlock(&vnode_list_mtx); 1447 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1448 uma_reclaim(UMA_RECLAIM_DRAIN); 1449 if (done == 0) { 1450 if (force == 0 || force == 1) { 1451 force = 2; 1452 continue; 1453 } 1454 if (force == 2) { 1455 force = 3; 1456 continue; 1457 } 1458 want_reread = true; 1459 force = 0; 1460 vnlru_nowhere++; 1461 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1462 } else { 1463 want_reread = true; 1464 kern_yield(PRI_USER); 1465 } 1466 } 1467 } 1468 1469 static struct kproc_desc vnlru_kp = { 1470 "vnlru", 1471 vnlru_proc, 1472 &vnlruproc 1473 }; 1474 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1475 &vnlru_kp); 1476 1477 /* 1478 * Routines having to do with the management of the vnode table. 1479 */ 1480 1481 /* 1482 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1483 * before we actually vgone(). This function must be called with the vnode 1484 * held to prevent the vnode from being returned to the free list midway 1485 * through vgone(). 1486 */ 1487 static int 1488 vtryrecycle(struct vnode *vp) 1489 { 1490 struct mount *vnmp; 1491 1492 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1493 VNASSERT(vp->v_holdcnt, vp, 1494 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1495 /* 1496 * This vnode may found and locked via some other list, if so we 1497 * can't recycle it yet. 1498 */ 1499 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1500 CTR2(KTR_VFS, 1501 "%s: impossible to recycle, vp %p lock is already held", 1502 __func__, vp); 1503 return (EWOULDBLOCK); 1504 } 1505 /* 1506 * Don't recycle if its filesystem is being suspended. 1507 */ 1508 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1509 VOP_UNLOCK(vp); 1510 CTR2(KTR_VFS, 1511 "%s: impossible to recycle, cannot start the write for %p", 1512 __func__, vp); 1513 return (EBUSY); 1514 } 1515 /* 1516 * If we got this far, we need to acquire the interlock and see if 1517 * anyone picked up this vnode from another list. If not, we will 1518 * mark it with DOOMED via vgonel() so that anyone who does find it 1519 * will skip over it. 1520 */ 1521 VI_LOCK(vp); 1522 if (vp->v_usecount) { 1523 VOP_UNLOCK(vp); 1524 VI_UNLOCK(vp); 1525 vn_finished_write(vnmp); 1526 CTR2(KTR_VFS, 1527 "%s: impossible to recycle, %p is already referenced", 1528 __func__, vp); 1529 return (EBUSY); 1530 } 1531 if (!VN_IS_DOOMED(vp)) { 1532 counter_u64_add(recycles_free_count, 1); 1533 vgonel(vp); 1534 } 1535 VOP_UNLOCK(vp); 1536 VI_UNLOCK(vp); 1537 vn_finished_write(vnmp); 1538 return (0); 1539 } 1540 1541 /* 1542 * Allocate a new vnode. 1543 * 1544 * The operation never returns an error. Returning an error was disabled 1545 * in r145385 (dated 2005) with the following comment: 1546 * 1547 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1548 * 1549 * Given the age of this commit (almost 15 years at the time of writing this 1550 * comment) restoring the ability to fail requires a significant audit of 1551 * all codepaths. 1552 * 1553 * The routine can try to free a vnode or stall for up to 1 second waiting for 1554 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1555 */ 1556 static u_long vn_alloc_cyclecount; 1557 1558 static struct vnode * __noinline 1559 vn_alloc_hard(struct mount *mp) 1560 { 1561 u_long rnumvnodes, rfreevnodes; 1562 1563 mtx_lock(&vnode_list_mtx); 1564 rnumvnodes = atomic_load_long(&numvnodes); 1565 if (rnumvnodes + 1 < desiredvnodes) { 1566 vn_alloc_cyclecount = 0; 1567 goto alloc; 1568 } 1569 rfreevnodes = vnlru_read_freevnodes(); 1570 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1571 vn_alloc_cyclecount = 0; 1572 vstir = 1; 1573 } 1574 /* 1575 * Grow the vnode cache if it will not be above its target max 1576 * after growing. Otherwise, if the free list is nonempty, try 1577 * to reclaim 1 item from it before growing the cache (possibly 1578 * above its target max if the reclamation failed or is delayed). 1579 * Otherwise, wait for some space. In all cases, schedule 1580 * vnlru_proc() if we are getting short of space. The watermarks 1581 * should be chosen so that we never wait or even reclaim from 1582 * the free list to below its target minimum. 1583 */ 1584 if (vnlru_free_locked(1, NULL) > 0) 1585 goto alloc; 1586 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1587 /* 1588 * Wait for space for a new vnode. 1589 */ 1590 vnlru_kick(); 1591 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1592 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1593 vnlru_read_freevnodes() > 1) 1594 vnlru_free_locked(1, NULL); 1595 } 1596 alloc: 1597 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1598 if (vnlru_under(rnumvnodes, vlowat)) 1599 vnlru_kick(); 1600 mtx_unlock(&vnode_list_mtx); 1601 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1602 } 1603 1604 static struct vnode * 1605 vn_alloc(struct mount *mp) 1606 { 1607 u_long rnumvnodes; 1608 1609 if (__predict_false(vn_alloc_cyclecount != 0)) 1610 return (vn_alloc_hard(mp)); 1611 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1612 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { 1613 atomic_subtract_long(&numvnodes, 1); 1614 return (vn_alloc_hard(mp)); 1615 } 1616 1617 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1618 } 1619 1620 static void 1621 vn_free(struct vnode *vp) 1622 { 1623 1624 atomic_subtract_long(&numvnodes, 1); 1625 uma_zfree_smr(vnode_zone, vp); 1626 } 1627 1628 /* 1629 * Return the next vnode from the free list. 1630 */ 1631 int 1632 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1633 struct vnode **vpp) 1634 { 1635 struct vnode *vp; 1636 struct thread *td; 1637 struct lock_object *lo; 1638 1639 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1640 1641 KASSERT(vops->registered, 1642 ("%s: not registered vector op %p\n", __func__, vops)); 1643 1644 td = curthread; 1645 if (td->td_vp_reserved != NULL) { 1646 vp = td->td_vp_reserved; 1647 td->td_vp_reserved = NULL; 1648 } else { 1649 vp = vn_alloc(mp); 1650 } 1651 counter_u64_add(vnodes_created, 1); 1652 /* 1653 * Locks are given the generic name "vnode" when created. 1654 * Follow the historic practice of using the filesystem 1655 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1656 * 1657 * Locks live in a witness group keyed on their name. Thus, 1658 * when a lock is renamed, it must also move from the witness 1659 * group of its old name to the witness group of its new name. 1660 * 1661 * The change only needs to be made when the vnode moves 1662 * from one filesystem type to another. We ensure that each 1663 * filesystem use a single static name pointer for its tag so 1664 * that we can compare pointers rather than doing a strcmp(). 1665 */ 1666 lo = &vp->v_vnlock->lock_object; 1667 #ifdef WITNESS 1668 if (lo->lo_name != tag) { 1669 #endif 1670 lo->lo_name = tag; 1671 #ifdef WITNESS 1672 WITNESS_DESTROY(lo); 1673 WITNESS_INIT(lo, tag); 1674 } 1675 #endif 1676 /* 1677 * By default, don't allow shared locks unless filesystems opt-in. 1678 */ 1679 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1680 /* 1681 * Finalize various vnode identity bits. 1682 */ 1683 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1684 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1685 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1686 vp->v_type = VNON; 1687 vp->v_op = vops; 1688 v_init_counters(vp); 1689 vp->v_bufobj.bo_ops = &buf_ops_bio; 1690 #ifdef DIAGNOSTIC 1691 if (mp == NULL && vops != &dead_vnodeops) 1692 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1693 #endif 1694 #ifdef MAC 1695 mac_vnode_init(vp); 1696 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1697 mac_vnode_associate_singlelabel(mp, vp); 1698 #endif 1699 if (mp != NULL) { 1700 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1701 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1702 vp->v_vflag |= VV_NOKNOTE; 1703 } 1704 1705 /* 1706 * For the filesystems which do not use vfs_hash_insert(), 1707 * still initialize v_hash to have vfs_hash_index() useful. 1708 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1709 * its own hashing. 1710 */ 1711 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1712 1713 *vpp = vp; 1714 return (0); 1715 } 1716 1717 void 1718 getnewvnode_reserve(void) 1719 { 1720 struct thread *td; 1721 1722 td = curthread; 1723 MPASS(td->td_vp_reserved == NULL); 1724 td->td_vp_reserved = vn_alloc(NULL); 1725 } 1726 1727 void 1728 getnewvnode_drop_reserve(void) 1729 { 1730 struct thread *td; 1731 1732 td = curthread; 1733 if (td->td_vp_reserved != NULL) { 1734 vn_free(td->td_vp_reserved); 1735 td->td_vp_reserved = NULL; 1736 } 1737 } 1738 1739 static void 1740 freevnode(struct vnode *vp) 1741 { 1742 struct bufobj *bo; 1743 1744 /* 1745 * The vnode has been marked for destruction, so free it. 1746 * 1747 * The vnode will be returned to the zone where it will 1748 * normally remain until it is needed for another vnode. We 1749 * need to cleanup (or verify that the cleanup has already 1750 * been done) any residual data left from its current use 1751 * so as not to contaminate the freshly allocated vnode. 1752 */ 1753 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1754 /* 1755 * Paired with vgone. 1756 */ 1757 vn_seqc_write_end_locked(vp); 1758 VNPASS(vp->v_seqc_users == 0, vp); 1759 1760 bo = &vp->v_bufobj; 1761 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1762 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 1763 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1764 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1765 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1766 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1767 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1768 ("clean blk trie not empty")); 1769 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1770 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1771 ("dirty blk trie not empty")); 1772 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 1773 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 1774 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 1775 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1776 ("Dangling rangelock waiters")); 1777 VI_UNLOCK(vp); 1778 #ifdef MAC 1779 mac_vnode_destroy(vp); 1780 #endif 1781 if (vp->v_pollinfo != NULL) { 1782 destroy_vpollinfo(vp->v_pollinfo); 1783 vp->v_pollinfo = NULL; 1784 } 1785 #ifdef INVARIANTS 1786 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 1787 vp->v_op = NULL; 1788 #endif 1789 vp->v_mountedhere = NULL; 1790 vp->v_unpcb = NULL; 1791 vp->v_rdev = NULL; 1792 vp->v_fifoinfo = NULL; 1793 vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 1794 vp->v_irflag = 0; 1795 vp->v_iflag = 0; 1796 vp->v_vflag = 0; 1797 bo->bo_flag = 0; 1798 vn_free(vp); 1799 } 1800 1801 /* 1802 * Delete from old mount point vnode list, if on one. 1803 */ 1804 static void 1805 delmntque(struct vnode *vp) 1806 { 1807 struct mount *mp; 1808 1809 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 1810 1811 mp = vp->v_mount; 1812 if (mp == NULL) 1813 return; 1814 MNT_ILOCK(mp); 1815 VI_LOCK(vp); 1816 vp->v_mount = NULL; 1817 VI_UNLOCK(vp); 1818 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1819 ("bad mount point vnode list size")); 1820 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1821 mp->mnt_nvnodelistsize--; 1822 MNT_REL(mp); 1823 MNT_IUNLOCK(mp); 1824 } 1825 1826 static void 1827 insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1828 { 1829 1830 vp->v_data = NULL; 1831 vp->v_op = &dead_vnodeops; 1832 vgone(vp); 1833 vput(vp); 1834 } 1835 1836 /* 1837 * Insert into list of vnodes for the new mount point, if available. 1838 */ 1839 int 1840 insmntque1(struct vnode *vp, struct mount *mp, 1841 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1842 { 1843 1844 KASSERT(vp->v_mount == NULL, 1845 ("insmntque: vnode already on per mount vnode list")); 1846 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1847 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1848 1849 /* 1850 * We acquire the vnode interlock early to ensure that the 1851 * vnode cannot be recycled by another process releasing a 1852 * holdcnt on it before we get it on both the vnode list 1853 * and the active vnode list. The mount mutex protects only 1854 * manipulation of the vnode list and the vnode freelist 1855 * mutex protects only manipulation of the active vnode list. 1856 * Hence the need to hold the vnode interlock throughout. 1857 */ 1858 MNT_ILOCK(mp); 1859 VI_LOCK(vp); 1860 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 1861 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1862 mp->mnt_nvnodelistsize == 0)) && 1863 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 1864 VI_UNLOCK(vp); 1865 MNT_IUNLOCK(mp); 1866 if (dtr != NULL) 1867 dtr(vp, dtr_arg); 1868 return (EBUSY); 1869 } 1870 vp->v_mount = mp; 1871 MNT_REF(mp); 1872 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1873 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1874 ("neg mount point vnode list size")); 1875 mp->mnt_nvnodelistsize++; 1876 VI_UNLOCK(vp); 1877 MNT_IUNLOCK(mp); 1878 return (0); 1879 } 1880 1881 int 1882 insmntque(struct vnode *vp, struct mount *mp) 1883 { 1884 1885 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1886 } 1887 1888 /* 1889 * Flush out and invalidate all buffers associated with a bufobj 1890 * Called with the underlying object locked. 1891 */ 1892 int 1893 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1894 { 1895 int error; 1896 1897 BO_LOCK(bo); 1898 if (flags & V_SAVE) { 1899 error = bufobj_wwait(bo, slpflag, slptimeo); 1900 if (error) { 1901 BO_UNLOCK(bo); 1902 return (error); 1903 } 1904 if (bo->bo_dirty.bv_cnt > 0) { 1905 BO_UNLOCK(bo); 1906 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1907 return (error); 1908 /* 1909 * XXX We could save a lock/unlock if this was only 1910 * enabled under INVARIANTS 1911 */ 1912 BO_LOCK(bo); 1913 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1914 panic("vinvalbuf: dirty bufs"); 1915 } 1916 } 1917 /* 1918 * If you alter this loop please notice that interlock is dropped and 1919 * reacquired in flushbuflist. Special care is needed to ensure that 1920 * no race conditions occur from this. 1921 */ 1922 do { 1923 error = flushbuflist(&bo->bo_clean, 1924 flags, bo, slpflag, slptimeo); 1925 if (error == 0 && !(flags & V_CLEANONLY)) 1926 error = flushbuflist(&bo->bo_dirty, 1927 flags, bo, slpflag, slptimeo); 1928 if (error != 0 && error != EAGAIN) { 1929 BO_UNLOCK(bo); 1930 return (error); 1931 } 1932 } while (error != 0); 1933 1934 /* 1935 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1936 * have write I/O in-progress but if there is a VM object then the 1937 * VM object can also have read-I/O in-progress. 1938 */ 1939 do { 1940 bufobj_wwait(bo, 0, 0); 1941 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 1942 BO_UNLOCK(bo); 1943 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 1944 BO_LOCK(bo); 1945 } 1946 } while (bo->bo_numoutput > 0); 1947 BO_UNLOCK(bo); 1948 1949 /* 1950 * Destroy the copy in the VM cache, too. 1951 */ 1952 if (bo->bo_object != NULL && 1953 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 1954 VM_OBJECT_WLOCK(bo->bo_object); 1955 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1956 OBJPR_CLEANONLY : 0); 1957 VM_OBJECT_WUNLOCK(bo->bo_object); 1958 } 1959 1960 #ifdef INVARIANTS 1961 BO_LOCK(bo); 1962 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 1963 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 1964 bo->bo_clean.bv_cnt > 0)) 1965 panic("vinvalbuf: flush failed"); 1966 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 1967 bo->bo_dirty.bv_cnt > 0) 1968 panic("vinvalbuf: flush dirty failed"); 1969 BO_UNLOCK(bo); 1970 #endif 1971 return (0); 1972 } 1973 1974 /* 1975 * Flush out and invalidate all buffers associated with a vnode. 1976 * Called with the underlying object locked. 1977 */ 1978 int 1979 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1980 { 1981 1982 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1983 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1984 if (vp->v_object != NULL && vp->v_object->handle != vp) 1985 return (0); 1986 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1987 } 1988 1989 /* 1990 * Flush out buffers on the specified list. 1991 * 1992 */ 1993 static int 1994 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1995 int slptimeo) 1996 { 1997 struct buf *bp, *nbp; 1998 int retval, error; 1999 daddr_t lblkno; 2000 b_xflags_t xflags; 2001 2002 ASSERT_BO_WLOCKED(bo); 2003 2004 retval = 0; 2005 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2006 /* 2007 * If we are flushing both V_NORMAL and V_ALT buffers then 2008 * do not skip any buffers. If we are flushing only V_NORMAL 2009 * buffers then skip buffers marked as BX_ALTDATA. If we are 2010 * flushing only V_ALT buffers then skip buffers not marked 2011 * as BX_ALTDATA. 2012 */ 2013 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2014 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2015 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2016 continue; 2017 } 2018 if (nbp != NULL) { 2019 lblkno = nbp->b_lblkno; 2020 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2021 } 2022 retval = EAGAIN; 2023 error = BUF_TIMELOCK(bp, 2024 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2025 "flushbuf", slpflag, slptimeo); 2026 if (error) { 2027 BO_LOCK(bo); 2028 return (error != ENOLCK ? error : EAGAIN); 2029 } 2030 KASSERT(bp->b_bufobj == bo, 2031 ("bp %p wrong b_bufobj %p should be %p", 2032 bp, bp->b_bufobj, bo)); 2033 /* 2034 * XXX Since there are no node locks for NFS, I 2035 * believe there is a slight chance that a delayed 2036 * write will occur while sleeping just above, so 2037 * check for it. 2038 */ 2039 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2040 (flags & V_SAVE)) { 2041 bremfree(bp); 2042 bp->b_flags |= B_ASYNC; 2043 bwrite(bp); 2044 BO_LOCK(bo); 2045 return (EAGAIN); /* XXX: why not loop ? */ 2046 } 2047 bremfree(bp); 2048 bp->b_flags |= (B_INVAL | B_RELBUF); 2049 bp->b_flags &= ~B_ASYNC; 2050 brelse(bp); 2051 BO_LOCK(bo); 2052 if (nbp == NULL) 2053 break; 2054 nbp = gbincore(bo, lblkno); 2055 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2056 != xflags) 2057 break; /* nbp invalid */ 2058 } 2059 return (retval); 2060 } 2061 2062 int 2063 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2064 { 2065 struct buf *bp; 2066 int error; 2067 daddr_t lblkno; 2068 2069 ASSERT_BO_LOCKED(bo); 2070 2071 for (lblkno = startn;;) { 2072 again: 2073 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2074 if (bp == NULL || bp->b_lblkno >= endn || 2075 bp->b_lblkno < startn) 2076 break; 2077 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2078 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2079 if (error != 0) { 2080 BO_RLOCK(bo); 2081 if (error == ENOLCK) 2082 goto again; 2083 return (error); 2084 } 2085 KASSERT(bp->b_bufobj == bo, 2086 ("bp %p wrong b_bufobj %p should be %p", 2087 bp, bp->b_bufobj, bo)); 2088 lblkno = bp->b_lblkno + 1; 2089 if ((bp->b_flags & B_MANAGED) == 0) 2090 bremfree(bp); 2091 bp->b_flags |= B_RELBUF; 2092 /* 2093 * In the VMIO case, use the B_NOREUSE flag to hint that the 2094 * pages backing each buffer in the range are unlikely to be 2095 * reused. Dirty buffers will have the hint applied once 2096 * they've been written. 2097 */ 2098 if ((bp->b_flags & B_VMIO) != 0) 2099 bp->b_flags |= B_NOREUSE; 2100 brelse(bp); 2101 BO_RLOCK(bo); 2102 } 2103 return (0); 2104 } 2105 2106 /* 2107 * Truncate a file's buffer and pages to a specified length. This 2108 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2109 * sync activity. 2110 */ 2111 int 2112 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2113 { 2114 struct buf *bp, *nbp; 2115 struct bufobj *bo; 2116 daddr_t startlbn; 2117 2118 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2119 vp, blksize, (uintmax_t)length); 2120 2121 /* 2122 * Round up to the *next* lbn. 2123 */ 2124 startlbn = howmany(length, blksize); 2125 2126 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2127 2128 bo = &vp->v_bufobj; 2129 restart_unlocked: 2130 BO_LOCK(bo); 2131 2132 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2133 ; 2134 2135 if (length > 0) { 2136 restartsync: 2137 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2138 if (bp->b_lblkno > 0) 2139 continue; 2140 /* 2141 * Since we hold the vnode lock this should only 2142 * fail if we're racing with the buf daemon. 2143 */ 2144 if (BUF_LOCK(bp, 2145 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2146 BO_LOCKPTR(bo)) == ENOLCK) 2147 goto restart_unlocked; 2148 2149 VNASSERT((bp->b_flags & B_DELWRI), vp, 2150 ("buf(%p) on dirty queue without DELWRI", bp)); 2151 2152 bremfree(bp); 2153 bawrite(bp); 2154 BO_LOCK(bo); 2155 goto restartsync; 2156 } 2157 } 2158 2159 bufobj_wwait(bo, 0, 0); 2160 BO_UNLOCK(bo); 2161 vnode_pager_setsize(vp, length); 2162 2163 return (0); 2164 } 2165 2166 /* 2167 * Invalidate the cached pages of a file's buffer within the range of block 2168 * numbers [startlbn, endlbn). 2169 */ 2170 void 2171 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2172 int blksize) 2173 { 2174 struct bufobj *bo; 2175 off_t start, end; 2176 2177 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2178 2179 start = blksize * startlbn; 2180 end = blksize * endlbn; 2181 2182 bo = &vp->v_bufobj; 2183 BO_LOCK(bo); 2184 MPASS(blksize == bo->bo_bsize); 2185 2186 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2187 ; 2188 2189 BO_UNLOCK(bo); 2190 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2191 } 2192 2193 static int 2194 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2195 daddr_t startlbn, daddr_t endlbn) 2196 { 2197 struct buf *bp, *nbp; 2198 bool anyfreed; 2199 2200 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2201 ASSERT_BO_LOCKED(bo); 2202 2203 do { 2204 anyfreed = false; 2205 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2206 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2207 continue; 2208 if (BUF_LOCK(bp, 2209 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2210 BO_LOCKPTR(bo)) == ENOLCK) { 2211 BO_LOCK(bo); 2212 return (EAGAIN); 2213 } 2214 2215 bremfree(bp); 2216 bp->b_flags |= B_INVAL | B_RELBUF; 2217 bp->b_flags &= ~B_ASYNC; 2218 brelse(bp); 2219 anyfreed = true; 2220 2221 BO_LOCK(bo); 2222 if (nbp != NULL && 2223 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2224 nbp->b_vp != vp || 2225 (nbp->b_flags & B_DELWRI) != 0)) 2226 return (EAGAIN); 2227 } 2228 2229 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2230 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2231 continue; 2232 if (BUF_LOCK(bp, 2233 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2234 BO_LOCKPTR(bo)) == ENOLCK) { 2235 BO_LOCK(bo); 2236 return (EAGAIN); 2237 } 2238 bremfree(bp); 2239 bp->b_flags |= B_INVAL | B_RELBUF; 2240 bp->b_flags &= ~B_ASYNC; 2241 brelse(bp); 2242 anyfreed = true; 2243 2244 BO_LOCK(bo); 2245 if (nbp != NULL && 2246 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2247 (nbp->b_vp != vp) || 2248 (nbp->b_flags & B_DELWRI) == 0)) 2249 return (EAGAIN); 2250 } 2251 } while (anyfreed); 2252 return (0); 2253 } 2254 2255 static void 2256 buf_vlist_remove(struct buf *bp) 2257 { 2258 struct bufv *bv; 2259 b_xflags_t flags; 2260 2261 flags = bp->b_xflags; 2262 2263 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2264 ASSERT_BO_WLOCKED(bp->b_bufobj); 2265 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2266 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2267 ("%s: buffer %p has invalid queue state", __func__, bp)); 2268 2269 if ((flags & BX_VNDIRTY) != 0) 2270 bv = &bp->b_bufobj->bo_dirty; 2271 else 2272 bv = &bp->b_bufobj->bo_clean; 2273 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2274 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2275 bv->bv_cnt--; 2276 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2277 } 2278 2279 /* 2280 * Add the buffer to the sorted clean or dirty block list. 2281 * 2282 * NOTE: xflags is passed as a constant, optimizing this inline function! 2283 */ 2284 static void 2285 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2286 { 2287 struct bufv *bv; 2288 struct buf *n; 2289 int error; 2290 2291 ASSERT_BO_WLOCKED(bo); 2292 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2293 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2294 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2295 ("dead bo %p", bo)); 2296 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2297 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2298 bp->b_xflags |= xflags; 2299 if (xflags & BX_VNDIRTY) 2300 bv = &bo->bo_dirty; 2301 else 2302 bv = &bo->bo_clean; 2303 2304 /* 2305 * Keep the list ordered. Optimize empty list insertion. Assume 2306 * we tend to grow at the tail so lookup_le should usually be cheaper 2307 * than _ge. 2308 */ 2309 if (bv->bv_cnt == 0 || 2310 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2311 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2312 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2313 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2314 else 2315 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2316 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2317 if (error) 2318 panic("buf_vlist_add: Preallocated nodes insufficient."); 2319 bv->bv_cnt++; 2320 } 2321 2322 /* 2323 * Look up a buffer using the buffer tries. 2324 */ 2325 struct buf * 2326 gbincore(struct bufobj *bo, daddr_t lblkno) 2327 { 2328 struct buf *bp; 2329 2330 ASSERT_BO_LOCKED(bo); 2331 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2332 if (bp != NULL) 2333 return (bp); 2334 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2335 } 2336 2337 /* 2338 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2339 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2340 * stability of the result. Like other lockless lookups, the found buf may 2341 * already be invalid by the time this function returns. 2342 */ 2343 struct buf * 2344 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2345 { 2346 struct buf *bp; 2347 2348 ASSERT_BO_UNLOCKED(bo); 2349 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2350 if (bp != NULL) 2351 return (bp); 2352 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2353 } 2354 2355 /* 2356 * Associate a buffer with a vnode. 2357 */ 2358 void 2359 bgetvp(struct vnode *vp, struct buf *bp) 2360 { 2361 struct bufobj *bo; 2362 2363 bo = &vp->v_bufobj; 2364 ASSERT_BO_WLOCKED(bo); 2365 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2366 2367 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2368 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2369 ("bgetvp: bp already attached! %p", bp)); 2370 2371 vhold(vp); 2372 bp->b_vp = vp; 2373 bp->b_bufobj = bo; 2374 /* 2375 * Insert onto list for new vnode. 2376 */ 2377 buf_vlist_add(bp, bo, BX_VNCLEAN); 2378 } 2379 2380 /* 2381 * Disassociate a buffer from a vnode. 2382 */ 2383 void 2384 brelvp(struct buf *bp) 2385 { 2386 struct bufobj *bo; 2387 struct vnode *vp; 2388 2389 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2390 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2391 2392 /* 2393 * Delete from old vnode list, if on one. 2394 */ 2395 vp = bp->b_vp; /* XXX */ 2396 bo = bp->b_bufobj; 2397 BO_LOCK(bo); 2398 buf_vlist_remove(bp); 2399 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2400 bo->bo_flag &= ~BO_ONWORKLST; 2401 mtx_lock(&sync_mtx); 2402 LIST_REMOVE(bo, bo_synclist); 2403 syncer_worklist_len--; 2404 mtx_unlock(&sync_mtx); 2405 } 2406 bp->b_vp = NULL; 2407 bp->b_bufobj = NULL; 2408 BO_UNLOCK(bo); 2409 vdrop(vp); 2410 } 2411 2412 /* 2413 * Add an item to the syncer work queue. 2414 */ 2415 static void 2416 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2417 { 2418 int slot; 2419 2420 ASSERT_BO_WLOCKED(bo); 2421 2422 mtx_lock(&sync_mtx); 2423 if (bo->bo_flag & BO_ONWORKLST) 2424 LIST_REMOVE(bo, bo_synclist); 2425 else { 2426 bo->bo_flag |= BO_ONWORKLST; 2427 syncer_worklist_len++; 2428 } 2429 2430 if (delay > syncer_maxdelay - 2) 2431 delay = syncer_maxdelay - 2; 2432 slot = (syncer_delayno + delay) & syncer_mask; 2433 2434 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2435 mtx_unlock(&sync_mtx); 2436 } 2437 2438 static int 2439 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2440 { 2441 int error, len; 2442 2443 mtx_lock(&sync_mtx); 2444 len = syncer_worklist_len - sync_vnode_count; 2445 mtx_unlock(&sync_mtx); 2446 error = SYSCTL_OUT(req, &len, sizeof(len)); 2447 return (error); 2448 } 2449 2450 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2451 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2452 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2453 2454 static struct proc *updateproc; 2455 static void sched_sync(void); 2456 static struct kproc_desc up_kp = { 2457 "syncer", 2458 sched_sync, 2459 &updateproc 2460 }; 2461 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2462 2463 static int 2464 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2465 { 2466 struct vnode *vp; 2467 struct mount *mp; 2468 2469 *bo = LIST_FIRST(slp); 2470 if (*bo == NULL) 2471 return (0); 2472 vp = bo2vnode(*bo); 2473 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2474 return (1); 2475 /* 2476 * We use vhold in case the vnode does not 2477 * successfully sync. vhold prevents the vnode from 2478 * going away when we unlock the sync_mtx so that 2479 * we can acquire the vnode interlock. 2480 */ 2481 vholdl(vp); 2482 mtx_unlock(&sync_mtx); 2483 VI_UNLOCK(vp); 2484 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2485 vdrop(vp); 2486 mtx_lock(&sync_mtx); 2487 return (*bo == LIST_FIRST(slp)); 2488 } 2489 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2490 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2491 VOP_UNLOCK(vp); 2492 vn_finished_write(mp); 2493 BO_LOCK(*bo); 2494 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2495 /* 2496 * Put us back on the worklist. The worklist 2497 * routine will remove us from our current 2498 * position and then add us back in at a later 2499 * position. 2500 */ 2501 vn_syncer_add_to_worklist(*bo, syncdelay); 2502 } 2503 BO_UNLOCK(*bo); 2504 vdrop(vp); 2505 mtx_lock(&sync_mtx); 2506 return (0); 2507 } 2508 2509 static int first_printf = 1; 2510 2511 /* 2512 * System filesystem synchronizer daemon. 2513 */ 2514 static void 2515 sched_sync(void) 2516 { 2517 struct synclist *next, *slp; 2518 struct bufobj *bo; 2519 long starttime; 2520 struct thread *td = curthread; 2521 int last_work_seen; 2522 int net_worklist_len; 2523 int syncer_final_iter; 2524 int error; 2525 2526 last_work_seen = 0; 2527 syncer_final_iter = 0; 2528 syncer_state = SYNCER_RUNNING; 2529 starttime = time_uptime; 2530 td->td_pflags |= TDP_NORUNNINGBUF; 2531 2532 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2533 SHUTDOWN_PRI_LAST); 2534 2535 mtx_lock(&sync_mtx); 2536 for (;;) { 2537 if (syncer_state == SYNCER_FINAL_DELAY && 2538 syncer_final_iter == 0) { 2539 mtx_unlock(&sync_mtx); 2540 kproc_suspend_check(td->td_proc); 2541 mtx_lock(&sync_mtx); 2542 } 2543 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2544 if (syncer_state != SYNCER_RUNNING && 2545 starttime != time_uptime) { 2546 if (first_printf) { 2547 printf("\nSyncing disks, vnodes remaining... "); 2548 first_printf = 0; 2549 } 2550 printf("%d ", net_worklist_len); 2551 } 2552 starttime = time_uptime; 2553 2554 /* 2555 * Push files whose dirty time has expired. Be careful 2556 * of interrupt race on slp queue. 2557 * 2558 * Skip over empty worklist slots when shutting down. 2559 */ 2560 do { 2561 slp = &syncer_workitem_pending[syncer_delayno]; 2562 syncer_delayno += 1; 2563 if (syncer_delayno == syncer_maxdelay) 2564 syncer_delayno = 0; 2565 next = &syncer_workitem_pending[syncer_delayno]; 2566 /* 2567 * If the worklist has wrapped since the 2568 * it was emptied of all but syncer vnodes, 2569 * switch to the FINAL_DELAY state and run 2570 * for one more second. 2571 */ 2572 if (syncer_state == SYNCER_SHUTTING_DOWN && 2573 net_worklist_len == 0 && 2574 last_work_seen == syncer_delayno) { 2575 syncer_state = SYNCER_FINAL_DELAY; 2576 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2577 } 2578 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2579 syncer_worklist_len > 0); 2580 2581 /* 2582 * Keep track of the last time there was anything 2583 * on the worklist other than syncer vnodes. 2584 * Return to the SHUTTING_DOWN state if any 2585 * new work appears. 2586 */ 2587 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2588 last_work_seen = syncer_delayno; 2589 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2590 syncer_state = SYNCER_SHUTTING_DOWN; 2591 while (!LIST_EMPTY(slp)) { 2592 error = sync_vnode(slp, &bo, td); 2593 if (error == 1) { 2594 LIST_REMOVE(bo, bo_synclist); 2595 LIST_INSERT_HEAD(next, bo, bo_synclist); 2596 continue; 2597 } 2598 2599 if (first_printf == 0) { 2600 /* 2601 * Drop the sync mutex, because some watchdog 2602 * drivers need to sleep while patting 2603 */ 2604 mtx_unlock(&sync_mtx); 2605 wdog_kern_pat(WD_LASTVAL); 2606 mtx_lock(&sync_mtx); 2607 } 2608 2609 } 2610 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2611 syncer_final_iter--; 2612 /* 2613 * The variable rushjob allows the kernel to speed up the 2614 * processing of the filesystem syncer process. A rushjob 2615 * value of N tells the filesystem syncer to process the next 2616 * N seconds worth of work on its queue ASAP. Currently rushjob 2617 * is used by the soft update code to speed up the filesystem 2618 * syncer process when the incore state is getting so far 2619 * ahead of the disk that the kernel memory pool is being 2620 * threatened with exhaustion. 2621 */ 2622 if (rushjob > 0) { 2623 rushjob -= 1; 2624 continue; 2625 } 2626 /* 2627 * Just sleep for a short period of time between 2628 * iterations when shutting down to allow some I/O 2629 * to happen. 2630 * 2631 * If it has taken us less than a second to process the 2632 * current work, then wait. Otherwise start right over 2633 * again. We can still lose time if any single round 2634 * takes more than two seconds, but it does not really 2635 * matter as we are just trying to generally pace the 2636 * filesystem activity. 2637 */ 2638 if (syncer_state != SYNCER_RUNNING || 2639 time_uptime == starttime) { 2640 thread_lock(td); 2641 sched_prio(td, PPAUSE); 2642 thread_unlock(td); 2643 } 2644 if (syncer_state != SYNCER_RUNNING) 2645 cv_timedwait(&sync_wakeup, &sync_mtx, 2646 hz / SYNCER_SHUTDOWN_SPEEDUP); 2647 else if (time_uptime == starttime) 2648 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2649 } 2650 } 2651 2652 /* 2653 * Request the syncer daemon to speed up its work. 2654 * We never push it to speed up more than half of its 2655 * normal turn time, otherwise it could take over the cpu. 2656 */ 2657 int 2658 speedup_syncer(void) 2659 { 2660 int ret = 0; 2661 2662 mtx_lock(&sync_mtx); 2663 if (rushjob < syncdelay / 2) { 2664 rushjob += 1; 2665 stat_rush_requests += 1; 2666 ret = 1; 2667 } 2668 mtx_unlock(&sync_mtx); 2669 cv_broadcast(&sync_wakeup); 2670 return (ret); 2671 } 2672 2673 /* 2674 * Tell the syncer to speed up its work and run though its work 2675 * list several times, then tell it to shut down. 2676 */ 2677 static void 2678 syncer_shutdown(void *arg, int howto) 2679 { 2680 2681 if (howto & RB_NOSYNC) 2682 return; 2683 mtx_lock(&sync_mtx); 2684 syncer_state = SYNCER_SHUTTING_DOWN; 2685 rushjob = 0; 2686 mtx_unlock(&sync_mtx); 2687 cv_broadcast(&sync_wakeup); 2688 kproc_shutdown(arg, howto); 2689 } 2690 2691 void 2692 syncer_suspend(void) 2693 { 2694 2695 syncer_shutdown(updateproc, 0); 2696 } 2697 2698 void 2699 syncer_resume(void) 2700 { 2701 2702 mtx_lock(&sync_mtx); 2703 first_printf = 1; 2704 syncer_state = SYNCER_RUNNING; 2705 mtx_unlock(&sync_mtx); 2706 cv_broadcast(&sync_wakeup); 2707 kproc_resume(updateproc); 2708 } 2709 2710 /* 2711 * Move the buffer between the clean and dirty lists of its vnode. 2712 */ 2713 void 2714 reassignbuf(struct buf *bp) 2715 { 2716 struct vnode *vp; 2717 struct bufobj *bo; 2718 int delay; 2719 #ifdef INVARIANTS 2720 struct bufv *bv; 2721 #endif 2722 2723 vp = bp->b_vp; 2724 bo = bp->b_bufobj; 2725 2726 KASSERT((bp->b_flags & B_PAGING) == 0, 2727 ("%s: cannot reassign paging buffer %p", __func__, bp)); 2728 2729 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2730 bp, bp->b_vp, bp->b_flags); 2731 2732 BO_LOCK(bo); 2733 buf_vlist_remove(bp); 2734 2735 /* 2736 * If dirty, put on list of dirty buffers; otherwise insert onto list 2737 * of clean buffers. 2738 */ 2739 if (bp->b_flags & B_DELWRI) { 2740 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2741 switch (vp->v_type) { 2742 case VDIR: 2743 delay = dirdelay; 2744 break; 2745 case VCHR: 2746 delay = metadelay; 2747 break; 2748 default: 2749 delay = filedelay; 2750 } 2751 vn_syncer_add_to_worklist(bo, delay); 2752 } 2753 buf_vlist_add(bp, bo, BX_VNDIRTY); 2754 } else { 2755 buf_vlist_add(bp, bo, BX_VNCLEAN); 2756 2757 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2758 mtx_lock(&sync_mtx); 2759 LIST_REMOVE(bo, bo_synclist); 2760 syncer_worklist_len--; 2761 mtx_unlock(&sync_mtx); 2762 bo->bo_flag &= ~BO_ONWORKLST; 2763 } 2764 } 2765 #ifdef INVARIANTS 2766 bv = &bo->bo_clean; 2767 bp = TAILQ_FIRST(&bv->bv_hd); 2768 KASSERT(bp == NULL || bp->b_bufobj == bo, 2769 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2770 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2771 KASSERT(bp == NULL || bp->b_bufobj == bo, 2772 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2773 bv = &bo->bo_dirty; 2774 bp = TAILQ_FIRST(&bv->bv_hd); 2775 KASSERT(bp == NULL || bp->b_bufobj == bo, 2776 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2777 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2778 KASSERT(bp == NULL || bp->b_bufobj == bo, 2779 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2780 #endif 2781 BO_UNLOCK(bo); 2782 } 2783 2784 static void 2785 v_init_counters(struct vnode *vp) 2786 { 2787 2788 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2789 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2790 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2791 2792 refcount_init(&vp->v_holdcnt, 1); 2793 refcount_init(&vp->v_usecount, 1); 2794 } 2795 2796 /* 2797 * Increment si_usecount of the associated device, if any. 2798 */ 2799 static void 2800 v_incr_devcount(struct vnode *vp) 2801 { 2802 2803 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2804 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2805 dev_lock(); 2806 vp->v_rdev->si_usecount++; 2807 dev_unlock(); 2808 } 2809 } 2810 2811 /* 2812 * Decrement si_usecount of the associated device, if any. 2813 * 2814 * The caller is required to hold the interlock when transitioning a VCHR use 2815 * count to zero. This prevents a race with devfs_reclaim_vchr() that would 2816 * leak a si_usecount reference. The vnode lock will also prevent this race 2817 * if it is held while dropping the last ref. 2818 * 2819 * The race is: 2820 * 2821 * CPU1 CPU2 2822 * devfs_reclaim_vchr 2823 * make v_usecount == 0 2824 * VI_LOCK 2825 * sees v_usecount == 0, no updates 2826 * vp->v_rdev = NULL; 2827 * ... 2828 * VI_UNLOCK 2829 * VI_LOCK 2830 * v_decr_devcount 2831 * sees v_rdev == NULL, no updates 2832 * 2833 * In this scenario si_devcount decrement is not performed. 2834 */ 2835 static void 2836 v_decr_devcount(struct vnode *vp) 2837 { 2838 2839 ASSERT_VOP_LOCKED(vp, __func__); 2840 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2841 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2842 dev_lock(); 2843 VNPASS(vp->v_rdev->si_usecount > 0, vp); 2844 vp->v_rdev->si_usecount--; 2845 dev_unlock(); 2846 } 2847 } 2848 2849 /* 2850 * Grab a particular vnode from the free list, increment its 2851 * reference count and lock it. VIRF_DOOMED is set if the vnode 2852 * is being destroyed. Only callers who specify LK_RETRY will 2853 * see doomed vnodes. If inactive processing was delayed in 2854 * vput try to do it here. 2855 * 2856 * usecount is manipulated using atomics without holding any locks. 2857 * 2858 * holdcnt can be manipulated using atomics without holding any locks, 2859 * except when transitioning 1<->0, in which case the interlock is held. 2860 * 2861 * Consumers which don't guarantee liveness of the vnode can use SMR to 2862 * try to get a reference. Note this operation can fail since the vnode 2863 * may be awaiting getting freed by the time they get to it. 2864 */ 2865 enum vgetstate 2866 vget_prep_smr(struct vnode *vp) 2867 { 2868 enum vgetstate vs; 2869 2870 VFS_SMR_ASSERT_ENTERED(); 2871 2872 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2873 vs = VGET_USECOUNT; 2874 } else { 2875 if (vhold_smr(vp)) 2876 vs = VGET_HOLDCNT; 2877 else 2878 vs = VGET_NONE; 2879 } 2880 return (vs); 2881 } 2882 2883 enum vgetstate 2884 vget_prep(struct vnode *vp) 2885 { 2886 enum vgetstate vs; 2887 2888 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2889 vs = VGET_USECOUNT; 2890 } else { 2891 vhold(vp); 2892 vs = VGET_HOLDCNT; 2893 } 2894 return (vs); 2895 } 2896 2897 void 2898 vget_abort(struct vnode *vp, enum vgetstate vs) 2899 { 2900 2901 switch (vs) { 2902 case VGET_USECOUNT: 2903 vrele(vp); 2904 break; 2905 case VGET_HOLDCNT: 2906 vdrop(vp); 2907 break; 2908 default: 2909 __assert_unreachable(); 2910 } 2911 } 2912 2913 int 2914 vget(struct vnode *vp, int flags, struct thread *td) 2915 { 2916 enum vgetstate vs; 2917 2918 MPASS(td == curthread); 2919 2920 vs = vget_prep(vp); 2921 return (vget_finish(vp, flags, vs)); 2922 } 2923 2924 static void __noinline 2925 vget_finish_vchr(struct vnode *vp) 2926 { 2927 2928 VNASSERT(vp->v_type == VCHR, vp, ("type != VCHR)")); 2929 2930 /* 2931 * See the comment in vget_finish before usecount bump. 2932 */ 2933 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2934 #ifdef INVARIANTS 2935 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2936 VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old)); 2937 #else 2938 refcount_release(&vp->v_holdcnt); 2939 #endif 2940 return; 2941 } 2942 2943 VI_LOCK(vp); 2944 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2945 #ifdef INVARIANTS 2946 int old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 2947 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 2948 #else 2949 refcount_release(&vp->v_holdcnt); 2950 #endif 2951 VI_UNLOCK(vp); 2952 return; 2953 } 2954 v_incr_devcount(vp); 2955 refcount_acquire(&vp->v_usecount); 2956 VI_UNLOCK(vp); 2957 } 2958 2959 int 2960 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 2961 { 2962 int error; 2963 2964 if ((flags & LK_INTERLOCK) != 0) 2965 ASSERT_VI_LOCKED(vp, __func__); 2966 else 2967 ASSERT_VI_UNLOCKED(vp, __func__); 2968 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 2969 VNPASS(vp->v_holdcnt > 0, vp); 2970 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 2971 2972 error = vn_lock(vp, flags); 2973 if (__predict_false(error != 0)) { 2974 vget_abort(vp, vs); 2975 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2976 vp); 2977 return (error); 2978 } 2979 2980 vget_finish_ref(vp, vs); 2981 return (0); 2982 } 2983 2984 void 2985 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 2986 { 2987 int old; 2988 2989 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 2990 VNPASS(vp->v_holdcnt > 0, vp); 2991 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 2992 2993 if (vs == VGET_USECOUNT) 2994 return; 2995 2996 if (__predict_false(vp->v_type == VCHR)) { 2997 vget_finish_vchr(vp); 2998 return; 2999 } 3000 3001 /* 3002 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3003 * the vnode around. Otherwise someone else lended their hold count and 3004 * we have to drop ours. 3005 */ 3006 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3007 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3008 if (old != 0) { 3009 #ifdef INVARIANTS 3010 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3011 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3012 #else 3013 refcount_release(&vp->v_holdcnt); 3014 #endif 3015 } 3016 } 3017 3018 /* 3019 * Increase the reference (use) and hold count of a vnode. 3020 * This will also remove the vnode from the free list if it is presently free. 3021 */ 3022 static void __noinline 3023 vref_vchr(struct vnode *vp, bool interlock) 3024 { 3025 3026 /* 3027 * See the comment in vget_finish before usecount bump. 3028 */ 3029 if (!interlock) { 3030 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3031 VNODE_REFCOUNT_FENCE_ACQ(); 3032 VNASSERT(vp->v_holdcnt > 0, vp, 3033 ("%s: active vnode not held", __func__)); 3034 return; 3035 } 3036 VI_LOCK(vp); 3037 /* 3038 * By the time we get here the vnode might have been doomed, at 3039 * which point the 0->1 use count transition is no longer 3040 * protected by the interlock. Since it can't bounce back to 3041 * VCHR and requires vref semantics, punt it back 3042 */ 3043 if (__predict_false(vp->v_type == VBAD)) { 3044 VI_UNLOCK(vp); 3045 vref(vp); 3046 return; 3047 } 3048 } 3049 VNASSERT(vp->v_type == VCHR, vp, ("type != VCHR)")); 3050 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3051 VNODE_REFCOUNT_FENCE_ACQ(); 3052 VNASSERT(vp->v_holdcnt > 0, vp, 3053 ("%s: active vnode not held", __func__)); 3054 if (!interlock) 3055 VI_UNLOCK(vp); 3056 return; 3057 } 3058 vhold(vp); 3059 v_incr_devcount(vp); 3060 refcount_acquire(&vp->v_usecount); 3061 if (!interlock) 3062 VI_UNLOCK(vp); 3063 return; 3064 } 3065 3066 void 3067 vref(struct vnode *vp) 3068 { 3069 int old; 3070 3071 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3072 if (__predict_false(vp->v_type == VCHR)) { 3073 vref_vchr(vp, false); 3074 return; 3075 } 3076 3077 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3078 VNODE_REFCOUNT_FENCE_ACQ(); 3079 VNASSERT(vp->v_holdcnt > 0, vp, 3080 ("%s: active vnode not held", __func__)); 3081 return; 3082 } 3083 vhold(vp); 3084 /* 3085 * See the comment in vget_finish. 3086 */ 3087 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3088 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3089 if (old != 0) { 3090 #ifdef INVARIANTS 3091 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3092 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3093 #else 3094 refcount_release(&vp->v_holdcnt); 3095 #endif 3096 } 3097 } 3098 3099 void 3100 vrefl(struct vnode *vp) 3101 { 3102 3103 ASSERT_VI_LOCKED(vp, __func__); 3104 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3105 if (__predict_false(vp->v_type == VCHR)) { 3106 vref_vchr(vp, true); 3107 return; 3108 } 3109 vref(vp); 3110 } 3111 3112 void 3113 vrefact(struct vnode *vp) 3114 { 3115 3116 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3117 #ifdef INVARIANTS 3118 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 3119 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3120 #else 3121 refcount_acquire(&vp->v_usecount); 3122 #endif 3123 } 3124 3125 void 3126 vrefactn(struct vnode *vp, u_int n) 3127 { 3128 3129 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3130 #ifdef INVARIANTS 3131 int old = atomic_fetchadd_int(&vp->v_usecount, n); 3132 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3133 #else 3134 atomic_add_int(&vp->v_usecount, n); 3135 #endif 3136 } 3137 3138 /* 3139 * Return reference count of a vnode. 3140 * 3141 * The results of this call are only guaranteed when some mechanism is used to 3142 * stop other processes from gaining references to the vnode. This may be the 3143 * case if the caller holds the only reference. This is also useful when stale 3144 * data is acceptable as race conditions may be accounted for by some other 3145 * means. 3146 */ 3147 int 3148 vrefcnt(struct vnode *vp) 3149 { 3150 3151 return (vp->v_usecount); 3152 } 3153 3154 void 3155 vlazy(struct vnode *vp) 3156 { 3157 struct mount *mp; 3158 3159 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3160 3161 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3162 return; 3163 /* 3164 * We may get here for inactive routines after the vnode got doomed. 3165 */ 3166 if (VN_IS_DOOMED(vp)) 3167 return; 3168 mp = vp->v_mount; 3169 mtx_lock(&mp->mnt_listmtx); 3170 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3171 vp->v_mflag |= VMP_LAZYLIST; 3172 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3173 mp->mnt_lazyvnodelistsize++; 3174 } 3175 mtx_unlock(&mp->mnt_listmtx); 3176 } 3177 3178 /* 3179 * This routine is only meant to be called from vgonel prior to dooming 3180 * the vnode. 3181 */ 3182 static void 3183 vunlazy_gone(struct vnode *vp) 3184 { 3185 struct mount *mp; 3186 3187 ASSERT_VOP_ELOCKED(vp, __func__); 3188 ASSERT_VI_LOCKED(vp, __func__); 3189 VNPASS(!VN_IS_DOOMED(vp), vp); 3190 3191 if (vp->v_mflag & VMP_LAZYLIST) { 3192 mp = vp->v_mount; 3193 mtx_lock(&mp->mnt_listmtx); 3194 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3195 vp->v_mflag &= ~VMP_LAZYLIST; 3196 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3197 mp->mnt_lazyvnodelistsize--; 3198 mtx_unlock(&mp->mnt_listmtx); 3199 } 3200 } 3201 3202 static void 3203 vdefer_inactive(struct vnode *vp) 3204 { 3205 3206 ASSERT_VI_LOCKED(vp, __func__); 3207 VNASSERT(vp->v_holdcnt > 0, vp, 3208 ("%s: vnode without hold count", __func__)); 3209 if (VN_IS_DOOMED(vp)) { 3210 vdropl(vp); 3211 return; 3212 } 3213 if (vp->v_iflag & VI_DEFINACT) { 3214 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 3215 vdropl(vp); 3216 return; 3217 } 3218 if (vp->v_usecount > 0) { 3219 vp->v_iflag &= ~VI_OWEINACT; 3220 vdropl(vp); 3221 return; 3222 } 3223 vlazy(vp); 3224 vp->v_iflag |= VI_DEFINACT; 3225 VI_UNLOCK(vp); 3226 counter_u64_add(deferred_inact, 1); 3227 } 3228 3229 static void 3230 vdefer_inactive_unlocked(struct vnode *vp) 3231 { 3232 3233 VI_LOCK(vp); 3234 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3235 vdropl(vp); 3236 return; 3237 } 3238 vdefer_inactive(vp); 3239 } 3240 3241 enum vput_op { VRELE, VPUT, VUNREF }; 3242 3243 /* 3244 * Handle ->v_usecount transitioning to 0. 3245 * 3246 * By releasing the last usecount we take ownership of the hold count which 3247 * provides liveness of the vnode, meaning we have to vdrop. 3248 * 3249 * If the vnode is of type VCHR we may need to decrement si_usecount, see 3250 * v_decr_devcount for details. 3251 * 3252 * For all vnodes we may need to perform inactive processing. It requires an 3253 * exclusive lock on the vnode, while it is legal to call here with only a 3254 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3255 * inactive processing gets deferred to the syncer. 3256 * 3257 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3258 * on the lock being held all the way until VOP_INACTIVE. This in particular 3259 * happens with UFS which adds half-constructed vnodes to the hash, where they 3260 * can be found by other code. 3261 */ 3262 static void 3263 vput_final(struct vnode *vp, enum vput_op func) 3264 { 3265 int error; 3266 bool want_unlock; 3267 3268 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3269 VNPASS(vp->v_holdcnt > 0, vp); 3270 3271 VI_LOCK(vp); 3272 if (__predict_false(vp->v_type == VCHR && func != VRELE)) 3273 v_decr_devcount(vp); 3274 3275 /* 3276 * By the time we got here someone else might have transitioned 3277 * the count back to > 0. 3278 */ 3279 if (vp->v_usecount > 0) 3280 goto out; 3281 3282 /* 3283 * If the vnode is doomed vgone already performed inactive processing 3284 * (if needed). 3285 */ 3286 if (VN_IS_DOOMED(vp)) 3287 goto out; 3288 3289 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3290 goto out; 3291 3292 if (vp->v_iflag & VI_DOINGINACT) 3293 goto out; 3294 3295 /* 3296 * Locking operations here will drop the interlock and possibly the 3297 * vnode lock, opening a window where the vnode can get doomed all the 3298 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3299 * perform inactive. 3300 */ 3301 vp->v_iflag |= VI_OWEINACT; 3302 want_unlock = false; 3303 error = 0; 3304 switch (func) { 3305 case VRELE: 3306 switch (VOP_ISLOCKED(vp)) { 3307 case LK_EXCLUSIVE: 3308 break; 3309 case LK_EXCLOTHER: 3310 case 0: 3311 want_unlock = true; 3312 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3313 VI_LOCK(vp); 3314 break; 3315 default: 3316 /* 3317 * The lock has at least one sharer, but we have no way 3318 * to conclude whether this is us. Play it safe and 3319 * defer processing. 3320 */ 3321 error = EAGAIN; 3322 break; 3323 } 3324 break; 3325 case VPUT: 3326 want_unlock = true; 3327 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3328 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3329 LK_NOWAIT); 3330 VI_LOCK(vp); 3331 } 3332 break; 3333 case VUNREF: 3334 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3335 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3336 VI_LOCK(vp); 3337 } 3338 break; 3339 } 3340 if (error == 0) { 3341 vinactive(vp); 3342 if (want_unlock) 3343 VOP_UNLOCK(vp); 3344 vdropl(vp); 3345 } else { 3346 vdefer_inactive(vp); 3347 } 3348 return; 3349 out: 3350 if (func == VPUT) 3351 VOP_UNLOCK(vp); 3352 vdropl(vp); 3353 } 3354 3355 /* 3356 * Decrement ->v_usecount for a vnode. 3357 * 3358 * Releasing the last use count requires additional processing, see vput_final 3359 * above for details. 3360 * 3361 * Note that releasing use count without the vnode lock requires special casing 3362 * for VCHR, see v_decr_devcount for details. 3363 * 3364 * Comment above each variant denotes lock state on entry and exit. 3365 */ 3366 3367 static void __noinline 3368 vrele_vchr(struct vnode *vp) 3369 { 3370 3371 if (refcount_release_if_not_last(&vp->v_usecount)) 3372 return; 3373 VI_LOCK(vp); 3374 if (!refcount_release(&vp->v_usecount)) { 3375 VI_UNLOCK(vp); 3376 return; 3377 } 3378 v_decr_devcount(vp); 3379 VI_UNLOCK(vp); 3380 vput_final(vp, VRELE); 3381 } 3382 3383 /* 3384 * in: any 3385 * out: same as passed in 3386 */ 3387 void 3388 vrele(struct vnode *vp) 3389 { 3390 3391 ASSERT_VI_UNLOCKED(vp, __func__); 3392 if (__predict_false(vp->v_type == VCHR)) { 3393 vrele_vchr(vp); 3394 return; 3395 } 3396 if (!refcount_release(&vp->v_usecount)) 3397 return; 3398 vput_final(vp, VRELE); 3399 } 3400 3401 /* 3402 * in: locked 3403 * out: unlocked 3404 */ 3405 void 3406 vput(struct vnode *vp) 3407 { 3408 3409 ASSERT_VOP_LOCKED(vp, __func__); 3410 ASSERT_VI_UNLOCKED(vp, __func__); 3411 if (!refcount_release(&vp->v_usecount)) { 3412 VOP_UNLOCK(vp); 3413 return; 3414 } 3415 vput_final(vp, VPUT); 3416 } 3417 3418 /* 3419 * in: locked 3420 * out: locked 3421 */ 3422 void 3423 vunref(struct vnode *vp) 3424 { 3425 3426 ASSERT_VOP_LOCKED(vp, __func__); 3427 ASSERT_VI_UNLOCKED(vp, __func__); 3428 if (!refcount_release(&vp->v_usecount)) 3429 return; 3430 vput_final(vp, VUNREF); 3431 } 3432 3433 void 3434 vhold(struct vnode *vp) 3435 { 3436 struct vdbatch *vd; 3437 int old; 3438 3439 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3440 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3441 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3442 ("%s: wrong hold count %d", __func__, old)); 3443 if (old != 0) 3444 return; 3445 critical_enter(); 3446 vd = DPCPU_PTR(vd); 3447 vd->freevnodes--; 3448 critical_exit(); 3449 } 3450 3451 void 3452 vholdl(struct vnode *vp) 3453 { 3454 3455 ASSERT_VI_LOCKED(vp, __func__); 3456 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3457 vhold(vp); 3458 } 3459 3460 void 3461 vholdnz(struct vnode *vp) 3462 { 3463 3464 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3465 #ifdef INVARIANTS 3466 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3467 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3468 ("%s: wrong hold count %d", __func__, old)); 3469 #else 3470 atomic_add_int(&vp->v_holdcnt, 1); 3471 #endif 3472 } 3473 3474 /* 3475 * Grab a hold count unless the vnode is freed. 3476 * 3477 * Only use this routine if vfs smr is the only protection you have against 3478 * freeing the vnode. 3479 * 3480 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3481 * is not set. After the flag is set the vnode becomes immutable to anyone but 3482 * the thread which managed to set the flag. 3483 * 3484 * It may be tempting to replace the loop with: 3485 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3486 * if (count & VHOLD_NO_SMR) { 3487 * backpedal and error out; 3488 * } 3489 * 3490 * However, while this is more performant, it hinders debugging by eliminating 3491 * the previously mentioned invariant. 3492 */ 3493 bool 3494 vhold_smr(struct vnode *vp) 3495 { 3496 int count; 3497 3498 VFS_SMR_ASSERT_ENTERED(); 3499 3500 count = atomic_load_int(&vp->v_holdcnt); 3501 for (;;) { 3502 if (count & VHOLD_NO_SMR) { 3503 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3504 ("non-zero hold count with flags %d\n", count)); 3505 return (false); 3506 } 3507 3508 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3509 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) 3510 return (true); 3511 } 3512 } 3513 3514 static void __noinline 3515 vdbatch_process(struct vdbatch *vd) 3516 { 3517 struct vnode *vp; 3518 int i; 3519 3520 mtx_assert(&vd->lock, MA_OWNED); 3521 MPASS(curthread->td_pinned > 0); 3522 MPASS(vd->index == VDBATCH_SIZE); 3523 3524 mtx_lock(&vnode_list_mtx); 3525 critical_enter(); 3526 freevnodes += vd->freevnodes; 3527 for (i = 0; i < VDBATCH_SIZE; i++) { 3528 vp = vd->tab[i]; 3529 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3530 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3531 MPASS(vp->v_dbatchcpu != NOCPU); 3532 vp->v_dbatchcpu = NOCPU; 3533 } 3534 mtx_unlock(&vnode_list_mtx); 3535 vd->freevnodes = 0; 3536 bzero(vd->tab, sizeof(vd->tab)); 3537 vd->index = 0; 3538 critical_exit(); 3539 } 3540 3541 static void 3542 vdbatch_enqueue(struct vnode *vp) 3543 { 3544 struct vdbatch *vd; 3545 3546 ASSERT_VI_LOCKED(vp, __func__); 3547 VNASSERT(!VN_IS_DOOMED(vp), vp, 3548 ("%s: deferring requeue of a doomed vnode", __func__)); 3549 3550 critical_enter(); 3551 vd = DPCPU_PTR(vd); 3552 vd->freevnodes++; 3553 if (vp->v_dbatchcpu != NOCPU) { 3554 VI_UNLOCK(vp); 3555 critical_exit(); 3556 return; 3557 } 3558 3559 sched_pin(); 3560 critical_exit(); 3561 mtx_lock(&vd->lock); 3562 MPASS(vd->index < VDBATCH_SIZE); 3563 MPASS(vd->tab[vd->index] == NULL); 3564 /* 3565 * A hack: we depend on being pinned so that we know what to put in 3566 * ->v_dbatchcpu. 3567 */ 3568 vp->v_dbatchcpu = curcpu; 3569 vd->tab[vd->index] = vp; 3570 vd->index++; 3571 VI_UNLOCK(vp); 3572 if (vd->index == VDBATCH_SIZE) 3573 vdbatch_process(vd); 3574 mtx_unlock(&vd->lock); 3575 sched_unpin(); 3576 } 3577 3578 /* 3579 * This routine must only be called for vnodes which are about to be 3580 * deallocated. Supporting dequeue for arbitrary vndoes would require 3581 * validating that the locked batch matches. 3582 */ 3583 static void 3584 vdbatch_dequeue(struct vnode *vp) 3585 { 3586 struct vdbatch *vd; 3587 int i; 3588 short cpu; 3589 3590 VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, 3591 ("%s: called for a used vnode\n", __func__)); 3592 3593 cpu = vp->v_dbatchcpu; 3594 if (cpu == NOCPU) 3595 return; 3596 3597 vd = DPCPU_ID_PTR(cpu, vd); 3598 mtx_lock(&vd->lock); 3599 for (i = 0; i < vd->index; i++) { 3600 if (vd->tab[i] != vp) 3601 continue; 3602 vp->v_dbatchcpu = NOCPU; 3603 vd->index--; 3604 vd->tab[i] = vd->tab[vd->index]; 3605 vd->tab[vd->index] = NULL; 3606 break; 3607 } 3608 mtx_unlock(&vd->lock); 3609 /* 3610 * Either we dequeued the vnode above or the target CPU beat us to it. 3611 */ 3612 MPASS(vp->v_dbatchcpu == NOCPU); 3613 } 3614 3615 /* 3616 * Drop the hold count of the vnode. If this is the last reference to 3617 * the vnode we place it on the free list unless it has been vgone'd 3618 * (marked VIRF_DOOMED) in which case we will free it. 3619 * 3620 * Because the vnode vm object keeps a hold reference on the vnode if 3621 * there is at least one resident non-cached page, the vnode cannot 3622 * leave the active list without the page cleanup done. 3623 */ 3624 static void 3625 vdrop_deactivate(struct vnode *vp) 3626 { 3627 struct mount *mp; 3628 3629 ASSERT_VI_LOCKED(vp, __func__); 3630 /* 3631 * Mark a vnode as free: remove it from its active list 3632 * and put it up for recycling on the freelist. 3633 */ 3634 VNASSERT(!VN_IS_DOOMED(vp), vp, 3635 ("vdrop: returning doomed vnode")); 3636 VNASSERT(vp->v_op != NULL, vp, 3637 ("vdrop: vnode already reclaimed.")); 3638 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 3639 ("vnode with VI_OWEINACT set")); 3640 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, 3641 ("vnode with VI_DEFINACT set")); 3642 if (vp->v_mflag & VMP_LAZYLIST) { 3643 mp = vp->v_mount; 3644 mtx_lock(&mp->mnt_listmtx); 3645 VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST")); 3646 /* 3647 * Don't remove the vnode from the lazy list if another thread 3648 * has increased the hold count. It may have re-enqueued the 3649 * vnode to the lazy list and is now responsible for its 3650 * removal. 3651 */ 3652 if (vp->v_holdcnt == 0) { 3653 vp->v_mflag &= ~VMP_LAZYLIST; 3654 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3655 mp->mnt_lazyvnodelistsize--; 3656 } 3657 mtx_unlock(&mp->mnt_listmtx); 3658 } 3659 vdbatch_enqueue(vp); 3660 } 3661 3662 void 3663 vdrop(struct vnode *vp) 3664 { 3665 3666 ASSERT_VI_UNLOCKED(vp, __func__); 3667 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3668 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3669 return; 3670 VI_LOCK(vp); 3671 vdropl(vp); 3672 } 3673 3674 void 3675 vdropl(struct vnode *vp) 3676 { 3677 3678 ASSERT_VI_LOCKED(vp, __func__); 3679 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3680 if (!refcount_release(&vp->v_holdcnt)) { 3681 VI_UNLOCK(vp); 3682 return; 3683 } 3684 if (!VN_IS_DOOMED(vp)) { 3685 vdrop_deactivate(vp); 3686 /* 3687 * Also unlocks the interlock. We can't assert on it as we 3688 * released our hold and by now the vnode might have been 3689 * freed. 3690 */ 3691 return; 3692 } 3693 /* 3694 * Set the VHOLD_NO_SMR flag. 3695 * 3696 * We may be racing against vhold_smr. If they win we can just pretend 3697 * we never got this far, they will vdrop later. 3698 */ 3699 if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) { 3700 VI_UNLOCK(vp); 3701 /* 3702 * We lost the aforementioned race. Any subsequent access is 3703 * invalid as they might have managed to vdropl on their own. 3704 */ 3705 return; 3706 } 3707 freevnode(vp); 3708 } 3709 3710 /* 3711 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3712 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3713 */ 3714 static void 3715 vinactivef(struct vnode *vp) 3716 { 3717 struct vm_object *obj; 3718 3719 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3720 ASSERT_VI_LOCKED(vp, "vinactive"); 3721 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3722 ("vinactive: recursed on VI_DOINGINACT")); 3723 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3724 vp->v_iflag |= VI_DOINGINACT; 3725 vp->v_iflag &= ~VI_OWEINACT; 3726 VI_UNLOCK(vp); 3727 /* 3728 * Before moving off the active list, we must be sure that any 3729 * modified pages are converted into the vnode's dirty 3730 * buffers, since these will no longer be checked once the 3731 * vnode is on the inactive list. 3732 * 3733 * The write-out of the dirty pages is asynchronous. At the 3734 * point that VOP_INACTIVE() is called, there could still be 3735 * pending I/O and dirty pages in the object. 3736 */ 3737 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3738 vm_object_mightbedirty(obj)) { 3739 VM_OBJECT_WLOCK(obj); 3740 vm_object_page_clean(obj, 0, 0, 0); 3741 VM_OBJECT_WUNLOCK(obj); 3742 } 3743 VOP_INACTIVE(vp, curthread); 3744 VI_LOCK(vp); 3745 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3746 ("vinactive: lost VI_DOINGINACT")); 3747 vp->v_iflag &= ~VI_DOINGINACT; 3748 } 3749 3750 void 3751 vinactive(struct vnode *vp) 3752 { 3753 3754 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3755 ASSERT_VI_LOCKED(vp, "vinactive"); 3756 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3757 3758 if ((vp->v_iflag & VI_OWEINACT) == 0) 3759 return; 3760 if (vp->v_iflag & VI_DOINGINACT) 3761 return; 3762 if (vp->v_usecount > 0) { 3763 vp->v_iflag &= ~VI_OWEINACT; 3764 return; 3765 } 3766 vinactivef(vp); 3767 } 3768 3769 /* 3770 * Remove any vnodes in the vnode table belonging to mount point mp. 3771 * 3772 * If FORCECLOSE is not specified, there should not be any active ones, 3773 * return error if any are found (nb: this is a user error, not a 3774 * system error). If FORCECLOSE is specified, detach any active vnodes 3775 * that are found. 3776 * 3777 * If WRITECLOSE is set, only flush out regular file vnodes open for 3778 * writing. 3779 * 3780 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3781 * 3782 * `rootrefs' specifies the base reference count for the root vnode 3783 * of this filesystem. The root vnode is considered busy if its 3784 * v_usecount exceeds this value. On a successful return, vflush(, td) 3785 * will call vrele() on the root vnode exactly rootrefs times. 3786 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3787 * be zero. 3788 */ 3789 #ifdef DIAGNOSTIC 3790 static int busyprt = 0; /* print out busy vnodes */ 3791 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3792 #endif 3793 3794 int 3795 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3796 { 3797 struct vnode *vp, *mvp, *rootvp = NULL; 3798 struct vattr vattr; 3799 int busy = 0, error; 3800 3801 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3802 rootrefs, flags); 3803 if (rootrefs > 0) { 3804 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3805 ("vflush: bad args")); 3806 /* 3807 * Get the filesystem root vnode. We can vput() it 3808 * immediately, since with rootrefs > 0, it won't go away. 3809 */ 3810 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3811 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3812 __func__, error); 3813 return (error); 3814 } 3815 vput(rootvp); 3816 } 3817 loop: 3818 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3819 vholdl(vp); 3820 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3821 if (error) { 3822 vdrop(vp); 3823 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3824 goto loop; 3825 } 3826 /* 3827 * Skip over a vnodes marked VV_SYSTEM. 3828 */ 3829 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3830 VOP_UNLOCK(vp); 3831 vdrop(vp); 3832 continue; 3833 } 3834 /* 3835 * If WRITECLOSE is set, flush out unlinked but still open 3836 * files (even if open only for reading) and regular file 3837 * vnodes open for writing. 3838 */ 3839 if (flags & WRITECLOSE) { 3840 if (vp->v_object != NULL) { 3841 VM_OBJECT_WLOCK(vp->v_object); 3842 vm_object_page_clean(vp->v_object, 0, 0, 0); 3843 VM_OBJECT_WUNLOCK(vp->v_object); 3844 } 3845 error = VOP_FSYNC(vp, MNT_WAIT, td); 3846 if (error != 0) { 3847 VOP_UNLOCK(vp); 3848 vdrop(vp); 3849 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3850 return (error); 3851 } 3852 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3853 VI_LOCK(vp); 3854 3855 if ((vp->v_type == VNON || 3856 (error == 0 && vattr.va_nlink > 0)) && 3857 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3858 VOP_UNLOCK(vp); 3859 vdropl(vp); 3860 continue; 3861 } 3862 } else 3863 VI_LOCK(vp); 3864 /* 3865 * With v_usecount == 0, all we need to do is clear out the 3866 * vnode data structures and we are done. 3867 * 3868 * If FORCECLOSE is set, forcibly close the vnode. 3869 */ 3870 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3871 vgonel(vp); 3872 } else { 3873 busy++; 3874 #ifdef DIAGNOSTIC 3875 if (busyprt) 3876 vn_printf(vp, "vflush: busy vnode "); 3877 #endif 3878 } 3879 VOP_UNLOCK(vp); 3880 vdropl(vp); 3881 } 3882 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3883 /* 3884 * If just the root vnode is busy, and if its refcount 3885 * is equal to `rootrefs', then go ahead and kill it. 3886 */ 3887 VI_LOCK(rootvp); 3888 KASSERT(busy > 0, ("vflush: not busy")); 3889 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3890 ("vflush: usecount %d < rootrefs %d", 3891 rootvp->v_usecount, rootrefs)); 3892 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3893 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3894 vgone(rootvp); 3895 VOP_UNLOCK(rootvp); 3896 busy = 0; 3897 } else 3898 VI_UNLOCK(rootvp); 3899 } 3900 if (busy) { 3901 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3902 busy); 3903 return (EBUSY); 3904 } 3905 for (; rootrefs > 0; rootrefs--) 3906 vrele(rootvp); 3907 return (0); 3908 } 3909 3910 /* 3911 * Recycle an unused vnode to the front of the free list. 3912 */ 3913 int 3914 vrecycle(struct vnode *vp) 3915 { 3916 int recycled; 3917 3918 VI_LOCK(vp); 3919 recycled = vrecyclel(vp); 3920 VI_UNLOCK(vp); 3921 return (recycled); 3922 } 3923 3924 /* 3925 * vrecycle, with the vp interlock held. 3926 */ 3927 int 3928 vrecyclel(struct vnode *vp) 3929 { 3930 int recycled; 3931 3932 ASSERT_VOP_ELOCKED(vp, __func__); 3933 ASSERT_VI_LOCKED(vp, __func__); 3934 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3935 recycled = 0; 3936 if (vp->v_usecount == 0) { 3937 recycled = 1; 3938 vgonel(vp); 3939 } 3940 return (recycled); 3941 } 3942 3943 /* 3944 * Eliminate all activity associated with a vnode 3945 * in preparation for reuse. 3946 */ 3947 void 3948 vgone(struct vnode *vp) 3949 { 3950 VI_LOCK(vp); 3951 vgonel(vp); 3952 VI_UNLOCK(vp); 3953 } 3954 3955 static void 3956 notify_lowervp_vfs_dummy(struct mount *mp __unused, 3957 struct vnode *lowervp __unused) 3958 { 3959 } 3960 3961 /* 3962 * Notify upper mounts about reclaimed or unlinked vnode. 3963 */ 3964 void 3965 vfs_notify_upper(struct vnode *vp, int event) 3966 { 3967 static struct vfsops vgonel_vfsops = { 3968 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 3969 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 3970 }; 3971 struct mount *mp, *ump, *mmp; 3972 3973 mp = vp->v_mount; 3974 if (mp == NULL) 3975 return; 3976 if (TAILQ_EMPTY(&mp->mnt_uppers)) 3977 return; 3978 3979 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 3980 mmp->mnt_op = &vgonel_vfsops; 3981 mmp->mnt_kern_flag |= MNTK_MARKER; 3982 MNT_ILOCK(mp); 3983 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 3984 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 3985 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 3986 ump = TAILQ_NEXT(ump, mnt_upper_link); 3987 continue; 3988 } 3989 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 3990 MNT_IUNLOCK(mp); 3991 switch (event) { 3992 case VFS_NOTIFY_UPPER_RECLAIM: 3993 VFS_RECLAIM_LOWERVP(ump, vp); 3994 break; 3995 case VFS_NOTIFY_UPPER_UNLINK: 3996 VFS_UNLINK_LOWERVP(ump, vp); 3997 break; 3998 default: 3999 KASSERT(0, ("invalid event %d", event)); 4000 break; 4001 } 4002 MNT_ILOCK(mp); 4003 ump = TAILQ_NEXT(mmp, mnt_upper_link); 4004 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 4005 } 4006 free(mmp, M_TEMP); 4007 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 4008 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 4009 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 4010 wakeup(&mp->mnt_uppers); 4011 } 4012 MNT_IUNLOCK(mp); 4013 } 4014 4015 /* 4016 * vgone, with the vp interlock held. 4017 */ 4018 static void 4019 vgonel(struct vnode *vp) 4020 { 4021 struct thread *td; 4022 struct mount *mp; 4023 vm_object_t object; 4024 bool active, oweinact; 4025 4026 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4027 ASSERT_VI_LOCKED(vp, "vgonel"); 4028 VNASSERT(vp->v_holdcnt, vp, 4029 ("vgonel: vp %p has no reference.", vp)); 4030 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4031 td = curthread; 4032 4033 /* 4034 * Don't vgonel if we're already doomed. 4035 */ 4036 if (vp->v_irflag & VIRF_DOOMED) 4037 return; 4038 /* 4039 * Paired with freevnode. 4040 */ 4041 vn_seqc_write_begin_locked(vp); 4042 vunlazy_gone(vp); 4043 vp->v_irflag |= VIRF_DOOMED; 4044 4045 /* 4046 * Check to see if the vnode is in use. If so, we have to call 4047 * VOP_CLOSE() and VOP_INACTIVE(). 4048 */ 4049 active = vp->v_usecount > 0; 4050 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4051 /* 4052 * If we need to do inactive VI_OWEINACT will be set. 4053 */ 4054 if (vp->v_iflag & VI_DEFINACT) { 4055 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4056 vp->v_iflag &= ~VI_DEFINACT; 4057 vdropl(vp); 4058 } else { 4059 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4060 VI_UNLOCK(vp); 4061 } 4062 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4063 4064 /* 4065 * If purging an active vnode, it must be closed and 4066 * deactivated before being reclaimed. 4067 */ 4068 if (active) 4069 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4070 if (oweinact || active) { 4071 VI_LOCK(vp); 4072 vinactivef(vp); 4073 VI_UNLOCK(vp); 4074 } 4075 if (vp->v_type == VSOCK) 4076 vfs_unp_reclaim(vp); 4077 4078 /* 4079 * Clean out any buffers associated with the vnode. 4080 * If the flush fails, just toss the buffers. 4081 */ 4082 mp = NULL; 4083 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4084 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4085 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4086 while (vinvalbuf(vp, 0, 0, 0) != 0) 4087 ; 4088 } 4089 4090 BO_LOCK(&vp->v_bufobj); 4091 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4092 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4093 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4094 vp->v_bufobj.bo_clean.bv_cnt == 0, 4095 ("vp %p bufobj not invalidated", vp)); 4096 4097 /* 4098 * For VMIO bufobj, BO_DEAD is set later, or in 4099 * vm_object_terminate() after the object's page queue is 4100 * flushed. 4101 */ 4102 object = vp->v_bufobj.bo_object; 4103 if (object == NULL) 4104 vp->v_bufobj.bo_flag |= BO_DEAD; 4105 BO_UNLOCK(&vp->v_bufobj); 4106 4107 /* 4108 * Handle the VM part. Tmpfs handles v_object on its own (the 4109 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4110 * should not touch the object borrowed from the lower vnode 4111 * (the handle check). 4112 */ 4113 if (object != NULL && object->type == OBJT_VNODE && 4114 object->handle == vp) 4115 vnode_destroy_vobject(vp); 4116 4117 /* 4118 * Reclaim the vnode. 4119 */ 4120 if (VOP_RECLAIM(vp, td)) 4121 panic("vgone: cannot reclaim"); 4122 if (mp != NULL) 4123 vn_finished_secondary_write(mp); 4124 VNASSERT(vp->v_object == NULL, vp, 4125 ("vop_reclaim left v_object vp=%p", vp)); 4126 /* 4127 * Clear the advisory locks and wake up waiting threads. 4128 */ 4129 (void)VOP_ADVLOCKPURGE(vp); 4130 vp->v_lockf = NULL; 4131 /* 4132 * Delete from old mount point vnode list. 4133 */ 4134 delmntque(vp); 4135 cache_purge_vgone(vp); 4136 /* 4137 * Done with purge, reset to the standard lock and invalidate 4138 * the vnode. 4139 */ 4140 VI_LOCK(vp); 4141 vp->v_vnlock = &vp->v_lock; 4142 vp->v_op = &dead_vnodeops; 4143 vp->v_type = VBAD; 4144 } 4145 4146 /* 4147 * Calculate the total number of references to a special device. 4148 */ 4149 int 4150 vcount(struct vnode *vp) 4151 { 4152 int count; 4153 4154 dev_lock(); 4155 count = vp->v_rdev->si_usecount; 4156 dev_unlock(); 4157 return (count); 4158 } 4159 4160 /* 4161 * Print out a description of a vnode. 4162 */ 4163 static const char * const typename[] = 4164 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 4165 "VMARKER"}; 4166 4167 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4168 "new hold count flag not added to vn_printf"); 4169 4170 void 4171 vn_printf(struct vnode *vp, const char *fmt, ...) 4172 { 4173 va_list ap; 4174 char buf[256], buf2[16]; 4175 u_long flags; 4176 u_int holdcnt; 4177 4178 va_start(ap, fmt); 4179 vprintf(fmt, ap); 4180 va_end(ap); 4181 printf("%p: ", (void *)vp); 4182 printf("type %s\n", typename[vp->v_type]); 4183 holdcnt = atomic_load_int(&vp->v_holdcnt); 4184 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4185 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4186 vp->v_seqc_users); 4187 switch (vp->v_type) { 4188 case VDIR: 4189 printf(" mountedhere %p\n", vp->v_mountedhere); 4190 break; 4191 case VCHR: 4192 printf(" rdev %p\n", vp->v_rdev); 4193 break; 4194 case VSOCK: 4195 printf(" socket %p\n", vp->v_unpcb); 4196 break; 4197 case VFIFO: 4198 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4199 break; 4200 default: 4201 printf("\n"); 4202 break; 4203 } 4204 buf[0] = '\0'; 4205 buf[1] = '\0'; 4206 if (holdcnt & VHOLD_NO_SMR) 4207 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4208 printf(" hold count flags (%s)\n", buf + 1); 4209 4210 buf[0] = '\0'; 4211 buf[1] = '\0'; 4212 if (vp->v_irflag & VIRF_DOOMED) 4213 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4214 flags = vp->v_irflag & ~(VIRF_DOOMED); 4215 if (flags != 0) { 4216 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4217 strlcat(buf, buf2, sizeof(buf)); 4218 } 4219 if (vp->v_vflag & VV_ROOT) 4220 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4221 if (vp->v_vflag & VV_ISTTY) 4222 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4223 if (vp->v_vflag & VV_NOSYNC) 4224 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4225 if (vp->v_vflag & VV_ETERNALDEV) 4226 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4227 if (vp->v_vflag & VV_CACHEDLABEL) 4228 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4229 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4230 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4231 if (vp->v_vflag & VV_COPYONWRITE) 4232 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4233 if (vp->v_vflag & VV_SYSTEM) 4234 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4235 if (vp->v_vflag & VV_PROCDEP) 4236 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4237 if (vp->v_vflag & VV_NOKNOTE) 4238 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 4239 if (vp->v_vflag & VV_DELETED) 4240 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4241 if (vp->v_vflag & VV_MD) 4242 strlcat(buf, "|VV_MD", sizeof(buf)); 4243 if (vp->v_vflag & VV_FORCEINSMQ) 4244 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4245 if (vp->v_vflag & VV_READLINK) 4246 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4247 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4248 VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 4249 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 4250 if (flags != 0) { 4251 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4252 strlcat(buf, buf2, sizeof(buf)); 4253 } 4254 if (vp->v_iflag & VI_TEXT_REF) 4255 strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); 4256 if (vp->v_iflag & VI_MOUNT) 4257 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4258 if (vp->v_iflag & VI_DOINGINACT) 4259 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4260 if (vp->v_iflag & VI_OWEINACT) 4261 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4262 if (vp->v_iflag & VI_DEFINACT) 4263 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4264 flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT | 4265 VI_OWEINACT | VI_DEFINACT); 4266 if (flags != 0) { 4267 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4268 strlcat(buf, buf2, sizeof(buf)); 4269 } 4270 if (vp->v_mflag & VMP_LAZYLIST) 4271 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4272 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4273 if (flags != 0) { 4274 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4275 strlcat(buf, buf2, sizeof(buf)); 4276 } 4277 printf(" flags (%s)\n", buf + 1); 4278 if (mtx_owned(VI_MTX(vp))) 4279 printf(" VI_LOCKed"); 4280 if (vp->v_object != NULL) 4281 printf(" v_object %p ref %d pages %d " 4282 "cleanbuf %d dirtybuf %d\n", 4283 vp->v_object, vp->v_object->ref_count, 4284 vp->v_object->resident_page_count, 4285 vp->v_bufobj.bo_clean.bv_cnt, 4286 vp->v_bufobj.bo_dirty.bv_cnt); 4287 printf(" "); 4288 lockmgr_printinfo(vp->v_vnlock); 4289 if (vp->v_data != NULL) 4290 VOP_PRINT(vp); 4291 } 4292 4293 #ifdef DDB 4294 /* 4295 * List all of the locked vnodes in the system. 4296 * Called when debugging the kernel. 4297 */ 4298 DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 4299 { 4300 struct mount *mp; 4301 struct vnode *vp; 4302 4303 /* 4304 * Note: because this is DDB, we can't obey the locking semantics 4305 * for these structures, which means we could catch an inconsistent 4306 * state and dereference a nasty pointer. Not much to be done 4307 * about that. 4308 */ 4309 db_printf("Locked vnodes\n"); 4310 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4311 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4312 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4313 vn_printf(vp, "vnode "); 4314 } 4315 } 4316 } 4317 4318 /* 4319 * Show details about the given vnode. 4320 */ 4321 DB_SHOW_COMMAND(vnode, db_show_vnode) 4322 { 4323 struct vnode *vp; 4324 4325 if (!have_addr) 4326 return; 4327 vp = (struct vnode *)addr; 4328 vn_printf(vp, "vnode "); 4329 } 4330 4331 /* 4332 * Show details about the given mount point. 4333 */ 4334 DB_SHOW_COMMAND(mount, db_show_mount) 4335 { 4336 struct mount *mp; 4337 struct vfsopt *opt; 4338 struct statfs *sp; 4339 struct vnode *vp; 4340 char buf[512]; 4341 uint64_t mflags; 4342 u_int flags; 4343 4344 if (!have_addr) { 4345 /* No address given, print short info about all mount points. */ 4346 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4347 db_printf("%p %s on %s (%s)\n", mp, 4348 mp->mnt_stat.f_mntfromname, 4349 mp->mnt_stat.f_mntonname, 4350 mp->mnt_stat.f_fstypename); 4351 if (db_pager_quit) 4352 break; 4353 } 4354 db_printf("\nMore info: show mount <addr>\n"); 4355 return; 4356 } 4357 4358 mp = (struct mount *)addr; 4359 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4360 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4361 4362 buf[0] = '\0'; 4363 mflags = mp->mnt_flag; 4364 #define MNT_FLAG(flag) do { \ 4365 if (mflags & (flag)) { \ 4366 if (buf[0] != '\0') \ 4367 strlcat(buf, ", ", sizeof(buf)); \ 4368 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4369 mflags &= ~(flag); \ 4370 } \ 4371 } while (0) 4372 MNT_FLAG(MNT_RDONLY); 4373 MNT_FLAG(MNT_SYNCHRONOUS); 4374 MNT_FLAG(MNT_NOEXEC); 4375 MNT_FLAG(MNT_NOSUID); 4376 MNT_FLAG(MNT_NFS4ACLS); 4377 MNT_FLAG(MNT_UNION); 4378 MNT_FLAG(MNT_ASYNC); 4379 MNT_FLAG(MNT_SUIDDIR); 4380 MNT_FLAG(MNT_SOFTDEP); 4381 MNT_FLAG(MNT_NOSYMFOLLOW); 4382 MNT_FLAG(MNT_GJOURNAL); 4383 MNT_FLAG(MNT_MULTILABEL); 4384 MNT_FLAG(MNT_ACLS); 4385 MNT_FLAG(MNT_NOATIME); 4386 MNT_FLAG(MNT_NOCLUSTERR); 4387 MNT_FLAG(MNT_NOCLUSTERW); 4388 MNT_FLAG(MNT_SUJ); 4389 MNT_FLAG(MNT_EXRDONLY); 4390 MNT_FLAG(MNT_EXPORTED); 4391 MNT_FLAG(MNT_DEFEXPORTED); 4392 MNT_FLAG(MNT_EXPORTANON); 4393 MNT_FLAG(MNT_EXKERB); 4394 MNT_FLAG(MNT_EXPUBLIC); 4395 MNT_FLAG(MNT_LOCAL); 4396 MNT_FLAG(MNT_QUOTA); 4397 MNT_FLAG(MNT_ROOTFS); 4398 MNT_FLAG(MNT_USER); 4399 MNT_FLAG(MNT_IGNORE); 4400 MNT_FLAG(MNT_UPDATE); 4401 MNT_FLAG(MNT_DELEXPORT); 4402 MNT_FLAG(MNT_RELOAD); 4403 MNT_FLAG(MNT_FORCE); 4404 MNT_FLAG(MNT_SNAPSHOT); 4405 MNT_FLAG(MNT_BYFSID); 4406 #undef MNT_FLAG 4407 if (mflags != 0) { 4408 if (buf[0] != '\0') 4409 strlcat(buf, ", ", sizeof(buf)); 4410 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4411 "0x%016jx", mflags); 4412 } 4413 db_printf(" mnt_flag = %s\n", buf); 4414 4415 buf[0] = '\0'; 4416 flags = mp->mnt_kern_flag; 4417 #define MNT_KERN_FLAG(flag) do { \ 4418 if (flags & (flag)) { \ 4419 if (buf[0] != '\0') \ 4420 strlcat(buf, ", ", sizeof(buf)); \ 4421 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4422 flags &= ~(flag); \ 4423 } \ 4424 } while (0) 4425 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4426 MNT_KERN_FLAG(MNTK_ASYNC); 4427 MNT_KERN_FLAG(MNTK_SOFTDEP); 4428 MNT_KERN_FLAG(MNTK_DRAINING); 4429 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4430 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4431 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4432 MNT_KERN_FLAG(MNTK_NO_IOPF); 4433 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 4434 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 4435 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 4436 MNT_KERN_FLAG(MNTK_MARKER); 4437 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4438 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4439 MNT_KERN_FLAG(MNTK_NOASYNC); 4440 MNT_KERN_FLAG(MNTK_UNMOUNT); 4441 MNT_KERN_FLAG(MNTK_MWAIT); 4442 MNT_KERN_FLAG(MNTK_SUSPEND); 4443 MNT_KERN_FLAG(MNTK_SUSPEND2); 4444 MNT_KERN_FLAG(MNTK_SUSPENDED); 4445 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4446 MNT_KERN_FLAG(MNTK_NOKNOTE); 4447 #undef MNT_KERN_FLAG 4448 if (flags != 0) { 4449 if (buf[0] != '\0') 4450 strlcat(buf, ", ", sizeof(buf)); 4451 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4452 "0x%08x", flags); 4453 } 4454 db_printf(" mnt_kern_flag = %s\n", buf); 4455 4456 db_printf(" mnt_opt = "); 4457 opt = TAILQ_FIRST(mp->mnt_opt); 4458 if (opt != NULL) { 4459 db_printf("%s", opt->name); 4460 opt = TAILQ_NEXT(opt, link); 4461 while (opt != NULL) { 4462 db_printf(", %s", opt->name); 4463 opt = TAILQ_NEXT(opt, link); 4464 } 4465 } 4466 db_printf("\n"); 4467 4468 sp = &mp->mnt_stat; 4469 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4470 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4471 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4472 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4473 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4474 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4475 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4476 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4477 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4478 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4479 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4480 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4481 4482 db_printf(" mnt_cred = { uid=%u ruid=%u", 4483 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4484 if (jailed(mp->mnt_cred)) 4485 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4486 db_printf(" }\n"); 4487 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4488 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4489 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4490 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4491 db_printf(" mnt_lazyvnodelistsize = %d\n", 4492 mp->mnt_lazyvnodelistsize); 4493 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4494 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4495 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 4496 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4497 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4498 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4499 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4500 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4501 db_printf(" mnt_secondary_accwrites = %d\n", 4502 mp->mnt_secondary_accwrites); 4503 db_printf(" mnt_gjprovider = %s\n", 4504 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4505 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4506 4507 db_printf("\n\nList of active vnodes\n"); 4508 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4509 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4510 vn_printf(vp, "vnode "); 4511 if (db_pager_quit) 4512 break; 4513 } 4514 } 4515 db_printf("\n\nList of inactive vnodes\n"); 4516 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4517 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4518 vn_printf(vp, "vnode "); 4519 if (db_pager_quit) 4520 break; 4521 } 4522 } 4523 } 4524 #endif /* DDB */ 4525 4526 /* 4527 * Fill in a struct xvfsconf based on a struct vfsconf. 4528 */ 4529 static int 4530 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4531 { 4532 struct xvfsconf xvfsp; 4533 4534 bzero(&xvfsp, sizeof(xvfsp)); 4535 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4536 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4537 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4538 xvfsp.vfc_flags = vfsp->vfc_flags; 4539 /* 4540 * These are unused in userland, we keep them 4541 * to not break binary compatibility. 4542 */ 4543 xvfsp.vfc_vfsops = NULL; 4544 xvfsp.vfc_next = NULL; 4545 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4546 } 4547 4548 #ifdef COMPAT_FREEBSD32 4549 struct xvfsconf32 { 4550 uint32_t vfc_vfsops; 4551 char vfc_name[MFSNAMELEN]; 4552 int32_t vfc_typenum; 4553 int32_t vfc_refcount; 4554 int32_t vfc_flags; 4555 uint32_t vfc_next; 4556 }; 4557 4558 static int 4559 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4560 { 4561 struct xvfsconf32 xvfsp; 4562 4563 bzero(&xvfsp, sizeof(xvfsp)); 4564 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4565 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4566 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4567 xvfsp.vfc_flags = vfsp->vfc_flags; 4568 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4569 } 4570 #endif 4571 4572 /* 4573 * Top level filesystem related information gathering. 4574 */ 4575 static int 4576 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4577 { 4578 struct vfsconf *vfsp; 4579 int error; 4580 4581 error = 0; 4582 vfsconf_slock(); 4583 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4584 #ifdef COMPAT_FREEBSD32 4585 if (req->flags & SCTL_MASK32) 4586 error = vfsconf2x32(req, vfsp); 4587 else 4588 #endif 4589 error = vfsconf2x(req, vfsp); 4590 if (error) 4591 break; 4592 } 4593 vfsconf_sunlock(); 4594 return (error); 4595 } 4596 4597 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4598 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4599 "S,xvfsconf", "List of all configured filesystems"); 4600 4601 #ifndef BURN_BRIDGES 4602 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4603 4604 static int 4605 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4606 { 4607 int *name = (int *)arg1 - 1; /* XXX */ 4608 u_int namelen = arg2 + 1; /* XXX */ 4609 struct vfsconf *vfsp; 4610 4611 log(LOG_WARNING, "userland calling deprecated sysctl, " 4612 "please rebuild world\n"); 4613 4614 #if 1 || defined(COMPAT_PRELITE2) 4615 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4616 if (namelen == 1) 4617 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4618 #endif 4619 4620 switch (name[1]) { 4621 case VFS_MAXTYPENUM: 4622 if (namelen != 2) 4623 return (ENOTDIR); 4624 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4625 case VFS_CONF: 4626 if (namelen != 3) 4627 return (ENOTDIR); /* overloaded */ 4628 vfsconf_slock(); 4629 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4630 if (vfsp->vfc_typenum == name[2]) 4631 break; 4632 } 4633 vfsconf_sunlock(); 4634 if (vfsp == NULL) 4635 return (EOPNOTSUPP); 4636 #ifdef COMPAT_FREEBSD32 4637 if (req->flags & SCTL_MASK32) 4638 return (vfsconf2x32(req, vfsp)); 4639 else 4640 #endif 4641 return (vfsconf2x(req, vfsp)); 4642 } 4643 return (EOPNOTSUPP); 4644 } 4645 4646 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4647 CTLFLAG_MPSAFE, vfs_sysctl, 4648 "Generic filesystem"); 4649 4650 #if 1 || defined(COMPAT_PRELITE2) 4651 4652 static int 4653 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4654 { 4655 int error; 4656 struct vfsconf *vfsp; 4657 struct ovfsconf ovfs; 4658 4659 vfsconf_slock(); 4660 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4661 bzero(&ovfs, sizeof(ovfs)); 4662 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4663 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4664 ovfs.vfc_index = vfsp->vfc_typenum; 4665 ovfs.vfc_refcount = vfsp->vfc_refcount; 4666 ovfs.vfc_flags = vfsp->vfc_flags; 4667 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4668 if (error != 0) { 4669 vfsconf_sunlock(); 4670 return (error); 4671 } 4672 } 4673 vfsconf_sunlock(); 4674 return (0); 4675 } 4676 4677 #endif /* 1 || COMPAT_PRELITE2 */ 4678 #endif /* !BURN_BRIDGES */ 4679 4680 #define KINFO_VNODESLOP 10 4681 #ifdef notyet 4682 /* 4683 * Dump vnode list (via sysctl). 4684 */ 4685 /* ARGSUSED */ 4686 static int 4687 sysctl_vnode(SYSCTL_HANDLER_ARGS) 4688 { 4689 struct xvnode *xvn; 4690 struct mount *mp; 4691 struct vnode *vp; 4692 int error, len, n; 4693 4694 /* 4695 * Stale numvnodes access is not fatal here. 4696 */ 4697 req->lock = 0; 4698 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 4699 if (!req->oldptr) 4700 /* Make an estimate */ 4701 return (SYSCTL_OUT(req, 0, len)); 4702 4703 error = sysctl_wire_old_buffer(req, 0); 4704 if (error != 0) 4705 return (error); 4706 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 4707 n = 0; 4708 mtx_lock(&mountlist_mtx); 4709 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4710 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 4711 continue; 4712 MNT_ILOCK(mp); 4713 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4714 if (n == len) 4715 break; 4716 vref(vp); 4717 xvn[n].xv_size = sizeof *xvn; 4718 xvn[n].xv_vnode = vp; 4719 xvn[n].xv_id = 0; /* XXX compat */ 4720 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 4721 XV_COPY(usecount); 4722 XV_COPY(writecount); 4723 XV_COPY(holdcnt); 4724 XV_COPY(mount); 4725 XV_COPY(numoutput); 4726 XV_COPY(type); 4727 #undef XV_COPY 4728 xvn[n].xv_flag = vp->v_vflag; 4729 4730 switch (vp->v_type) { 4731 case VREG: 4732 case VDIR: 4733 case VLNK: 4734 break; 4735 case VBLK: 4736 case VCHR: 4737 if (vp->v_rdev == NULL) { 4738 vrele(vp); 4739 continue; 4740 } 4741 xvn[n].xv_dev = dev2udev(vp->v_rdev); 4742 break; 4743 case VSOCK: 4744 xvn[n].xv_socket = vp->v_socket; 4745 break; 4746 case VFIFO: 4747 xvn[n].xv_fifo = vp->v_fifoinfo; 4748 break; 4749 case VNON: 4750 case VBAD: 4751 default: 4752 /* shouldn't happen? */ 4753 vrele(vp); 4754 continue; 4755 } 4756 vrele(vp); 4757 ++n; 4758 } 4759 MNT_IUNLOCK(mp); 4760 mtx_lock(&mountlist_mtx); 4761 vfs_unbusy(mp); 4762 if (n == len) 4763 break; 4764 } 4765 mtx_unlock(&mountlist_mtx); 4766 4767 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4768 free(xvn, M_TEMP); 4769 return (error); 4770 } 4771 4772 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4773 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4774 ""); 4775 #endif 4776 4777 static void 4778 unmount_or_warn(struct mount *mp) 4779 { 4780 int error; 4781 4782 error = dounmount(mp, MNT_FORCE, curthread); 4783 if (error != 0) { 4784 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4785 if (error == EBUSY) 4786 printf("BUSY)\n"); 4787 else 4788 printf("%d)\n", error); 4789 } 4790 } 4791 4792 /* 4793 * Unmount all filesystems. The list is traversed in reverse order 4794 * of mounting to avoid dependencies. 4795 */ 4796 void 4797 vfs_unmountall(void) 4798 { 4799 struct mount *mp, *tmp; 4800 4801 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4802 4803 /* 4804 * Since this only runs when rebooting, it is not interlocked. 4805 */ 4806 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4807 vfs_ref(mp); 4808 4809 /* 4810 * Forcibly unmounting "/dev" before "/" would prevent clean 4811 * unmount of the latter. 4812 */ 4813 if (mp == rootdevmp) 4814 continue; 4815 4816 unmount_or_warn(mp); 4817 } 4818 4819 if (rootdevmp != NULL) 4820 unmount_or_warn(rootdevmp); 4821 } 4822 4823 static void 4824 vfs_deferred_inactive(struct vnode *vp, int lkflags) 4825 { 4826 4827 ASSERT_VI_LOCKED(vp, __func__); 4828 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); 4829 if ((vp->v_iflag & VI_OWEINACT) == 0) { 4830 vdropl(vp); 4831 return; 4832 } 4833 if (vn_lock(vp, lkflags) == 0) { 4834 VI_LOCK(vp); 4835 vinactive(vp); 4836 VOP_UNLOCK(vp); 4837 vdropl(vp); 4838 return; 4839 } 4840 vdefer_inactive_unlocked(vp); 4841 } 4842 4843 static int 4844 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 4845 { 4846 4847 return (vp->v_iflag & VI_DEFINACT); 4848 } 4849 4850 static void __noinline 4851 vfs_periodic_inactive(struct mount *mp, int flags) 4852 { 4853 struct vnode *vp, *mvp; 4854 int lkflags; 4855 4856 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4857 if (flags != MNT_WAIT) 4858 lkflags |= LK_NOWAIT; 4859 4860 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 4861 if ((vp->v_iflag & VI_DEFINACT) == 0) { 4862 VI_UNLOCK(vp); 4863 continue; 4864 } 4865 vp->v_iflag &= ~VI_DEFINACT; 4866 vfs_deferred_inactive(vp, lkflags); 4867 } 4868 } 4869 4870 static inline bool 4871 vfs_want_msync(struct vnode *vp) 4872 { 4873 struct vm_object *obj; 4874 4875 /* 4876 * This test may be performed without any locks held. 4877 * We rely on vm_object's type stability. 4878 */ 4879 if (vp->v_vflag & VV_NOSYNC) 4880 return (false); 4881 obj = vp->v_object; 4882 return (obj != NULL && vm_object_mightbedirty(obj)); 4883 } 4884 4885 static int 4886 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 4887 { 4888 4889 if (vp->v_vflag & VV_NOSYNC) 4890 return (false); 4891 if (vp->v_iflag & VI_DEFINACT) 4892 return (true); 4893 return (vfs_want_msync(vp)); 4894 } 4895 4896 static void __noinline 4897 vfs_periodic_msync_inactive(struct mount *mp, int flags) 4898 { 4899 struct vnode *vp, *mvp; 4900 struct vm_object *obj; 4901 struct thread *td; 4902 int lkflags, objflags; 4903 bool seen_defer; 4904 4905 td = curthread; 4906 4907 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4908 if (flags != MNT_WAIT) { 4909 lkflags |= LK_NOWAIT; 4910 objflags = OBJPC_NOSYNC; 4911 } else { 4912 objflags = OBJPC_SYNC; 4913 } 4914 4915 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 4916 seen_defer = false; 4917 if (vp->v_iflag & VI_DEFINACT) { 4918 vp->v_iflag &= ~VI_DEFINACT; 4919 seen_defer = true; 4920 } 4921 if (!vfs_want_msync(vp)) { 4922 if (seen_defer) 4923 vfs_deferred_inactive(vp, lkflags); 4924 else 4925 VI_UNLOCK(vp); 4926 continue; 4927 } 4928 if (vget(vp, lkflags, td) == 0) { 4929 obj = vp->v_object; 4930 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 4931 VM_OBJECT_WLOCK(obj); 4932 vm_object_page_clean(obj, 0, 0, objflags); 4933 VM_OBJECT_WUNLOCK(obj); 4934 } 4935 vput(vp); 4936 if (seen_defer) 4937 vdrop(vp); 4938 } else { 4939 if (seen_defer) 4940 vdefer_inactive_unlocked(vp); 4941 } 4942 } 4943 } 4944 4945 void 4946 vfs_periodic(struct mount *mp, int flags) 4947 { 4948 4949 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4950 4951 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4952 vfs_periodic_inactive(mp, flags); 4953 else 4954 vfs_periodic_msync_inactive(mp, flags); 4955 } 4956 4957 static void 4958 destroy_vpollinfo_free(struct vpollinfo *vi) 4959 { 4960 4961 knlist_destroy(&vi->vpi_selinfo.si_note); 4962 mtx_destroy(&vi->vpi_lock); 4963 uma_zfree(vnodepoll_zone, vi); 4964 } 4965 4966 static void 4967 destroy_vpollinfo(struct vpollinfo *vi) 4968 { 4969 4970 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4971 seldrain(&vi->vpi_selinfo); 4972 destroy_vpollinfo_free(vi); 4973 } 4974 4975 /* 4976 * Initialize per-vnode helper structure to hold poll-related state. 4977 */ 4978 void 4979 v_addpollinfo(struct vnode *vp) 4980 { 4981 struct vpollinfo *vi; 4982 4983 if (vp->v_pollinfo != NULL) 4984 return; 4985 vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); 4986 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4987 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4988 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 4989 VI_LOCK(vp); 4990 if (vp->v_pollinfo != NULL) { 4991 VI_UNLOCK(vp); 4992 destroy_vpollinfo_free(vi); 4993 return; 4994 } 4995 vp->v_pollinfo = vi; 4996 VI_UNLOCK(vp); 4997 } 4998 4999 /* 5000 * Record a process's interest in events which might happen to 5001 * a vnode. Because poll uses the historic select-style interface 5002 * internally, this routine serves as both the ``check for any 5003 * pending events'' and the ``record my interest in future events'' 5004 * functions. (These are done together, while the lock is held, 5005 * to avoid race conditions.) 5006 */ 5007 int 5008 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5009 { 5010 5011 v_addpollinfo(vp); 5012 mtx_lock(&vp->v_pollinfo->vpi_lock); 5013 if (vp->v_pollinfo->vpi_revents & events) { 5014 /* 5015 * This leaves events we are not interested 5016 * in available for the other process which 5017 * which presumably had requested them 5018 * (otherwise they would never have been 5019 * recorded). 5020 */ 5021 events &= vp->v_pollinfo->vpi_revents; 5022 vp->v_pollinfo->vpi_revents &= ~events; 5023 5024 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5025 return (events); 5026 } 5027 vp->v_pollinfo->vpi_events |= events; 5028 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5029 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5030 return (0); 5031 } 5032 5033 /* 5034 * Routine to create and manage a filesystem syncer vnode. 5035 */ 5036 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5037 static int sync_fsync(struct vop_fsync_args *); 5038 static int sync_inactive(struct vop_inactive_args *); 5039 static int sync_reclaim(struct vop_reclaim_args *); 5040 5041 static struct vop_vector sync_vnodeops = { 5042 .vop_bypass = VOP_EOPNOTSUPP, 5043 .vop_close = sync_close, /* close */ 5044 .vop_fsync = sync_fsync, /* fsync */ 5045 .vop_inactive = sync_inactive, /* inactive */ 5046 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 5047 .vop_reclaim = sync_reclaim, /* reclaim */ 5048 .vop_lock1 = vop_stdlock, /* lock */ 5049 .vop_unlock = vop_stdunlock, /* unlock */ 5050 .vop_islocked = vop_stdislocked, /* islocked */ 5051 }; 5052 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5053 5054 /* 5055 * Create a new filesystem syncer vnode for the specified mount point. 5056 */ 5057 void 5058 vfs_allocate_syncvnode(struct mount *mp) 5059 { 5060 struct vnode *vp; 5061 struct bufobj *bo; 5062 static long start, incr, next; 5063 int error; 5064 5065 /* Allocate a new vnode */ 5066 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5067 if (error != 0) 5068 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5069 vp->v_type = VNON; 5070 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5071 vp->v_vflag |= VV_FORCEINSMQ; 5072 error = insmntque(vp, mp); 5073 if (error != 0) 5074 panic("vfs_allocate_syncvnode: insmntque() failed"); 5075 vp->v_vflag &= ~VV_FORCEINSMQ; 5076 VOP_UNLOCK(vp); 5077 /* 5078 * Place the vnode onto the syncer worklist. We attempt to 5079 * scatter them about on the list so that they will go off 5080 * at evenly distributed times even if all the filesystems 5081 * are mounted at once. 5082 */ 5083 next += incr; 5084 if (next == 0 || next > syncer_maxdelay) { 5085 start /= 2; 5086 incr /= 2; 5087 if (start == 0) { 5088 start = syncer_maxdelay / 2; 5089 incr = syncer_maxdelay; 5090 } 5091 next = start; 5092 } 5093 bo = &vp->v_bufobj; 5094 BO_LOCK(bo); 5095 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5096 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5097 mtx_lock(&sync_mtx); 5098 sync_vnode_count++; 5099 if (mp->mnt_syncer == NULL) { 5100 mp->mnt_syncer = vp; 5101 vp = NULL; 5102 } 5103 mtx_unlock(&sync_mtx); 5104 BO_UNLOCK(bo); 5105 if (vp != NULL) { 5106 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5107 vgone(vp); 5108 vput(vp); 5109 } 5110 } 5111 5112 void 5113 vfs_deallocate_syncvnode(struct mount *mp) 5114 { 5115 struct vnode *vp; 5116 5117 mtx_lock(&sync_mtx); 5118 vp = mp->mnt_syncer; 5119 if (vp != NULL) 5120 mp->mnt_syncer = NULL; 5121 mtx_unlock(&sync_mtx); 5122 if (vp != NULL) 5123 vrele(vp); 5124 } 5125 5126 /* 5127 * Do a lazy sync of the filesystem. 5128 */ 5129 static int 5130 sync_fsync(struct vop_fsync_args *ap) 5131 { 5132 struct vnode *syncvp = ap->a_vp; 5133 struct mount *mp = syncvp->v_mount; 5134 int error, save; 5135 struct bufobj *bo; 5136 5137 /* 5138 * We only need to do something if this is a lazy evaluation. 5139 */ 5140 if (ap->a_waitfor != MNT_LAZY) 5141 return (0); 5142 5143 /* 5144 * Move ourselves to the back of the sync list. 5145 */ 5146 bo = &syncvp->v_bufobj; 5147 BO_LOCK(bo); 5148 vn_syncer_add_to_worklist(bo, syncdelay); 5149 BO_UNLOCK(bo); 5150 5151 /* 5152 * Walk the list of vnodes pushing all that are dirty and 5153 * not already on the sync list. 5154 */ 5155 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5156 return (0); 5157 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 5158 vfs_unbusy(mp); 5159 return (0); 5160 } 5161 save = curthread_pflags_set(TDP_SYNCIO); 5162 /* 5163 * The filesystem at hand may be idle with free vnodes stored in the 5164 * batch. Return them instead of letting them stay there indefinitely. 5165 */ 5166 vfs_periodic(mp, MNT_NOWAIT); 5167 error = VFS_SYNC(mp, MNT_LAZY); 5168 curthread_pflags_restore(save); 5169 vn_finished_write(mp); 5170 vfs_unbusy(mp); 5171 return (error); 5172 } 5173 5174 /* 5175 * The syncer vnode is no referenced. 5176 */ 5177 static int 5178 sync_inactive(struct vop_inactive_args *ap) 5179 { 5180 5181 vgone(ap->a_vp); 5182 return (0); 5183 } 5184 5185 /* 5186 * The syncer vnode is no longer needed and is being decommissioned. 5187 * 5188 * Modifications to the worklist must be protected by sync_mtx. 5189 */ 5190 static int 5191 sync_reclaim(struct vop_reclaim_args *ap) 5192 { 5193 struct vnode *vp = ap->a_vp; 5194 struct bufobj *bo; 5195 5196 bo = &vp->v_bufobj; 5197 BO_LOCK(bo); 5198 mtx_lock(&sync_mtx); 5199 if (vp->v_mount->mnt_syncer == vp) 5200 vp->v_mount->mnt_syncer = NULL; 5201 if (bo->bo_flag & BO_ONWORKLST) { 5202 LIST_REMOVE(bo, bo_synclist); 5203 syncer_worklist_len--; 5204 sync_vnode_count--; 5205 bo->bo_flag &= ~BO_ONWORKLST; 5206 } 5207 mtx_unlock(&sync_mtx); 5208 BO_UNLOCK(bo); 5209 5210 return (0); 5211 } 5212 5213 int 5214 vn_need_pageq_flush(struct vnode *vp) 5215 { 5216 struct vm_object *obj; 5217 int need; 5218 5219 MPASS(mtx_owned(VI_MTX(vp))); 5220 need = 0; 5221 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5222 vm_object_mightbedirty(obj)) 5223 need = 1; 5224 return (need); 5225 } 5226 5227 /* 5228 * Check if vnode represents a disk device 5229 */ 5230 int 5231 vn_isdisk(struct vnode *vp, int *errp) 5232 { 5233 int error; 5234 5235 if (vp->v_type != VCHR) { 5236 error = ENOTBLK; 5237 goto out; 5238 } 5239 error = 0; 5240 dev_lock(); 5241 if (vp->v_rdev == NULL) 5242 error = ENXIO; 5243 else if (vp->v_rdev->si_devsw == NULL) 5244 error = ENXIO; 5245 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5246 error = ENOTBLK; 5247 dev_unlock(); 5248 out: 5249 if (errp != NULL) 5250 *errp = error; 5251 return (error == 0); 5252 } 5253 5254 /* 5255 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5256 * the comment above cache_fplookup for details. 5257 * 5258 * We never deny as priv_check_cred calls are not yet supported, see vaccess. 5259 */ 5260 int 5261 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5262 { 5263 5264 VFS_SMR_ASSERT_ENTERED(); 5265 5266 /* Check the owner. */ 5267 if (cred->cr_uid == file_uid) { 5268 if (file_mode & S_IXUSR) 5269 return (0); 5270 return (EAGAIN); 5271 } 5272 5273 /* Otherwise, check the groups (first match) */ 5274 if (groupmember(file_gid, cred)) { 5275 if (file_mode & S_IXGRP) 5276 return (0); 5277 return (EAGAIN); 5278 } 5279 5280 /* Otherwise, check everyone else. */ 5281 if (file_mode & S_IXOTH) 5282 return (0); 5283 return (EAGAIN); 5284 } 5285 5286 /* 5287 * Common filesystem object access control check routine. Accepts a 5288 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5289 * Returns 0 on success, or an errno on failure. 5290 */ 5291 int 5292 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5293 accmode_t accmode, struct ucred *cred) 5294 { 5295 accmode_t dac_granted; 5296 accmode_t priv_granted; 5297 5298 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5299 ("invalid bit in accmode")); 5300 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5301 ("VAPPEND without VWRITE")); 5302 5303 /* 5304 * Look for a normal, non-privileged way to access the file/directory 5305 * as requested. If it exists, go with that. 5306 */ 5307 5308 dac_granted = 0; 5309 5310 /* Check the owner. */ 5311 if (cred->cr_uid == file_uid) { 5312 dac_granted |= VADMIN; 5313 if (file_mode & S_IXUSR) 5314 dac_granted |= VEXEC; 5315 if (file_mode & S_IRUSR) 5316 dac_granted |= VREAD; 5317 if (file_mode & S_IWUSR) 5318 dac_granted |= (VWRITE | VAPPEND); 5319 5320 if ((accmode & dac_granted) == accmode) 5321 return (0); 5322 5323 goto privcheck; 5324 } 5325 5326 /* Otherwise, check the groups (first match) */ 5327 if (groupmember(file_gid, cred)) { 5328 if (file_mode & S_IXGRP) 5329 dac_granted |= VEXEC; 5330 if (file_mode & S_IRGRP) 5331 dac_granted |= VREAD; 5332 if (file_mode & S_IWGRP) 5333 dac_granted |= (VWRITE | VAPPEND); 5334 5335 if ((accmode & dac_granted) == accmode) 5336 return (0); 5337 5338 goto privcheck; 5339 } 5340 5341 /* Otherwise, check everyone else. */ 5342 if (file_mode & S_IXOTH) 5343 dac_granted |= VEXEC; 5344 if (file_mode & S_IROTH) 5345 dac_granted |= VREAD; 5346 if (file_mode & S_IWOTH) 5347 dac_granted |= (VWRITE | VAPPEND); 5348 if ((accmode & dac_granted) == accmode) 5349 return (0); 5350 5351 privcheck: 5352 /* 5353 * Build a privilege mask to determine if the set of privileges 5354 * satisfies the requirements when combined with the granted mask 5355 * from above. For each privilege, if the privilege is required, 5356 * bitwise or the request type onto the priv_granted mask. 5357 */ 5358 priv_granted = 0; 5359 5360 if (type == VDIR) { 5361 /* 5362 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5363 * requests, instead of PRIV_VFS_EXEC. 5364 */ 5365 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5366 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5367 priv_granted |= VEXEC; 5368 } else { 5369 /* 5370 * Ensure that at least one execute bit is on. Otherwise, 5371 * a privileged user will always succeed, and we don't want 5372 * this to happen unless the file really is executable. 5373 */ 5374 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5375 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5376 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5377 priv_granted |= VEXEC; 5378 } 5379 5380 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5381 !priv_check_cred(cred, PRIV_VFS_READ)) 5382 priv_granted |= VREAD; 5383 5384 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5385 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5386 priv_granted |= (VWRITE | VAPPEND); 5387 5388 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5389 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5390 priv_granted |= VADMIN; 5391 5392 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5393 return (0); 5394 } 5395 5396 return ((accmode & VADMIN) ? EPERM : EACCES); 5397 } 5398 5399 /* 5400 * Credential check based on process requesting service, and per-attribute 5401 * permissions. 5402 */ 5403 int 5404 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5405 struct thread *td, accmode_t accmode) 5406 { 5407 5408 /* 5409 * Kernel-invoked always succeeds. 5410 */ 5411 if (cred == NOCRED) 5412 return (0); 5413 5414 /* 5415 * Do not allow privileged processes in jail to directly manipulate 5416 * system attributes. 5417 */ 5418 switch (attrnamespace) { 5419 case EXTATTR_NAMESPACE_SYSTEM: 5420 /* Potentially should be: return (EPERM); */ 5421 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5422 case EXTATTR_NAMESPACE_USER: 5423 return (VOP_ACCESS(vp, accmode, cred, td)); 5424 default: 5425 return (EPERM); 5426 } 5427 } 5428 5429 #ifdef DEBUG_VFS_LOCKS 5430 /* 5431 * This only exists to suppress warnings from unlocked specfs accesses. It is 5432 * no longer ok to have an unlocked VFS. 5433 */ 5434 #define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \ 5435 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 5436 5437 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5438 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5439 "Drop into debugger on lock violation"); 5440 5441 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5442 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5443 0, "Check for interlock across VOPs"); 5444 5445 int vfs_badlock_print = 1; /* Print lock violations. */ 5446 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5447 0, "Print lock violations"); 5448 5449 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5450 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5451 0, "Print vnode details on lock violations"); 5452 5453 #ifdef KDB 5454 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5455 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5456 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5457 #endif 5458 5459 static void 5460 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5461 { 5462 5463 #ifdef KDB 5464 if (vfs_badlock_backtrace) 5465 kdb_backtrace(); 5466 #endif 5467 if (vfs_badlock_vnode) 5468 vn_printf(vp, "vnode "); 5469 if (vfs_badlock_print) 5470 printf("%s: %p %s\n", str, (void *)vp, msg); 5471 if (vfs_badlock_ddb) 5472 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5473 } 5474 5475 void 5476 assert_vi_locked(struct vnode *vp, const char *str) 5477 { 5478 5479 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5480 vfs_badlock("interlock is not locked but should be", str, vp); 5481 } 5482 5483 void 5484 assert_vi_unlocked(struct vnode *vp, const char *str) 5485 { 5486 5487 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5488 vfs_badlock("interlock is locked but should not be", str, vp); 5489 } 5490 5491 void 5492 assert_vop_locked(struct vnode *vp, const char *str) 5493 { 5494 int locked; 5495 5496 if (!IGNORE_LOCK(vp)) { 5497 locked = VOP_ISLOCKED(vp); 5498 if (locked == 0 || locked == LK_EXCLOTHER) 5499 vfs_badlock("is not locked but should be", str, vp); 5500 } 5501 } 5502 5503 void 5504 assert_vop_unlocked(struct vnode *vp, const char *str) 5505 { 5506 5507 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5508 vfs_badlock("is locked but should not be", str, vp); 5509 } 5510 5511 void 5512 assert_vop_elocked(struct vnode *vp, const char *str) 5513 { 5514 5515 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5516 vfs_badlock("is not exclusive locked but should be", str, vp); 5517 } 5518 #endif /* DEBUG_VFS_LOCKS */ 5519 5520 void 5521 vop_rename_fail(struct vop_rename_args *ap) 5522 { 5523 5524 if (ap->a_tvp != NULL) 5525 vput(ap->a_tvp); 5526 if (ap->a_tdvp == ap->a_tvp) 5527 vrele(ap->a_tdvp); 5528 else 5529 vput(ap->a_tdvp); 5530 vrele(ap->a_fdvp); 5531 vrele(ap->a_fvp); 5532 } 5533 5534 void 5535 vop_rename_pre(void *ap) 5536 { 5537 struct vop_rename_args *a = ap; 5538 5539 #ifdef DEBUG_VFS_LOCKS 5540 if (a->a_tvp) 5541 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5542 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5543 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5544 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5545 5546 /* Check the source (from). */ 5547 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5548 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5549 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5550 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5551 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5552 5553 /* Check the target. */ 5554 if (a->a_tvp) 5555 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5556 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5557 #endif 5558 /* 5559 * It may be tempting to add vn_seqc_write_begin/end calls here and 5560 * in vop_rename_post but that's not going to work out since some 5561 * filesystems relookup vnodes mid-rename. This is probably a bug. 5562 * 5563 * For now filesystems are expected to do the relevant calls after they 5564 * decide what vnodes to operate on. 5565 */ 5566 if (a->a_tdvp != a->a_fdvp) 5567 vhold(a->a_fdvp); 5568 if (a->a_tvp != a->a_fvp) 5569 vhold(a->a_fvp); 5570 vhold(a->a_tdvp); 5571 if (a->a_tvp) 5572 vhold(a->a_tvp); 5573 } 5574 5575 #ifdef DEBUG_VFS_LOCKS 5576 void 5577 vop_fplookup_vexec_debugpre(void *ap __unused) 5578 { 5579 5580 VFS_SMR_ASSERT_ENTERED(); 5581 } 5582 5583 void 5584 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) 5585 { 5586 5587 VFS_SMR_ASSERT_ENTERED(); 5588 } 5589 5590 void 5591 vop_strategy_debugpre(void *ap) 5592 { 5593 struct vop_strategy_args *a; 5594 struct buf *bp; 5595 5596 a = ap; 5597 bp = a->a_bp; 5598 5599 /* 5600 * Cluster ops lock their component buffers but not the IO container. 5601 */ 5602 if ((bp->b_flags & B_CLUSTER) != 0) 5603 return; 5604 5605 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5606 if (vfs_badlock_print) 5607 printf( 5608 "VOP_STRATEGY: bp is not locked but should be\n"); 5609 if (vfs_badlock_ddb) 5610 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5611 } 5612 } 5613 5614 void 5615 vop_lock_debugpre(void *ap) 5616 { 5617 struct vop_lock1_args *a = ap; 5618 5619 if ((a->a_flags & LK_INTERLOCK) == 0) 5620 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5621 else 5622 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5623 } 5624 5625 void 5626 vop_lock_debugpost(void *ap, int rc) 5627 { 5628 struct vop_lock1_args *a = ap; 5629 5630 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5631 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5632 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5633 } 5634 5635 void 5636 vop_unlock_debugpre(void *ap) 5637 { 5638 struct vop_unlock_args *a = ap; 5639 5640 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 5641 } 5642 5643 void 5644 vop_need_inactive_debugpre(void *ap) 5645 { 5646 struct vop_need_inactive_args *a = ap; 5647 5648 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5649 } 5650 5651 void 5652 vop_need_inactive_debugpost(void *ap, int rc) 5653 { 5654 struct vop_need_inactive_args *a = ap; 5655 5656 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5657 } 5658 #endif 5659 5660 void 5661 vop_create_pre(void *ap) 5662 { 5663 struct vop_create_args *a; 5664 struct vnode *dvp; 5665 5666 a = ap; 5667 dvp = a->a_dvp; 5668 vn_seqc_write_begin(dvp); 5669 } 5670 5671 void 5672 vop_create_post(void *ap, int rc) 5673 { 5674 struct vop_create_args *a; 5675 struct vnode *dvp; 5676 5677 a = ap; 5678 dvp = a->a_dvp; 5679 vn_seqc_write_end(dvp); 5680 if (!rc) 5681 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5682 } 5683 5684 void 5685 vop_whiteout_pre(void *ap) 5686 { 5687 struct vop_whiteout_args *a; 5688 struct vnode *dvp; 5689 5690 a = ap; 5691 dvp = a->a_dvp; 5692 vn_seqc_write_begin(dvp); 5693 } 5694 5695 void 5696 vop_whiteout_post(void *ap, int rc) 5697 { 5698 struct vop_whiteout_args *a; 5699 struct vnode *dvp; 5700 5701 a = ap; 5702 dvp = a->a_dvp; 5703 vn_seqc_write_end(dvp); 5704 } 5705 5706 void 5707 vop_deleteextattr_pre(void *ap) 5708 { 5709 struct vop_deleteextattr_args *a; 5710 struct vnode *vp; 5711 5712 a = ap; 5713 vp = a->a_vp; 5714 vn_seqc_write_begin(vp); 5715 } 5716 5717 void 5718 vop_deleteextattr_post(void *ap, int rc) 5719 { 5720 struct vop_deleteextattr_args *a; 5721 struct vnode *vp; 5722 5723 a = ap; 5724 vp = a->a_vp; 5725 vn_seqc_write_end(vp); 5726 if (!rc) 5727 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5728 } 5729 5730 void 5731 vop_link_pre(void *ap) 5732 { 5733 struct vop_link_args *a; 5734 struct vnode *vp, *tdvp; 5735 5736 a = ap; 5737 vp = a->a_vp; 5738 tdvp = a->a_tdvp; 5739 vn_seqc_write_begin(vp); 5740 vn_seqc_write_begin(tdvp); 5741 } 5742 5743 void 5744 vop_link_post(void *ap, int rc) 5745 { 5746 struct vop_link_args *a; 5747 struct vnode *vp, *tdvp; 5748 5749 a = ap; 5750 vp = a->a_vp; 5751 tdvp = a->a_tdvp; 5752 vn_seqc_write_end(vp); 5753 vn_seqc_write_end(tdvp); 5754 if (!rc) { 5755 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 5756 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 5757 } 5758 } 5759 5760 void 5761 vop_mkdir_pre(void *ap) 5762 { 5763 struct vop_mkdir_args *a; 5764 struct vnode *dvp; 5765 5766 a = ap; 5767 dvp = a->a_dvp; 5768 vn_seqc_write_begin(dvp); 5769 } 5770 5771 void 5772 vop_mkdir_post(void *ap, int rc) 5773 { 5774 struct vop_mkdir_args *a; 5775 struct vnode *dvp; 5776 5777 a = ap; 5778 dvp = a->a_dvp; 5779 vn_seqc_write_end(dvp); 5780 if (!rc) 5781 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5782 } 5783 5784 void 5785 vop_mknod_pre(void *ap) 5786 { 5787 struct vop_mknod_args *a; 5788 struct vnode *dvp; 5789 5790 a = ap; 5791 dvp = a->a_dvp; 5792 vn_seqc_write_begin(dvp); 5793 } 5794 5795 void 5796 vop_mknod_post(void *ap, int rc) 5797 { 5798 struct vop_mknod_args *a; 5799 struct vnode *dvp; 5800 5801 a = ap; 5802 dvp = a->a_dvp; 5803 vn_seqc_write_end(dvp); 5804 if (!rc) 5805 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5806 } 5807 5808 void 5809 vop_reclaim_post(void *ap, int rc) 5810 { 5811 struct vop_reclaim_args *a; 5812 struct vnode *vp; 5813 5814 a = ap; 5815 vp = a->a_vp; 5816 ASSERT_VOP_IN_SEQC(vp); 5817 if (!rc) 5818 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 5819 } 5820 5821 void 5822 vop_remove_pre(void *ap) 5823 { 5824 struct vop_remove_args *a; 5825 struct vnode *dvp, *vp; 5826 5827 a = ap; 5828 dvp = a->a_dvp; 5829 vp = a->a_vp; 5830 vn_seqc_write_begin(dvp); 5831 vn_seqc_write_begin(vp); 5832 } 5833 5834 void 5835 vop_remove_post(void *ap, int rc) 5836 { 5837 struct vop_remove_args *a; 5838 struct vnode *dvp, *vp; 5839 5840 a = ap; 5841 dvp = a->a_dvp; 5842 vp = a->a_vp; 5843 vn_seqc_write_end(dvp); 5844 vn_seqc_write_end(vp); 5845 if (!rc) { 5846 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5847 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5848 } 5849 } 5850 5851 void 5852 vop_rename_post(void *ap, int rc) 5853 { 5854 struct vop_rename_args *a = ap; 5855 long hint; 5856 5857 if (!rc) { 5858 hint = NOTE_WRITE; 5859 if (a->a_fdvp == a->a_tdvp) { 5860 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5861 hint |= NOTE_LINK; 5862 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5863 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5864 } else { 5865 hint |= NOTE_EXTEND; 5866 if (a->a_fvp->v_type == VDIR) 5867 hint |= NOTE_LINK; 5868 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5869 5870 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5871 a->a_tvp->v_type == VDIR) 5872 hint &= ~NOTE_LINK; 5873 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5874 } 5875 5876 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5877 if (a->a_tvp) 5878 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5879 } 5880 if (a->a_tdvp != a->a_fdvp) 5881 vdrop(a->a_fdvp); 5882 if (a->a_tvp != a->a_fvp) 5883 vdrop(a->a_fvp); 5884 vdrop(a->a_tdvp); 5885 if (a->a_tvp) 5886 vdrop(a->a_tvp); 5887 } 5888 5889 void 5890 vop_rmdir_pre(void *ap) 5891 { 5892 struct vop_rmdir_args *a; 5893 struct vnode *dvp, *vp; 5894 5895 a = ap; 5896 dvp = a->a_dvp; 5897 vp = a->a_vp; 5898 vn_seqc_write_begin(dvp); 5899 vn_seqc_write_begin(vp); 5900 } 5901 5902 void 5903 vop_rmdir_post(void *ap, int rc) 5904 { 5905 struct vop_rmdir_args *a; 5906 struct vnode *dvp, *vp; 5907 5908 a = ap; 5909 dvp = a->a_dvp; 5910 vp = a->a_vp; 5911 vn_seqc_write_end(dvp); 5912 vn_seqc_write_end(vp); 5913 if (!rc) { 5914 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5915 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5916 } 5917 } 5918 5919 void 5920 vop_setattr_pre(void *ap) 5921 { 5922 struct vop_setattr_args *a; 5923 struct vnode *vp; 5924 5925 a = ap; 5926 vp = a->a_vp; 5927 vn_seqc_write_begin(vp); 5928 } 5929 5930 void 5931 vop_setattr_post(void *ap, int rc) 5932 { 5933 struct vop_setattr_args *a; 5934 struct vnode *vp; 5935 5936 a = ap; 5937 vp = a->a_vp; 5938 vn_seqc_write_end(vp); 5939 if (!rc) 5940 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 5941 } 5942 5943 void 5944 vop_setacl_pre(void *ap) 5945 { 5946 struct vop_setacl_args *a; 5947 struct vnode *vp; 5948 5949 a = ap; 5950 vp = a->a_vp; 5951 vn_seqc_write_begin(vp); 5952 } 5953 5954 void 5955 vop_setacl_post(void *ap, int rc __unused) 5956 { 5957 struct vop_setacl_args *a; 5958 struct vnode *vp; 5959 5960 a = ap; 5961 vp = a->a_vp; 5962 vn_seqc_write_end(vp); 5963 } 5964 5965 void 5966 vop_setextattr_pre(void *ap) 5967 { 5968 struct vop_setextattr_args *a; 5969 struct vnode *vp; 5970 5971 a = ap; 5972 vp = a->a_vp; 5973 vn_seqc_write_begin(vp); 5974 } 5975 5976 void 5977 vop_setextattr_post(void *ap, int rc) 5978 { 5979 struct vop_setextattr_args *a; 5980 struct vnode *vp; 5981 5982 a = ap; 5983 vp = a->a_vp; 5984 vn_seqc_write_end(vp); 5985 if (!rc) 5986 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 5987 } 5988 5989 void 5990 vop_symlink_pre(void *ap) 5991 { 5992 struct vop_symlink_args *a; 5993 struct vnode *dvp; 5994 5995 a = ap; 5996 dvp = a->a_dvp; 5997 vn_seqc_write_begin(dvp); 5998 } 5999 6000 void 6001 vop_symlink_post(void *ap, int rc) 6002 { 6003 struct vop_symlink_args *a; 6004 struct vnode *dvp; 6005 6006 a = ap; 6007 dvp = a->a_dvp; 6008 vn_seqc_write_end(dvp); 6009 if (!rc) 6010 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6011 } 6012 6013 void 6014 vop_open_post(void *ap, int rc) 6015 { 6016 struct vop_open_args *a = ap; 6017 6018 if (!rc) 6019 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6020 } 6021 6022 void 6023 vop_close_post(void *ap, int rc) 6024 { 6025 struct vop_close_args *a = ap; 6026 6027 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6028 !VN_IS_DOOMED(a->a_vp))) { 6029 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6030 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6031 } 6032 } 6033 6034 void 6035 vop_read_post(void *ap, int rc) 6036 { 6037 struct vop_read_args *a = ap; 6038 6039 if (!rc) 6040 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6041 } 6042 6043 void 6044 vop_readdir_post(void *ap, int rc) 6045 { 6046 struct vop_readdir_args *a = ap; 6047 6048 if (!rc) 6049 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6050 } 6051 6052 static struct knlist fs_knlist; 6053 6054 static void 6055 vfs_event_init(void *arg) 6056 { 6057 knlist_init_mtx(&fs_knlist, NULL); 6058 } 6059 /* XXX - correct order? */ 6060 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6061 6062 void 6063 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6064 { 6065 6066 KNOTE_UNLOCKED(&fs_knlist, event); 6067 } 6068 6069 static int filt_fsattach(struct knote *kn); 6070 static void filt_fsdetach(struct knote *kn); 6071 static int filt_fsevent(struct knote *kn, long hint); 6072 6073 struct filterops fs_filtops = { 6074 .f_isfd = 0, 6075 .f_attach = filt_fsattach, 6076 .f_detach = filt_fsdetach, 6077 .f_event = filt_fsevent 6078 }; 6079 6080 static int 6081 filt_fsattach(struct knote *kn) 6082 { 6083 6084 kn->kn_flags |= EV_CLEAR; 6085 knlist_add(&fs_knlist, kn, 0); 6086 return (0); 6087 } 6088 6089 static void 6090 filt_fsdetach(struct knote *kn) 6091 { 6092 6093 knlist_remove(&fs_knlist, kn, 0); 6094 } 6095 6096 static int 6097 filt_fsevent(struct knote *kn, long hint) 6098 { 6099 6100 kn->kn_fflags |= hint; 6101 return (kn->kn_fflags != 0); 6102 } 6103 6104 static int 6105 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6106 { 6107 struct vfsidctl vc; 6108 int error; 6109 struct mount *mp; 6110 6111 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6112 if (error) 6113 return (error); 6114 if (vc.vc_vers != VFS_CTL_VERS1) 6115 return (EINVAL); 6116 mp = vfs_getvfs(&vc.vc_fsid); 6117 if (mp == NULL) 6118 return (ENOENT); 6119 /* ensure that a specific sysctl goes to the right filesystem. */ 6120 if (strcmp(vc.vc_fstypename, "*") != 0 && 6121 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6122 vfs_rel(mp); 6123 return (EINVAL); 6124 } 6125 VCTLTOREQ(&vc, req); 6126 error = VFS_SYSCTL(mp, vc.vc_op, req); 6127 vfs_rel(mp); 6128 return (error); 6129 } 6130 6131 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6132 NULL, 0, sysctl_vfs_ctl, "", 6133 "Sysctl by fsid"); 6134 6135 /* 6136 * Function to initialize a va_filerev field sensibly. 6137 * XXX: Wouldn't a random number make a lot more sense ?? 6138 */ 6139 u_quad_t 6140 init_va_filerev(void) 6141 { 6142 struct bintime bt; 6143 6144 getbinuptime(&bt); 6145 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6146 } 6147 6148 static int filt_vfsread(struct knote *kn, long hint); 6149 static int filt_vfswrite(struct knote *kn, long hint); 6150 static int filt_vfsvnode(struct knote *kn, long hint); 6151 static void filt_vfsdetach(struct knote *kn); 6152 static struct filterops vfsread_filtops = { 6153 .f_isfd = 1, 6154 .f_detach = filt_vfsdetach, 6155 .f_event = filt_vfsread 6156 }; 6157 static struct filterops vfswrite_filtops = { 6158 .f_isfd = 1, 6159 .f_detach = filt_vfsdetach, 6160 .f_event = filt_vfswrite 6161 }; 6162 static struct filterops vfsvnode_filtops = { 6163 .f_isfd = 1, 6164 .f_detach = filt_vfsdetach, 6165 .f_event = filt_vfsvnode 6166 }; 6167 6168 static void 6169 vfs_knllock(void *arg) 6170 { 6171 struct vnode *vp = arg; 6172 6173 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6174 } 6175 6176 static void 6177 vfs_knlunlock(void *arg) 6178 { 6179 struct vnode *vp = arg; 6180 6181 VOP_UNLOCK(vp); 6182 } 6183 6184 static void 6185 vfs_knl_assert_locked(void *arg) 6186 { 6187 #ifdef DEBUG_VFS_LOCKS 6188 struct vnode *vp = arg; 6189 6190 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6191 #endif 6192 } 6193 6194 static void 6195 vfs_knl_assert_unlocked(void *arg) 6196 { 6197 #ifdef DEBUG_VFS_LOCKS 6198 struct vnode *vp = arg; 6199 6200 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6201 #endif 6202 } 6203 6204 int 6205 vfs_kqfilter(struct vop_kqfilter_args *ap) 6206 { 6207 struct vnode *vp = ap->a_vp; 6208 struct knote *kn = ap->a_kn; 6209 struct knlist *knl; 6210 6211 switch (kn->kn_filter) { 6212 case EVFILT_READ: 6213 kn->kn_fop = &vfsread_filtops; 6214 break; 6215 case EVFILT_WRITE: 6216 kn->kn_fop = &vfswrite_filtops; 6217 break; 6218 case EVFILT_VNODE: 6219 kn->kn_fop = &vfsvnode_filtops; 6220 break; 6221 default: 6222 return (EINVAL); 6223 } 6224 6225 kn->kn_hook = (caddr_t)vp; 6226 6227 v_addpollinfo(vp); 6228 if (vp->v_pollinfo == NULL) 6229 return (ENOMEM); 6230 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6231 vhold(vp); 6232 knlist_add(knl, kn, 0); 6233 6234 return (0); 6235 } 6236 6237 /* 6238 * Detach knote from vnode 6239 */ 6240 static void 6241 filt_vfsdetach(struct knote *kn) 6242 { 6243 struct vnode *vp = (struct vnode *)kn->kn_hook; 6244 6245 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6246 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6247 vdrop(vp); 6248 } 6249 6250 /*ARGSUSED*/ 6251 static int 6252 filt_vfsread(struct knote *kn, long hint) 6253 { 6254 struct vnode *vp = (struct vnode *)kn->kn_hook; 6255 struct vattr va; 6256 int res; 6257 6258 /* 6259 * filesystem is gone, so set the EOF flag and schedule 6260 * the knote for deletion. 6261 */ 6262 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6263 VI_LOCK(vp); 6264 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6265 VI_UNLOCK(vp); 6266 return (1); 6267 } 6268 6269 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 6270 return (0); 6271 6272 VI_LOCK(vp); 6273 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 6274 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6275 VI_UNLOCK(vp); 6276 return (res); 6277 } 6278 6279 /*ARGSUSED*/ 6280 static int 6281 filt_vfswrite(struct knote *kn, long hint) 6282 { 6283 struct vnode *vp = (struct vnode *)kn->kn_hook; 6284 6285 VI_LOCK(vp); 6286 6287 /* 6288 * filesystem is gone, so set the EOF flag and schedule 6289 * the knote for deletion. 6290 */ 6291 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6292 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6293 6294 kn->kn_data = 0; 6295 VI_UNLOCK(vp); 6296 return (1); 6297 } 6298 6299 static int 6300 filt_vfsvnode(struct knote *kn, long hint) 6301 { 6302 struct vnode *vp = (struct vnode *)kn->kn_hook; 6303 int res; 6304 6305 VI_LOCK(vp); 6306 if (kn->kn_sfflags & hint) 6307 kn->kn_fflags |= hint; 6308 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6309 kn->kn_flags |= EV_EOF; 6310 VI_UNLOCK(vp); 6311 return (1); 6312 } 6313 res = (kn->kn_fflags != 0); 6314 VI_UNLOCK(vp); 6315 return (res); 6316 } 6317 6318 /* 6319 * Returns whether the directory is empty or not. 6320 * If it is empty, the return value is 0; otherwise 6321 * the return value is an error value (which may 6322 * be ENOTEMPTY). 6323 */ 6324 int 6325 vfs_emptydir(struct vnode *vp) 6326 { 6327 struct uio uio; 6328 struct iovec iov; 6329 struct dirent *dirent, *dp, *endp; 6330 int error, eof; 6331 6332 error = 0; 6333 eof = 0; 6334 6335 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 6336 6337 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 6338 iov.iov_base = dirent; 6339 iov.iov_len = sizeof(struct dirent); 6340 6341 uio.uio_iov = &iov; 6342 uio.uio_iovcnt = 1; 6343 uio.uio_offset = 0; 6344 uio.uio_resid = sizeof(struct dirent); 6345 uio.uio_segflg = UIO_SYSSPACE; 6346 uio.uio_rw = UIO_READ; 6347 uio.uio_td = curthread; 6348 6349 while (eof == 0 && error == 0) { 6350 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 6351 NULL, NULL); 6352 if (error != 0) 6353 break; 6354 endp = (void *)((uint8_t *)dirent + 6355 sizeof(struct dirent) - uio.uio_resid); 6356 for (dp = dirent; dp < endp; 6357 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 6358 if (dp->d_type == DT_WHT) 6359 continue; 6360 if (dp->d_namlen == 0) 6361 continue; 6362 if (dp->d_type != DT_DIR && 6363 dp->d_type != DT_UNKNOWN) { 6364 error = ENOTEMPTY; 6365 break; 6366 } 6367 if (dp->d_namlen > 2) { 6368 error = ENOTEMPTY; 6369 break; 6370 } 6371 if (dp->d_namlen == 1 && 6372 dp->d_name[0] != '.') { 6373 error = ENOTEMPTY; 6374 break; 6375 } 6376 if (dp->d_namlen == 2 && 6377 dp->d_name[1] != '.') { 6378 error = ENOTEMPTY; 6379 break; 6380 } 6381 uio.uio_resid = sizeof(struct dirent); 6382 } 6383 } 6384 free(dirent, M_TEMP); 6385 return (error); 6386 } 6387 6388 int 6389 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6390 { 6391 int error; 6392 6393 if (dp->d_reclen > ap->a_uio->uio_resid) 6394 return (ENAMETOOLONG); 6395 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6396 if (error) { 6397 if (ap->a_ncookies != NULL) { 6398 if (ap->a_cookies != NULL) 6399 free(ap->a_cookies, M_TEMP); 6400 ap->a_cookies = NULL; 6401 *ap->a_ncookies = 0; 6402 } 6403 return (error); 6404 } 6405 if (ap->a_ncookies == NULL) 6406 return (0); 6407 6408 KASSERT(ap->a_cookies, 6409 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6410 6411 *ap->a_cookies = realloc(*ap->a_cookies, 6412 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 6413 (*ap->a_cookies)[*ap->a_ncookies] = off; 6414 *ap->a_ncookies += 1; 6415 return (0); 6416 } 6417 6418 /* 6419 * The purpose of this routine is to remove granularity from accmode_t, 6420 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6421 * VADMIN and VAPPEND. 6422 * 6423 * If it returns 0, the caller is supposed to continue with the usual 6424 * access checks using 'accmode' as modified by this routine. If it 6425 * returns nonzero value, the caller is supposed to return that value 6426 * as errno. 6427 * 6428 * Note that after this routine runs, accmode may be zero. 6429 */ 6430 int 6431 vfs_unixify_accmode(accmode_t *accmode) 6432 { 6433 /* 6434 * There is no way to specify explicit "deny" rule using 6435 * file mode or POSIX.1e ACLs. 6436 */ 6437 if (*accmode & VEXPLICIT_DENY) { 6438 *accmode = 0; 6439 return (0); 6440 } 6441 6442 /* 6443 * None of these can be translated into usual access bits. 6444 * Also, the common case for NFSv4 ACLs is to not contain 6445 * either of these bits. Caller should check for VWRITE 6446 * on the containing directory instead. 6447 */ 6448 if (*accmode & (VDELETE_CHILD | VDELETE)) 6449 return (EPERM); 6450 6451 if (*accmode & VADMIN_PERMS) { 6452 *accmode &= ~VADMIN_PERMS; 6453 *accmode |= VADMIN; 6454 } 6455 6456 /* 6457 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6458 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6459 */ 6460 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6461 6462 return (0); 6463 } 6464 6465 /* 6466 * Clear out a doomed vnode (if any) and replace it with a new one as long 6467 * as the fs is not being unmounted. Return the root vnode to the caller. 6468 */ 6469 static int __noinline 6470 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6471 { 6472 struct vnode *vp; 6473 int error; 6474 6475 restart: 6476 if (mp->mnt_rootvnode != NULL) { 6477 MNT_ILOCK(mp); 6478 vp = mp->mnt_rootvnode; 6479 if (vp != NULL) { 6480 if (!VN_IS_DOOMED(vp)) { 6481 vrefact(vp); 6482 MNT_IUNLOCK(mp); 6483 error = vn_lock(vp, flags); 6484 if (error == 0) { 6485 *vpp = vp; 6486 return (0); 6487 } 6488 vrele(vp); 6489 goto restart; 6490 } 6491 /* 6492 * Clear the old one. 6493 */ 6494 mp->mnt_rootvnode = NULL; 6495 } 6496 MNT_IUNLOCK(mp); 6497 if (vp != NULL) { 6498 vfs_op_barrier_wait(mp); 6499 vrele(vp); 6500 } 6501 } 6502 error = VFS_CACHEDROOT(mp, flags, vpp); 6503 if (error != 0) 6504 return (error); 6505 if (mp->mnt_vfs_ops == 0) { 6506 MNT_ILOCK(mp); 6507 if (mp->mnt_vfs_ops != 0) { 6508 MNT_IUNLOCK(mp); 6509 return (0); 6510 } 6511 if (mp->mnt_rootvnode == NULL) { 6512 vrefact(*vpp); 6513 mp->mnt_rootvnode = *vpp; 6514 } else { 6515 if (mp->mnt_rootvnode != *vpp) { 6516 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6517 panic("%s: mismatch between vnode returned " 6518 " by VFS_CACHEDROOT and the one cached " 6519 " (%p != %p)", 6520 __func__, *vpp, mp->mnt_rootvnode); 6521 } 6522 } 6523 } 6524 MNT_IUNLOCK(mp); 6525 } 6526 return (0); 6527 } 6528 6529 int 6530 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6531 { 6532 struct vnode *vp; 6533 int error; 6534 6535 if (!vfs_op_thread_enter(mp)) 6536 return (vfs_cache_root_fallback(mp, flags, vpp)); 6537 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6538 if (vp == NULL || VN_IS_DOOMED(vp)) { 6539 vfs_op_thread_exit(mp); 6540 return (vfs_cache_root_fallback(mp, flags, vpp)); 6541 } 6542 vrefact(vp); 6543 vfs_op_thread_exit(mp); 6544 error = vn_lock(vp, flags); 6545 if (error != 0) { 6546 vrele(vp); 6547 return (vfs_cache_root_fallback(mp, flags, vpp)); 6548 } 6549 *vpp = vp; 6550 return (0); 6551 } 6552 6553 struct vnode * 6554 vfs_cache_root_clear(struct mount *mp) 6555 { 6556 struct vnode *vp; 6557 6558 /* 6559 * ops > 0 guarantees there is nobody who can see this vnode 6560 */ 6561 MPASS(mp->mnt_vfs_ops > 0); 6562 vp = mp->mnt_rootvnode; 6563 if (vp != NULL) 6564 vn_seqc_write_begin(vp); 6565 mp->mnt_rootvnode = NULL; 6566 return (vp); 6567 } 6568 6569 void 6570 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6571 { 6572 6573 MPASS(mp->mnt_vfs_ops > 0); 6574 vrefact(vp); 6575 mp->mnt_rootvnode = vp; 6576 } 6577 6578 /* 6579 * These are helper functions for filesystems to traverse all 6580 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6581 * 6582 * This interface replaces MNT_VNODE_FOREACH. 6583 */ 6584 6585 struct vnode * 6586 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6587 { 6588 struct vnode *vp; 6589 6590 if (should_yield()) 6591 kern_yield(PRI_USER); 6592 MNT_ILOCK(mp); 6593 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6594 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6595 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6596 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6597 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6598 continue; 6599 VI_LOCK(vp); 6600 if (VN_IS_DOOMED(vp)) { 6601 VI_UNLOCK(vp); 6602 continue; 6603 } 6604 break; 6605 } 6606 if (vp == NULL) { 6607 __mnt_vnode_markerfree_all(mvp, mp); 6608 /* MNT_IUNLOCK(mp); -- done in above function */ 6609 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6610 return (NULL); 6611 } 6612 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6613 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6614 MNT_IUNLOCK(mp); 6615 return (vp); 6616 } 6617 6618 struct vnode * 6619 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6620 { 6621 struct vnode *vp; 6622 6623 *mvp = vn_alloc_marker(mp); 6624 MNT_ILOCK(mp); 6625 MNT_REF(mp); 6626 6627 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6628 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6629 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6630 continue; 6631 VI_LOCK(vp); 6632 if (VN_IS_DOOMED(vp)) { 6633 VI_UNLOCK(vp); 6634 continue; 6635 } 6636 break; 6637 } 6638 if (vp == NULL) { 6639 MNT_REL(mp); 6640 MNT_IUNLOCK(mp); 6641 vn_free_marker(*mvp); 6642 *mvp = NULL; 6643 return (NULL); 6644 } 6645 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6646 MNT_IUNLOCK(mp); 6647 return (vp); 6648 } 6649 6650 void 6651 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6652 { 6653 6654 if (*mvp == NULL) { 6655 MNT_IUNLOCK(mp); 6656 return; 6657 } 6658 6659 mtx_assert(MNT_MTX(mp), MA_OWNED); 6660 6661 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6662 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6663 MNT_REL(mp); 6664 MNT_IUNLOCK(mp); 6665 vn_free_marker(*mvp); 6666 *mvp = NULL; 6667 } 6668 6669 /* 6670 * These are helper functions for filesystems to traverse their 6671 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6672 */ 6673 static void 6674 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6675 { 6676 6677 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6678 6679 MNT_ILOCK(mp); 6680 MNT_REL(mp); 6681 MNT_IUNLOCK(mp); 6682 vn_free_marker(*mvp); 6683 *mvp = NULL; 6684 } 6685 6686 /* 6687 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6688 * conventional lock order during mnt_vnode_next_lazy iteration. 6689 * 6690 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6691 * The list lock is dropped and reacquired. On success, both locks are held. 6692 * On failure, the mount vnode list lock is held but the vnode interlock is 6693 * not, and the procedure may have yielded. 6694 */ 6695 static bool 6696 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6697 struct vnode *vp) 6698 { 6699 6700 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6701 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6702 ("%s: bad marker", __func__)); 6703 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6704 ("%s: inappropriate vnode", __func__)); 6705 ASSERT_VI_UNLOCKED(vp, __func__); 6706 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6707 6708 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6709 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6710 6711 /* 6712 * Note we may be racing against vdrop which transitioned the hold 6713 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6714 * if we are the only user after we get the interlock we will just 6715 * vdrop. 6716 */ 6717 vhold(vp); 6718 mtx_unlock(&mp->mnt_listmtx); 6719 VI_LOCK(vp); 6720 if (VN_IS_DOOMED(vp)) { 6721 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6722 goto out_lost; 6723 } 6724 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6725 /* 6726 * There is nothing to do if we are the last user. 6727 */ 6728 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6729 goto out_lost; 6730 mtx_lock(&mp->mnt_listmtx); 6731 return (true); 6732 out_lost: 6733 vdropl(vp); 6734 maybe_yield(); 6735 mtx_lock(&mp->mnt_listmtx); 6736 return (false); 6737 } 6738 6739 static struct vnode * 6740 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6741 void *cbarg) 6742 { 6743 struct vnode *vp; 6744 6745 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6746 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6747 restart: 6748 vp = TAILQ_NEXT(*mvp, v_lazylist); 6749 while (vp != NULL) { 6750 if (vp->v_type == VMARKER) { 6751 vp = TAILQ_NEXT(vp, v_lazylist); 6752 continue; 6753 } 6754 /* 6755 * See if we want to process the vnode. Note we may encounter a 6756 * long string of vnodes we don't care about and hog the list 6757 * as a result. Check for it and requeue the marker. 6758 */ 6759 VNPASS(!VN_IS_DOOMED(vp), vp); 6760 if (!cb(vp, cbarg)) { 6761 if (!should_yield()) { 6762 vp = TAILQ_NEXT(vp, v_lazylist); 6763 continue; 6764 } 6765 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 6766 v_lazylist); 6767 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 6768 v_lazylist); 6769 mtx_unlock(&mp->mnt_listmtx); 6770 kern_yield(PRI_USER); 6771 mtx_lock(&mp->mnt_listmtx); 6772 goto restart; 6773 } 6774 /* 6775 * Try-lock because this is the wrong lock order. 6776 */ 6777 if (!VI_TRYLOCK(vp) && 6778 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 6779 goto restart; 6780 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6781 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6782 ("alien vnode on the lazy list %p %p", vp, mp)); 6783 VNPASS(vp->v_mount == mp, vp); 6784 VNPASS(!VN_IS_DOOMED(vp), vp); 6785 break; 6786 } 6787 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6788 6789 /* Check if we are done */ 6790 if (vp == NULL) { 6791 mtx_unlock(&mp->mnt_listmtx); 6792 mnt_vnode_markerfree_lazy(mvp, mp); 6793 return (NULL); 6794 } 6795 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 6796 mtx_unlock(&mp->mnt_listmtx); 6797 ASSERT_VI_LOCKED(vp, "lazy iter"); 6798 return (vp); 6799 } 6800 6801 struct vnode * 6802 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6803 void *cbarg) 6804 { 6805 6806 if (should_yield()) 6807 kern_yield(PRI_USER); 6808 mtx_lock(&mp->mnt_listmtx); 6809 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6810 } 6811 6812 struct vnode * 6813 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6814 void *cbarg) 6815 { 6816 struct vnode *vp; 6817 6818 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 6819 return (NULL); 6820 6821 *mvp = vn_alloc_marker(mp); 6822 MNT_ILOCK(mp); 6823 MNT_REF(mp); 6824 MNT_IUNLOCK(mp); 6825 6826 mtx_lock(&mp->mnt_listmtx); 6827 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 6828 if (vp == NULL) { 6829 mtx_unlock(&mp->mnt_listmtx); 6830 mnt_vnode_markerfree_lazy(mvp, mp); 6831 return (NULL); 6832 } 6833 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 6834 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6835 } 6836 6837 void 6838 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6839 { 6840 6841 if (*mvp == NULL) 6842 return; 6843 6844 mtx_lock(&mp->mnt_listmtx); 6845 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6846 mtx_unlock(&mp->mnt_listmtx); 6847 mnt_vnode_markerfree_lazy(mvp, mp); 6848 } 6849 6850 int 6851 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 6852 { 6853 6854 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 6855 cnp->cn_flags &= ~NOEXECCHECK; 6856 return (0); 6857 } 6858 6859 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); 6860 } 6861 6862 /* 6863 * Do not use this variant unless you have means other than the hold count 6864 * to prevent the vnode from getting freed. 6865 */ 6866 void 6867 vn_seqc_write_begin_unheld_locked(struct vnode *vp) 6868 { 6869 6870 ASSERT_VI_LOCKED(vp, __func__); 6871 VNPASS(vp->v_seqc_users >= 0, vp); 6872 vp->v_seqc_users++; 6873 if (vp->v_seqc_users == 1) 6874 seqc_sleepable_write_begin(&vp->v_seqc); 6875 } 6876 6877 void 6878 vn_seqc_write_begin_locked(struct vnode *vp) 6879 { 6880 6881 ASSERT_VI_LOCKED(vp, __func__); 6882 VNPASS(vp->v_holdcnt > 0, vp); 6883 vn_seqc_write_begin_unheld_locked(vp); 6884 } 6885 6886 void 6887 vn_seqc_write_begin(struct vnode *vp) 6888 { 6889 6890 VI_LOCK(vp); 6891 vn_seqc_write_begin_locked(vp); 6892 VI_UNLOCK(vp); 6893 } 6894 6895 void 6896 vn_seqc_write_begin_unheld(struct vnode *vp) 6897 { 6898 6899 VI_LOCK(vp); 6900 vn_seqc_write_begin_unheld_locked(vp); 6901 VI_UNLOCK(vp); 6902 } 6903 6904 void 6905 vn_seqc_write_end_locked(struct vnode *vp) 6906 { 6907 6908 ASSERT_VI_LOCKED(vp, __func__); 6909 VNPASS(vp->v_seqc_users > 0, vp); 6910 vp->v_seqc_users--; 6911 if (vp->v_seqc_users == 0) 6912 seqc_sleepable_write_end(&vp->v_seqc); 6913 } 6914 6915 void 6916 vn_seqc_write_end(struct vnode *vp) 6917 { 6918 6919 VI_LOCK(vp); 6920 vn_seqc_write_end_locked(vp); 6921 VI_UNLOCK(vp); 6922 } 6923