1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 __FBSDID("$FreeBSD$"); 45 46 #include "opt_ddb.h" 47 #include "opt_watchdog.h" 48 49 #include <sys/param.h> 50 #include <sys/systm.h> 51 #include <sys/asan.h> 52 #include <sys/bio.h> 53 #include <sys/buf.h> 54 #include <sys/capsicum.h> 55 #include <sys/condvar.h> 56 #include <sys/conf.h> 57 #include <sys/counter.h> 58 #include <sys/dirent.h> 59 #include <sys/event.h> 60 #include <sys/eventhandler.h> 61 #include <sys/extattr.h> 62 #include <sys/file.h> 63 #include <sys/fcntl.h> 64 #include <sys/jail.h> 65 #include <sys/kdb.h> 66 #include <sys/kernel.h> 67 #include <sys/kthread.h> 68 #include <sys/ktr.h> 69 #include <sys/limits.h> 70 #include <sys/lockf.h> 71 #include <sys/malloc.h> 72 #include <sys/mount.h> 73 #include <sys/namei.h> 74 #include <sys/pctrie.h> 75 #include <sys/priv.h> 76 #include <sys/reboot.h> 77 #include <sys/refcount.h> 78 #include <sys/rwlock.h> 79 #include <sys/sched.h> 80 #include <sys/sleepqueue.h> 81 #include <sys/smr.h> 82 #include <sys/smp.h> 83 #include <sys/stat.h> 84 #include <sys/sysctl.h> 85 #include <sys/syslog.h> 86 #include <sys/vmmeter.h> 87 #include <sys/vnode.h> 88 #include <sys/watchdog.h> 89 90 #include <machine/stdarg.h> 91 92 #include <security/mac/mac_framework.h> 93 94 #include <vm/vm.h> 95 #include <vm/vm_object.h> 96 #include <vm/vm_extern.h> 97 #include <vm/pmap.h> 98 #include <vm/vm_map.h> 99 #include <vm/vm_page.h> 100 #include <vm/vm_kern.h> 101 #include <vm/uma.h> 102 103 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS)) 104 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS 105 #endif 106 107 #ifdef DDB 108 #include <ddb/ddb.h> 109 #endif 110 111 static void delmntque(struct vnode *vp); 112 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 113 int slpflag, int slptimeo); 114 static void syncer_shutdown(void *arg, int howto); 115 static int vtryrecycle(struct vnode *vp); 116 static void v_init_counters(struct vnode *); 117 static void vn_seqc_init(struct vnode *); 118 static void vn_seqc_write_end_free(struct vnode *vp); 119 static void vgonel(struct vnode *); 120 static bool vhold_recycle_free(struct vnode *); 121 static void vdropl_recycle(struct vnode *vp); 122 static void vdrop_recycle(struct vnode *vp); 123 static void vfs_knllock(void *arg); 124 static void vfs_knlunlock(void *arg); 125 static void vfs_knl_assert_lock(void *arg, int what); 126 static void destroy_vpollinfo(struct vpollinfo *vi); 127 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 128 daddr_t startlbn, daddr_t endlbn); 129 static void vnlru_recalc(void); 130 131 /* 132 * Number of vnodes in existence. Increased whenever getnewvnode() 133 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 134 */ 135 static u_long __exclusive_cache_line numvnodes; 136 137 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 138 "Number of vnodes in existence"); 139 140 static counter_u64_t vnodes_created; 141 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 142 "Number of vnodes created by getnewvnode"); 143 144 /* 145 * Conversion tables for conversion from vnode types to inode formats 146 * and back. 147 */ 148 enum vtype iftovt_tab[16] = { 149 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 150 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 151 }; 152 int vttoif_tab[10] = { 153 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 154 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 155 }; 156 157 /* 158 * List of allocates vnodes in the system. 159 */ 160 static TAILQ_HEAD(freelst, vnode) vnode_list; 161 static struct vnode *vnode_list_free_marker; 162 static struct vnode *vnode_list_reclaim_marker; 163 164 /* 165 * "Free" vnode target. Free vnodes are rarely completely free, but are 166 * just ones that are cheap to recycle. Usually they are for files which 167 * have been stat'd but not read; these usually have inode and namecache 168 * data attached to them. This target is the preferred minimum size of a 169 * sub-cache consisting mostly of such files. The system balances the size 170 * of this sub-cache with its complement to try to prevent either from 171 * thrashing while the other is relatively inactive. The targets express 172 * a preference for the best balance. 173 * 174 * "Above" this target there are 2 further targets (watermarks) related 175 * to recyling of free vnodes. In the best-operating case, the cache is 176 * exactly full, the free list has size between vlowat and vhiwat above the 177 * free target, and recycling from it and normal use maintains this state. 178 * Sometimes the free list is below vlowat or even empty, but this state 179 * is even better for immediate use provided the cache is not full. 180 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 181 * ones) to reach one of these states. The watermarks are currently hard- 182 * coded as 4% and 9% of the available space higher. These and the default 183 * of 25% for wantfreevnodes are too large if the memory size is large. 184 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 185 * whenever vnlru_proc() becomes active. 186 */ 187 static long wantfreevnodes; 188 static long __exclusive_cache_line freevnodes; 189 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 190 &freevnodes, 0, "Number of \"free\" vnodes"); 191 static long freevnodes_old; 192 193 static counter_u64_t recycles_count; 194 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 195 "Number of vnodes recycled to meet vnode cache targets"); 196 197 static counter_u64_t recycles_free_count; 198 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, 199 "Number of free vnodes recycled to meet vnode cache targets"); 200 201 static counter_u64_t deferred_inact; 202 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 203 "Number of times inactive processing was deferred"); 204 205 /* To keep more than one thread at a time from running vfs_getnewfsid */ 206 static struct mtx mntid_mtx; 207 208 /* 209 * Lock for any access to the following: 210 * vnode_list 211 * numvnodes 212 * freevnodes 213 */ 214 static struct mtx __exclusive_cache_line vnode_list_mtx; 215 216 /* Publicly exported FS */ 217 struct nfs_public nfs_pub; 218 219 static uma_zone_t buf_trie_zone; 220 static smr_t buf_trie_smr; 221 222 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 223 static uma_zone_t vnode_zone; 224 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 225 226 __read_frequently smr_t vfs_smr; 227 228 /* 229 * The workitem queue. 230 * 231 * It is useful to delay writes of file data and filesystem metadata 232 * for tens of seconds so that quickly created and deleted files need 233 * not waste disk bandwidth being created and removed. To realize this, 234 * we append vnodes to a "workitem" queue. When running with a soft 235 * updates implementation, most pending metadata dependencies should 236 * not wait for more than a few seconds. Thus, mounted on block devices 237 * are delayed only about a half the time that file data is delayed. 238 * Similarly, directory updates are more critical, so are only delayed 239 * about a third the time that file data is delayed. Thus, there are 240 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 241 * one each second (driven off the filesystem syncer process). The 242 * syncer_delayno variable indicates the next queue that is to be processed. 243 * Items that need to be processed soon are placed in this queue: 244 * 245 * syncer_workitem_pending[syncer_delayno] 246 * 247 * A delay of fifteen seconds is done by placing the request fifteen 248 * entries later in the queue: 249 * 250 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 251 * 252 */ 253 static int syncer_delayno; 254 static long syncer_mask; 255 LIST_HEAD(synclist, bufobj); 256 static struct synclist *syncer_workitem_pending; 257 /* 258 * The sync_mtx protects: 259 * bo->bo_synclist 260 * sync_vnode_count 261 * syncer_delayno 262 * syncer_state 263 * syncer_workitem_pending 264 * syncer_worklist_len 265 * rushjob 266 */ 267 static struct mtx sync_mtx; 268 static struct cv sync_wakeup; 269 270 #define SYNCER_MAXDELAY 32 271 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 272 static int syncdelay = 30; /* max time to delay syncing data */ 273 static int filedelay = 30; /* time to delay syncing files */ 274 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 275 "Time to delay syncing files (in seconds)"); 276 static int dirdelay = 29; /* time to delay syncing directories */ 277 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 278 "Time to delay syncing directories (in seconds)"); 279 static int metadelay = 28; /* time to delay syncing metadata */ 280 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 281 "Time to delay syncing metadata (in seconds)"); 282 static int rushjob; /* number of slots to run ASAP */ 283 static int stat_rush_requests; /* number of times I/O speeded up */ 284 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 285 "Number of times I/O speeded up (rush requests)"); 286 287 #define VDBATCH_SIZE 8 288 struct vdbatch { 289 u_int index; 290 long freevnodes; 291 struct mtx lock; 292 struct vnode *tab[VDBATCH_SIZE]; 293 }; 294 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 295 296 static void vdbatch_dequeue(struct vnode *vp); 297 298 /* 299 * When shutting down the syncer, run it at four times normal speed. 300 */ 301 #define SYNCER_SHUTDOWN_SPEEDUP 4 302 static int sync_vnode_count; 303 static int syncer_worklist_len; 304 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 305 syncer_state; 306 307 /* Target for maximum number of vnodes. */ 308 u_long desiredvnodes; 309 static u_long gapvnodes; /* gap between wanted and desired */ 310 static u_long vhiwat; /* enough extras after expansion */ 311 static u_long vlowat; /* minimal extras before expansion */ 312 static u_long vstir; /* nonzero to stir non-free vnodes */ 313 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 314 315 static u_long vnlru_read_freevnodes(void); 316 317 /* 318 * Note that no attempt is made to sanitize these parameters. 319 */ 320 static int 321 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 322 { 323 u_long val; 324 int error; 325 326 val = desiredvnodes; 327 error = sysctl_handle_long(oidp, &val, 0, req); 328 if (error != 0 || req->newptr == NULL) 329 return (error); 330 331 if (val == desiredvnodes) 332 return (0); 333 mtx_lock(&vnode_list_mtx); 334 desiredvnodes = val; 335 wantfreevnodes = desiredvnodes / 4; 336 vnlru_recalc(); 337 mtx_unlock(&vnode_list_mtx); 338 /* 339 * XXX There is no protection against multiple threads changing 340 * desiredvnodes at the same time. Locking above only helps vnlru and 341 * getnewvnode. 342 */ 343 vfs_hash_changesize(desiredvnodes); 344 cache_changesize(desiredvnodes); 345 return (0); 346 } 347 348 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 349 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 350 "LU", "Target for maximum number of vnodes"); 351 352 static int 353 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 354 { 355 u_long val; 356 int error; 357 358 val = wantfreevnodes; 359 error = sysctl_handle_long(oidp, &val, 0, req); 360 if (error != 0 || req->newptr == NULL) 361 return (error); 362 363 if (val == wantfreevnodes) 364 return (0); 365 mtx_lock(&vnode_list_mtx); 366 wantfreevnodes = val; 367 vnlru_recalc(); 368 mtx_unlock(&vnode_list_mtx); 369 return (0); 370 } 371 372 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 373 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 374 "LU", "Target for minimum number of \"free\" vnodes"); 375 376 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 377 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 378 static int vnlru_nowhere; 379 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW | CTLFLAG_STATS, 380 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 381 382 static int 383 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 384 { 385 struct vnode *vp; 386 struct nameidata nd; 387 char *buf; 388 unsigned long ndflags; 389 int error; 390 391 if (req->newptr == NULL) 392 return (EINVAL); 393 if (req->newlen >= PATH_MAX) 394 return (E2BIG); 395 396 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 397 error = SYSCTL_IN(req, buf, req->newlen); 398 if (error != 0) 399 goto out; 400 401 buf[req->newlen] = '\0'; 402 403 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 404 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 405 if ((error = namei(&nd)) != 0) 406 goto out; 407 vp = nd.ni_vp; 408 409 if (VN_IS_DOOMED(vp)) { 410 /* 411 * This vnode is being recycled. Return != 0 to let the caller 412 * know that the sysctl had no effect. Return EAGAIN because a 413 * subsequent call will likely succeed (since namei will create 414 * a new vnode if necessary) 415 */ 416 error = EAGAIN; 417 goto putvnode; 418 } 419 420 counter_u64_add(recycles_count, 1); 421 vgone(vp); 422 putvnode: 423 vput(vp); 424 NDFREE_PNBUF(&nd); 425 out: 426 free(buf, M_TEMP); 427 return (error); 428 } 429 430 static int 431 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 432 { 433 struct thread *td = curthread; 434 struct vnode *vp; 435 struct file *fp; 436 int error; 437 int fd; 438 439 if (req->newptr == NULL) 440 return (EBADF); 441 442 error = sysctl_handle_int(oidp, &fd, 0, req); 443 if (error != 0) 444 return (error); 445 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 446 if (error != 0) 447 return (error); 448 vp = fp->f_vnode; 449 450 error = vn_lock(vp, LK_EXCLUSIVE); 451 if (error != 0) 452 goto drop; 453 454 counter_u64_add(recycles_count, 1); 455 vgone(vp); 456 VOP_UNLOCK(vp); 457 drop: 458 fdrop(fp, td); 459 return (error); 460 } 461 462 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 463 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 464 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 465 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 466 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 467 sysctl_ftry_reclaim_vnode, "I", 468 "Try to reclaim a vnode by its file descriptor"); 469 470 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 471 #define vnsz2log 8 472 #ifndef DEBUG_LOCKS 473 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 474 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 475 "vnsz2log needs to be updated"); 476 #endif 477 478 /* 479 * Support for the bufobj clean & dirty pctrie. 480 */ 481 static void * 482 buf_trie_alloc(struct pctrie *ptree) 483 { 484 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 485 } 486 487 static void 488 buf_trie_free(struct pctrie *ptree, void *node) 489 { 490 uma_zfree_smr(buf_trie_zone, node); 491 } 492 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 493 buf_trie_smr); 494 495 /* 496 * Initialize the vnode management data structures. 497 * 498 * Reevaluate the following cap on the number of vnodes after the physical 499 * memory size exceeds 512GB. In the limit, as the physical memory size 500 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 501 */ 502 #ifndef MAXVNODES_MAX 503 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 504 #endif 505 506 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 507 508 static struct vnode * 509 vn_alloc_marker(struct mount *mp) 510 { 511 struct vnode *vp; 512 513 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 514 vp->v_type = VMARKER; 515 vp->v_mount = mp; 516 517 return (vp); 518 } 519 520 static void 521 vn_free_marker(struct vnode *vp) 522 { 523 524 MPASS(vp->v_type == VMARKER); 525 free(vp, M_VNODE_MARKER); 526 } 527 528 #ifdef KASAN 529 static int 530 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 531 { 532 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 533 return (0); 534 } 535 536 static void 537 vnode_dtor(void *mem, int size, void *arg __unused) 538 { 539 size_t end1, end2, off1, off2; 540 541 _Static_assert(offsetof(struct vnode, v_vnodelist) < 542 offsetof(struct vnode, v_dbatchcpu), 543 "KASAN marks require updating"); 544 545 off1 = offsetof(struct vnode, v_vnodelist); 546 off2 = offsetof(struct vnode, v_dbatchcpu); 547 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 548 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 549 550 /* 551 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 552 * after the vnode has been freed. Try to get some KASAN coverage by 553 * marking everything except those two fields as invalid. Because 554 * KASAN's tracking is not byte-granular, any preceding fields sharing 555 * the same 8-byte aligned word must also be marked valid. 556 */ 557 558 /* Handle the area from the start until v_vnodelist... */ 559 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 560 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 561 562 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 563 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 564 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 565 if (off2 > off1) 566 kasan_mark((void *)((char *)mem + off1), off2 - off1, 567 off2 - off1, KASAN_UMA_FREED); 568 569 /* ... and finally the area from v_dbatchcpu to the end. */ 570 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 571 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 572 KASAN_UMA_FREED); 573 } 574 #endif /* KASAN */ 575 576 /* 577 * Initialize a vnode as it first enters the zone. 578 */ 579 static int 580 vnode_init(void *mem, int size, int flags) 581 { 582 struct vnode *vp; 583 584 vp = mem; 585 bzero(vp, size); 586 /* 587 * Setup locks. 588 */ 589 vp->v_vnlock = &vp->v_lock; 590 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 591 /* 592 * By default, don't allow shared locks unless filesystems opt-in. 593 */ 594 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 595 LK_NOSHARE | LK_IS_VNODE); 596 /* 597 * Initialize bufobj. 598 */ 599 bufobj_init(&vp->v_bufobj, vp); 600 /* 601 * Initialize namecache. 602 */ 603 cache_vnode_init(vp); 604 /* 605 * Initialize rangelocks. 606 */ 607 rangelock_init(&vp->v_rl); 608 609 vp->v_dbatchcpu = NOCPU; 610 611 vp->v_state = VSTATE_DEAD; 612 613 /* 614 * Check vhold_recycle_free for an explanation. 615 */ 616 vp->v_holdcnt = VHOLD_NO_SMR; 617 vp->v_type = VNON; 618 mtx_lock(&vnode_list_mtx); 619 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 620 mtx_unlock(&vnode_list_mtx); 621 return (0); 622 } 623 624 /* 625 * Free a vnode when it is cleared from the zone. 626 */ 627 static void 628 vnode_fini(void *mem, int size) 629 { 630 struct vnode *vp; 631 struct bufobj *bo; 632 633 vp = mem; 634 vdbatch_dequeue(vp); 635 mtx_lock(&vnode_list_mtx); 636 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 637 mtx_unlock(&vnode_list_mtx); 638 rangelock_destroy(&vp->v_rl); 639 lockdestroy(vp->v_vnlock); 640 mtx_destroy(&vp->v_interlock); 641 bo = &vp->v_bufobj; 642 rw_destroy(BO_LOCKPTR(bo)); 643 644 kasan_mark(mem, size, size, 0); 645 } 646 647 /* 648 * Provide the size of NFS nclnode and NFS fh for calculation of the 649 * vnode memory consumption. The size is specified directly to 650 * eliminate dependency on NFS-private header. 651 * 652 * Other filesystems may use bigger or smaller (like UFS and ZFS) 653 * private inode data, but the NFS-based estimation is ample enough. 654 * Still, we care about differences in the size between 64- and 32-bit 655 * platforms. 656 * 657 * Namecache structure size is heuristically 658 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 659 */ 660 #ifdef _LP64 661 #define NFS_NCLNODE_SZ (528 + 64) 662 #define NC_SZ 148 663 #else 664 #define NFS_NCLNODE_SZ (360 + 32) 665 #define NC_SZ 92 666 #endif 667 668 static void 669 vntblinit(void *dummy __unused) 670 { 671 struct vdbatch *vd; 672 uma_ctor ctor; 673 uma_dtor dtor; 674 int cpu, physvnodes, virtvnodes; 675 676 /* 677 * Desiredvnodes is a function of the physical memory size and the 678 * kernel's heap size. Generally speaking, it scales with the 679 * physical memory size. The ratio of desiredvnodes to the physical 680 * memory size is 1:16 until desiredvnodes exceeds 98,304. 681 * Thereafter, the 682 * marginal ratio of desiredvnodes to the physical memory size is 683 * 1:64. However, desiredvnodes is limited by the kernel's heap 684 * size. The memory required by desiredvnodes vnodes and vm objects 685 * must not exceed 1/10th of the kernel's heap size. 686 */ 687 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 688 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 689 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 690 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 691 desiredvnodes = min(physvnodes, virtvnodes); 692 if (desiredvnodes > MAXVNODES_MAX) { 693 if (bootverbose) 694 printf("Reducing kern.maxvnodes %lu -> %lu\n", 695 desiredvnodes, MAXVNODES_MAX); 696 desiredvnodes = MAXVNODES_MAX; 697 } 698 wantfreevnodes = desiredvnodes / 4; 699 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 700 TAILQ_INIT(&vnode_list); 701 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 702 /* 703 * The lock is taken to appease WITNESS. 704 */ 705 mtx_lock(&vnode_list_mtx); 706 vnlru_recalc(); 707 mtx_unlock(&vnode_list_mtx); 708 vnode_list_free_marker = vn_alloc_marker(NULL); 709 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 710 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 711 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 712 713 #ifdef KASAN 714 ctor = vnode_ctor; 715 dtor = vnode_dtor; 716 #else 717 ctor = NULL; 718 dtor = NULL; 719 #endif 720 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 721 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 722 uma_zone_set_smr(vnode_zone, vfs_smr); 723 724 /* 725 * Preallocate enough nodes to support one-per buf so that 726 * we can not fail an insert. reassignbuf() callers can not 727 * tolerate the insertion failure. 728 */ 729 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 730 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 731 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 732 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 733 uma_prealloc(buf_trie_zone, nbuf); 734 735 vnodes_created = counter_u64_alloc(M_WAITOK); 736 recycles_count = counter_u64_alloc(M_WAITOK); 737 recycles_free_count = counter_u64_alloc(M_WAITOK); 738 deferred_inact = counter_u64_alloc(M_WAITOK); 739 740 /* 741 * Initialize the filesystem syncer. 742 */ 743 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 744 &syncer_mask); 745 syncer_maxdelay = syncer_mask + 1; 746 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 747 cv_init(&sync_wakeup, "syncer"); 748 749 CPU_FOREACH(cpu) { 750 vd = DPCPU_ID_PTR((cpu), vd); 751 bzero(vd, sizeof(*vd)); 752 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 753 } 754 } 755 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 756 757 /* 758 * Mark a mount point as busy. Used to synchronize access and to delay 759 * unmounting. Eventually, mountlist_mtx is not released on failure. 760 * 761 * vfs_busy() is a custom lock, it can block the caller. 762 * vfs_busy() only sleeps if the unmount is active on the mount point. 763 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 764 * vnode belonging to mp. 765 * 766 * Lookup uses vfs_busy() to traverse mount points. 767 * root fs var fs 768 * / vnode lock A / vnode lock (/var) D 769 * /var vnode lock B /log vnode lock(/var/log) E 770 * vfs_busy lock C vfs_busy lock F 771 * 772 * Within each file system, the lock order is C->A->B and F->D->E. 773 * 774 * When traversing across mounts, the system follows that lock order: 775 * 776 * C->A->B 777 * | 778 * +->F->D->E 779 * 780 * The lookup() process for namei("/var") illustrates the process: 781 * 1. VOP_LOOKUP() obtains B while A is held 782 * 2. vfs_busy() obtains a shared lock on F while A and B are held 783 * 3. vput() releases lock on B 784 * 4. vput() releases lock on A 785 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 786 * 6. vfs_unbusy() releases shared lock on F 787 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 788 * Attempt to lock A (instead of vp_crossmp) while D is held would 789 * violate the global order, causing deadlocks. 790 * 791 * dounmount() locks B while F is drained. Note that for stacked 792 * filesystems, D and B in the example above may be the same lock, 793 * which introdues potential lock order reversal deadlock between 794 * dounmount() and step 5 above. These filesystems may avoid the LOR 795 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 796 * remain held until after step 5. 797 */ 798 int 799 vfs_busy(struct mount *mp, int flags) 800 { 801 struct mount_pcpu *mpcpu; 802 803 MPASS((flags & ~MBF_MASK) == 0); 804 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 805 806 if (vfs_op_thread_enter(mp, mpcpu)) { 807 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 808 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 809 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 810 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 811 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 812 vfs_op_thread_exit(mp, mpcpu); 813 if (flags & MBF_MNTLSTLOCK) 814 mtx_unlock(&mountlist_mtx); 815 return (0); 816 } 817 818 MNT_ILOCK(mp); 819 vfs_assert_mount_counters(mp); 820 MNT_REF(mp); 821 /* 822 * If mount point is currently being unmounted, sleep until the 823 * mount point fate is decided. If thread doing the unmounting fails, 824 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 825 * that this mount point has survived the unmount attempt and vfs_busy 826 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 827 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 828 * about to be really destroyed. vfs_busy needs to release its 829 * reference on the mount point in this case and return with ENOENT, 830 * telling the caller the mount it tried to busy is no longer valid. 831 */ 832 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 833 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 834 ("%s: non-empty upper mount list with pending unmount", 835 __func__)); 836 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 837 MNT_REL(mp); 838 MNT_IUNLOCK(mp); 839 CTR1(KTR_VFS, "%s: failed busying before sleeping", 840 __func__); 841 return (ENOENT); 842 } 843 if (flags & MBF_MNTLSTLOCK) 844 mtx_unlock(&mountlist_mtx); 845 mp->mnt_kern_flag |= MNTK_MWAIT; 846 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 847 if (flags & MBF_MNTLSTLOCK) 848 mtx_lock(&mountlist_mtx); 849 MNT_ILOCK(mp); 850 } 851 if (flags & MBF_MNTLSTLOCK) 852 mtx_unlock(&mountlist_mtx); 853 mp->mnt_lockref++; 854 MNT_IUNLOCK(mp); 855 return (0); 856 } 857 858 /* 859 * Free a busy filesystem. 860 */ 861 void 862 vfs_unbusy(struct mount *mp) 863 { 864 struct mount_pcpu *mpcpu; 865 int c; 866 867 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 868 869 if (vfs_op_thread_enter(mp, mpcpu)) { 870 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 871 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 872 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 873 vfs_op_thread_exit(mp, mpcpu); 874 return; 875 } 876 877 MNT_ILOCK(mp); 878 vfs_assert_mount_counters(mp); 879 MNT_REL(mp); 880 c = --mp->mnt_lockref; 881 if (mp->mnt_vfs_ops == 0) { 882 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 883 MNT_IUNLOCK(mp); 884 return; 885 } 886 if (c < 0) 887 vfs_dump_mount_counters(mp); 888 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 889 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 890 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 891 mp->mnt_kern_flag &= ~MNTK_DRAINING; 892 wakeup(&mp->mnt_lockref); 893 } 894 MNT_IUNLOCK(mp); 895 } 896 897 /* 898 * Lookup a mount point by filesystem identifier. 899 */ 900 struct mount * 901 vfs_getvfs(fsid_t *fsid) 902 { 903 struct mount *mp; 904 905 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 906 mtx_lock(&mountlist_mtx); 907 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 908 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 909 vfs_ref(mp); 910 mtx_unlock(&mountlist_mtx); 911 return (mp); 912 } 913 } 914 mtx_unlock(&mountlist_mtx); 915 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 916 return ((struct mount *) 0); 917 } 918 919 /* 920 * Lookup a mount point by filesystem identifier, busying it before 921 * returning. 922 * 923 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 924 * cache for popular filesystem identifiers. The cache is lockess, using 925 * the fact that struct mount's are never freed. In worst case we may 926 * get pointer to unmounted or even different filesystem, so we have to 927 * check what we got, and go slow way if so. 928 */ 929 struct mount * 930 vfs_busyfs(fsid_t *fsid) 931 { 932 #define FSID_CACHE_SIZE 256 933 typedef struct mount * volatile vmp_t; 934 static vmp_t cache[FSID_CACHE_SIZE]; 935 struct mount *mp; 936 int error; 937 uint32_t hash; 938 939 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 940 hash = fsid->val[0] ^ fsid->val[1]; 941 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 942 mp = cache[hash]; 943 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 944 goto slow; 945 if (vfs_busy(mp, 0) != 0) { 946 cache[hash] = NULL; 947 goto slow; 948 } 949 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 950 return (mp); 951 else 952 vfs_unbusy(mp); 953 954 slow: 955 mtx_lock(&mountlist_mtx); 956 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 957 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 958 error = vfs_busy(mp, MBF_MNTLSTLOCK); 959 if (error) { 960 cache[hash] = NULL; 961 mtx_unlock(&mountlist_mtx); 962 return (NULL); 963 } 964 cache[hash] = mp; 965 return (mp); 966 } 967 } 968 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 969 mtx_unlock(&mountlist_mtx); 970 return ((struct mount *) 0); 971 } 972 973 /* 974 * Check if a user can access privileged mount options. 975 */ 976 int 977 vfs_suser(struct mount *mp, struct thread *td) 978 { 979 int error; 980 981 if (jailed(td->td_ucred)) { 982 /* 983 * If the jail of the calling thread lacks permission for 984 * this type of file system, deny immediately. 985 */ 986 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 987 return (EPERM); 988 989 /* 990 * If the file system was mounted outside the jail of the 991 * calling thread, deny immediately. 992 */ 993 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 994 return (EPERM); 995 } 996 997 /* 998 * If file system supports delegated administration, we don't check 999 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 1000 * by the file system itself. 1001 * If this is not the user that did original mount, we check for 1002 * the PRIV_VFS_MOUNT_OWNER privilege. 1003 */ 1004 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1005 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1006 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1007 return (error); 1008 } 1009 return (0); 1010 } 1011 1012 /* 1013 * Get a new unique fsid. Try to make its val[0] unique, since this value 1014 * will be used to create fake device numbers for stat(). Also try (but 1015 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1016 * support 16-bit device numbers. We end up with unique val[0]'s for the 1017 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1018 * 1019 * Keep in mind that several mounts may be running in parallel. Starting 1020 * the search one past where the previous search terminated is both a 1021 * micro-optimization and a defense against returning the same fsid to 1022 * different mounts. 1023 */ 1024 void 1025 vfs_getnewfsid(struct mount *mp) 1026 { 1027 static uint16_t mntid_base; 1028 struct mount *nmp; 1029 fsid_t tfsid; 1030 int mtype; 1031 1032 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1033 mtx_lock(&mntid_mtx); 1034 mtype = mp->mnt_vfc->vfc_typenum; 1035 tfsid.val[1] = mtype; 1036 mtype = (mtype & 0xFF) << 24; 1037 for (;;) { 1038 tfsid.val[0] = makedev(255, 1039 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1040 mntid_base++; 1041 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1042 break; 1043 vfs_rel(nmp); 1044 } 1045 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1046 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1047 mtx_unlock(&mntid_mtx); 1048 } 1049 1050 /* 1051 * Knob to control the precision of file timestamps: 1052 * 1053 * 0 = seconds only; nanoseconds zeroed. 1054 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1055 * 2 = seconds and nanoseconds, truncated to microseconds. 1056 * >=3 = seconds and nanoseconds, maximum precision. 1057 */ 1058 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1059 1060 static int timestamp_precision = TSP_USEC; 1061 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1062 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1063 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1064 "3+: sec + ns (max. precision))"); 1065 1066 /* 1067 * Get a current timestamp. 1068 */ 1069 void 1070 vfs_timestamp(struct timespec *tsp) 1071 { 1072 struct timeval tv; 1073 1074 switch (timestamp_precision) { 1075 case TSP_SEC: 1076 tsp->tv_sec = time_second; 1077 tsp->tv_nsec = 0; 1078 break; 1079 case TSP_HZ: 1080 getnanotime(tsp); 1081 break; 1082 case TSP_USEC: 1083 microtime(&tv); 1084 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1085 break; 1086 case TSP_NSEC: 1087 default: 1088 nanotime(tsp); 1089 break; 1090 } 1091 } 1092 1093 /* 1094 * Set vnode attributes to VNOVAL 1095 */ 1096 void 1097 vattr_null(struct vattr *vap) 1098 { 1099 1100 vap->va_type = VNON; 1101 vap->va_size = VNOVAL; 1102 vap->va_bytes = VNOVAL; 1103 vap->va_mode = VNOVAL; 1104 vap->va_nlink = VNOVAL; 1105 vap->va_uid = VNOVAL; 1106 vap->va_gid = VNOVAL; 1107 vap->va_fsid = VNOVAL; 1108 vap->va_fileid = VNOVAL; 1109 vap->va_blocksize = VNOVAL; 1110 vap->va_rdev = VNOVAL; 1111 vap->va_atime.tv_sec = VNOVAL; 1112 vap->va_atime.tv_nsec = VNOVAL; 1113 vap->va_mtime.tv_sec = VNOVAL; 1114 vap->va_mtime.tv_nsec = VNOVAL; 1115 vap->va_ctime.tv_sec = VNOVAL; 1116 vap->va_ctime.tv_nsec = VNOVAL; 1117 vap->va_birthtime.tv_sec = VNOVAL; 1118 vap->va_birthtime.tv_nsec = VNOVAL; 1119 vap->va_flags = VNOVAL; 1120 vap->va_gen = VNOVAL; 1121 vap->va_vaflags = 0; 1122 } 1123 1124 /* 1125 * Try to reduce the total number of vnodes. 1126 * 1127 * This routine (and its user) are buggy in at least the following ways: 1128 * - all parameters were picked years ago when RAM sizes were significantly 1129 * smaller 1130 * - it can pick vnodes based on pages used by the vm object, but filesystems 1131 * like ZFS don't use it making the pick broken 1132 * - since ZFS has its own aging policy it gets partially combated by this one 1133 * - a dedicated method should be provided for filesystems to let them decide 1134 * whether the vnode should be recycled 1135 * 1136 * This routine is called when we have too many vnodes. It attempts 1137 * to free <count> vnodes and will potentially free vnodes that still 1138 * have VM backing store (VM backing store is typically the cause 1139 * of a vnode blowout so we want to do this). Therefore, this operation 1140 * is not considered cheap. 1141 * 1142 * A number of conditions may prevent a vnode from being reclaimed. 1143 * the buffer cache may have references on the vnode, a directory 1144 * vnode may still have references due to the namei cache representing 1145 * underlying files, or the vnode may be in active use. It is not 1146 * desirable to reuse such vnodes. These conditions may cause the 1147 * number of vnodes to reach some minimum value regardless of what 1148 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1149 * 1150 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1151 * entries if this argument is strue 1152 * @param trigger Only reclaim vnodes with fewer than this many resident 1153 * pages. 1154 * @param target How many vnodes to reclaim. 1155 * @return The number of vnodes that were reclaimed. 1156 */ 1157 static int 1158 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1159 { 1160 struct vnode *vp, *mvp; 1161 struct mount *mp; 1162 struct vm_object *object; 1163 u_long done; 1164 bool retried; 1165 1166 mtx_assert(&vnode_list_mtx, MA_OWNED); 1167 1168 retried = false; 1169 done = 0; 1170 1171 mvp = vnode_list_reclaim_marker; 1172 restart: 1173 vp = mvp; 1174 while (done < target) { 1175 vp = TAILQ_NEXT(vp, v_vnodelist); 1176 if (__predict_false(vp == NULL)) 1177 break; 1178 1179 if (__predict_false(vp->v_type == VMARKER)) 1180 continue; 1181 1182 /* 1183 * If it's been deconstructed already, it's still 1184 * referenced, or it exceeds the trigger, skip it. 1185 * Also skip free vnodes. We are trying to make space 1186 * to expand the free list, not reduce it. 1187 */ 1188 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1189 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1190 goto next_iter; 1191 1192 if (vp->v_type == VBAD || vp->v_type == VNON) 1193 goto next_iter; 1194 1195 object = atomic_load_ptr(&vp->v_object); 1196 if (object == NULL || object->resident_page_count > trigger) { 1197 goto next_iter; 1198 } 1199 1200 /* 1201 * Handle races against vnode allocation. Filesystems lock the 1202 * vnode some time after it gets returned from getnewvnode, 1203 * despite type and hold count being manipulated earlier. 1204 * Resorting to checking v_mount restores guarantees present 1205 * before the global list was reworked to contain all vnodes. 1206 */ 1207 if (!VI_TRYLOCK(vp)) 1208 goto next_iter; 1209 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1210 VI_UNLOCK(vp); 1211 goto next_iter; 1212 } 1213 if (vp->v_mount == NULL) { 1214 VI_UNLOCK(vp); 1215 goto next_iter; 1216 } 1217 vholdl(vp); 1218 VI_UNLOCK(vp); 1219 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1220 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1221 mtx_unlock(&vnode_list_mtx); 1222 1223 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1224 vdrop_recycle(vp); 1225 goto next_iter_unlocked; 1226 } 1227 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1228 vdrop_recycle(vp); 1229 vn_finished_write(mp); 1230 goto next_iter_unlocked; 1231 } 1232 1233 VI_LOCK(vp); 1234 if (vp->v_usecount > 0 || 1235 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1236 (vp->v_object != NULL && vp->v_object->handle == vp && 1237 vp->v_object->resident_page_count > trigger)) { 1238 VOP_UNLOCK(vp); 1239 vdropl_recycle(vp); 1240 vn_finished_write(mp); 1241 goto next_iter_unlocked; 1242 } 1243 counter_u64_add(recycles_count, 1); 1244 vgonel(vp); 1245 VOP_UNLOCK(vp); 1246 vdropl_recycle(vp); 1247 vn_finished_write(mp); 1248 done++; 1249 next_iter_unlocked: 1250 maybe_yield(); 1251 mtx_lock(&vnode_list_mtx); 1252 goto restart; 1253 next_iter: 1254 MPASS(vp->v_type != VMARKER); 1255 if (!should_yield()) 1256 continue; 1257 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1258 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1259 mtx_unlock(&vnode_list_mtx); 1260 kern_yield(PRI_USER); 1261 mtx_lock(&vnode_list_mtx); 1262 goto restart; 1263 } 1264 if (done == 0 && !retried) { 1265 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1266 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1267 retried = true; 1268 goto restart; 1269 } 1270 return (done); 1271 } 1272 1273 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1274 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1275 0, 1276 "limit on vnode free requests per call to the vnlru_free routine"); 1277 1278 /* 1279 * Attempt to reduce the free list by the requested amount. 1280 */ 1281 static int 1282 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) 1283 { 1284 struct vnode *vp; 1285 struct mount *mp; 1286 int ocount; 1287 1288 mtx_assert(&vnode_list_mtx, MA_OWNED); 1289 if (count > max_vnlru_free) 1290 count = max_vnlru_free; 1291 ocount = count; 1292 vp = mvp; 1293 for (;;) { 1294 if (count == 0) { 1295 break; 1296 } 1297 vp = TAILQ_NEXT(vp, v_vnodelist); 1298 if (__predict_false(vp == NULL)) { 1299 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1300 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1301 break; 1302 } 1303 if (__predict_false(vp->v_type == VMARKER)) 1304 continue; 1305 if (vp->v_holdcnt > 0) 1306 continue; 1307 /* 1308 * Don't recycle if our vnode is from different type 1309 * of mount point. Note that mp is type-safe, the 1310 * check does not reach unmapped address even if 1311 * vnode is reclaimed. 1312 */ 1313 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1314 mp->mnt_op != mnt_op) { 1315 continue; 1316 } 1317 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1318 continue; 1319 } 1320 if (!vhold_recycle_free(vp)) 1321 continue; 1322 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1323 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1324 mtx_unlock(&vnode_list_mtx); 1325 /* 1326 * FIXME: ignores the return value, meaning it may be nothing 1327 * got recycled but it claims otherwise to the caller. 1328 * 1329 * Originally the value started being ignored in 2005 with 1330 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1331 * 1332 * Respecting the value can run into significant stalls if most 1333 * vnodes belong to one file system and it has writes 1334 * suspended. In presence of many threads and millions of 1335 * vnodes they keep contending on the vnode_list_mtx lock only 1336 * to find vnodes they can't recycle. 1337 * 1338 * The solution would be to pre-check if the vnode is likely to 1339 * be recycle-able, but it needs to happen with the 1340 * vnode_list_mtx lock held. This runs into a problem where 1341 * VOP_GETWRITEMOUNT (currently needed to find out about if 1342 * writes are frozen) can take locks which LOR against it. 1343 * 1344 * Check nullfs for one example (null_getwritemount). 1345 */ 1346 vtryrecycle(vp); 1347 count--; 1348 mtx_lock(&vnode_list_mtx); 1349 vp = mvp; 1350 } 1351 return (ocount - count); 1352 } 1353 1354 static int 1355 vnlru_free_locked(int count) 1356 { 1357 1358 mtx_assert(&vnode_list_mtx, MA_OWNED); 1359 return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); 1360 } 1361 1362 void 1363 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1364 { 1365 1366 MPASS(mnt_op != NULL); 1367 MPASS(mvp != NULL); 1368 VNPASS(mvp->v_type == VMARKER, mvp); 1369 mtx_lock(&vnode_list_mtx); 1370 vnlru_free_impl(count, mnt_op, mvp); 1371 mtx_unlock(&vnode_list_mtx); 1372 } 1373 1374 struct vnode * 1375 vnlru_alloc_marker(void) 1376 { 1377 struct vnode *mvp; 1378 1379 mvp = vn_alloc_marker(NULL); 1380 mtx_lock(&vnode_list_mtx); 1381 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1382 mtx_unlock(&vnode_list_mtx); 1383 return (mvp); 1384 } 1385 1386 void 1387 vnlru_free_marker(struct vnode *mvp) 1388 { 1389 mtx_lock(&vnode_list_mtx); 1390 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1391 mtx_unlock(&vnode_list_mtx); 1392 vn_free_marker(mvp); 1393 } 1394 1395 static void 1396 vnlru_recalc(void) 1397 { 1398 1399 mtx_assert(&vnode_list_mtx, MA_OWNED); 1400 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1401 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1402 vlowat = vhiwat / 2; 1403 } 1404 1405 /* 1406 * Attempt to recycle vnodes in a context that is always safe to block. 1407 * Calling vlrurecycle() from the bowels of filesystem code has some 1408 * interesting deadlock problems. 1409 */ 1410 static struct proc *vnlruproc; 1411 static int vnlruproc_sig; 1412 1413 /* 1414 * The main freevnodes counter is only updated when threads requeue their vnode 1415 * batches. CPUs are conditionally walked to compute a more accurate total. 1416 * 1417 * Limit how much of a slop are we willing to tolerate. Note: the actual value 1418 * at any given moment can still exceed slop, but it should not be by significant 1419 * margin in practice. 1420 */ 1421 #define VNLRU_FREEVNODES_SLOP 128 1422 1423 static __inline void 1424 vfs_freevnodes_inc(void) 1425 { 1426 struct vdbatch *vd; 1427 1428 critical_enter(); 1429 vd = DPCPU_PTR(vd); 1430 vd->freevnodes++; 1431 critical_exit(); 1432 } 1433 1434 static __inline void 1435 vfs_freevnodes_dec(void) 1436 { 1437 struct vdbatch *vd; 1438 1439 critical_enter(); 1440 vd = DPCPU_PTR(vd); 1441 vd->freevnodes--; 1442 critical_exit(); 1443 } 1444 1445 static u_long 1446 vnlru_read_freevnodes(void) 1447 { 1448 struct vdbatch *vd; 1449 long slop; 1450 int cpu; 1451 1452 mtx_assert(&vnode_list_mtx, MA_OWNED); 1453 if (freevnodes > freevnodes_old) 1454 slop = freevnodes - freevnodes_old; 1455 else 1456 slop = freevnodes_old - freevnodes; 1457 if (slop < VNLRU_FREEVNODES_SLOP) 1458 return (freevnodes >= 0 ? freevnodes : 0); 1459 freevnodes_old = freevnodes; 1460 CPU_FOREACH(cpu) { 1461 vd = DPCPU_ID_PTR((cpu), vd); 1462 freevnodes_old += vd->freevnodes; 1463 } 1464 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1465 } 1466 1467 static bool 1468 vnlru_under(u_long rnumvnodes, u_long limit) 1469 { 1470 u_long rfreevnodes, space; 1471 1472 if (__predict_false(rnumvnodes > desiredvnodes)) 1473 return (true); 1474 1475 space = desiredvnodes - rnumvnodes; 1476 if (space < limit) { 1477 rfreevnodes = vnlru_read_freevnodes(); 1478 if (rfreevnodes > wantfreevnodes) 1479 space += rfreevnodes - wantfreevnodes; 1480 } 1481 return (space < limit); 1482 } 1483 1484 static bool 1485 vnlru_under_unlocked(u_long rnumvnodes, u_long limit) 1486 { 1487 long rfreevnodes, space; 1488 1489 if (__predict_false(rnumvnodes > desiredvnodes)) 1490 return (true); 1491 1492 space = desiredvnodes - rnumvnodes; 1493 if (space < limit) { 1494 rfreevnodes = atomic_load_long(&freevnodes); 1495 if (rfreevnodes > wantfreevnodes) 1496 space += rfreevnodes - wantfreevnodes; 1497 } 1498 return (space < limit); 1499 } 1500 1501 static void 1502 vnlru_kick(void) 1503 { 1504 1505 mtx_assert(&vnode_list_mtx, MA_OWNED); 1506 if (vnlruproc_sig == 0) { 1507 vnlruproc_sig = 1; 1508 wakeup(vnlruproc); 1509 } 1510 } 1511 1512 static void 1513 vnlru_proc(void) 1514 { 1515 u_long rnumvnodes, rfreevnodes, target; 1516 unsigned long onumvnodes; 1517 int done, force, trigger, usevnodes; 1518 bool reclaim_nc_src, want_reread; 1519 1520 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1521 SHUTDOWN_PRI_FIRST); 1522 1523 force = 0; 1524 want_reread = false; 1525 for (;;) { 1526 kproc_suspend_check(vnlruproc); 1527 mtx_lock(&vnode_list_mtx); 1528 rnumvnodes = atomic_load_long(&numvnodes); 1529 1530 if (want_reread) { 1531 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1532 want_reread = false; 1533 } 1534 1535 /* 1536 * If numvnodes is too large (due to desiredvnodes being 1537 * adjusted using its sysctl, or emergency growth), first 1538 * try to reduce it by discarding from the free list. 1539 */ 1540 if (rnumvnodes > desiredvnodes) { 1541 vnlru_free_locked(rnumvnodes - desiredvnodes); 1542 rnumvnodes = atomic_load_long(&numvnodes); 1543 } 1544 /* 1545 * Sleep if the vnode cache is in a good state. This is 1546 * when it is not over-full and has space for about a 4% 1547 * or 9% expansion (by growing its size or inexcessively 1548 * reducing its free list). Otherwise, try to reclaim 1549 * space for a 10% expansion. 1550 */ 1551 if (vstir && force == 0) { 1552 force = 1; 1553 vstir = 0; 1554 } 1555 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1556 vnlruproc_sig = 0; 1557 wakeup(&vnlruproc_sig); 1558 msleep(vnlruproc, &vnode_list_mtx, 1559 PVFS|PDROP, "vlruwt", hz); 1560 continue; 1561 } 1562 rfreevnodes = vnlru_read_freevnodes(); 1563 1564 onumvnodes = rnumvnodes; 1565 /* 1566 * Calculate parameters for recycling. These are the same 1567 * throughout the loop to give some semblance of fairness. 1568 * The trigger point is to avoid recycling vnodes with lots 1569 * of resident pages. We aren't trying to free memory; we 1570 * are trying to recycle or at least free vnodes. 1571 */ 1572 if (rnumvnodes <= desiredvnodes) 1573 usevnodes = rnumvnodes - rfreevnodes; 1574 else 1575 usevnodes = rnumvnodes; 1576 if (usevnodes <= 0) 1577 usevnodes = 1; 1578 /* 1579 * The trigger value is chosen to give a conservatively 1580 * large value to ensure that it alone doesn't prevent 1581 * making progress. The value can easily be so large that 1582 * it is effectively infinite in some congested and 1583 * misconfigured cases, and this is necessary. Normally 1584 * it is about 8 to 100 (pages), which is quite large. 1585 */ 1586 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1587 if (force < 2) 1588 trigger = vsmalltrigger; 1589 reclaim_nc_src = force >= 3; 1590 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1591 target = target / 10 + 1; 1592 done = vlrureclaim(reclaim_nc_src, trigger, target); 1593 mtx_unlock(&vnode_list_mtx); 1594 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1595 uma_reclaim(UMA_RECLAIM_DRAIN); 1596 if (done == 0) { 1597 if (force == 0 || force == 1) { 1598 force = 2; 1599 continue; 1600 } 1601 if (force == 2) { 1602 force = 3; 1603 continue; 1604 } 1605 want_reread = true; 1606 force = 0; 1607 vnlru_nowhere++; 1608 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1609 } else { 1610 want_reread = true; 1611 kern_yield(PRI_USER); 1612 } 1613 } 1614 } 1615 1616 static struct kproc_desc vnlru_kp = { 1617 "vnlru", 1618 vnlru_proc, 1619 &vnlruproc 1620 }; 1621 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1622 &vnlru_kp); 1623 1624 /* 1625 * Routines having to do with the management of the vnode table. 1626 */ 1627 1628 /* 1629 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1630 * before we actually vgone(). This function must be called with the vnode 1631 * held to prevent the vnode from being returned to the free list midway 1632 * through vgone(). 1633 */ 1634 static int 1635 vtryrecycle(struct vnode *vp) 1636 { 1637 struct mount *vnmp; 1638 1639 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1640 VNASSERT(vp->v_holdcnt, vp, 1641 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 1642 /* 1643 * This vnode may found and locked via some other list, if so we 1644 * can't recycle it yet. 1645 */ 1646 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1647 CTR2(KTR_VFS, 1648 "%s: impossible to recycle, vp %p lock is already held", 1649 __func__, vp); 1650 vdrop_recycle(vp); 1651 return (EWOULDBLOCK); 1652 } 1653 /* 1654 * Don't recycle if its filesystem is being suspended. 1655 */ 1656 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1657 VOP_UNLOCK(vp); 1658 CTR2(KTR_VFS, 1659 "%s: impossible to recycle, cannot start the write for %p", 1660 __func__, vp); 1661 vdrop_recycle(vp); 1662 return (EBUSY); 1663 } 1664 /* 1665 * If we got this far, we need to acquire the interlock and see if 1666 * anyone picked up this vnode from another list. If not, we will 1667 * mark it with DOOMED via vgonel() so that anyone who does find it 1668 * will skip over it. 1669 */ 1670 VI_LOCK(vp); 1671 if (vp->v_usecount) { 1672 VOP_UNLOCK(vp); 1673 vdropl_recycle(vp); 1674 vn_finished_write(vnmp); 1675 CTR2(KTR_VFS, 1676 "%s: impossible to recycle, %p is already referenced", 1677 __func__, vp); 1678 return (EBUSY); 1679 } 1680 if (!VN_IS_DOOMED(vp)) { 1681 counter_u64_add(recycles_free_count, 1); 1682 vgonel(vp); 1683 } 1684 VOP_UNLOCK(vp); 1685 vdropl_recycle(vp); 1686 vn_finished_write(vnmp); 1687 return (0); 1688 } 1689 1690 /* 1691 * Allocate a new vnode. 1692 * 1693 * The operation never returns an error. Returning an error was disabled 1694 * in r145385 (dated 2005) with the following comment: 1695 * 1696 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1697 * 1698 * Given the age of this commit (almost 15 years at the time of writing this 1699 * comment) restoring the ability to fail requires a significant audit of 1700 * all codepaths. 1701 * 1702 * The routine can try to free a vnode or stall for up to 1 second waiting for 1703 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1704 */ 1705 static u_long vn_alloc_cyclecount; 1706 1707 static struct vnode * __noinline 1708 vn_alloc_hard(struct mount *mp) 1709 { 1710 u_long rnumvnodes, rfreevnodes; 1711 1712 mtx_lock(&vnode_list_mtx); 1713 rnumvnodes = atomic_load_long(&numvnodes); 1714 if (rnumvnodes + 1 < desiredvnodes) { 1715 vn_alloc_cyclecount = 0; 1716 goto alloc; 1717 } 1718 rfreevnodes = vnlru_read_freevnodes(); 1719 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1720 vn_alloc_cyclecount = 0; 1721 vstir = 1; 1722 } 1723 /* 1724 * Grow the vnode cache if it will not be above its target max 1725 * after growing. Otherwise, if the free list is nonempty, try 1726 * to reclaim 1 item from it before growing the cache (possibly 1727 * above its target max if the reclamation failed or is delayed). 1728 * Otherwise, wait for some space. In all cases, schedule 1729 * vnlru_proc() if we are getting short of space. The watermarks 1730 * should be chosen so that we never wait or even reclaim from 1731 * the free list to below its target minimum. 1732 */ 1733 if (vnlru_free_locked(1) > 0) 1734 goto alloc; 1735 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1736 /* 1737 * Wait for space for a new vnode. 1738 */ 1739 vnlru_kick(); 1740 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1741 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1742 vnlru_read_freevnodes() > 1) 1743 vnlru_free_locked(1); 1744 } 1745 alloc: 1746 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1747 if (vnlru_under(rnumvnodes, vlowat)) 1748 vnlru_kick(); 1749 mtx_unlock(&vnode_list_mtx); 1750 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1751 } 1752 1753 static struct vnode * 1754 vn_alloc(struct mount *mp) 1755 { 1756 u_long rnumvnodes; 1757 1758 if (__predict_false(vn_alloc_cyclecount != 0)) 1759 return (vn_alloc_hard(mp)); 1760 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1761 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { 1762 atomic_subtract_long(&numvnodes, 1); 1763 return (vn_alloc_hard(mp)); 1764 } 1765 1766 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1767 } 1768 1769 static void 1770 vn_free(struct vnode *vp) 1771 { 1772 1773 atomic_subtract_long(&numvnodes, 1); 1774 uma_zfree_smr(vnode_zone, vp); 1775 } 1776 1777 /* 1778 * Return the next vnode from the free list. 1779 */ 1780 int 1781 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1782 struct vnode **vpp) 1783 { 1784 struct vnode *vp; 1785 struct thread *td; 1786 struct lock_object *lo; 1787 1788 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1789 1790 KASSERT(vops->registered, 1791 ("%s: not registered vector op %p\n", __func__, vops)); 1792 1793 td = curthread; 1794 if (td->td_vp_reserved != NULL) { 1795 vp = td->td_vp_reserved; 1796 td->td_vp_reserved = NULL; 1797 } else { 1798 vp = vn_alloc(mp); 1799 } 1800 counter_u64_add(vnodes_created, 1); 1801 1802 vn_set_state(vp, VSTATE_UNINITIALIZED); 1803 1804 /* 1805 * Locks are given the generic name "vnode" when created. 1806 * Follow the historic practice of using the filesystem 1807 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1808 * 1809 * Locks live in a witness group keyed on their name. Thus, 1810 * when a lock is renamed, it must also move from the witness 1811 * group of its old name to the witness group of its new name. 1812 * 1813 * The change only needs to be made when the vnode moves 1814 * from one filesystem type to another. We ensure that each 1815 * filesystem use a single static name pointer for its tag so 1816 * that we can compare pointers rather than doing a strcmp(). 1817 */ 1818 lo = &vp->v_vnlock->lock_object; 1819 #ifdef WITNESS 1820 if (lo->lo_name != tag) { 1821 #endif 1822 lo->lo_name = tag; 1823 #ifdef WITNESS 1824 WITNESS_DESTROY(lo); 1825 WITNESS_INIT(lo, tag); 1826 } 1827 #endif 1828 /* 1829 * By default, don't allow shared locks unless filesystems opt-in. 1830 */ 1831 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1832 /* 1833 * Finalize various vnode identity bits. 1834 */ 1835 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1836 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1837 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1838 vp->v_type = VNON; 1839 vp->v_op = vops; 1840 vp->v_irflag = 0; 1841 v_init_counters(vp); 1842 vn_seqc_init(vp); 1843 vp->v_bufobj.bo_ops = &buf_ops_bio; 1844 #ifdef DIAGNOSTIC 1845 if (mp == NULL && vops != &dead_vnodeops) 1846 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1847 #endif 1848 #ifdef MAC 1849 mac_vnode_init(vp); 1850 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1851 mac_vnode_associate_singlelabel(mp, vp); 1852 #endif 1853 if (mp != NULL) { 1854 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1855 } 1856 1857 /* 1858 * For the filesystems which do not use vfs_hash_insert(), 1859 * still initialize v_hash to have vfs_hash_index() useful. 1860 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1861 * its own hashing. 1862 */ 1863 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1864 1865 *vpp = vp; 1866 return (0); 1867 } 1868 1869 void 1870 getnewvnode_reserve(void) 1871 { 1872 struct thread *td; 1873 1874 td = curthread; 1875 MPASS(td->td_vp_reserved == NULL); 1876 td->td_vp_reserved = vn_alloc(NULL); 1877 } 1878 1879 void 1880 getnewvnode_drop_reserve(void) 1881 { 1882 struct thread *td; 1883 1884 td = curthread; 1885 if (td->td_vp_reserved != NULL) { 1886 vn_free(td->td_vp_reserved); 1887 td->td_vp_reserved = NULL; 1888 } 1889 } 1890 1891 static void __noinline 1892 freevnode(struct vnode *vp) 1893 { 1894 struct bufobj *bo; 1895 1896 /* 1897 * The vnode has been marked for destruction, so free it. 1898 * 1899 * The vnode will be returned to the zone where it will 1900 * normally remain until it is needed for another vnode. We 1901 * need to cleanup (or verify that the cleanup has already 1902 * been done) any residual data left from its current use 1903 * so as not to contaminate the freshly allocated vnode. 1904 */ 1905 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1906 /* 1907 * Paired with vgone. 1908 */ 1909 vn_seqc_write_end_free(vp); 1910 1911 bo = &vp->v_bufobj; 1912 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1913 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 1914 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1915 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1916 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1917 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1918 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1919 ("clean blk trie not empty")); 1920 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1921 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1922 ("dirty blk trie not empty")); 1923 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1924 ("Dangling rangelock waiters")); 1925 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 1926 ("Leaked inactivation")); 1927 VI_UNLOCK(vp); 1928 cache_assert_no_entries(vp); 1929 1930 #ifdef MAC 1931 mac_vnode_destroy(vp); 1932 #endif 1933 if (vp->v_pollinfo != NULL) { 1934 /* 1935 * Use LK_NOWAIT to shut up witness about the lock. We may get 1936 * here while having another vnode locked when trying to 1937 * satisfy a lookup and needing to recycle. 1938 */ 1939 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 1940 destroy_vpollinfo(vp->v_pollinfo); 1941 VOP_UNLOCK(vp); 1942 vp->v_pollinfo = NULL; 1943 } 1944 vp->v_mountedhere = NULL; 1945 vp->v_unpcb = NULL; 1946 vp->v_rdev = NULL; 1947 vp->v_fifoinfo = NULL; 1948 vp->v_iflag = 0; 1949 vp->v_vflag = 0; 1950 bo->bo_flag = 0; 1951 vn_free(vp); 1952 } 1953 1954 /* 1955 * Delete from old mount point vnode list, if on one. 1956 */ 1957 static void 1958 delmntque(struct vnode *vp) 1959 { 1960 struct mount *mp; 1961 1962 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 1963 1964 mp = vp->v_mount; 1965 MNT_ILOCK(mp); 1966 VI_LOCK(vp); 1967 vp->v_mount = NULL; 1968 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1969 ("bad mount point vnode list size")); 1970 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1971 mp->mnt_nvnodelistsize--; 1972 MNT_REL(mp); 1973 MNT_IUNLOCK(mp); 1974 /* 1975 * The caller expects the interlock to be still held. 1976 */ 1977 ASSERT_VI_LOCKED(vp, __func__); 1978 } 1979 1980 static int 1981 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 1982 { 1983 1984 KASSERT(vp->v_mount == NULL, 1985 ("insmntque: vnode already on per mount vnode list")); 1986 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1987 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 1988 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 1989 } else { 1990 KASSERT(!dtr, 1991 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 1992 __func__)); 1993 } 1994 1995 /* 1996 * We acquire the vnode interlock early to ensure that the 1997 * vnode cannot be recycled by another process releasing a 1998 * holdcnt on it before we get it on both the vnode list 1999 * and the active vnode list. The mount mutex protects only 2000 * manipulation of the vnode list and the vnode freelist 2001 * mutex protects only manipulation of the active vnode list. 2002 * Hence the need to hold the vnode interlock throughout. 2003 */ 2004 MNT_ILOCK(mp); 2005 VI_LOCK(vp); 2006 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2007 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2008 mp->mnt_nvnodelistsize == 0)) && 2009 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2010 VI_UNLOCK(vp); 2011 MNT_IUNLOCK(mp); 2012 if (dtr) { 2013 vp->v_data = NULL; 2014 vp->v_op = &dead_vnodeops; 2015 vgone(vp); 2016 vput(vp); 2017 } 2018 return (EBUSY); 2019 } 2020 vp->v_mount = mp; 2021 MNT_REF(mp); 2022 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2023 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2024 ("neg mount point vnode list size")); 2025 mp->mnt_nvnodelistsize++; 2026 VI_UNLOCK(vp); 2027 MNT_IUNLOCK(mp); 2028 return (0); 2029 } 2030 2031 /* 2032 * Insert into list of vnodes for the new mount point, if available. 2033 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2034 * leaves handling of the vnode to the caller. 2035 */ 2036 int 2037 insmntque(struct vnode *vp, struct mount *mp) 2038 { 2039 return (insmntque1_int(vp, mp, true)); 2040 } 2041 2042 int 2043 insmntque1(struct vnode *vp, struct mount *mp) 2044 { 2045 return (insmntque1_int(vp, mp, false)); 2046 } 2047 2048 /* 2049 * Flush out and invalidate all buffers associated with a bufobj 2050 * Called with the underlying object locked. 2051 */ 2052 int 2053 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2054 { 2055 int error; 2056 2057 BO_LOCK(bo); 2058 if (flags & V_SAVE) { 2059 error = bufobj_wwait(bo, slpflag, slptimeo); 2060 if (error) { 2061 BO_UNLOCK(bo); 2062 return (error); 2063 } 2064 if (bo->bo_dirty.bv_cnt > 0) { 2065 BO_UNLOCK(bo); 2066 do { 2067 error = BO_SYNC(bo, MNT_WAIT); 2068 } while (error == ERELOOKUP); 2069 if (error != 0) 2070 return (error); 2071 BO_LOCK(bo); 2072 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2073 BO_UNLOCK(bo); 2074 return (EBUSY); 2075 } 2076 } 2077 } 2078 /* 2079 * If you alter this loop please notice that interlock is dropped and 2080 * reacquired in flushbuflist. Special care is needed to ensure that 2081 * no race conditions occur from this. 2082 */ 2083 do { 2084 error = flushbuflist(&bo->bo_clean, 2085 flags, bo, slpflag, slptimeo); 2086 if (error == 0 && !(flags & V_CLEANONLY)) 2087 error = flushbuflist(&bo->bo_dirty, 2088 flags, bo, slpflag, slptimeo); 2089 if (error != 0 && error != EAGAIN) { 2090 BO_UNLOCK(bo); 2091 return (error); 2092 } 2093 } while (error != 0); 2094 2095 /* 2096 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2097 * have write I/O in-progress but if there is a VM object then the 2098 * VM object can also have read-I/O in-progress. 2099 */ 2100 do { 2101 bufobj_wwait(bo, 0, 0); 2102 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2103 BO_UNLOCK(bo); 2104 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2105 BO_LOCK(bo); 2106 } 2107 } while (bo->bo_numoutput > 0); 2108 BO_UNLOCK(bo); 2109 2110 /* 2111 * Destroy the copy in the VM cache, too. 2112 */ 2113 if (bo->bo_object != NULL && 2114 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2115 VM_OBJECT_WLOCK(bo->bo_object); 2116 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2117 OBJPR_CLEANONLY : 0); 2118 VM_OBJECT_WUNLOCK(bo->bo_object); 2119 } 2120 2121 #ifdef INVARIANTS 2122 BO_LOCK(bo); 2123 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2124 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2125 bo->bo_clean.bv_cnt > 0)) 2126 panic("vinvalbuf: flush failed"); 2127 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2128 bo->bo_dirty.bv_cnt > 0) 2129 panic("vinvalbuf: flush dirty failed"); 2130 BO_UNLOCK(bo); 2131 #endif 2132 return (0); 2133 } 2134 2135 /* 2136 * Flush out and invalidate all buffers associated with a vnode. 2137 * Called with the underlying object locked. 2138 */ 2139 int 2140 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2141 { 2142 2143 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2144 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2145 if (vp->v_object != NULL && vp->v_object->handle != vp) 2146 return (0); 2147 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2148 } 2149 2150 /* 2151 * Flush out buffers on the specified list. 2152 * 2153 */ 2154 static int 2155 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2156 int slptimeo) 2157 { 2158 struct buf *bp, *nbp; 2159 int retval, error; 2160 daddr_t lblkno; 2161 b_xflags_t xflags; 2162 2163 ASSERT_BO_WLOCKED(bo); 2164 2165 retval = 0; 2166 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2167 /* 2168 * If we are flushing both V_NORMAL and V_ALT buffers then 2169 * do not skip any buffers. If we are flushing only V_NORMAL 2170 * buffers then skip buffers marked as BX_ALTDATA. If we are 2171 * flushing only V_ALT buffers then skip buffers not marked 2172 * as BX_ALTDATA. 2173 */ 2174 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2175 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2176 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2177 continue; 2178 } 2179 if (nbp != NULL) { 2180 lblkno = nbp->b_lblkno; 2181 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2182 } 2183 retval = EAGAIN; 2184 error = BUF_TIMELOCK(bp, 2185 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2186 "flushbuf", slpflag, slptimeo); 2187 if (error) { 2188 BO_LOCK(bo); 2189 return (error != ENOLCK ? error : EAGAIN); 2190 } 2191 KASSERT(bp->b_bufobj == bo, 2192 ("bp %p wrong b_bufobj %p should be %p", 2193 bp, bp->b_bufobj, bo)); 2194 /* 2195 * XXX Since there are no node locks for NFS, I 2196 * believe there is a slight chance that a delayed 2197 * write will occur while sleeping just above, so 2198 * check for it. 2199 */ 2200 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2201 (flags & V_SAVE)) { 2202 bremfree(bp); 2203 bp->b_flags |= B_ASYNC; 2204 bwrite(bp); 2205 BO_LOCK(bo); 2206 return (EAGAIN); /* XXX: why not loop ? */ 2207 } 2208 bremfree(bp); 2209 bp->b_flags |= (B_INVAL | B_RELBUF); 2210 bp->b_flags &= ~B_ASYNC; 2211 brelse(bp); 2212 BO_LOCK(bo); 2213 if (nbp == NULL) 2214 break; 2215 nbp = gbincore(bo, lblkno); 2216 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2217 != xflags) 2218 break; /* nbp invalid */ 2219 } 2220 return (retval); 2221 } 2222 2223 int 2224 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2225 { 2226 struct buf *bp; 2227 int error; 2228 daddr_t lblkno; 2229 2230 ASSERT_BO_LOCKED(bo); 2231 2232 for (lblkno = startn;;) { 2233 again: 2234 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2235 if (bp == NULL || bp->b_lblkno >= endn || 2236 bp->b_lblkno < startn) 2237 break; 2238 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2239 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2240 if (error != 0) { 2241 BO_RLOCK(bo); 2242 if (error == ENOLCK) 2243 goto again; 2244 return (error); 2245 } 2246 KASSERT(bp->b_bufobj == bo, 2247 ("bp %p wrong b_bufobj %p should be %p", 2248 bp, bp->b_bufobj, bo)); 2249 lblkno = bp->b_lblkno + 1; 2250 if ((bp->b_flags & B_MANAGED) == 0) 2251 bremfree(bp); 2252 bp->b_flags |= B_RELBUF; 2253 /* 2254 * In the VMIO case, use the B_NOREUSE flag to hint that the 2255 * pages backing each buffer in the range are unlikely to be 2256 * reused. Dirty buffers will have the hint applied once 2257 * they've been written. 2258 */ 2259 if ((bp->b_flags & B_VMIO) != 0) 2260 bp->b_flags |= B_NOREUSE; 2261 brelse(bp); 2262 BO_RLOCK(bo); 2263 } 2264 return (0); 2265 } 2266 2267 /* 2268 * Truncate a file's buffer and pages to a specified length. This 2269 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2270 * sync activity. 2271 */ 2272 int 2273 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2274 { 2275 struct buf *bp, *nbp; 2276 struct bufobj *bo; 2277 daddr_t startlbn; 2278 2279 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2280 vp, blksize, (uintmax_t)length); 2281 2282 /* 2283 * Round up to the *next* lbn. 2284 */ 2285 startlbn = howmany(length, blksize); 2286 2287 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2288 2289 bo = &vp->v_bufobj; 2290 restart_unlocked: 2291 BO_LOCK(bo); 2292 2293 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2294 ; 2295 2296 if (length > 0) { 2297 restartsync: 2298 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2299 if (bp->b_lblkno > 0) 2300 continue; 2301 /* 2302 * Since we hold the vnode lock this should only 2303 * fail if we're racing with the buf daemon. 2304 */ 2305 if (BUF_LOCK(bp, 2306 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2307 BO_LOCKPTR(bo)) == ENOLCK) 2308 goto restart_unlocked; 2309 2310 VNASSERT((bp->b_flags & B_DELWRI), vp, 2311 ("buf(%p) on dirty queue without DELWRI", bp)); 2312 2313 bremfree(bp); 2314 bawrite(bp); 2315 BO_LOCK(bo); 2316 goto restartsync; 2317 } 2318 } 2319 2320 bufobj_wwait(bo, 0, 0); 2321 BO_UNLOCK(bo); 2322 vnode_pager_setsize(vp, length); 2323 2324 return (0); 2325 } 2326 2327 /* 2328 * Invalidate the cached pages of a file's buffer within the range of block 2329 * numbers [startlbn, endlbn). 2330 */ 2331 void 2332 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2333 int blksize) 2334 { 2335 struct bufobj *bo; 2336 off_t start, end; 2337 2338 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2339 2340 start = blksize * startlbn; 2341 end = blksize * endlbn; 2342 2343 bo = &vp->v_bufobj; 2344 BO_LOCK(bo); 2345 MPASS(blksize == bo->bo_bsize); 2346 2347 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2348 ; 2349 2350 BO_UNLOCK(bo); 2351 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2352 } 2353 2354 static int 2355 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2356 daddr_t startlbn, daddr_t endlbn) 2357 { 2358 struct buf *bp, *nbp; 2359 bool anyfreed; 2360 2361 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2362 ASSERT_BO_LOCKED(bo); 2363 2364 do { 2365 anyfreed = false; 2366 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2367 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2368 continue; 2369 if (BUF_LOCK(bp, 2370 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2371 BO_LOCKPTR(bo)) == ENOLCK) { 2372 BO_LOCK(bo); 2373 return (EAGAIN); 2374 } 2375 2376 bremfree(bp); 2377 bp->b_flags |= B_INVAL | B_RELBUF; 2378 bp->b_flags &= ~B_ASYNC; 2379 brelse(bp); 2380 anyfreed = true; 2381 2382 BO_LOCK(bo); 2383 if (nbp != NULL && 2384 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2385 nbp->b_vp != vp || 2386 (nbp->b_flags & B_DELWRI) != 0)) 2387 return (EAGAIN); 2388 } 2389 2390 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2391 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2392 continue; 2393 if (BUF_LOCK(bp, 2394 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2395 BO_LOCKPTR(bo)) == ENOLCK) { 2396 BO_LOCK(bo); 2397 return (EAGAIN); 2398 } 2399 bremfree(bp); 2400 bp->b_flags |= B_INVAL | B_RELBUF; 2401 bp->b_flags &= ~B_ASYNC; 2402 brelse(bp); 2403 anyfreed = true; 2404 2405 BO_LOCK(bo); 2406 if (nbp != NULL && 2407 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2408 (nbp->b_vp != vp) || 2409 (nbp->b_flags & B_DELWRI) == 0)) 2410 return (EAGAIN); 2411 } 2412 } while (anyfreed); 2413 return (0); 2414 } 2415 2416 static void 2417 buf_vlist_remove(struct buf *bp) 2418 { 2419 struct bufv *bv; 2420 b_xflags_t flags; 2421 2422 flags = bp->b_xflags; 2423 2424 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2425 ASSERT_BO_WLOCKED(bp->b_bufobj); 2426 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2427 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2428 ("%s: buffer %p has invalid queue state", __func__, bp)); 2429 2430 if ((flags & BX_VNDIRTY) != 0) 2431 bv = &bp->b_bufobj->bo_dirty; 2432 else 2433 bv = &bp->b_bufobj->bo_clean; 2434 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2435 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2436 bv->bv_cnt--; 2437 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2438 } 2439 2440 /* 2441 * Add the buffer to the sorted clean or dirty block list. 2442 * 2443 * NOTE: xflags is passed as a constant, optimizing this inline function! 2444 */ 2445 static void 2446 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2447 { 2448 struct bufv *bv; 2449 struct buf *n; 2450 int error; 2451 2452 ASSERT_BO_WLOCKED(bo); 2453 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2454 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2455 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2456 ("dead bo %p", bo)); 2457 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2458 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2459 bp->b_xflags |= xflags; 2460 if (xflags & BX_VNDIRTY) 2461 bv = &bo->bo_dirty; 2462 else 2463 bv = &bo->bo_clean; 2464 2465 /* 2466 * Keep the list ordered. Optimize empty list insertion. Assume 2467 * we tend to grow at the tail so lookup_le should usually be cheaper 2468 * than _ge. 2469 */ 2470 if (bv->bv_cnt == 0 || 2471 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2472 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2473 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2474 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2475 else 2476 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2477 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2478 if (error) 2479 panic("buf_vlist_add: Preallocated nodes insufficient."); 2480 bv->bv_cnt++; 2481 } 2482 2483 /* 2484 * Look up a buffer using the buffer tries. 2485 */ 2486 struct buf * 2487 gbincore(struct bufobj *bo, daddr_t lblkno) 2488 { 2489 struct buf *bp; 2490 2491 ASSERT_BO_LOCKED(bo); 2492 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2493 if (bp != NULL) 2494 return (bp); 2495 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2496 } 2497 2498 /* 2499 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2500 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2501 * stability of the result. Like other lockless lookups, the found buf may 2502 * already be invalid by the time this function returns. 2503 */ 2504 struct buf * 2505 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2506 { 2507 struct buf *bp; 2508 2509 ASSERT_BO_UNLOCKED(bo); 2510 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2511 if (bp != NULL) 2512 return (bp); 2513 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2514 } 2515 2516 /* 2517 * Associate a buffer with a vnode. 2518 */ 2519 void 2520 bgetvp(struct vnode *vp, struct buf *bp) 2521 { 2522 struct bufobj *bo; 2523 2524 bo = &vp->v_bufobj; 2525 ASSERT_BO_WLOCKED(bo); 2526 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2527 2528 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2529 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2530 ("bgetvp: bp already attached! %p", bp)); 2531 2532 vhold(vp); 2533 bp->b_vp = vp; 2534 bp->b_bufobj = bo; 2535 /* 2536 * Insert onto list for new vnode. 2537 */ 2538 buf_vlist_add(bp, bo, BX_VNCLEAN); 2539 } 2540 2541 /* 2542 * Disassociate a buffer from a vnode. 2543 */ 2544 void 2545 brelvp(struct buf *bp) 2546 { 2547 struct bufobj *bo; 2548 struct vnode *vp; 2549 2550 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2551 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2552 2553 /* 2554 * Delete from old vnode list, if on one. 2555 */ 2556 vp = bp->b_vp; /* XXX */ 2557 bo = bp->b_bufobj; 2558 BO_LOCK(bo); 2559 buf_vlist_remove(bp); 2560 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2561 bo->bo_flag &= ~BO_ONWORKLST; 2562 mtx_lock(&sync_mtx); 2563 LIST_REMOVE(bo, bo_synclist); 2564 syncer_worklist_len--; 2565 mtx_unlock(&sync_mtx); 2566 } 2567 bp->b_vp = NULL; 2568 bp->b_bufobj = NULL; 2569 BO_UNLOCK(bo); 2570 vdrop(vp); 2571 } 2572 2573 /* 2574 * Add an item to the syncer work queue. 2575 */ 2576 static void 2577 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2578 { 2579 int slot; 2580 2581 ASSERT_BO_WLOCKED(bo); 2582 2583 mtx_lock(&sync_mtx); 2584 if (bo->bo_flag & BO_ONWORKLST) 2585 LIST_REMOVE(bo, bo_synclist); 2586 else { 2587 bo->bo_flag |= BO_ONWORKLST; 2588 syncer_worklist_len++; 2589 } 2590 2591 if (delay > syncer_maxdelay - 2) 2592 delay = syncer_maxdelay - 2; 2593 slot = (syncer_delayno + delay) & syncer_mask; 2594 2595 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2596 mtx_unlock(&sync_mtx); 2597 } 2598 2599 static int 2600 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2601 { 2602 int error, len; 2603 2604 mtx_lock(&sync_mtx); 2605 len = syncer_worklist_len - sync_vnode_count; 2606 mtx_unlock(&sync_mtx); 2607 error = SYSCTL_OUT(req, &len, sizeof(len)); 2608 return (error); 2609 } 2610 2611 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2612 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2613 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2614 2615 static struct proc *updateproc; 2616 static void sched_sync(void); 2617 static struct kproc_desc up_kp = { 2618 "syncer", 2619 sched_sync, 2620 &updateproc 2621 }; 2622 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2623 2624 static int 2625 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2626 { 2627 struct vnode *vp; 2628 struct mount *mp; 2629 2630 *bo = LIST_FIRST(slp); 2631 if (*bo == NULL) 2632 return (0); 2633 vp = bo2vnode(*bo); 2634 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2635 return (1); 2636 /* 2637 * We use vhold in case the vnode does not 2638 * successfully sync. vhold prevents the vnode from 2639 * going away when we unlock the sync_mtx so that 2640 * we can acquire the vnode interlock. 2641 */ 2642 vholdl(vp); 2643 mtx_unlock(&sync_mtx); 2644 VI_UNLOCK(vp); 2645 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2646 vdrop(vp); 2647 mtx_lock(&sync_mtx); 2648 return (*bo == LIST_FIRST(slp)); 2649 } 2650 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2651 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2652 VOP_UNLOCK(vp); 2653 vn_finished_write(mp); 2654 BO_LOCK(*bo); 2655 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2656 /* 2657 * Put us back on the worklist. The worklist 2658 * routine will remove us from our current 2659 * position and then add us back in at a later 2660 * position. 2661 */ 2662 vn_syncer_add_to_worklist(*bo, syncdelay); 2663 } 2664 BO_UNLOCK(*bo); 2665 vdrop(vp); 2666 mtx_lock(&sync_mtx); 2667 return (0); 2668 } 2669 2670 static int first_printf = 1; 2671 2672 /* 2673 * System filesystem synchronizer daemon. 2674 */ 2675 static void 2676 sched_sync(void) 2677 { 2678 struct synclist *next, *slp; 2679 struct bufobj *bo; 2680 long starttime; 2681 struct thread *td = curthread; 2682 int last_work_seen; 2683 int net_worklist_len; 2684 int syncer_final_iter; 2685 int error; 2686 2687 last_work_seen = 0; 2688 syncer_final_iter = 0; 2689 syncer_state = SYNCER_RUNNING; 2690 starttime = time_uptime; 2691 td->td_pflags |= TDP_NORUNNINGBUF; 2692 2693 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2694 SHUTDOWN_PRI_LAST); 2695 2696 mtx_lock(&sync_mtx); 2697 for (;;) { 2698 if (syncer_state == SYNCER_FINAL_DELAY && 2699 syncer_final_iter == 0) { 2700 mtx_unlock(&sync_mtx); 2701 kproc_suspend_check(td->td_proc); 2702 mtx_lock(&sync_mtx); 2703 } 2704 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2705 if (syncer_state != SYNCER_RUNNING && 2706 starttime != time_uptime) { 2707 if (first_printf) { 2708 printf("\nSyncing disks, vnodes remaining... "); 2709 first_printf = 0; 2710 } 2711 printf("%d ", net_worklist_len); 2712 } 2713 starttime = time_uptime; 2714 2715 /* 2716 * Push files whose dirty time has expired. Be careful 2717 * of interrupt race on slp queue. 2718 * 2719 * Skip over empty worklist slots when shutting down. 2720 */ 2721 do { 2722 slp = &syncer_workitem_pending[syncer_delayno]; 2723 syncer_delayno += 1; 2724 if (syncer_delayno == syncer_maxdelay) 2725 syncer_delayno = 0; 2726 next = &syncer_workitem_pending[syncer_delayno]; 2727 /* 2728 * If the worklist has wrapped since the 2729 * it was emptied of all but syncer vnodes, 2730 * switch to the FINAL_DELAY state and run 2731 * for one more second. 2732 */ 2733 if (syncer_state == SYNCER_SHUTTING_DOWN && 2734 net_worklist_len == 0 && 2735 last_work_seen == syncer_delayno) { 2736 syncer_state = SYNCER_FINAL_DELAY; 2737 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2738 } 2739 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2740 syncer_worklist_len > 0); 2741 2742 /* 2743 * Keep track of the last time there was anything 2744 * on the worklist other than syncer vnodes. 2745 * Return to the SHUTTING_DOWN state if any 2746 * new work appears. 2747 */ 2748 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2749 last_work_seen = syncer_delayno; 2750 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2751 syncer_state = SYNCER_SHUTTING_DOWN; 2752 while (!LIST_EMPTY(slp)) { 2753 error = sync_vnode(slp, &bo, td); 2754 if (error == 1) { 2755 LIST_REMOVE(bo, bo_synclist); 2756 LIST_INSERT_HEAD(next, bo, bo_synclist); 2757 continue; 2758 } 2759 2760 if (first_printf == 0) { 2761 /* 2762 * Drop the sync mutex, because some watchdog 2763 * drivers need to sleep while patting 2764 */ 2765 mtx_unlock(&sync_mtx); 2766 wdog_kern_pat(WD_LASTVAL); 2767 mtx_lock(&sync_mtx); 2768 } 2769 } 2770 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2771 syncer_final_iter--; 2772 /* 2773 * The variable rushjob allows the kernel to speed up the 2774 * processing of the filesystem syncer process. A rushjob 2775 * value of N tells the filesystem syncer to process the next 2776 * N seconds worth of work on its queue ASAP. Currently rushjob 2777 * is used by the soft update code to speed up the filesystem 2778 * syncer process when the incore state is getting so far 2779 * ahead of the disk that the kernel memory pool is being 2780 * threatened with exhaustion. 2781 */ 2782 if (rushjob > 0) { 2783 rushjob -= 1; 2784 continue; 2785 } 2786 /* 2787 * Just sleep for a short period of time between 2788 * iterations when shutting down to allow some I/O 2789 * to happen. 2790 * 2791 * If it has taken us less than a second to process the 2792 * current work, then wait. Otherwise start right over 2793 * again. We can still lose time if any single round 2794 * takes more than two seconds, but it does not really 2795 * matter as we are just trying to generally pace the 2796 * filesystem activity. 2797 */ 2798 if (syncer_state != SYNCER_RUNNING || 2799 time_uptime == starttime) { 2800 thread_lock(td); 2801 sched_prio(td, PPAUSE); 2802 thread_unlock(td); 2803 } 2804 if (syncer_state != SYNCER_RUNNING) 2805 cv_timedwait(&sync_wakeup, &sync_mtx, 2806 hz / SYNCER_SHUTDOWN_SPEEDUP); 2807 else if (time_uptime == starttime) 2808 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2809 } 2810 } 2811 2812 /* 2813 * Request the syncer daemon to speed up its work. 2814 * We never push it to speed up more than half of its 2815 * normal turn time, otherwise it could take over the cpu. 2816 */ 2817 int 2818 speedup_syncer(void) 2819 { 2820 int ret = 0; 2821 2822 mtx_lock(&sync_mtx); 2823 if (rushjob < syncdelay / 2) { 2824 rushjob += 1; 2825 stat_rush_requests += 1; 2826 ret = 1; 2827 } 2828 mtx_unlock(&sync_mtx); 2829 cv_broadcast(&sync_wakeup); 2830 return (ret); 2831 } 2832 2833 /* 2834 * Tell the syncer to speed up its work and run though its work 2835 * list several times, then tell it to shut down. 2836 */ 2837 static void 2838 syncer_shutdown(void *arg, int howto) 2839 { 2840 2841 if (howto & RB_NOSYNC) 2842 return; 2843 mtx_lock(&sync_mtx); 2844 syncer_state = SYNCER_SHUTTING_DOWN; 2845 rushjob = 0; 2846 mtx_unlock(&sync_mtx); 2847 cv_broadcast(&sync_wakeup); 2848 kproc_shutdown(arg, howto); 2849 } 2850 2851 void 2852 syncer_suspend(void) 2853 { 2854 2855 syncer_shutdown(updateproc, 0); 2856 } 2857 2858 void 2859 syncer_resume(void) 2860 { 2861 2862 mtx_lock(&sync_mtx); 2863 first_printf = 1; 2864 syncer_state = SYNCER_RUNNING; 2865 mtx_unlock(&sync_mtx); 2866 cv_broadcast(&sync_wakeup); 2867 kproc_resume(updateproc); 2868 } 2869 2870 /* 2871 * Move the buffer between the clean and dirty lists of its vnode. 2872 */ 2873 void 2874 reassignbuf(struct buf *bp) 2875 { 2876 struct vnode *vp; 2877 struct bufobj *bo; 2878 int delay; 2879 #ifdef INVARIANTS 2880 struct bufv *bv; 2881 #endif 2882 2883 vp = bp->b_vp; 2884 bo = bp->b_bufobj; 2885 2886 KASSERT((bp->b_flags & B_PAGING) == 0, 2887 ("%s: cannot reassign paging buffer %p", __func__, bp)); 2888 2889 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2890 bp, bp->b_vp, bp->b_flags); 2891 2892 BO_LOCK(bo); 2893 buf_vlist_remove(bp); 2894 2895 /* 2896 * If dirty, put on list of dirty buffers; otherwise insert onto list 2897 * of clean buffers. 2898 */ 2899 if (bp->b_flags & B_DELWRI) { 2900 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2901 switch (vp->v_type) { 2902 case VDIR: 2903 delay = dirdelay; 2904 break; 2905 case VCHR: 2906 delay = metadelay; 2907 break; 2908 default: 2909 delay = filedelay; 2910 } 2911 vn_syncer_add_to_worklist(bo, delay); 2912 } 2913 buf_vlist_add(bp, bo, BX_VNDIRTY); 2914 } else { 2915 buf_vlist_add(bp, bo, BX_VNCLEAN); 2916 2917 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2918 mtx_lock(&sync_mtx); 2919 LIST_REMOVE(bo, bo_synclist); 2920 syncer_worklist_len--; 2921 mtx_unlock(&sync_mtx); 2922 bo->bo_flag &= ~BO_ONWORKLST; 2923 } 2924 } 2925 #ifdef INVARIANTS 2926 bv = &bo->bo_clean; 2927 bp = TAILQ_FIRST(&bv->bv_hd); 2928 KASSERT(bp == NULL || bp->b_bufobj == bo, 2929 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2930 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2931 KASSERT(bp == NULL || bp->b_bufobj == bo, 2932 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2933 bv = &bo->bo_dirty; 2934 bp = TAILQ_FIRST(&bv->bv_hd); 2935 KASSERT(bp == NULL || bp->b_bufobj == bo, 2936 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2937 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2938 KASSERT(bp == NULL || bp->b_bufobj == bo, 2939 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2940 #endif 2941 BO_UNLOCK(bo); 2942 } 2943 2944 static void 2945 v_init_counters(struct vnode *vp) 2946 { 2947 2948 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2949 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2950 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2951 2952 refcount_init(&vp->v_holdcnt, 1); 2953 refcount_init(&vp->v_usecount, 1); 2954 } 2955 2956 /* 2957 * Grab a particular vnode from the free list, increment its 2958 * reference count and lock it. VIRF_DOOMED is set if the vnode 2959 * is being destroyed. Only callers who specify LK_RETRY will 2960 * see doomed vnodes. If inactive processing was delayed in 2961 * vput try to do it here. 2962 * 2963 * usecount is manipulated using atomics without holding any locks. 2964 * 2965 * holdcnt can be manipulated using atomics without holding any locks, 2966 * except when transitioning 1<->0, in which case the interlock is held. 2967 * 2968 * Consumers which don't guarantee liveness of the vnode can use SMR to 2969 * try to get a reference. Note this operation can fail since the vnode 2970 * may be awaiting getting freed by the time they get to it. 2971 */ 2972 enum vgetstate 2973 vget_prep_smr(struct vnode *vp) 2974 { 2975 enum vgetstate vs; 2976 2977 VFS_SMR_ASSERT_ENTERED(); 2978 2979 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2980 vs = VGET_USECOUNT; 2981 } else { 2982 if (vhold_smr(vp)) 2983 vs = VGET_HOLDCNT; 2984 else 2985 vs = VGET_NONE; 2986 } 2987 return (vs); 2988 } 2989 2990 enum vgetstate 2991 vget_prep(struct vnode *vp) 2992 { 2993 enum vgetstate vs; 2994 2995 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2996 vs = VGET_USECOUNT; 2997 } else { 2998 vhold(vp); 2999 vs = VGET_HOLDCNT; 3000 } 3001 return (vs); 3002 } 3003 3004 void 3005 vget_abort(struct vnode *vp, enum vgetstate vs) 3006 { 3007 3008 switch (vs) { 3009 case VGET_USECOUNT: 3010 vrele(vp); 3011 break; 3012 case VGET_HOLDCNT: 3013 vdrop(vp); 3014 break; 3015 default: 3016 __assert_unreachable(); 3017 } 3018 } 3019 3020 int 3021 vget(struct vnode *vp, int flags) 3022 { 3023 enum vgetstate vs; 3024 3025 vs = vget_prep(vp); 3026 return (vget_finish(vp, flags, vs)); 3027 } 3028 3029 int 3030 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3031 { 3032 int error; 3033 3034 if ((flags & LK_INTERLOCK) != 0) 3035 ASSERT_VI_LOCKED(vp, __func__); 3036 else 3037 ASSERT_VI_UNLOCKED(vp, __func__); 3038 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3039 VNPASS(vp->v_holdcnt > 0, vp); 3040 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3041 3042 error = vn_lock(vp, flags); 3043 if (__predict_false(error != 0)) { 3044 vget_abort(vp, vs); 3045 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3046 vp); 3047 return (error); 3048 } 3049 3050 vget_finish_ref(vp, vs); 3051 return (0); 3052 } 3053 3054 void 3055 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3056 { 3057 int old; 3058 3059 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3060 VNPASS(vp->v_holdcnt > 0, vp); 3061 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3062 3063 if (vs == VGET_USECOUNT) 3064 return; 3065 3066 /* 3067 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3068 * the vnode around. Otherwise someone else lended their hold count and 3069 * we have to drop ours. 3070 */ 3071 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3072 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3073 if (old != 0) { 3074 #ifdef INVARIANTS 3075 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3076 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3077 #else 3078 refcount_release(&vp->v_holdcnt); 3079 #endif 3080 } 3081 } 3082 3083 void 3084 vref(struct vnode *vp) 3085 { 3086 enum vgetstate vs; 3087 3088 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3089 vs = vget_prep(vp); 3090 vget_finish_ref(vp, vs); 3091 } 3092 3093 void 3094 vrefact(struct vnode *vp) 3095 { 3096 3097 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3098 #ifdef INVARIANTS 3099 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 3100 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3101 #else 3102 refcount_acquire(&vp->v_usecount); 3103 #endif 3104 } 3105 3106 void 3107 vlazy(struct vnode *vp) 3108 { 3109 struct mount *mp; 3110 3111 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3112 3113 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3114 return; 3115 /* 3116 * We may get here for inactive routines after the vnode got doomed. 3117 */ 3118 if (VN_IS_DOOMED(vp)) 3119 return; 3120 mp = vp->v_mount; 3121 mtx_lock(&mp->mnt_listmtx); 3122 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3123 vp->v_mflag |= VMP_LAZYLIST; 3124 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3125 mp->mnt_lazyvnodelistsize++; 3126 } 3127 mtx_unlock(&mp->mnt_listmtx); 3128 } 3129 3130 static void 3131 vunlazy(struct vnode *vp) 3132 { 3133 struct mount *mp; 3134 3135 ASSERT_VI_LOCKED(vp, __func__); 3136 VNPASS(!VN_IS_DOOMED(vp), vp); 3137 3138 mp = vp->v_mount; 3139 mtx_lock(&mp->mnt_listmtx); 3140 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3141 /* 3142 * Don't remove the vnode from the lazy list if another thread 3143 * has increased the hold count. It may have re-enqueued the 3144 * vnode to the lazy list and is now responsible for its 3145 * removal. 3146 */ 3147 if (vp->v_holdcnt == 0) { 3148 vp->v_mflag &= ~VMP_LAZYLIST; 3149 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3150 mp->mnt_lazyvnodelistsize--; 3151 } 3152 mtx_unlock(&mp->mnt_listmtx); 3153 } 3154 3155 /* 3156 * This routine is only meant to be called from vgonel prior to dooming 3157 * the vnode. 3158 */ 3159 static void 3160 vunlazy_gone(struct vnode *vp) 3161 { 3162 struct mount *mp; 3163 3164 ASSERT_VOP_ELOCKED(vp, __func__); 3165 ASSERT_VI_LOCKED(vp, __func__); 3166 VNPASS(!VN_IS_DOOMED(vp), vp); 3167 3168 if (vp->v_mflag & VMP_LAZYLIST) { 3169 mp = vp->v_mount; 3170 mtx_lock(&mp->mnt_listmtx); 3171 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3172 vp->v_mflag &= ~VMP_LAZYLIST; 3173 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3174 mp->mnt_lazyvnodelistsize--; 3175 mtx_unlock(&mp->mnt_listmtx); 3176 } 3177 } 3178 3179 static void 3180 vdefer_inactive(struct vnode *vp) 3181 { 3182 3183 ASSERT_VI_LOCKED(vp, __func__); 3184 VNASSERT(vp->v_holdcnt > 0, vp, 3185 ("%s: vnode without hold count", __func__)); 3186 if (VN_IS_DOOMED(vp)) { 3187 vdropl(vp); 3188 return; 3189 } 3190 if (vp->v_iflag & VI_DEFINACT) { 3191 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 3192 vdropl(vp); 3193 return; 3194 } 3195 if (vp->v_usecount > 0) { 3196 vp->v_iflag &= ~VI_OWEINACT; 3197 vdropl(vp); 3198 return; 3199 } 3200 vlazy(vp); 3201 vp->v_iflag |= VI_DEFINACT; 3202 VI_UNLOCK(vp); 3203 counter_u64_add(deferred_inact, 1); 3204 } 3205 3206 static void 3207 vdefer_inactive_unlocked(struct vnode *vp) 3208 { 3209 3210 VI_LOCK(vp); 3211 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3212 vdropl(vp); 3213 return; 3214 } 3215 vdefer_inactive(vp); 3216 } 3217 3218 enum vput_op { VRELE, VPUT, VUNREF }; 3219 3220 /* 3221 * Handle ->v_usecount transitioning to 0. 3222 * 3223 * By releasing the last usecount we take ownership of the hold count which 3224 * provides liveness of the vnode, meaning we have to vdrop. 3225 * 3226 * For all vnodes we may need to perform inactive processing. It requires an 3227 * exclusive lock on the vnode, while it is legal to call here with only a 3228 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3229 * inactive processing gets deferred to the syncer. 3230 * 3231 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3232 * on the lock being held all the way until VOP_INACTIVE. This in particular 3233 * happens with UFS which adds half-constructed vnodes to the hash, where they 3234 * can be found by other code. 3235 */ 3236 static void 3237 vput_final(struct vnode *vp, enum vput_op func) 3238 { 3239 int error; 3240 bool want_unlock; 3241 3242 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3243 VNPASS(vp->v_holdcnt > 0, vp); 3244 3245 VI_LOCK(vp); 3246 3247 /* 3248 * By the time we got here someone else might have transitioned 3249 * the count back to > 0. 3250 */ 3251 if (vp->v_usecount > 0) 3252 goto out; 3253 3254 /* 3255 * If the vnode is doomed vgone already performed inactive processing 3256 * (if needed). 3257 */ 3258 if (VN_IS_DOOMED(vp)) 3259 goto out; 3260 3261 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3262 goto out; 3263 3264 if (vp->v_iflag & VI_DOINGINACT) 3265 goto out; 3266 3267 /* 3268 * Locking operations here will drop the interlock and possibly the 3269 * vnode lock, opening a window where the vnode can get doomed all the 3270 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3271 * perform inactive. 3272 */ 3273 vp->v_iflag |= VI_OWEINACT; 3274 want_unlock = false; 3275 error = 0; 3276 switch (func) { 3277 case VRELE: 3278 switch (VOP_ISLOCKED(vp)) { 3279 case LK_EXCLUSIVE: 3280 break; 3281 case LK_EXCLOTHER: 3282 case 0: 3283 want_unlock = true; 3284 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3285 VI_LOCK(vp); 3286 break; 3287 default: 3288 /* 3289 * The lock has at least one sharer, but we have no way 3290 * to conclude whether this is us. Play it safe and 3291 * defer processing. 3292 */ 3293 error = EAGAIN; 3294 break; 3295 } 3296 break; 3297 case VPUT: 3298 want_unlock = true; 3299 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3300 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3301 LK_NOWAIT); 3302 VI_LOCK(vp); 3303 } 3304 break; 3305 case VUNREF: 3306 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3307 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3308 VI_LOCK(vp); 3309 } 3310 break; 3311 } 3312 if (error == 0) { 3313 if (func == VUNREF) { 3314 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3315 ("recursive vunref")); 3316 vp->v_vflag |= VV_UNREF; 3317 } 3318 for (;;) { 3319 error = vinactive(vp); 3320 if (want_unlock) 3321 VOP_UNLOCK(vp); 3322 if (error != ERELOOKUP || !want_unlock) 3323 break; 3324 VOP_LOCK(vp, LK_EXCLUSIVE); 3325 } 3326 if (func == VUNREF) 3327 vp->v_vflag &= ~VV_UNREF; 3328 vdropl(vp); 3329 } else { 3330 vdefer_inactive(vp); 3331 } 3332 return; 3333 out: 3334 if (func == VPUT) 3335 VOP_UNLOCK(vp); 3336 vdropl(vp); 3337 } 3338 3339 /* 3340 * Decrement ->v_usecount for a vnode. 3341 * 3342 * Releasing the last use count requires additional processing, see vput_final 3343 * above for details. 3344 * 3345 * Comment above each variant denotes lock state on entry and exit. 3346 */ 3347 3348 /* 3349 * in: any 3350 * out: same as passed in 3351 */ 3352 void 3353 vrele(struct vnode *vp) 3354 { 3355 3356 ASSERT_VI_UNLOCKED(vp, __func__); 3357 if (!refcount_release(&vp->v_usecount)) 3358 return; 3359 vput_final(vp, VRELE); 3360 } 3361 3362 /* 3363 * in: locked 3364 * out: unlocked 3365 */ 3366 void 3367 vput(struct vnode *vp) 3368 { 3369 3370 ASSERT_VOP_LOCKED(vp, __func__); 3371 ASSERT_VI_UNLOCKED(vp, __func__); 3372 if (!refcount_release(&vp->v_usecount)) { 3373 VOP_UNLOCK(vp); 3374 return; 3375 } 3376 vput_final(vp, VPUT); 3377 } 3378 3379 /* 3380 * in: locked 3381 * out: locked 3382 */ 3383 void 3384 vunref(struct vnode *vp) 3385 { 3386 3387 ASSERT_VOP_LOCKED(vp, __func__); 3388 ASSERT_VI_UNLOCKED(vp, __func__); 3389 if (!refcount_release(&vp->v_usecount)) 3390 return; 3391 vput_final(vp, VUNREF); 3392 } 3393 3394 void 3395 vhold(struct vnode *vp) 3396 { 3397 int old; 3398 3399 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3400 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3401 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3402 ("%s: wrong hold count %d", __func__, old)); 3403 if (old == 0) 3404 vfs_freevnodes_dec(); 3405 } 3406 3407 void 3408 vholdnz(struct vnode *vp) 3409 { 3410 3411 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3412 #ifdef INVARIANTS 3413 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3414 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3415 ("%s: wrong hold count %d", __func__, old)); 3416 #else 3417 atomic_add_int(&vp->v_holdcnt, 1); 3418 #endif 3419 } 3420 3421 /* 3422 * Grab a hold count unless the vnode is freed. 3423 * 3424 * Only use this routine if vfs smr is the only protection you have against 3425 * freeing the vnode. 3426 * 3427 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3428 * is not set. After the flag is set the vnode becomes immutable to anyone but 3429 * the thread which managed to set the flag. 3430 * 3431 * It may be tempting to replace the loop with: 3432 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3433 * if (count & VHOLD_NO_SMR) { 3434 * backpedal and error out; 3435 * } 3436 * 3437 * However, while this is more performant, it hinders debugging by eliminating 3438 * the previously mentioned invariant. 3439 */ 3440 bool 3441 vhold_smr(struct vnode *vp) 3442 { 3443 int count; 3444 3445 VFS_SMR_ASSERT_ENTERED(); 3446 3447 count = atomic_load_int(&vp->v_holdcnt); 3448 for (;;) { 3449 if (count & VHOLD_NO_SMR) { 3450 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3451 ("non-zero hold count with flags %d\n", count)); 3452 return (false); 3453 } 3454 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3455 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3456 if (count == 0) 3457 vfs_freevnodes_dec(); 3458 return (true); 3459 } 3460 } 3461 } 3462 3463 /* 3464 * Hold a free vnode for recycling. 3465 * 3466 * Note: vnode_init references this comment. 3467 * 3468 * Attempts to recycle only need the global vnode list lock and have no use for 3469 * SMR. 3470 * 3471 * However, vnodes get inserted into the global list before they get fully 3472 * initialized and stay there until UMA decides to free the memory. This in 3473 * particular means the target can be found before it becomes usable and after 3474 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3475 * VHOLD_NO_SMR. 3476 * 3477 * Note: the vnode may gain more references after we transition the count 0->1. 3478 */ 3479 static bool 3480 vhold_recycle_free(struct vnode *vp) 3481 { 3482 int count; 3483 3484 mtx_assert(&vnode_list_mtx, MA_OWNED); 3485 3486 count = atomic_load_int(&vp->v_holdcnt); 3487 for (;;) { 3488 if (count & VHOLD_NO_SMR) { 3489 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3490 ("non-zero hold count with flags %d\n", count)); 3491 return (false); 3492 } 3493 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3494 if (count > 0) { 3495 return (false); 3496 } 3497 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3498 vfs_freevnodes_dec(); 3499 return (true); 3500 } 3501 } 3502 } 3503 3504 static void __noinline 3505 vdbatch_process(struct vdbatch *vd) 3506 { 3507 struct vnode *vp; 3508 int i; 3509 3510 mtx_assert(&vd->lock, MA_OWNED); 3511 MPASS(curthread->td_pinned > 0); 3512 MPASS(vd->index == VDBATCH_SIZE); 3513 3514 mtx_lock(&vnode_list_mtx); 3515 critical_enter(); 3516 freevnodes += vd->freevnodes; 3517 for (i = 0; i < VDBATCH_SIZE; i++) { 3518 vp = vd->tab[i]; 3519 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3520 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3521 MPASS(vp->v_dbatchcpu != NOCPU); 3522 vp->v_dbatchcpu = NOCPU; 3523 } 3524 mtx_unlock(&vnode_list_mtx); 3525 vd->freevnodes = 0; 3526 bzero(vd->tab, sizeof(vd->tab)); 3527 vd->index = 0; 3528 critical_exit(); 3529 } 3530 3531 static void 3532 vdbatch_enqueue(struct vnode *vp) 3533 { 3534 struct vdbatch *vd; 3535 3536 ASSERT_VI_LOCKED(vp, __func__); 3537 VNASSERT(!VN_IS_DOOMED(vp), vp, 3538 ("%s: deferring requeue of a doomed vnode", __func__)); 3539 3540 if (vp->v_dbatchcpu != NOCPU) { 3541 VI_UNLOCK(vp); 3542 return; 3543 } 3544 3545 sched_pin(); 3546 vd = DPCPU_PTR(vd); 3547 mtx_lock(&vd->lock); 3548 MPASS(vd->index < VDBATCH_SIZE); 3549 MPASS(vd->tab[vd->index] == NULL); 3550 /* 3551 * A hack: we depend on being pinned so that we know what to put in 3552 * ->v_dbatchcpu. 3553 */ 3554 vp->v_dbatchcpu = curcpu; 3555 vd->tab[vd->index] = vp; 3556 vd->index++; 3557 VI_UNLOCK(vp); 3558 if (vd->index == VDBATCH_SIZE) 3559 vdbatch_process(vd); 3560 mtx_unlock(&vd->lock); 3561 sched_unpin(); 3562 } 3563 3564 /* 3565 * This routine must only be called for vnodes which are about to be 3566 * deallocated. Supporting dequeue for arbitrary vndoes would require 3567 * validating that the locked batch matches. 3568 */ 3569 static void 3570 vdbatch_dequeue(struct vnode *vp) 3571 { 3572 struct vdbatch *vd; 3573 int i; 3574 short cpu; 3575 3576 VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, 3577 ("%s: called for a used vnode\n", __func__)); 3578 3579 cpu = vp->v_dbatchcpu; 3580 if (cpu == NOCPU) 3581 return; 3582 3583 vd = DPCPU_ID_PTR(cpu, vd); 3584 mtx_lock(&vd->lock); 3585 for (i = 0; i < vd->index; i++) { 3586 if (vd->tab[i] != vp) 3587 continue; 3588 vp->v_dbatchcpu = NOCPU; 3589 vd->index--; 3590 vd->tab[i] = vd->tab[vd->index]; 3591 vd->tab[vd->index] = NULL; 3592 break; 3593 } 3594 mtx_unlock(&vd->lock); 3595 /* 3596 * Either we dequeued the vnode above or the target CPU beat us to it. 3597 */ 3598 MPASS(vp->v_dbatchcpu == NOCPU); 3599 } 3600 3601 /* 3602 * Drop the hold count of the vnode. If this is the last reference to 3603 * the vnode we place it on the free list unless it has been vgone'd 3604 * (marked VIRF_DOOMED) in which case we will free it. 3605 * 3606 * Because the vnode vm object keeps a hold reference on the vnode if 3607 * there is at least one resident non-cached page, the vnode cannot 3608 * leave the active list without the page cleanup done. 3609 */ 3610 static void __noinline 3611 vdropl_final(struct vnode *vp) 3612 { 3613 3614 ASSERT_VI_LOCKED(vp, __func__); 3615 VNPASS(VN_IS_DOOMED(vp), vp); 3616 /* 3617 * Set the VHOLD_NO_SMR flag. 3618 * 3619 * We may be racing against vhold_smr. If they win we can just pretend 3620 * we never got this far, they will vdrop later. 3621 */ 3622 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3623 vfs_freevnodes_inc(); 3624 VI_UNLOCK(vp); 3625 /* 3626 * We lost the aforementioned race. Any subsequent access is 3627 * invalid as they might have managed to vdropl on their own. 3628 */ 3629 return; 3630 } 3631 /* 3632 * Don't bump freevnodes as this one is going away. 3633 */ 3634 freevnode(vp); 3635 } 3636 3637 void 3638 vdrop(struct vnode *vp) 3639 { 3640 3641 ASSERT_VI_UNLOCKED(vp, __func__); 3642 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3643 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3644 return; 3645 VI_LOCK(vp); 3646 vdropl(vp); 3647 } 3648 3649 static void __always_inline 3650 vdropl_impl(struct vnode *vp, bool enqueue) 3651 { 3652 3653 ASSERT_VI_LOCKED(vp, __func__); 3654 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3655 if (!refcount_release(&vp->v_holdcnt)) { 3656 VI_UNLOCK(vp); 3657 return; 3658 } 3659 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 3660 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 3661 if (VN_IS_DOOMED(vp)) { 3662 vdropl_final(vp); 3663 return; 3664 } 3665 3666 vfs_freevnodes_inc(); 3667 if (vp->v_mflag & VMP_LAZYLIST) { 3668 vunlazy(vp); 3669 } 3670 3671 if (!enqueue) { 3672 VI_UNLOCK(vp); 3673 return; 3674 } 3675 3676 /* 3677 * Also unlocks the interlock. We can't assert on it as we 3678 * released our hold and by now the vnode might have been 3679 * freed. 3680 */ 3681 vdbatch_enqueue(vp); 3682 } 3683 3684 void 3685 vdropl(struct vnode *vp) 3686 { 3687 3688 vdropl_impl(vp, true); 3689 } 3690 3691 /* 3692 * vdrop a vnode when recycling 3693 * 3694 * This is a special case routine only to be used when recycling, differs from 3695 * regular vdrop by not requeieing the vnode on LRU. 3696 * 3697 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 3698 * e.g., frozen writes on the filesystem), filling the batch and causing it to 3699 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 3700 * loop which can last for as long as writes are frozen. 3701 */ 3702 static void 3703 vdropl_recycle(struct vnode *vp) 3704 { 3705 3706 vdropl_impl(vp, false); 3707 } 3708 3709 static void 3710 vdrop_recycle(struct vnode *vp) 3711 { 3712 3713 VI_LOCK(vp); 3714 vdropl_recycle(vp); 3715 } 3716 3717 /* 3718 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3719 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3720 */ 3721 static int 3722 vinactivef(struct vnode *vp) 3723 { 3724 struct vm_object *obj; 3725 int error; 3726 3727 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3728 ASSERT_VI_LOCKED(vp, "vinactive"); 3729 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 3730 ("vinactive: recursed on VI_DOINGINACT")); 3731 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3732 vp->v_iflag |= VI_DOINGINACT; 3733 vp->v_iflag &= ~VI_OWEINACT; 3734 VI_UNLOCK(vp); 3735 /* 3736 * Before moving off the active list, we must be sure that any 3737 * modified pages are converted into the vnode's dirty 3738 * buffers, since these will no longer be checked once the 3739 * vnode is on the inactive list. 3740 * 3741 * The write-out of the dirty pages is asynchronous. At the 3742 * point that VOP_INACTIVE() is called, there could still be 3743 * pending I/O and dirty pages in the object. 3744 */ 3745 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3746 vm_object_mightbedirty(obj)) { 3747 VM_OBJECT_WLOCK(obj); 3748 vm_object_page_clean(obj, 0, 0, 0); 3749 VM_OBJECT_WUNLOCK(obj); 3750 } 3751 error = VOP_INACTIVE(vp); 3752 VI_LOCK(vp); 3753 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 3754 ("vinactive: lost VI_DOINGINACT")); 3755 vp->v_iflag &= ~VI_DOINGINACT; 3756 return (error); 3757 } 3758 3759 int 3760 vinactive(struct vnode *vp) 3761 { 3762 3763 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3764 ASSERT_VI_LOCKED(vp, "vinactive"); 3765 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3766 3767 if ((vp->v_iflag & VI_OWEINACT) == 0) 3768 return (0); 3769 if (vp->v_iflag & VI_DOINGINACT) 3770 return (0); 3771 if (vp->v_usecount > 0) { 3772 vp->v_iflag &= ~VI_OWEINACT; 3773 return (0); 3774 } 3775 return (vinactivef(vp)); 3776 } 3777 3778 /* 3779 * Remove any vnodes in the vnode table belonging to mount point mp. 3780 * 3781 * If FORCECLOSE is not specified, there should not be any active ones, 3782 * return error if any are found (nb: this is a user error, not a 3783 * system error). If FORCECLOSE is specified, detach any active vnodes 3784 * that are found. 3785 * 3786 * If WRITECLOSE is set, only flush out regular file vnodes open for 3787 * writing. 3788 * 3789 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3790 * 3791 * `rootrefs' specifies the base reference count for the root vnode 3792 * of this filesystem. The root vnode is considered busy if its 3793 * v_usecount exceeds this value. On a successful return, vflush(, td) 3794 * will call vrele() on the root vnode exactly rootrefs times. 3795 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3796 * be zero. 3797 */ 3798 #ifdef DIAGNOSTIC 3799 static int busyprt = 0; /* print out busy vnodes */ 3800 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3801 #endif 3802 3803 int 3804 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3805 { 3806 struct vnode *vp, *mvp, *rootvp = NULL; 3807 struct vattr vattr; 3808 int busy = 0, error; 3809 3810 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3811 rootrefs, flags); 3812 if (rootrefs > 0) { 3813 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3814 ("vflush: bad args")); 3815 /* 3816 * Get the filesystem root vnode. We can vput() it 3817 * immediately, since with rootrefs > 0, it won't go away. 3818 */ 3819 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3820 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3821 __func__, error); 3822 return (error); 3823 } 3824 vput(rootvp); 3825 } 3826 loop: 3827 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3828 vholdl(vp); 3829 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3830 if (error) { 3831 vdrop(vp); 3832 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3833 goto loop; 3834 } 3835 /* 3836 * Skip over a vnodes marked VV_SYSTEM. 3837 */ 3838 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3839 VOP_UNLOCK(vp); 3840 vdrop(vp); 3841 continue; 3842 } 3843 /* 3844 * If WRITECLOSE is set, flush out unlinked but still open 3845 * files (even if open only for reading) and regular file 3846 * vnodes open for writing. 3847 */ 3848 if (flags & WRITECLOSE) { 3849 if (vp->v_object != NULL) { 3850 VM_OBJECT_WLOCK(vp->v_object); 3851 vm_object_page_clean(vp->v_object, 0, 0, 0); 3852 VM_OBJECT_WUNLOCK(vp->v_object); 3853 } 3854 do { 3855 error = VOP_FSYNC(vp, MNT_WAIT, td); 3856 } while (error == ERELOOKUP); 3857 if (error != 0) { 3858 VOP_UNLOCK(vp); 3859 vdrop(vp); 3860 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3861 return (error); 3862 } 3863 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3864 VI_LOCK(vp); 3865 3866 if ((vp->v_type == VNON || 3867 (error == 0 && vattr.va_nlink > 0)) && 3868 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3869 VOP_UNLOCK(vp); 3870 vdropl(vp); 3871 continue; 3872 } 3873 } else 3874 VI_LOCK(vp); 3875 /* 3876 * With v_usecount == 0, all we need to do is clear out the 3877 * vnode data structures and we are done. 3878 * 3879 * If FORCECLOSE is set, forcibly close the vnode. 3880 */ 3881 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3882 vgonel(vp); 3883 } else { 3884 busy++; 3885 #ifdef DIAGNOSTIC 3886 if (busyprt) 3887 vn_printf(vp, "vflush: busy vnode "); 3888 #endif 3889 } 3890 VOP_UNLOCK(vp); 3891 vdropl(vp); 3892 } 3893 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3894 /* 3895 * If just the root vnode is busy, and if its refcount 3896 * is equal to `rootrefs', then go ahead and kill it. 3897 */ 3898 VI_LOCK(rootvp); 3899 KASSERT(busy > 0, ("vflush: not busy")); 3900 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3901 ("vflush: usecount %d < rootrefs %d", 3902 rootvp->v_usecount, rootrefs)); 3903 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3904 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3905 vgone(rootvp); 3906 VOP_UNLOCK(rootvp); 3907 busy = 0; 3908 } else 3909 VI_UNLOCK(rootvp); 3910 } 3911 if (busy) { 3912 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3913 busy); 3914 return (EBUSY); 3915 } 3916 for (; rootrefs > 0; rootrefs--) 3917 vrele(rootvp); 3918 return (0); 3919 } 3920 3921 /* 3922 * Recycle an unused vnode to the front of the free list. 3923 */ 3924 int 3925 vrecycle(struct vnode *vp) 3926 { 3927 int recycled; 3928 3929 VI_LOCK(vp); 3930 recycled = vrecyclel(vp); 3931 VI_UNLOCK(vp); 3932 return (recycled); 3933 } 3934 3935 /* 3936 * vrecycle, with the vp interlock held. 3937 */ 3938 int 3939 vrecyclel(struct vnode *vp) 3940 { 3941 int recycled; 3942 3943 ASSERT_VOP_ELOCKED(vp, __func__); 3944 ASSERT_VI_LOCKED(vp, __func__); 3945 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3946 recycled = 0; 3947 if (vp->v_usecount == 0) { 3948 recycled = 1; 3949 vgonel(vp); 3950 } 3951 return (recycled); 3952 } 3953 3954 /* 3955 * Eliminate all activity associated with a vnode 3956 * in preparation for reuse. 3957 */ 3958 void 3959 vgone(struct vnode *vp) 3960 { 3961 VI_LOCK(vp); 3962 vgonel(vp); 3963 VI_UNLOCK(vp); 3964 } 3965 3966 /* 3967 * Notify upper mounts about reclaimed or unlinked vnode. 3968 */ 3969 void 3970 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 3971 { 3972 struct mount *mp; 3973 struct mount_upper_node *ump; 3974 3975 mp = atomic_load_ptr(&vp->v_mount); 3976 if (mp == NULL) 3977 return; 3978 if (TAILQ_EMPTY(&mp->mnt_notify)) 3979 return; 3980 3981 MNT_ILOCK(mp); 3982 mp->mnt_upper_pending++; 3983 KASSERT(mp->mnt_upper_pending > 0, 3984 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 3985 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 3986 MNT_IUNLOCK(mp); 3987 switch (event) { 3988 case VFS_NOTIFY_UPPER_RECLAIM: 3989 VFS_RECLAIM_LOWERVP(ump->mp, vp); 3990 break; 3991 case VFS_NOTIFY_UPPER_UNLINK: 3992 VFS_UNLINK_LOWERVP(ump->mp, vp); 3993 break; 3994 } 3995 MNT_ILOCK(mp); 3996 } 3997 mp->mnt_upper_pending--; 3998 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 3999 mp->mnt_upper_pending == 0) { 4000 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 4001 wakeup(&mp->mnt_uppers); 4002 } 4003 MNT_IUNLOCK(mp); 4004 } 4005 4006 /* 4007 * vgone, with the vp interlock held. 4008 */ 4009 static void 4010 vgonel(struct vnode *vp) 4011 { 4012 struct thread *td; 4013 struct mount *mp; 4014 vm_object_t object; 4015 bool active, doinginact, oweinact; 4016 4017 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4018 ASSERT_VI_LOCKED(vp, "vgonel"); 4019 VNASSERT(vp->v_holdcnt, vp, 4020 ("vgonel: vp %p has no reference.", vp)); 4021 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4022 td = curthread; 4023 4024 /* 4025 * Don't vgonel if we're already doomed. 4026 */ 4027 if (VN_IS_DOOMED(vp)) { 4028 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4029 vn_get_state(vp) == VSTATE_DEAD, vp); 4030 return; 4031 } 4032 /* 4033 * Paired with freevnode. 4034 */ 4035 vn_seqc_write_begin_locked(vp); 4036 vunlazy_gone(vp); 4037 vn_irflag_set_locked(vp, VIRF_DOOMED); 4038 vn_set_state(vp, VSTATE_DESTROYING); 4039 4040 /* 4041 * Check to see if the vnode is in use. If so, we have to 4042 * call VOP_CLOSE() and VOP_INACTIVE(). 4043 * 4044 * It could be that VOP_INACTIVE() requested reclamation, in 4045 * which case we should avoid recursion, so check 4046 * VI_DOINGINACT. This is not precise but good enough. 4047 */ 4048 active = vp->v_usecount > 0; 4049 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4050 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4051 4052 /* 4053 * If we need to do inactive VI_OWEINACT will be set. 4054 */ 4055 if (vp->v_iflag & VI_DEFINACT) { 4056 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4057 vp->v_iflag &= ~VI_DEFINACT; 4058 vdropl(vp); 4059 } else { 4060 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4061 VI_UNLOCK(vp); 4062 } 4063 cache_purge_vgone(vp); 4064 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4065 4066 /* 4067 * If purging an active vnode, it must be closed and 4068 * deactivated before being reclaimed. 4069 */ 4070 if (active) 4071 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4072 if (!doinginact) { 4073 do { 4074 if (oweinact || active) { 4075 VI_LOCK(vp); 4076 vinactivef(vp); 4077 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4078 VI_UNLOCK(vp); 4079 } 4080 } while (oweinact); 4081 } 4082 if (vp->v_type == VSOCK) 4083 vfs_unp_reclaim(vp); 4084 4085 /* 4086 * Clean out any buffers associated with the vnode. 4087 * If the flush fails, just toss the buffers. 4088 */ 4089 mp = NULL; 4090 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4091 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4092 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4093 while (vinvalbuf(vp, 0, 0, 0) != 0) 4094 ; 4095 } 4096 4097 BO_LOCK(&vp->v_bufobj); 4098 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4099 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4100 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4101 vp->v_bufobj.bo_clean.bv_cnt == 0, 4102 ("vp %p bufobj not invalidated", vp)); 4103 4104 /* 4105 * For VMIO bufobj, BO_DEAD is set later, or in 4106 * vm_object_terminate() after the object's page queue is 4107 * flushed. 4108 */ 4109 object = vp->v_bufobj.bo_object; 4110 if (object == NULL) 4111 vp->v_bufobj.bo_flag |= BO_DEAD; 4112 BO_UNLOCK(&vp->v_bufobj); 4113 4114 /* 4115 * Handle the VM part. Tmpfs handles v_object on its own (the 4116 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4117 * should not touch the object borrowed from the lower vnode 4118 * (the handle check). 4119 */ 4120 if (object != NULL && object->type == OBJT_VNODE && 4121 object->handle == vp) 4122 vnode_destroy_vobject(vp); 4123 4124 /* 4125 * Reclaim the vnode. 4126 */ 4127 if (VOP_RECLAIM(vp)) 4128 panic("vgone: cannot reclaim"); 4129 if (mp != NULL) 4130 vn_finished_secondary_write(mp); 4131 VNASSERT(vp->v_object == NULL, vp, 4132 ("vop_reclaim left v_object vp=%p", vp)); 4133 /* 4134 * Clear the advisory locks and wake up waiting threads. 4135 */ 4136 if (vp->v_lockf != NULL) { 4137 (void)VOP_ADVLOCKPURGE(vp); 4138 vp->v_lockf = NULL; 4139 } 4140 /* 4141 * Delete from old mount point vnode list. 4142 */ 4143 if (vp->v_mount == NULL) { 4144 VI_LOCK(vp); 4145 } else { 4146 delmntque(vp); 4147 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4148 } 4149 /* 4150 * Done with purge, reset to the standard lock and invalidate 4151 * the vnode. 4152 */ 4153 vp->v_vnlock = &vp->v_lock; 4154 vp->v_op = &dead_vnodeops; 4155 vp->v_type = VBAD; 4156 vn_set_state(vp, VSTATE_DEAD); 4157 } 4158 4159 /* 4160 * Print out a description of a vnode. 4161 */ 4162 static const char *const vtypename[] = { 4163 [VNON] = "VNON", 4164 [VREG] = "VREG", 4165 [VDIR] = "VDIR", 4166 [VBLK] = "VBLK", 4167 [VCHR] = "VCHR", 4168 [VLNK] = "VLNK", 4169 [VSOCK] = "VSOCK", 4170 [VFIFO] = "VFIFO", 4171 [VBAD] = "VBAD", 4172 [VMARKER] = "VMARKER", 4173 }; 4174 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4175 "vnode type name not added to vtypename"); 4176 4177 static const char *const vstatename[] = { 4178 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4179 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4180 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4181 [VSTATE_DEAD] = "VSTATE_DEAD", 4182 }; 4183 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4184 "vnode state name not added to vstatename"); 4185 4186 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4187 "new hold count flag not added to vn_printf"); 4188 4189 void 4190 vn_printf(struct vnode *vp, const char *fmt, ...) 4191 { 4192 va_list ap; 4193 char buf[256], buf2[16]; 4194 u_long flags; 4195 u_int holdcnt; 4196 short irflag; 4197 4198 va_start(ap, fmt); 4199 vprintf(fmt, ap); 4200 va_end(ap); 4201 printf("%p: ", (void *)vp); 4202 printf("type %s state %s\n", vtypename[vp->v_type], vstatename[vp->v_state]); 4203 holdcnt = atomic_load_int(&vp->v_holdcnt); 4204 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4205 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4206 vp->v_seqc_users); 4207 switch (vp->v_type) { 4208 case VDIR: 4209 printf(" mountedhere %p\n", vp->v_mountedhere); 4210 break; 4211 case VCHR: 4212 printf(" rdev %p\n", vp->v_rdev); 4213 break; 4214 case VSOCK: 4215 printf(" socket %p\n", vp->v_unpcb); 4216 break; 4217 case VFIFO: 4218 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4219 break; 4220 default: 4221 printf("\n"); 4222 break; 4223 } 4224 buf[0] = '\0'; 4225 buf[1] = '\0'; 4226 if (holdcnt & VHOLD_NO_SMR) 4227 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4228 printf(" hold count flags (%s)\n", buf + 1); 4229 4230 buf[0] = '\0'; 4231 buf[1] = '\0'; 4232 irflag = vn_irflag_read(vp); 4233 if (irflag & VIRF_DOOMED) 4234 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4235 if (irflag & VIRF_PGREAD) 4236 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4237 if (irflag & VIRF_MOUNTPOINT) 4238 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4239 if (irflag & VIRF_TEXT_REF) 4240 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4241 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4242 if (flags != 0) { 4243 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4244 strlcat(buf, buf2, sizeof(buf)); 4245 } 4246 if (vp->v_vflag & VV_ROOT) 4247 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4248 if (vp->v_vflag & VV_ISTTY) 4249 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4250 if (vp->v_vflag & VV_NOSYNC) 4251 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4252 if (vp->v_vflag & VV_ETERNALDEV) 4253 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4254 if (vp->v_vflag & VV_CACHEDLABEL) 4255 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4256 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4257 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4258 if (vp->v_vflag & VV_COPYONWRITE) 4259 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4260 if (vp->v_vflag & VV_SYSTEM) 4261 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4262 if (vp->v_vflag & VV_PROCDEP) 4263 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4264 if (vp->v_vflag & VV_DELETED) 4265 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4266 if (vp->v_vflag & VV_MD) 4267 strlcat(buf, "|VV_MD", sizeof(buf)); 4268 if (vp->v_vflag & VV_FORCEINSMQ) 4269 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4270 if (vp->v_vflag & VV_READLINK) 4271 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4272 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4273 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4274 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4275 if (flags != 0) { 4276 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4277 strlcat(buf, buf2, sizeof(buf)); 4278 } 4279 if (vp->v_iflag & VI_MOUNT) 4280 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4281 if (vp->v_iflag & VI_DOINGINACT) 4282 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4283 if (vp->v_iflag & VI_OWEINACT) 4284 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4285 if (vp->v_iflag & VI_DEFINACT) 4286 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4287 if (vp->v_iflag & VI_FOPENING) 4288 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4289 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4290 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4291 if (flags != 0) { 4292 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4293 strlcat(buf, buf2, sizeof(buf)); 4294 } 4295 if (vp->v_mflag & VMP_LAZYLIST) 4296 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4297 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4298 if (flags != 0) { 4299 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4300 strlcat(buf, buf2, sizeof(buf)); 4301 } 4302 printf(" flags (%s)", buf + 1); 4303 if (mtx_owned(VI_MTX(vp))) 4304 printf(" VI_LOCKed"); 4305 printf("\n"); 4306 if (vp->v_object != NULL) 4307 printf(" v_object %p ref %d pages %d " 4308 "cleanbuf %d dirtybuf %d\n", 4309 vp->v_object, vp->v_object->ref_count, 4310 vp->v_object->resident_page_count, 4311 vp->v_bufobj.bo_clean.bv_cnt, 4312 vp->v_bufobj.bo_dirty.bv_cnt); 4313 printf(" "); 4314 lockmgr_printinfo(vp->v_vnlock); 4315 if (vp->v_data != NULL) 4316 VOP_PRINT(vp); 4317 } 4318 4319 #ifdef DDB 4320 /* 4321 * List all of the locked vnodes in the system. 4322 * Called when debugging the kernel. 4323 */ 4324 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4325 { 4326 struct mount *mp; 4327 struct vnode *vp; 4328 4329 /* 4330 * Note: because this is DDB, we can't obey the locking semantics 4331 * for these structures, which means we could catch an inconsistent 4332 * state and dereference a nasty pointer. Not much to be done 4333 * about that. 4334 */ 4335 db_printf("Locked vnodes\n"); 4336 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4337 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4338 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4339 vn_printf(vp, "vnode "); 4340 } 4341 } 4342 } 4343 4344 /* 4345 * Show details about the given vnode. 4346 */ 4347 DB_SHOW_COMMAND(vnode, db_show_vnode) 4348 { 4349 struct vnode *vp; 4350 4351 if (!have_addr) 4352 return; 4353 vp = (struct vnode *)addr; 4354 vn_printf(vp, "vnode "); 4355 } 4356 4357 /* 4358 * Show details about the given mount point. 4359 */ 4360 DB_SHOW_COMMAND(mount, db_show_mount) 4361 { 4362 struct mount *mp; 4363 struct vfsopt *opt; 4364 struct statfs *sp; 4365 struct vnode *vp; 4366 char buf[512]; 4367 uint64_t mflags; 4368 u_int flags; 4369 4370 if (!have_addr) { 4371 /* No address given, print short info about all mount points. */ 4372 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4373 db_printf("%p %s on %s (%s)\n", mp, 4374 mp->mnt_stat.f_mntfromname, 4375 mp->mnt_stat.f_mntonname, 4376 mp->mnt_stat.f_fstypename); 4377 if (db_pager_quit) 4378 break; 4379 } 4380 db_printf("\nMore info: show mount <addr>\n"); 4381 return; 4382 } 4383 4384 mp = (struct mount *)addr; 4385 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4386 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4387 4388 buf[0] = '\0'; 4389 mflags = mp->mnt_flag; 4390 #define MNT_FLAG(flag) do { \ 4391 if (mflags & (flag)) { \ 4392 if (buf[0] != '\0') \ 4393 strlcat(buf, ", ", sizeof(buf)); \ 4394 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4395 mflags &= ~(flag); \ 4396 } \ 4397 } while (0) 4398 MNT_FLAG(MNT_RDONLY); 4399 MNT_FLAG(MNT_SYNCHRONOUS); 4400 MNT_FLAG(MNT_NOEXEC); 4401 MNT_FLAG(MNT_NOSUID); 4402 MNT_FLAG(MNT_NFS4ACLS); 4403 MNT_FLAG(MNT_UNION); 4404 MNT_FLAG(MNT_ASYNC); 4405 MNT_FLAG(MNT_SUIDDIR); 4406 MNT_FLAG(MNT_SOFTDEP); 4407 MNT_FLAG(MNT_NOSYMFOLLOW); 4408 MNT_FLAG(MNT_GJOURNAL); 4409 MNT_FLAG(MNT_MULTILABEL); 4410 MNT_FLAG(MNT_ACLS); 4411 MNT_FLAG(MNT_NOATIME); 4412 MNT_FLAG(MNT_NOCLUSTERR); 4413 MNT_FLAG(MNT_NOCLUSTERW); 4414 MNT_FLAG(MNT_SUJ); 4415 MNT_FLAG(MNT_EXRDONLY); 4416 MNT_FLAG(MNT_EXPORTED); 4417 MNT_FLAG(MNT_DEFEXPORTED); 4418 MNT_FLAG(MNT_EXPORTANON); 4419 MNT_FLAG(MNT_EXKERB); 4420 MNT_FLAG(MNT_EXPUBLIC); 4421 MNT_FLAG(MNT_LOCAL); 4422 MNT_FLAG(MNT_QUOTA); 4423 MNT_FLAG(MNT_ROOTFS); 4424 MNT_FLAG(MNT_USER); 4425 MNT_FLAG(MNT_IGNORE); 4426 MNT_FLAG(MNT_UPDATE); 4427 MNT_FLAG(MNT_DELEXPORT); 4428 MNT_FLAG(MNT_RELOAD); 4429 MNT_FLAG(MNT_FORCE); 4430 MNT_FLAG(MNT_SNAPSHOT); 4431 MNT_FLAG(MNT_BYFSID); 4432 #undef MNT_FLAG 4433 if (mflags != 0) { 4434 if (buf[0] != '\0') 4435 strlcat(buf, ", ", sizeof(buf)); 4436 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4437 "0x%016jx", mflags); 4438 } 4439 db_printf(" mnt_flag = %s\n", buf); 4440 4441 buf[0] = '\0'; 4442 flags = mp->mnt_kern_flag; 4443 #define MNT_KERN_FLAG(flag) do { \ 4444 if (flags & (flag)) { \ 4445 if (buf[0] != '\0') \ 4446 strlcat(buf, ", ", sizeof(buf)); \ 4447 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4448 flags &= ~(flag); \ 4449 } \ 4450 } while (0) 4451 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4452 MNT_KERN_FLAG(MNTK_ASYNC); 4453 MNT_KERN_FLAG(MNTK_SOFTDEP); 4454 MNT_KERN_FLAG(MNTK_NOMSYNC); 4455 MNT_KERN_FLAG(MNTK_DRAINING); 4456 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4457 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4458 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4459 MNT_KERN_FLAG(MNTK_NO_IOPF); 4460 MNT_KERN_FLAG(MNTK_RECURSE); 4461 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4462 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4463 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4464 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4465 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4466 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4467 MNT_KERN_FLAG(MNTK_NOASYNC); 4468 MNT_KERN_FLAG(MNTK_UNMOUNT); 4469 MNT_KERN_FLAG(MNTK_MWAIT); 4470 MNT_KERN_FLAG(MNTK_SUSPEND); 4471 MNT_KERN_FLAG(MNTK_SUSPEND2); 4472 MNT_KERN_FLAG(MNTK_SUSPENDED); 4473 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4474 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4475 #undef MNT_KERN_FLAG 4476 if (flags != 0) { 4477 if (buf[0] != '\0') 4478 strlcat(buf, ", ", sizeof(buf)); 4479 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4480 "0x%08x", flags); 4481 } 4482 db_printf(" mnt_kern_flag = %s\n", buf); 4483 4484 db_printf(" mnt_opt = "); 4485 opt = TAILQ_FIRST(mp->mnt_opt); 4486 if (opt != NULL) { 4487 db_printf("%s", opt->name); 4488 opt = TAILQ_NEXT(opt, link); 4489 while (opt != NULL) { 4490 db_printf(", %s", opt->name); 4491 opt = TAILQ_NEXT(opt, link); 4492 } 4493 } 4494 db_printf("\n"); 4495 4496 sp = &mp->mnt_stat; 4497 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4498 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4499 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4500 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4501 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4502 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4503 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4504 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4505 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4506 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4507 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4508 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4509 4510 db_printf(" mnt_cred = { uid=%u ruid=%u", 4511 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4512 if (jailed(mp->mnt_cred)) 4513 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4514 db_printf(" }\n"); 4515 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4516 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4517 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4518 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4519 db_printf(" mnt_lazyvnodelistsize = %d\n", 4520 mp->mnt_lazyvnodelistsize); 4521 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4522 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4523 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4524 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4525 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4526 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4527 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4528 db_printf(" mnt_secondary_accwrites = %d\n", 4529 mp->mnt_secondary_accwrites); 4530 db_printf(" mnt_gjprovider = %s\n", 4531 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4532 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4533 4534 db_printf("\n\nList of active vnodes\n"); 4535 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4536 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4537 vn_printf(vp, "vnode "); 4538 if (db_pager_quit) 4539 break; 4540 } 4541 } 4542 db_printf("\n\nList of inactive vnodes\n"); 4543 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4544 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4545 vn_printf(vp, "vnode "); 4546 if (db_pager_quit) 4547 break; 4548 } 4549 } 4550 } 4551 #endif /* DDB */ 4552 4553 /* 4554 * Fill in a struct xvfsconf based on a struct vfsconf. 4555 */ 4556 static int 4557 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4558 { 4559 struct xvfsconf xvfsp; 4560 4561 bzero(&xvfsp, sizeof(xvfsp)); 4562 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4563 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4564 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4565 xvfsp.vfc_flags = vfsp->vfc_flags; 4566 /* 4567 * These are unused in userland, we keep them 4568 * to not break binary compatibility. 4569 */ 4570 xvfsp.vfc_vfsops = NULL; 4571 xvfsp.vfc_next = NULL; 4572 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4573 } 4574 4575 #ifdef COMPAT_FREEBSD32 4576 struct xvfsconf32 { 4577 uint32_t vfc_vfsops; 4578 char vfc_name[MFSNAMELEN]; 4579 int32_t vfc_typenum; 4580 int32_t vfc_refcount; 4581 int32_t vfc_flags; 4582 uint32_t vfc_next; 4583 }; 4584 4585 static int 4586 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4587 { 4588 struct xvfsconf32 xvfsp; 4589 4590 bzero(&xvfsp, sizeof(xvfsp)); 4591 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4592 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4593 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4594 xvfsp.vfc_flags = vfsp->vfc_flags; 4595 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4596 } 4597 #endif 4598 4599 /* 4600 * Top level filesystem related information gathering. 4601 */ 4602 static int 4603 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4604 { 4605 struct vfsconf *vfsp; 4606 int error; 4607 4608 error = 0; 4609 vfsconf_slock(); 4610 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4611 #ifdef COMPAT_FREEBSD32 4612 if (req->flags & SCTL_MASK32) 4613 error = vfsconf2x32(req, vfsp); 4614 else 4615 #endif 4616 error = vfsconf2x(req, vfsp); 4617 if (error) 4618 break; 4619 } 4620 vfsconf_sunlock(); 4621 return (error); 4622 } 4623 4624 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4625 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4626 "S,xvfsconf", "List of all configured filesystems"); 4627 4628 #ifndef BURN_BRIDGES 4629 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4630 4631 static int 4632 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4633 { 4634 int *name = (int *)arg1 - 1; /* XXX */ 4635 u_int namelen = arg2 + 1; /* XXX */ 4636 struct vfsconf *vfsp; 4637 4638 log(LOG_WARNING, "userland calling deprecated sysctl, " 4639 "please rebuild world\n"); 4640 4641 #if 1 || defined(COMPAT_PRELITE2) 4642 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4643 if (namelen == 1) 4644 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4645 #endif 4646 4647 switch (name[1]) { 4648 case VFS_MAXTYPENUM: 4649 if (namelen != 2) 4650 return (ENOTDIR); 4651 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4652 case VFS_CONF: 4653 if (namelen != 3) 4654 return (ENOTDIR); /* overloaded */ 4655 vfsconf_slock(); 4656 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4657 if (vfsp->vfc_typenum == name[2]) 4658 break; 4659 } 4660 vfsconf_sunlock(); 4661 if (vfsp == NULL) 4662 return (EOPNOTSUPP); 4663 #ifdef COMPAT_FREEBSD32 4664 if (req->flags & SCTL_MASK32) 4665 return (vfsconf2x32(req, vfsp)); 4666 else 4667 #endif 4668 return (vfsconf2x(req, vfsp)); 4669 } 4670 return (EOPNOTSUPP); 4671 } 4672 4673 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4674 CTLFLAG_MPSAFE, vfs_sysctl, 4675 "Generic filesystem"); 4676 4677 #if 1 || defined(COMPAT_PRELITE2) 4678 4679 static int 4680 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4681 { 4682 int error; 4683 struct vfsconf *vfsp; 4684 struct ovfsconf ovfs; 4685 4686 vfsconf_slock(); 4687 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4688 bzero(&ovfs, sizeof(ovfs)); 4689 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4690 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4691 ovfs.vfc_index = vfsp->vfc_typenum; 4692 ovfs.vfc_refcount = vfsp->vfc_refcount; 4693 ovfs.vfc_flags = vfsp->vfc_flags; 4694 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4695 if (error != 0) { 4696 vfsconf_sunlock(); 4697 return (error); 4698 } 4699 } 4700 vfsconf_sunlock(); 4701 return (0); 4702 } 4703 4704 #endif /* 1 || COMPAT_PRELITE2 */ 4705 #endif /* !BURN_BRIDGES */ 4706 4707 #define KINFO_VNODESLOP 10 4708 #ifdef notyet 4709 /* 4710 * Dump vnode list (via sysctl). 4711 */ 4712 /* ARGSUSED */ 4713 static int 4714 sysctl_vnode(SYSCTL_HANDLER_ARGS) 4715 { 4716 struct xvnode *xvn; 4717 struct mount *mp; 4718 struct vnode *vp; 4719 int error, len, n; 4720 4721 /* 4722 * Stale numvnodes access is not fatal here. 4723 */ 4724 req->lock = 0; 4725 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 4726 if (!req->oldptr) 4727 /* Make an estimate */ 4728 return (SYSCTL_OUT(req, 0, len)); 4729 4730 error = sysctl_wire_old_buffer(req, 0); 4731 if (error != 0) 4732 return (error); 4733 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 4734 n = 0; 4735 mtx_lock(&mountlist_mtx); 4736 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4737 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 4738 continue; 4739 MNT_ILOCK(mp); 4740 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4741 if (n == len) 4742 break; 4743 vref(vp); 4744 xvn[n].xv_size = sizeof *xvn; 4745 xvn[n].xv_vnode = vp; 4746 xvn[n].xv_id = 0; /* XXX compat */ 4747 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 4748 XV_COPY(usecount); 4749 XV_COPY(writecount); 4750 XV_COPY(holdcnt); 4751 XV_COPY(mount); 4752 XV_COPY(numoutput); 4753 XV_COPY(type); 4754 #undef XV_COPY 4755 xvn[n].xv_flag = vp->v_vflag; 4756 4757 switch (vp->v_type) { 4758 case VREG: 4759 case VDIR: 4760 case VLNK: 4761 break; 4762 case VBLK: 4763 case VCHR: 4764 if (vp->v_rdev == NULL) { 4765 vrele(vp); 4766 continue; 4767 } 4768 xvn[n].xv_dev = dev2udev(vp->v_rdev); 4769 break; 4770 case VSOCK: 4771 xvn[n].xv_socket = vp->v_socket; 4772 break; 4773 case VFIFO: 4774 xvn[n].xv_fifo = vp->v_fifoinfo; 4775 break; 4776 case VNON: 4777 case VBAD: 4778 default: 4779 /* shouldn't happen? */ 4780 vrele(vp); 4781 continue; 4782 } 4783 vrele(vp); 4784 ++n; 4785 } 4786 MNT_IUNLOCK(mp); 4787 mtx_lock(&mountlist_mtx); 4788 vfs_unbusy(mp); 4789 if (n == len) 4790 break; 4791 } 4792 mtx_unlock(&mountlist_mtx); 4793 4794 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 4795 free(xvn, M_TEMP); 4796 return (error); 4797 } 4798 4799 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | 4800 CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", 4801 ""); 4802 #endif 4803 4804 static void 4805 unmount_or_warn(struct mount *mp) 4806 { 4807 int error; 4808 4809 error = dounmount(mp, MNT_FORCE, curthread); 4810 if (error != 0) { 4811 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4812 if (error == EBUSY) 4813 printf("BUSY)\n"); 4814 else 4815 printf("%d)\n", error); 4816 } 4817 } 4818 4819 /* 4820 * Unmount all filesystems. The list is traversed in reverse order 4821 * of mounting to avoid dependencies. 4822 */ 4823 void 4824 vfs_unmountall(void) 4825 { 4826 struct mount *mp, *tmp; 4827 4828 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4829 4830 /* 4831 * Since this only runs when rebooting, it is not interlocked. 4832 */ 4833 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4834 vfs_ref(mp); 4835 4836 /* 4837 * Forcibly unmounting "/dev" before "/" would prevent clean 4838 * unmount of the latter. 4839 */ 4840 if (mp == rootdevmp) 4841 continue; 4842 4843 unmount_or_warn(mp); 4844 } 4845 4846 if (rootdevmp != NULL) 4847 unmount_or_warn(rootdevmp); 4848 } 4849 4850 static void 4851 vfs_deferred_inactive(struct vnode *vp, int lkflags) 4852 { 4853 4854 ASSERT_VI_LOCKED(vp, __func__); 4855 VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); 4856 if ((vp->v_iflag & VI_OWEINACT) == 0) { 4857 vdropl(vp); 4858 return; 4859 } 4860 if (vn_lock(vp, lkflags) == 0) { 4861 VI_LOCK(vp); 4862 vinactive(vp); 4863 VOP_UNLOCK(vp); 4864 vdropl(vp); 4865 return; 4866 } 4867 vdefer_inactive_unlocked(vp); 4868 } 4869 4870 static int 4871 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 4872 { 4873 4874 return (vp->v_iflag & VI_DEFINACT); 4875 } 4876 4877 static void __noinline 4878 vfs_periodic_inactive(struct mount *mp, int flags) 4879 { 4880 struct vnode *vp, *mvp; 4881 int lkflags; 4882 4883 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4884 if (flags != MNT_WAIT) 4885 lkflags |= LK_NOWAIT; 4886 4887 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 4888 if ((vp->v_iflag & VI_DEFINACT) == 0) { 4889 VI_UNLOCK(vp); 4890 continue; 4891 } 4892 vp->v_iflag &= ~VI_DEFINACT; 4893 vfs_deferred_inactive(vp, lkflags); 4894 } 4895 } 4896 4897 static inline bool 4898 vfs_want_msync(struct vnode *vp) 4899 { 4900 struct vm_object *obj; 4901 4902 /* 4903 * This test may be performed without any locks held. 4904 * We rely on vm_object's type stability. 4905 */ 4906 if (vp->v_vflag & VV_NOSYNC) 4907 return (false); 4908 obj = vp->v_object; 4909 return (obj != NULL && vm_object_mightbedirty(obj)); 4910 } 4911 4912 static int 4913 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 4914 { 4915 4916 if (vp->v_vflag & VV_NOSYNC) 4917 return (false); 4918 if (vp->v_iflag & VI_DEFINACT) 4919 return (true); 4920 return (vfs_want_msync(vp)); 4921 } 4922 4923 static void __noinline 4924 vfs_periodic_msync_inactive(struct mount *mp, int flags) 4925 { 4926 struct vnode *vp, *mvp; 4927 struct vm_object *obj; 4928 int lkflags, objflags; 4929 bool seen_defer; 4930 4931 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4932 if (flags != MNT_WAIT) { 4933 lkflags |= LK_NOWAIT; 4934 objflags = OBJPC_NOSYNC; 4935 } else { 4936 objflags = OBJPC_SYNC; 4937 } 4938 4939 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 4940 seen_defer = false; 4941 if (vp->v_iflag & VI_DEFINACT) { 4942 vp->v_iflag &= ~VI_DEFINACT; 4943 seen_defer = true; 4944 } 4945 if (!vfs_want_msync(vp)) { 4946 if (seen_defer) 4947 vfs_deferred_inactive(vp, lkflags); 4948 else 4949 VI_UNLOCK(vp); 4950 continue; 4951 } 4952 if (vget(vp, lkflags) == 0) { 4953 obj = vp->v_object; 4954 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 4955 VM_OBJECT_WLOCK(obj); 4956 vm_object_page_clean(obj, 0, 0, objflags); 4957 VM_OBJECT_WUNLOCK(obj); 4958 } 4959 vput(vp); 4960 if (seen_defer) 4961 vdrop(vp); 4962 } else { 4963 if (seen_defer) 4964 vdefer_inactive_unlocked(vp); 4965 } 4966 } 4967 } 4968 4969 void 4970 vfs_periodic(struct mount *mp, int flags) 4971 { 4972 4973 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4974 4975 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4976 vfs_periodic_inactive(mp, flags); 4977 else 4978 vfs_periodic_msync_inactive(mp, flags); 4979 } 4980 4981 static void 4982 destroy_vpollinfo_free(struct vpollinfo *vi) 4983 { 4984 4985 knlist_destroy(&vi->vpi_selinfo.si_note); 4986 mtx_destroy(&vi->vpi_lock); 4987 free(vi, M_VNODEPOLL); 4988 } 4989 4990 static void 4991 destroy_vpollinfo(struct vpollinfo *vi) 4992 { 4993 4994 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4995 seldrain(&vi->vpi_selinfo); 4996 destroy_vpollinfo_free(vi); 4997 } 4998 4999 /* 5000 * Initialize per-vnode helper structure to hold poll-related state. 5001 */ 5002 void 5003 v_addpollinfo(struct vnode *vp) 5004 { 5005 struct vpollinfo *vi; 5006 5007 if (vp->v_pollinfo != NULL) 5008 return; 5009 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 5010 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 5011 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 5012 vfs_knlunlock, vfs_knl_assert_lock); 5013 VI_LOCK(vp); 5014 if (vp->v_pollinfo != NULL) { 5015 VI_UNLOCK(vp); 5016 destroy_vpollinfo_free(vi); 5017 return; 5018 } 5019 vp->v_pollinfo = vi; 5020 VI_UNLOCK(vp); 5021 } 5022 5023 /* 5024 * Record a process's interest in events which might happen to 5025 * a vnode. Because poll uses the historic select-style interface 5026 * internally, this routine serves as both the ``check for any 5027 * pending events'' and the ``record my interest in future events'' 5028 * functions. (These are done together, while the lock is held, 5029 * to avoid race conditions.) 5030 */ 5031 int 5032 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 5033 { 5034 5035 v_addpollinfo(vp); 5036 mtx_lock(&vp->v_pollinfo->vpi_lock); 5037 if (vp->v_pollinfo->vpi_revents & events) { 5038 /* 5039 * This leaves events we are not interested 5040 * in available for the other process which 5041 * which presumably had requested them 5042 * (otherwise they would never have been 5043 * recorded). 5044 */ 5045 events &= vp->v_pollinfo->vpi_revents; 5046 vp->v_pollinfo->vpi_revents &= ~events; 5047 5048 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5049 return (events); 5050 } 5051 vp->v_pollinfo->vpi_events |= events; 5052 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 5053 mtx_unlock(&vp->v_pollinfo->vpi_lock); 5054 return (0); 5055 } 5056 5057 /* 5058 * Routine to create and manage a filesystem syncer vnode. 5059 */ 5060 #define sync_close ((int (*)(struct vop_close_args *))nullop) 5061 static int sync_fsync(struct vop_fsync_args *); 5062 static int sync_inactive(struct vop_inactive_args *); 5063 static int sync_reclaim(struct vop_reclaim_args *); 5064 5065 static struct vop_vector sync_vnodeops = { 5066 .vop_bypass = VOP_EOPNOTSUPP, 5067 .vop_close = sync_close, /* close */ 5068 .vop_fsync = sync_fsync, /* fsync */ 5069 .vop_inactive = sync_inactive, /* inactive */ 5070 .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ 5071 .vop_reclaim = sync_reclaim, /* reclaim */ 5072 .vop_lock1 = vop_stdlock, /* lock */ 5073 .vop_unlock = vop_stdunlock, /* unlock */ 5074 .vop_islocked = vop_stdislocked, /* islocked */ 5075 }; 5076 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5077 5078 /* 5079 * Create a new filesystem syncer vnode for the specified mount point. 5080 */ 5081 void 5082 vfs_allocate_syncvnode(struct mount *mp) 5083 { 5084 struct vnode *vp; 5085 struct bufobj *bo; 5086 static long start, incr, next; 5087 int error; 5088 5089 /* Allocate a new vnode */ 5090 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5091 if (error != 0) 5092 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5093 vp->v_type = VNON; 5094 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5095 vp->v_vflag |= VV_FORCEINSMQ; 5096 error = insmntque1(vp, mp); 5097 if (error != 0) 5098 panic("vfs_allocate_syncvnode: insmntque() failed"); 5099 vp->v_vflag &= ~VV_FORCEINSMQ; 5100 vn_set_state(vp, VSTATE_CONSTRUCTED); 5101 VOP_UNLOCK(vp); 5102 /* 5103 * Place the vnode onto the syncer worklist. We attempt to 5104 * scatter them about on the list so that they will go off 5105 * at evenly distributed times even if all the filesystems 5106 * are mounted at once. 5107 */ 5108 next += incr; 5109 if (next == 0 || next > syncer_maxdelay) { 5110 start /= 2; 5111 incr /= 2; 5112 if (start == 0) { 5113 start = syncer_maxdelay / 2; 5114 incr = syncer_maxdelay; 5115 } 5116 next = start; 5117 } 5118 bo = &vp->v_bufobj; 5119 BO_LOCK(bo); 5120 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5121 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5122 mtx_lock(&sync_mtx); 5123 sync_vnode_count++; 5124 if (mp->mnt_syncer == NULL) { 5125 mp->mnt_syncer = vp; 5126 vp = NULL; 5127 } 5128 mtx_unlock(&sync_mtx); 5129 BO_UNLOCK(bo); 5130 if (vp != NULL) { 5131 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5132 vgone(vp); 5133 vput(vp); 5134 } 5135 } 5136 5137 void 5138 vfs_deallocate_syncvnode(struct mount *mp) 5139 { 5140 struct vnode *vp; 5141 5142 mtx_lock(&sync_mtx); 5143 vp = mp->mnt_syncer; 5144 if (vp != NULL) 5145 mp->mnt_syncer = NULL; 5146 mtx_unlock(&sync_mtx); 5147 if (vp != NULL) 5148 vrele(vp); 5149 } 5150 5151 /* 5152 * Do a lazy sync of the filesystem. 5153 */ 5154 static int 5155 sync_fsync(struct vop_fsync_args *ap) 5156 { 5157 struct vnode *syncvp = ap->a_vp; 5158 struct mount *mp = syncvp->v_mount; 5159 int error, save; 5160 struct bufobj *bo; 5161 5162 /* 5163 * We only need to do something if this is a lazy evaluation. 5164 */ 5165 if (ap->a_waitfor != MNT_LAZY) 5166 return (0); 5167 5168 /* 5169 * Move ourselves to the back of the sync list. 5170 */ 5171 bo = &syncvp->v_bufobj; 5172 BO_LOCK(bo); 5173 vn_syncer_add_to_worklist(bo, syncdelay); 5174 BO_UNLOCK(bo); 5175 5176 /* 5177 * Walk the list of vnodes pushing all that are dirty and 5178 * not already on the sync list. 5179 */ 5180 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5181 return (0); 5182 VOP_UNLOCK(syncvp); 5183 save = curthread_pflags_set(TDP_SYNCIO); 5184 /* 5185 * The filesystem at hand may be idle with free vnodes stored in the 5186 * batch. Return them instead of letting them stay there indefinitely. 5187 */ 5188 vfs_periodic(mp, MNT_NOWAIT); 5189 error = VFS_SYNC(mp, MNT_LAZY); 5190 curthread_pflags_restore(save); 5191 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5192 vfs_unbusy(mp); 5193 return (error); 5194 } 5195 5196 /* 5197 * The syncer vnode is no referenced. 5198 */ 5199 static int 5200 sync_inactive(struct vop_inactive_args *ap) 5201 { 5202 5203 vgone(ap->a_vp); 5204 return (0); 5205 } 5206 5207 /* 5208 * The syncer vnode is no longer needed and is being decommissioned. 5209 * 5210 * Modifications to the worklist must be protected by sync_mtx. 5211 */ 5212 static int 5213 sync_reclaim(struct vop_reclaim_args *ap) 5214 { 5215 struct vnode *vp = ap->a_vp; 5216 struct bufobj *bo; 5217 5218 bo = &vp->v_bufobj; 5219 BO_LOCK(bo); 5220 mtx_lock(&sync_mtx); 5221 if (vp->v_mount->mnt_syncer == vp) 5222 vp->v_mount->mnt_syncer = NULL; 5223 if (bo->bo_flag & BO_ONWORKLST) { 5224 LIST_REMOVE(bo, bo_synclist); 5225 syncer_worklist_len--; 5226 sync_vnode_count--; 5227 bo->bo_flag &= ~BO_ONWORKLST; 5228 } 5229 mtx_unlock(&sync_mtx); 5230 BO_UNLOCK(bo); 5231 5232 return (0); 5233 } 5234 5235 int 5236 vn_need_pageq_flush(struct vnode *vp) 5237 { 5238 struct vm_object *obj; 5239 5240 obj = vp->v_object; 5241 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5242 vm_object_mightbedirty(obj)); 5243 } 5244 5245 /* 5246 * Check if vnode represents a disk device 5247 */ 5248 bool 5249 vn_isdisk_error(struct vnode *vp, int *errp) 5250 { 5251 int error; 5252 5253 if (vp->v_type != VCHR) { 5254 error = ENOTBLK; 5255 goto out; 5256 } 5257 error = 0; 5258 dev_lock(); 5259 if (vp->v_rdev == NULL) 5260 error = ENXIO; 5261 else if (vp->v_rdev->si_devsw == NULL) 5262 error = ENXIO; 5263 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5264 error = ENOTBLK; 5265 dev_unlock(); 5266 out: 5267 *errp = error; 5268 return (error == 0); 5269 } 5270 5271 bool 5272 vn_isdisk(struct vnode *vp) 5273 { 5274 int error; 5275 5276 return (vn_isdisk_error(vp, &error)); 5277 } 5278 5279 /* 5280 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5281 * the comment above cache_fplookup for details. 5282 */ 5283 int 5284 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5285 { 5286 int error; 5287 5288 VFS_SMR_ASSERT_ENTERED(); 5289 5290 /* Check the owner. */ 5291 if (cred->cr_uid == file_uid) { 5292 if (file_mode & S_IXUSR) 5293 return (0); 5294 goto out_error; 5295 } 5296 5297 /* Otherwise, check the groups (first match) */ 5298 if (groupmember(file_gid, cred)) { 5299 if (file_mode & S_IXGRP) 5300 return (0); 5301 goto out_error; 5302 } 5303 5304 /* Otherwise, check everyone else. */ 5305 if (file_mode & S_IXOTH) 5306 return (0); 5307 out_error: 5308 /* 5309 * Permission check failed, but it is possible denial will get overwritten 5310 * (e.g., when root is traversing through a 700 directory owned by someone 5311 * else). 5312 * 5313 * vaccess() calls priv_check_cred which in turn can descent into MAC 5314 * modules overriding this result. It's quite unclear what semantics 5315 * are allowed for them to operate, thus for safety we don't call them 5316 * from within the SMR section. This also means if any such modules 5317 * are present, we have to let the regular lookup decide. 5318 */ 5319 error = priv_check_cred_vfs_lookup_nomac(cred); 5320 switch (error) { 5321 case 0: 5322 return (0); 5323 case EAGAIN: 5324 /* 5325 * MAC modules present. 5326 */ 5327 return (EAGAIN); 5328 case EPERM: 5329 return (EACCES); 5330 default: 5331 return (error); 5332 } 5333 } 5334 5335 /* 5336 * Common filesystem object access control check routine. Accepts a 5337 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5338 * Returns 0 on success, or an errno on failure. 5339 */ 5340 int 5341 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5342 accmode_t accmode, struct ucred *cred) 5343 { 5344 accmode_t dac_granted; 5345 accmode_t priv_granted; 5346 5347 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5348 ("invalid bit in accmode")); 5349 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5350 ("VAPPEND without VWRITE")); 5351 5352 /* 5353 * Look for a normal, non-privileged way to access the file/directory 5354 * as requested. If it exists, go with that. 5355 */ 5356 5357 dac_granted = 0; 5358 5359 /* Check the owner. */ 5360 if (cred->cr_uid == file_uid) { 5361 dac_granted |= VADMIN; 5362 if (file_mode & S_IXUSR) 5363 dac_granted |= VEXEC; 5364 if (file_mode & S_IRUSR) 5365 dac_granted |= VREAD; 5366 if (file_mode & S_IWUSR) 5367 dac_granted |= (VWRITE | VAPPEND); 5368 5369 if ((accmode & dac_granted) == accmode) 5370 return (0); 5371 5372 goto privcheck; 5373 } 5374 5375 /* Otherwise, check the groups (first match) */ 5376 if (groupmember(file_gid, cred)) { 5377 if (file_mode & S_IXGRP) 5378 dac_granted |= VEXEC; 5379 if (file_mode & S_IRGRP) 5380 dac_granted |= VREAD; 5381 if (file_mode & S_IWGRP) 5382 dac_granted |= (VWRITE | VAPPEND); 5383 5384 if ((accmode & dac_granted) == accmode) 5385 return (0); 5386 5387 goto privcheck; 5388 } 5389 5390 /* Otherwise, check everyone else. */ 5391 if (file_mode & S_IXOTH) 5392 dac_granted |= VEXEC; 5393 if (file_mode & S_IROTH) 5394 dac_granted |= VREAD; 5395 if (file_mode & S_IWOTH) 5396 dac_granted |= (VWRITE | VAPPEND); 5397 if ((accmode & dac_granted) == accmode) 5398 return (0); 5399 5400 privcheck: 5401 /* 5402 * Build a privilege mask to determine if the set of privileges 5403 * satisfies the requirements when combined with the granted mask 5404 * from above. For each privilege, if the privilege is required, 5405 * bitwise or the request type onto the priv_granted mask. 5406 */ 5407 priv_granted = 0; 5408 5409 if (type == VDIR) { 5410 /* 5411 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5412 * requests, instead of PRIV_VFS_EXEC. 5413 */ 5414 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5415 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5416 priv_granted |= VEXEC; 5417 } else { 5418 /* 5419 * Ensure that at least one execute bit is on. Otherwise, 5420 * a privileged user will always succeed, and we don't want 5421 * this to happen unless the file really is executable. 5422 */ 5423 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5424 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5425 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5426 priv_granted |= VEXEC; 5427 } 5428 5429 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5430 !priv_check_cred(cred, PRIV_VFS_READ)) 5431 priv_granted |= VREAD; 5432 5433 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5434 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5435 priv_granted |= (VWRITE | VAPPEND); 5436 5437 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5438 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5439 priv_granted |= VADMIN; 5440 5441 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5442 return (0); 5443 } 5444 5445 return ((accmode & VADMIN) ? EPERM : EACCES); 5446 } 5447 5448 /* 5449 * Credential check based on process requesting service, and per-attribute 5450 * permissions. 5451 */ 5452 int 5453 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5454 struct thread *td, accmode_t accmode) 5455 { 5456 5457 /* 5458 * Kernel-invoked always succeeds. 5459 */ 5460 if (cred == NOCRED) 5461 return (0); 5462 5463 /* 5464 * Do not allow privileged processes in jail to directly manipulate 5465 * system attributes. 5466 */ 5467 switch (attrnamespace) { 5468 case EXTATTR_NAMESPACE_SYSTEM: 5469 /* Potentially should be: return (EPERM); */ 5470 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5471 case EXTATTR_NAMESPACE_USER: 5472 return (VOP_ACCESS(vp, accmode, cred, td)); 5473 default: 5474 return (EPERM); 5475 } 5476 } 5477 5478 #ifdef DEBUG_VFS_LOCKS 5479 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5480 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5481 "Drop into debugger on lock violation"); 5482 5483 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5484 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5485 0, "Check for interlock across VOPs"); 5486 5487 int vfs_badlock_print = 1; /* Print lock violations. */ 5488 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5489 0, "Print lock violations"); 5490 5491 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5492 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5493 0, "Print vnode details on lock violations"); 5494 5495 #ifdef KDB 5496 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5497 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5498 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5499 #endif 5500 5501 static void 5502 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5503 { 5504 5505 #ifdef KDB 5506 if (vfs_badlock_backtrace) 5507 kdb_backtrace(); 5508 #endif 5509 if (vfs_badlock_vnode) 5510 vn_printf(vp, "vnode "); 5511 if (vfs_badlock_print) 5512 printf("%s: %p %s\n", str, (void *)vp, msg); 5513 if (vfs_badlock_ddb) 5514 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5515 } 5516 5517 void 5518 assert_vi_locked(struct vnode *vp, const char *str) 5519 { 5520 5521 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5522 vfs_badlock("interlock is not locked but should be", str, vp); 5523 } 5524 5525 void 5526 assert_vi_unlocked(struct vnode *vp, const char *str) 5527 { 5528 5529 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5530 vfs_badlock("interlock is locked but should not be", str, vp); 5531 } 5532 5533 void 5534 assert_vop_locked(struct vnode *vp, const char *str) 5535 { 5536 int locked; 5537 5538 if (KERNEL_PANICKED() || vp == NULL) 5539 return; 5540 5541 locked = VOP_ISLOCKED(vp); 5542 if (locked == 0 || locked == LK_EXCLOTHER) 5543 vfs_badlock("is not locked but should be", str, vp); 5544 } 5545 5546 void 5547 assert_vop_unlocked(struct vnode *vp, const char *str) 5548 { 5549 if (KERNEL_PANICKED() || vp == NULL) 5550 return; 5551 5552 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5553 vfs_badlock("is locked but should not be", str, vp); 5554 } 5555 5556 void 5557 assert_vop_elocked(struct vnode *vp, const char *str) 5558 { 5559 if (KERNEL_PANICKED() || vp == NULL) 5560 return; 5561 5562 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5563 vfs_badlock("is not exclusive locked but should be", str, vp); 5564 } 5565 #endif /* DEBUG_VFS_LOCKS */ 5566 5567 void 5568 vop_rename_fail(struct vop_rename_args *ap) 5569 { 5570 5571 if (ap->a_tvp != NULL) 5572 vput(ap->a_tvp); 5573 if (ap->a_tdvp == ap->a_tvp) 5574 vrele(ap->a_tdvp); 5575 else 5576 vput(ap->a_tdvp); 5577 vrele(ap->a_fdvp); 5578 vrele(ap->a_fvp); 5579 } 5580 5581 void 5582 vop_rename_pre(void *ap) 5583 { 5584 struct vop_rename_args *a = ap; 5585 5586 #ifdef DEBUG_VFS_LOCKS 5587 if (a->a_tvp) 5588 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5589 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5590 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5591 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5592 5593 /* Check the source (from). */ 5594 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5595 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5596 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5597 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5598 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5599 5600 /* Check the target. */ 5601 if (a->a_tvp) 5602 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5603 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5604 #endif 5605 /* 5606 * It may be tempting to add vn_seqc_write_begin/end calls here and 5607 * in vop_rename_post but that's not going to work out since some 5608 * filesystems relookup vnodes mid-rename. This is probably a bug. 5609 * 5610 * For now filesystems are expected to do the relevant calls after they 5611 * decide what vnodes to operate on. 5612 */ 5613 if (a->a_tdvp != a->a_fdvp) 5614 vhold(a->a_fdvp); 5615 if (a->a_tvp != a->a_fvp) 5616 vhold(a->a_fvp); 5617 vhold(a->a_tdvp); 5618 if (a->a_tvp) 5619 vhold(a->a_tvp); 5620 } 5621 5622 #ifdef DEBUG_VFS_LOCKS 5623 void 5624 vop_fplookup_vexec_debugpre(void *ap __unused) 5625 { 5626 5627 VFS_SMR_ASSERT_ENTERED(); 5628 } 5629 5630 void 5631 vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) 5632 { 5633 5634 VFS_SMR_ASSERT_ENTERED(); 5635 } 5636 5637 void 5638 vop_fplookup_symlink_debugpre(void *ap __unused) 5639 { 5640 5641 VFS_SMR_ASSERT_ENTERED(); 5642 } 5643 5644 void 5645 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5646 { 5647 5648 VFS_SMR_ASSERT_ENTERED(); 5649 } 5650 5651 static void 5652 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5653 { 5654 if (vp->v_type == VCHR) 5655 ; 5656 else if (MNT_EXTENDED_SHARED(vp->v_mount)) 5657 ASSERT_VOP_LOCKED(vp, name); 5658 else 5659 ASSERT_VOP_ELOCKED(vp, name); 5660 } 5661 5662 void 5663 vop_fsync_debugpre(void *a) 5664 { 5665 struct vop_fsync_args *ap; 5666 5667 ap = a; 5668 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5669 } 5670 5671 void 5672 vop_fsync_debugpost(void *a, int rc __unused) 5673 { 5674 struct vop_fsync_args *ap; 5675 5676 ap = a; 5677 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5678 } 5679 5680 void 5681 vop_fdatasync_debugpre(void *a) 5682 { 5683 struct vop_fdatasync_args *ap; 5684 5685 ap = a; 5686 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5687 } 5688 5689 void 5690 vop_fdatasync_debugpost(void *a, int rc __unused) 5691 { 5692 struct vop_fdatasync_args *ap; 5693 5694 ap = a; 5695 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5696 } 5697 5698 void 5699 vop_strategy_debugpre(void *ap) 5700 { 5701 struct vop_strategy_args *a; 5702 struct buf *bp; 5703 5704 a = ap; 5705 bp = a->a_bp; 5706 5707 /* 5708 * Cluster ops lock their component buffers but not the IO container. 5709 */ 5710 if ((bp->b_flags & B_CLUSTER) != 0) 5711 return; 5712 5713 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5714 if (vfs_badlock_print) 5715 printf( 5716 "VOP_STRATEGY: bp is not locked but should be\n"); 5717 if (vfs_badlock_ddb) 5718 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5719 } 5720 } 5721 5722 void 5723 vop_lock_debugpre(void *ap) 5724 { 5725 struct vop_lock1_args *a = ap; 5726 5727 if ((a->a_flags & LK_INTERLOCK) == 0) 5728 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5729 else 5730 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5731 } 5732 5733 void 5734 vop_lock_debugpost(void *ap, int rc) 5735 { 5736 struct vop_lock1_args *a = ap; 5737 5738 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5739 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5740 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5741 } 5742 5743 void 5744 vop_unlock_debugpre(void *ap) 5745 { 5746 struct vop_unlock_args *a = ap; 5747 struct vnode *vp = a->a_vp; 5748 5749 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 5750 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 5751 } 5752 5753 void 5754 vop_need_inactive_debugpre(void *ap) 5755 { 5756 struct vop_need_inactive_args *a = ap; 5757 5758 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5759 } 5760 5761 void 5762 vop_need_inactive_debugpost(void *ap, int rc) 5763 { 5764 struct vop_need_inactive_args *a = ap; 5765 5766 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5767 } 5768 #endif 5769 5770 void 5771 vop_create_pre(void *ap) 5772 { 5773 struct vop_create_args *a; 5774 struct vnode *dvp; 5775 5776 a = ap; 5777 dvp = a->a_dvp; 5778 vn_seqc_write_begin(dvp); 5779 } 5780 5781 void 5782 vop_create_post(void *ap, int rc) 5783 { 5784 struct vop_create_args *a; 5785 struct vnode *dvp; 5786 5787 a = ap; 5788 dvp = a->a_dvp; 5789 vn_seqc_write_end(dvp); 5790 if (!rc) 5791 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5792 } 5793 5794 void 5795 vop_whiteout_pre(void *ap) 5796 { 5797 struct vop_whiteout_args *a; 5798 struct vnode *dvp; 5799 5800 a = ap; 5801 dvp = a->a_dvp; 5802 vn_seqc_write_begin(dvp); 5803 } 5804 5805 void 5806 vop_whiteout_post(void *ap, int rc) 5807 { 5808 struct vop_whiteout_args *a; 5809 struct vnode *dvp; 5810 5811 a = ap; 5812 dvp = a->a_dvp; 5813 vn_seqc_write_end(dvp); 5814 } 5815 5816 void 5817 vop_deleteextattr_pre(void *ap) 5818 { 5819 struct vop_deleteextattr_args *a; 5820 struct vnode *vp; 5821 5822 a = ap; 5823 vp = a->a_vp; 5824 vn_seqc_write_begin(vp); 5825 } 5826 5827 void 5828 vop_deleteextattr_post(void *ap, int rc) 5829 { 5830 struct vop_deleteextattr_args *a; 5831 struct vnode *vp; 5832 5833 a = ap; 5834 vp = a->a_vp; 5835 vn_seqc_write_end(vp); 5836 if (!rc) 5837 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5838 } 5839 5840 void 5841 vop_link_pre(void *ap) 5842 { 5843 struct vop_link_args *a; 5844 struct vnode *vp, *tdvp; 5845 5846 a = ap; 5847 vp = a->a_vp; 5848 tdvp = a->a_tdvp; 5849 vn_seqc_write_begin(vp); 5850 vn_seqc_write_begin(tdvp); 5851 } 5852 5853 void 5854 vop_link_post(void *ap, int rc) 5855 { 5856 struct vop_link_args *a; 5857 struct vnode *vp, *tdvp; 5858 5859 a = ap; 5860 vp = a->a_vp; 5861 tdvp = a->a_tdvp; 5862 vn_seqc_write_end(vp); 5863 vn_seqc_write_end(tdvp); 5864 if (!rc) { 5865 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 5866 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 5867 } 5868 } 5869 5870 void 5871 vop_mkdir_pre(void *ap) 5872 { 5873 struct vop_mkdir_args *a; 5874 struct vnode *dvp; 5875 5876 a = ap; 5877 dvp = a->a_dvp; 5878 vn_seqc_write_begin(dvp); 5879 } 5880 5881 void 5882 vop_mkdir_post(void *ap, int rc) 5883 { 5884 struct vop_mkdir_args *a; 5885 struct vnode *dvp; 5886 5887 a = ap; 5888 dvp = a->a_dvp; 5889 vn_seqc_write_end(dvp); 5890 if (!rc) 5891 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5892 } 5893 5894 #ifdef DEBUG_VFS_LOCKS 5895 void 5896 vop_mkdir_debugpost(void *ap, int rc) 5897 { 5898 struct vop_mkdir_args *a; 5899 5900 a = ap; 5901 if (!rc) 5902 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 5903 } 5904 #endif 5905 5906 void 5907 vop_mknod_pre(void *ap) 5908 { 5909 struct vop_mknod_args *a; 5910 struct vnode *dvp; 5911 5912 a = ap; 5913 dvp = a->a_dvp; 5914 vn_seqc_write_begin(dvp); 5915 } 5916 5917 void 5918 vop_mknod_post(void *ap, int rc) 5919 { 5920 struct vop_mknod_args *a; 5921 struct vnode *dvp; 5922 5923 a = ap; 5924 dvp = a->a_dvp; 5925 vn_seqc_write_end(dvp); 5926 if (!rc) 5927 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5928 } 5929 5930 void 5931 vop_reclaim_post(void *ap, int rc) 5932 { 5933 struct vop_reclaim_args *a; 5934 struct vnode *vp; 5935 5936 a = ap; 5937 vp = a->a_vp; 5938 ASSERT_VOP_IN_SEQC(vp); 5939 if (!rc) 5940 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 5941 } 5942 5943 void 5944 vop_remove_pre(void *ap) 5945 { 5946 struct vop_remove_args *a; 5947 struct vnode *dvp, *vp; 5948 5949 a = ap; 5950 dvp = a->a_dvp; 5951 vp = a->a_vp; 5952 vn_seqc_write_begin(dvp); 5953 vn_seqc_write_begin(vp); 5954 } 5955 5956 void 5957 vop_remove_post(void *ap, int rc) 5958 { 5959 struct vop_remove_args *a; 5960 struct vnode *dvp, *vp; 5961 5962 a = ap; 5963 dvp = a->a_dvp; 5964 vp = a->a_vp; 5965 vn_seqc_write_end(dvp); 5966 vn_seqc_write_end(vp); 5967 if (!rc) { 5968 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5969 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5970 } 5971 } 5972 5973 void 5974 vop_rename_post(void *ap, int rc) 5975 { 5976 struct vop_rename_args *a = ap; 5977 long hint; 5978 5979 if (!rc) { 5980 hint = NOTE_WRITE; 5981 if (a->a_fdvp == a->a_tdvp) { 5982 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5983 hint |= NOTE_LINK; 5984 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5985 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5986 } else { 5987 hint |= NOTE_EXTEND; 5988 if (a->a_fvp->v_type == VDIR) 5989 hint |= NOTE_LINK; 5990 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5991 5992 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5993 a->a_tvp->v_type == VDIR) 5994 hint &= ~NOTE_LINK; 5995 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5996 } 5997 5998 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5999 if (a->a_tvp) 6000 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 6001 } 6002 if (a->a_tdvp != a->a_fdvp) 6003 vdrop(a->a_fdvp); 6004 if (a->a_tvp != a->a_fvp) 6005 vdrop(a->a_fvp); 6006 vdrop(a->a_tdvp); 6007 if (a->a_tvp) 6008 vdrop(a->a_tvp); 6009 } 6010 6011 void 6012 vop_rmdir_pre(void *ap) 6013 { 6014 struct vop_rmdir_args *a; 6015 struct vnode *dvp, *vp; 6016 6017 a = ap; 6018 dvp = a->a_dvp; 6019 vp = a->a_vp; 6020 vn_seqc_write_begin(dvp); 6021 vn_seqc_write_begin(vp); 6022 } 6023 6024 void 6025 vop_rmdir_post(void *ap, int rc) 6026 { 6027 struct vop_rmdir_args *a; 6028 struct vnode *dvp, *vp; 6029 6030 a = ap; 6031 dvp = a->a_dvp; 6032 vp = a->a_vp; 6033 vn_seqc_write_end(dvp); 6034 vn_seqc_write_end(vp); 6035 if (!rc) { 6036 vp->v_vflag |= VV_UNLINKED; 6037 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 6038 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 6039 } 6040 } 6041 6042 void 6043 vop_setattr_pre(void *ap) 6044 { 6045 struct vop_setattr_args *a; 6046 struct vnode *vp; 6047 6048 a = ap; 6049 vp = a->a_vp; 6050 vn_seqc_write_begin(vp); 6051 } 6052 6053 void 6054 vop_setattr_post(void *ap, int rc) 6055 { 6056 struct vop_setattr_args *a; 6057 struct vnode *vp; 6058 6059 a = ap; 6060 vp = a->a_vp; 6061 vn_seqc_write_end(vp); 6062 if (!rc) 6063 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6064 } 6065 6066 void 6067 vop_setacl_pre(void *ap) 6068 { 6069 struct vop_setacl_args *a; 6070 struct vnode *vp; 6071 6072 a = ap; 6073 vp = a->a_vp; 6074 vn_seqc_write_begin(vp); 6075 } 6076 6077 void 6078 vop_setacl_post(void *ap, int rc __unused) 6079 { 6080 struct vop_setacl_args *a; 6081 struct vnode *vp; 6082 6083 a = ap; 6084 vp = a->a_vp; 6085 vn_seqc_write_end(vp); 6086 } 6087 6088 void 6089 vop_setextattr_pre(void *ap) 6090 { 6091 struct vop_setextattr_args *a; 6092 struct vnode *vp; 6093 6094 a = ap; 6095 vp = a->a_vp; 6096 vn_seqc_write_begin(vp); 6097 } 6098 6099 void 6100 vop_setextattr_post(void *ap, int rc) 6101 { 6102 struct vop_setextattr_args *a; 6103 struct vnode *vp; 6104 6105 a = ap; 6106 vp = a->a_vp; 6107 vn_seqc_write_end(vp); 6108 if (!rc) 6109 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6110 } 6111 6112 void 6113 vop_symlink_pre(void *ap) 6114 { 6115 struct vop_symlink_args *a; 6116 struct vnode *dvp; 6117 6118 a = ap; 6119 dvp = a->a_dvp; 6120 vn_seqc_write_begin(dvp); 6121 } 6122 6123 void 6124 vop_symlink_post(void *ap, int rc) 6125 { 6126 struct vop_symlink_args *a; 6127 struct vnode *dvp; 6128 6129 a = ap; 6130 dvp = a->a_dvp; 6131 vn_seqc_write_end(dvp); 6132 if (!rc) 6133 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6134 } 6135 6136 void 6137 vop_open_post(void *ap, int rc) 6138 { 6139 struct vop_open_args *a = ap; 6140 6141 if (!rc) 6142 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6143 } 6144 6145 void 6146 vop_close_post(void *ap, int rc) 6147 { 6148 struct vop_close_args *a = ap; 6149 6150 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6151 !VN_IS_DOOMED(a->a_vp))) { 6152 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6153 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6154 } 6155 } 6156 6157 void 6158 vop_read_post(void *ap, int rc) 6159 { 6160 struct vop_read_args *a = ap; 6161 6162 if (!rc) 6163 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6164 } 6165 6166 void 6167 vop_read_pgcache_post(void *ap, int rc) 6168 { 6169 struct vop_read_pgcache_args *a = ap; 6170 6171 if (!rc) 6172 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6173 } 6174 6175 void 6176 vop_readdir_post(void *ap, int rc) 6177 { 6178 struct vop_readdir_args *a = ap; 6179 6180 if (!rc) 6181 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6182 } 6183 6184 static struct knlist fs_knlist; 6185 6186 static void 6187 vfs_event_init(void *arg) 6188 { 6189 knlist_init_mtx(&fs_knlist, NULL); 6190 } 6191 /* XXX - correct order? */ 6192 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6193 6194 void 6195 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6196 { 6197 6198 KNOTE_UNLOCKED(&fs_knlist, event); 6199 } 6200 6201 static int filt_fsattach(struct knote *kn); 6202 static void filt_fsdetach(struct knote *kn); 6203 static int filt_fsevent(struct knote *kn, long hint); 6204 6205 struct filterops fs_filtops = { 6206 .f_isfd = 0, 6207 .f_attach = filt_fsattach, 6208 .f_detach = filt_fsdetach, 6209 .f_event = filt_fsevent 6210 }; 6211 6212 static int 6213 filt_fsattach(struct knote *kn) 6214 { 6215 6216 kn->kn_flags |= EV_CLEAR; 6217 knlist_add(&fs_knlist, kn, 0); 6218 return (0); 6219 } 6220 6221 static void 6222 filt_fsdetach(struct knote *kn) 6223 { 6224 6225 knlist_remove(&fs_knlist, kn, 0); 6226 } 6227 6228 static int 6229 filt_fsevent(struct knote *kn, long hint) 6230 { 6231 6232 kn->kn_fflags |= kn->kn_sfflags & hint; 6233 6234 return (kn->kn_fflags != 0); 6235 } 6236 6237 static int 6238 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6239 { 6240 struct vfsidctl vc; 6241 int error; 6242 struct mount *mp; 6243 6244 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6245 if (error) 6246 return (error); 6247 if (vc.vc_vers != VFS_CTL_VERS1) 6248 return (EINVAL); 6249 mp = vfs_getvfs(&vc.vc_fsid); 6250 if (mp == NULL) 6251 return (ENOENT); 6252 /* ensure that a specific sysctl goes to the right filesystem. */ 6253 if (strcmp(vc.vc_fstypename, "*") != 0 && 6254 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6255 vfs_rel(mp); 6256 return (EINVAL); 6257 } 6258 VCTLTOREQ(&vc, req); 6259 error = VFS_SYSCTL(mp, vc.vc_op, req); 6260 vfs_rel(mp); 6261 return (error); 6262 } 6263 6264 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6265 NULL, 0, sysctl_vfs_ctl, "", 6266 "Sysctl by fsid"); 6267 6268 /* 6269 * Function to initialize a va_filerev field sensibly. 6270 * XXX: Wouldn't a random number make a lot more sense ?? 6271 */ 6272 u_quad_t 6273 init_va_filerev(void) 6274 { 6275 struct bintime bt; 6276 6277 getbinuptime(&bt); 6278 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6279 } 6280 6281 static int filt_vfsread(struct knote *kn, long hint); 6282 static int filt_vfswrite(struct knote *kn, long hint); 6283 static int filt_vfsvnode(struct knote *kn, long hint); 6284 static void filt_vfsdetach(struct knote *kn); 6285 static struct filterops vfsread_filtops = { 6286 .f_isfd = 1, 6287 .f_detach = filt_vfsdetach, 6288 .f_event = filt_vfsread 6289 }; 6290 static struct filterops vfswrite_filtops = { 6291 .f_isfd = 1, 6292 .f_detach = filt_vfsdetach, 6293 .f_event = filt_vfswrite 6294 }; 6295 static struct filterops vfsvnode_filtops = { 6296 .f_isfd = 1, 6297 .f_detach = filt_vfsdetach, 6298 .f_event = filt_vfsvnode 6299 }; 6300 6301 static void 6302 vfs_knllock(void *arg) 6303 { 6304 struct vnode *vp = arg; 6305 6306 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6307 } 6308 6309 static void 6310 vfs_knlunlock(void *arg) 6311 { 6312 struct vnode *vp = arg; 6313 6314 VOP_UNLOCK(vp); 6315 } 6316 6317 static void 6318 vfs_knl_assert_lock(void *arg, int what) 6319 { 6320 #ifdef DEBUG_VFS_LOCKS 6321 struct vnode *vp = arg; 6322 6323 if (what == LA_LOCKED) 6324 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6325 else 6326 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6327 #endif 6328 } 6329 6330 int 6331 vfs_kqfilter(struct vop_kqfilter_args *ap) 6332 { 6333 struct vnode *vp = ap->a_vp; 6334 struct knote *kn = ap->a_kn; 6335 struct knlist *knl; 6336 6337 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6338 kn->kn_filter != EVFILT_WRITE), 6339 ("READ/WRITE filter on a FIFO leaked through")); 6340 switch (kn->kn_filter) { 6341 case EVFILT_READ: 6342 kn->kn_fop = &vfsread_filtops; 6343 break; 6344 case EVFILT_WRITE: 6345 kn->kn_fop = &vfswrite_filtops; 6346 break; 6347 case EVFILT_VNODE: 6348 kn->kn_fop = &vfsvnode_filtops; 6349 break; 6350 default: 6351 return (EINVAL); 6352 } 6353 6354 kn->kn_hook = (caddr_t)vp; 6355 6356 v_addpollinfo(vp); 6357 if (vp->v_pollinfo == NULL) 6358 return (ENOMEM); 6359 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6360 vhold(vp); 6361 knlist_add(knl, kn, 0); 6362 6363 return (0); 6364 } 6365 6366 /* 6367 * Detach knote from vnode 6368 */ 6369 static void 6370 filt_vfsdetach(struct knote *kn) 6371 { 6372 struct vnode *vp = (struct vnode *)kn->kn_hook; 6373 6374 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6375 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6376 vdrop(vp); 6377 } 6378 6379 /*ARGSUSED*/ 6380 static int 6381 filt_vfsread(struct knote *kn, long hint) 6382 { 6383 struct vnode *vp = (struct vnode *)kn->kn_hook; 6384 off_t size; 6385 int res; 6386 6387 /* 6388 * filesystem is gone, so set the EOF flag and schedule 6389 * the knote for deletion. 6390 */ 6391 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6392 VI_LOCK(vp); 6393 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6394 VI_UNLOCK(vp); 6395 return (1); 6396 } 6397 6398 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6399 return (0); 6400 6401 VI_LOCK(vp); 6402 kn->kn_data = size - kn->kn_fp->f_offset; 6403 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6404 VI_UNLOCK(vp); 6405 return (res); 6406 } 6407 6408 /*ARGSUSED*/ 6409 static int 6410 filt_vfswrite(struct knote *kn, long hint) 6411 { 6412 struct vnode *vp = (struct vnode *)kn->kn_hook; 6413 6414 VI_LOCK(vp); 6415 6416 /* 6417 * filesystem is gone, so set the EOF flag and schedule 6418 * the knote for deletion. 6419 */ 6420 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6421 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6422 6423 kn->kn_data = 0; 6424 VI_UNLOCK(vp); 6425 return (1); 6426 } 6427 6428 static int 6429 filt_vfsvnode(struct knote *kn, long hint) 6430 { 6431 struct vnode *vp = (struct vnode *)kn->kn_hook; 6432 int res; 6433 6434 VI_LOCK(vp); 6435 if (kn->kn_sfflags & hint) 6436 kn->kn_fflags |= hint; 6437 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6438 kn->kn_flags |= EV_EOF; 6439 VI_UNLOCK(vp); 6440 return (1); 6441 } 6442 res = (kn->kn_fflags != 0); 6443 VI_UNLOCK(vp); 6444 return (res); 6445 } 6446 6447 /* 6448 * Returns whether the directory is empty or not. 6449 * If it is empty, the return value is 0; otherwise 6450 * the return value is an error value (which may 6451 * be ENOTEMPTY). 6452 */ 6453 int 6454 vfs_emptydir(struct vnode *vp) 6455 { 6456 struct uio uio; 6457 struct iovec iov; 6458 struct dirent *dirent, *dp, *endp; 6459 int error, eof; 6460 6461 error = 0; 6462 eof = 0; 6463 6464 ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); 6465 VNASSERT(vp->v_type == VDIR, vp, ("vp is not a directory")); 6466 6467 dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); 6468 iov.iov_base = dirent; 6469 iov.iov_len = sizeof(struct dirent); 6470 6471 uio.uio_iov = &iov; 6472 uio.uio_iovcnt = 1; 6473 uio.uio_offset = 0; 6474 uio.uio_resid = sizeof(struct dirent); 6475 uio.uio_segflg = UIO_SYSSPACE; 6476 uio.uio_rw = UIO_READ; 6477 uio.uio_td = curthread; 6478 6479 while (eof == 0 && error == 0) { 6480 error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, 6481 NULL, NULL); 6482 if (error != 0) 6483 break; 6484 endp = (void *)((uint8_t *)dirent + 6485 sizeof(struct dirent) - uio.uio_resid); 6486 for (dp = dirent; dp < endp; 6487 dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { 6488 if (dp->d_type == DT_WHT) 6489 continue; 6490 if (dp->d_namlen == 0) 6491 continue; 6492 if (dp->d_type != DT_DIR && 6493 dp->d_type != DT_UNKNOWN) { 6494 error = ENOTEMPTY; 6495 break; 6496 } 6497 if (dp->d_namlen > 2) { 6498 error = ENOTEMPTY; 6499 break; 6500 } 6501 if (dp->d_namlen == 1 && 6502 dp->d_name[0] != '.') { 6503 error = ENOTEMPTY; 6504 break; 6505 } 6506 if (dp->d_namlen == 2 && 6507 dp->d_name[1] != '.') { 6508 error = ENOTEMPTY; 6509 break; 6510 } 6511 uio.uio_resid = sizeof(struct dirent); 6512 } 6513 } 6514 free(dirent, M_TEMP); 6515 return (error); 6516 } 6517 6518 int 6519 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6520 { 6521 int error; 6522 6523 if (dp->d_reclen > ap->a_uio->uio_resid) 6524 return (ENAMETOOLONG); 6525 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6526 if (error) { 6527 if (ap->a_ncookies != NULL) { 6528 if (ap->a_cookies != NULL) 6529 free(ap->a_cookies, M_TEMP); 6530 ap->a_cookies = NULL; 6531 *ap->a_ncookies = 0; 6532 } 6533 return (error); 6534 } 6535 if (ap->a_ncookies == NULL) 6536 return (0); 6537 6538 KASSERT(ap->a_cookies, 6539 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6540 6541 *ap->a_cookies = realloc(*ap->a_cookies, 6542 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6543 (*ap->a_cookies)[*ap->a_ncookies] = off; 6544 *ap->a_ncookies += 1; 6545 return (0); 6546 } 6547 6548 /* 6549 * The purpose of this routine is to remove granularity from accmode_t, 6550 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6551 * VADMIN and VAPPEND. 6552 * 6553 * If it returns 0, the caller is supposed to continue with the usual 6554 * access checks using 'accmode' as modified by this routine. If it 6555 * returns nonzero value, the caller is supposed to return that value 6556 * as errno. 6557 * 6558 * Note that after this routine runs, accmode may be zero. 6559 */ 6560 int 6561 vfs_unixify_accmode(accmode_t *accmode) 6562 { 6563 /* 6564 * There is no way to specify explicit "deny" rule using 6565 * file mode or POSIX.1e ACLs. 6566 */ 6567 if (*accmode & VEXPLICIT_DENY) { 6568 *accmode = 0; 6569 return (0); 6570 } 6571 6572 /* 6573 * None of these can be translated into usual access bits. 6574 * Also, the common case for NFSv4 ACLs is to not contain 6575 * either of these bits. Caller should check for VWRITE 6576 * on the containing directory instead. 6577 */ 6578 if (*accmode & (VDELETE_CHILD | VDELETE)) 6579 return (EPERM); 6580 6581 if (*accmode & VADMIN_PERMS) { 6582 *accmode &= ~VADMIN_PERMS; 6583 *accmode |= VADMIN; 6584 } 6585 6586 /* 6587 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6588 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6589 */ 6590 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6591 6592 return (0); 6593 } 6594 6595 /* 6596 * Clear out a doomed vnode (if any) and replace it with a new one as long 6597 * as the fs is not being unmounted. Return the root vnode to the caller. 6598 */ 6599 static int __noinline 6600 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6601 { 6602 struct vnode *vp; 6603 int error; 6604 6605 restart: 6606 if (mp->mnt_rootvnode != NULL) { 6607 MNT_ILOCK(mp); 6608 vp = mp->mnt_rootvnode; 6609 if (vp != NULL) { 6610 if (!VN_IS_DOOMED(vp)) { 6611 vrefact(vp); 6612 MNT_IUNLOCK(mp); 6613 error = vn_lock(vp, flags); 6614 if (error == 0) { 6615 *vpp = vp; 6616 return (0); 6617 } 6618 vrele(vp); 6619 goto restart; 6620 } 6621 /* 6622 * Clear the old one. 6623 */ 6624 mp->mnt_rootvnode = NULL; 6625 } 6626 MNT_IUNLOCK(mp); 6627 if (vp != NULL) { 6628 vfs_op_barrier_wait(mp); 6629 vrele(vp); 6630 } 6631 } 6632 error = VFS_CACHEDROOT(mp, flags, vpp); 6633 if (error != 0) 6634 return (error); 6635 if (mp->mnt_vfs_ops == 0) { 6636 MNT_ILOCK(mp); 6637 if (mp->mnt_vfs_ops != 0) { 6638 MNT_IUNLOCK(mp); 6639 return (0); 6640 } 6641 if (mp->mnt_rootvnode == NULL) { 6642 vrefact(*vpp); 6643 mp->mnt_rootvnode = *vpp; 6644 } else { 6645 if (mp->mnt_rootvnode != *vpp) { 6646 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6647 panic("%s: mismatch between vnode returned " 6648 " by VFS_CACHEDROOT and the one cached " 6649 " (%p != %p)", 6650 __func__, *vpp, mp->mnt_rootvnode); 6651 } 6652 } 6653 } 6654 MNT_IUNLOCK(mp); 6655 } 6656 return (0); 6657 } 6658 6659 int 6660 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6661 { 6662 struct mount_pcpu *mpcpu; 6663 struct vnode *vp; 6664 int error; 6665 6666 if (!vfs_op_thread_enter(mp, mpcpu)) 6667 return (vfs_cache_root_fallback(mp, flags, vpp)); 6668 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6669 if (vp == NULL || VN_IS_DOOMED(vp)) { 6670 vfs_op_thread_exit(mp, mpcpu); 6671 return (vfs_cache_root_fallback(mp, flags, vpp)); 6672 } 6673 vrefact(vp); 6674 vfs_op_thread_exit(mp, mpcpu); 6675 error = vn_lock(vp, flags); 6676 if (error != 0) { 6677 vrele(vp); 6678 return (vfs_cache_root_fallback(mp, flags, vpp)); 6679 } 6680 *vpp = vp; 6681 return (0); 6682 } 6683 6684 struct vnode * 6685 vfs_cache_root_clear(struct mount *mp) 6686 { 6687 struct vnode *vp; 6688 6689 /* 6690 * ops > 0 guarantees there is nobody who can see this vnode 6691 */ 6692 MPASS(mp->mnt_vfs_ops > 0); 6693 vp = mp->mnt_rootvnode; 6694 if (vp != NULL) 6695 vn_seqc_write_begin(vp); 6696 mp->mnt_rootvnode = NULL; 6697 return (vp); 6698 } 6699 6700 void 6701 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6702 { 6703 6704 MPASS(mp->mnt_vfs_ops > 0); 6705 vrefact(vp); 6706 mp->mnt_rootvnode = vp; 6707 } 6708 6709 /* 6710 * These are helper functions for filesystems to traverse all 6711 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6712 * 6713 * This interface replaces MNT_VNODE_FOREACH. 6714 */ 6715 6716 struct vnode * 6717 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6718 { 6719 struct vnode *vp; 6720 6721 maybe_yield(); 6722 MNT_ILOCK(mp); 6723 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6724 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6725 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6726 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6727 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6728 continue; 6729 VI_LOCK(vp); 6730 if (VN_IS_DOOMED(vp)) { 6731 VI_UNLOCK(vp); 6732 continue; 6733 } 6734 break; 6735 } 6736 if (vp == NULL) { 6737 __mnt_vnode_markerfree_all(mvp, mp); 6738 /* MNT_IUNLOCK(mp); -- done in above function */ 6739 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6740 return (NULL); 6741 } 6742 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6743 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6744 MNT_IUNLOCK(mp); 6745 return (vp); 6746 } 6747 6748 struct vnode * 6749 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6750 { 6751 struct vnode *vp; 6752 6753 *mvp = vn_alloc_marker(mp); 6754 MNT_ILOCK(mp); 6755 MNT_REF(mp); 6756 6757 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6758 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6759 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6760 continue; 6761 VI_LOCK(vp); 6762 if (VN_IS_DOOMED(vp)) { 6763 VI_UNLOCK(vp); 6764 continue; 6765 } 6766 break; 6767 } 6768 if (vp == NULL) { 6769 MNT_REL(mp); 6770 MNT_IUNLOCK(mp); 6771 vn_free_marker(*mvp); 6772 *mvp = NULL; 6773 return (NULL); 6774 } 6775 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6776 MNT_IUNLOCK(mp); 6777 return (vp); 6778 } 6779 6780 void 6781 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6782 { 6783 6784 if (*mvp == NULL) { 6785 MNT_IUNLOCK(mp); 6786 return; 6787 } 6788 6789 mtx_assert(MNT_MTX(mp), MA_OWNED); 6790 6791 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6792 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6793 MNT_REL(mp); 6794 MNT_IUNLOCK(mp); 6795 vn_free_marker(*mvp); 6796 *mvp = NULL; 6797 } 6798 6799 /* 6800 * These are helper functions for filesystems to traverse their 6801 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6802 */ 6803 static void 6804 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6805 { 6806 6807 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6808 6809 MNT_ILOCK(mp); 6810 MNT_REL(mp); 6811 MNT_IUNLOCK(mp); 6812 vn_free_marker(*mvp); 6813 *mvp = NULL; 6814 } 6815 6816 /* 6817 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6818 * conventional lock order during mnt_vnode_next_lazy iteration. 6819 * 6820 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6821 * The list lock is dropped and reacquired. On success, both locks are held. 6822 * On failure, the mount vnode list lock is held but the vnode interlock is 6823 * not, and the procedure may have yielded. 6824 */ 6825 static bool 6826 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6827 struct vnode *vp) 6828 { 6829 6830 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6831 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6832 ("%s: bad marker", __func__)); 6833 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6834 ("%s: inappropriate vnode", __func__)); 6835 ASSERT_VI_UNLOCKED(vp, __func__); 6836 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6837 6838 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6839 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6840 6841 /* 6842 * Note we may be racing against vdrop which transitioned the hold 6843 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6844 * if we are the only user after we get the interlock we will just 6845 * vdrop. 6846 */ 6847 vhold(vp); 6848 mtx_unlock(&mp->mnt_listmtx); 6849 VI_LOCK(vp); 6850 if (VN_IS_DOOMED(vp)) { 6851 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6852 goto out_lost; 6853 } 6854 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6855 /* 6856 * There is nothing to do if we are the last user. 6857 */ 6858 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6859 goto out_lost; 6860 mtx_lock(&mp->mnt_listmtx); 6861 return (true); 6862 out_lost: 6863 vdropl(vp); 6864 maybe_yield(); 6865 mtx_lock(&mp->mnt_listmtx); 6866 return (false); 6867 } 6868 6869 static struct vnode * 6870 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6871 void *cbarg) 6872 { 6873 struct vnode *vp; 6874 6875 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6876 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6877 restart: 6878 vp = TAILQ_NEXT(*mvp, v_lazylist); 6879 while (vp != NULL) { 6880 if (vp->v_type == VMARKER) { 6881 vp = TAILQ_NEXT(vp, v_lazylist); 6882 continue; 6883 } 6884 /* 6885 * See if we want to process the vnode. Note we may encounter a 6886 * long string of vnodes we don't care about and hog the list 6887 * as a result. Check for it and requeue the marker. 6888 */ 6889 VNPASS(!VN_IS_DOOMED(vp), vp); 6890 if (!cb(vp, cbarg)) { 6891 if (!should_yield()) { 6892 vp = TAILQ_NEXT(vp, v_lazylist); 6893 continue; 6894 } 6895 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 6896 v_lazylist); 6897 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 6898 v_lazylist); 6899 mtx_unlock(&mp->mnt_listmtx); 6900 kern_yield(PRI_USER); 6901 mtx_lock(&mp->mnt_listmtx); 6902 goto restart; 6903 } 6904 /* 6905 * Try-lock because this is the wrong lock order. 6906 */ 6907 if (!VI_TRYLOCK(vp) && 6908 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 6909 goto restart; 6910 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6911 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6912 ("alien vnode on the lazy list %p %p", vp, mp)); 6913 VNPASS(vp->v_mount == mp, vp); 6914 VNPASS(!VN_IS_DOOMED(vp), vp); 6915 break; 6916 } 6917 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6918 6919 /* Check if we are done */ 6920 if (vp == NULL) { 6921 mtx_unlock(&mp->mnt_listmtx); 6922 mnt_vnode_markerfree_lazy(mvp, mp); 6923 return (NULL); 6924 } 6925 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 6926 mtx_unlock(&mp->mnt_listmtx); 6927 ASSERT_VI_LOCKED(vp, "lazy iter"); 6928 return (vp); 6929 } 6930 6931 struct vnode * 6932 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6933 void *cbarg) 6934 { 6935 6936 maybe_yield(); 6937 mtx_lock(&mp->mnt_listmtx); 6938 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6939 } 6940 6941 struct vnode * 6942 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6943 void *cbarg) 6944 { 6945 struct vnode *vp; 6946 6947 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 6948 return (NULL); 6949 6950 *mvp = vn_alloc_marker(mp); 6951 MNT_ILOCK(mp); 6952 MNT_REF(mp); 6953 MNT_IUNLOCK(mp); 6954 6955 mtx_lock(&mp->mnt_listmtx); 6956 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 6957 if (vp == NULL) { 6958 mtx_unlock(&mp->mnt_listmtx); 6959 mnt_vnode_markerfree_lazy(mvp, mp); 6960 return (NULL); 6961 } 6962 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 6963 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6964 } 6965 6966 void 6967 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6968 { 6969 6970 if (*mvp == NULL) 6971 return; 6972 6973 mtx_lock(&mp->mnt_listmtx); 6974 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6975 mtx_unlock(&mp->mnt_listmtx); 6976 mnt_vnode_markerfree_lazy(mvp, mp); 6977 } 6978 6979 int 6980 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 6981 { 6982 6983 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 6984 cnp->cn_flags &= ~NOEXECCHECK; 6985 return (0); 6986 } 6987 6988 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 6989 } 6990 6991 /* 6992 * Do not use this variant unless you have means other than the hold count 6993 * to prevent the vnode from getting freed. 6994 */ 6995 void 6996 vn_seqc_write_begin_locked(struct vnode *vp) 6997 { 6998 6999 ASSERT_VI_LOCKED(vp, __func__); 7000 VNPASS(vp->v_holdcnt > 0, vp); 7001 VNPASS(vp->v_seqc_users >= 0, vp); 7002 vp->v_seqc_users++; 7003 if (vp->v_seqc_users == 1) 7004 seqc_sleepable_write_begin(&vp->v_seqc); 7005 } 7006 7007 void 7008 vn_seqc_write_begin(struct vnode *vp) 7009 { 7010 7011 VI_LOCK(vp); 7012 vn_seqc_write_begin_locked(vp); 7013 VI_UNLOCK(vp); 7014 } 7015 7016 void 7017 vn_seqc_write_end_locked(struct vnode *vp) 7018 { 7019 7020 ASSERT_VI_LOCKED(vp, __func__); 7021 VNPASS(vp->v_seqc_users > 0, vp); 7022 vp->v_seqc_users--; 7023 if (vp->v_seqc_users == 0) 7024 seqc_sleepable_write_end(&vp->v_seqc); 7025 } 7026 7027 void 7028 vn_seqc_write_end(struct vnode *vp) 7029 { 7030 7031 VI_LOCK(vp); 7032 vn_seqc_write_end_locked(vp); 7033 VI_UNLOCK(vp); 7034 } 7035 7036 /* 7037 * Special case handling for allocating and freeing vnodes. 7038 * 7039 * The counter remains unchanged on free so that a doomed vnode will 7040 * keep testing as in modify as long as it is accessible with SMR. 7041 */ 7042 static void 7043 vn_seqc_init(struct vnode *vp) 7044 { 7045 7046 vp->v_seqc = 0; 7047 vp->v_seqc_users = 0; 7048 } 7049 7050 static void 7051 vn_seqc_write_end_free(struct vnode *vp) 7052 { 7053 7054 VNPASS(seqc_in_modify(vp->v_seqc), vp); 7055 VNPASS(vp->v_seqc_users == 1, vp); 7056 } 7057 7058 void 7059 vn_irflag_set_locked(struct vnode *vp, short toset) 7060 { 7061 short flags; 7062 7063 ASSERT_VI_LOCKED(vp, __func__); 7064 flags = vn_irflag_read(vp); 7065 VNASSERT((flags & toset) == 0, vp, 7066 ("%s: some of the passed flags already set (have %d, passed %d)\n", 7067 __func__, flags, toset)); 7068 atomic_store_short(&vp->v_irflag, flags | toset); 7069 } 7070 7071 void 7072 vn_irflag_set(struct vnode *vp, short toset) 7073 { 7074 7075 VI_LOCK(vp); 7076 vn_irflag_set_locked(vp, toset); 7077 VI_UNLOCK(vp); 7078 } 7079 7080 void 7081 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 7082 { 7083 short flags; 7084 7085 ASSERT_VI_LOCKED(vp, __func__); 7086 flags = vn_irflag_read(vp); 7087 atomic_store_short(&vp->v_irflag, flags | toset); 7088 } 7089 7090 void 7091 vn_irflag_set_cond(struct vnode *vp, short toset) 7092 { 7093 7094 VI_LOCK(vp); 7095 vn_irflag_set_cond_locked(vp, toset); 7096 VI_UNLOCK(vp); 7097 } 7098 7099 void 7100 vn_irflag_unset_locked(struct vnode *vp, short tounset) 7101 { 7102 short flags; 7103 7104 ASSERT_VI_LOCKED(vp, __func__); 7105 flags = vn_irflag_read(vp); 7106 VNASSERT((flags & tounset) == tounset, vp, 7107 ("%s: some of the passed flags not set (have %d, passed %d)\n", 7108 __func__, flags, tounset)); 7109 atomic_store_short(&vp->v_irflag, flags & ~tounset); 7110 } 7111 7112 void 7113 vn_irflag_unset(struct vnode *vp, short tounset) 7114 { 7115 7116 VI_LOCK(vp); 7117 vn_irflag_unset_locked(vp, tounset); 7118 VI_UNLOCK(vp); 7119 } 7120 7121 int 7122 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 7123 { 7124 struct vattr vattr; 7125 int error; 7126 7127 ASSERT_VOP_LOCKED(vp, __func__); 7128 error = VOP_GETATTR(vp, &vattr, cred); 7129 if (__predict_true(error == 0)) { 7130 if (vattr.va_size <= OFF_MAX) 7131 *size = vattr.va_size; 7132 else 7133 error = EFBIG; 7134 } 7135 return (error); 7136 } 7137 7138 int 7139 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7140 { 7141 int error; 7142 7143 VOP_LOCK(vp, LK_SHARED); 7144 error = vn_getsize_locked(vp, size, cred); 7145 VOP_UNLOCK(vp); 7146 return (error); 7147 } 7148 7149 #ifdef INVARIANTS 7150 void 7151 vn_set_state_validate(struct vnode *vp, enum vstate state) 7152 { 7153 7154 switch (vp->v_state) { 7155 case VSTATE_UNINITIALIZED: 7156 switch (state) { 7157 case VSTATE_CONSTRUCTED: 7158 case VSTATE_DESTROYING: 7159 return; 7160 default: 7161 break; 7162 } 7163 break; 7164 case VSTATE_CONSTRUCTED: 7165 ASSERT_VOP_ELOCKED(vp, __func__); 7166 switch (state) { 7167 case VSTATE_DESTROYING: 7168 return; 7169 default: 7170 break; 7171 } 7172 break; 7173 case VSTATE_DESTROYING: 7174 ASSERT_VOP_ELOCKED(vp, __func__); 7175 switch (state) { 7176 case VSTATE_DEAD: 7177 return; 7178 default: 7179 break; 7180 } 7181 break; 7182 case VSTATE_DEAD: 7183 switch (state) { 7184 case VSTATE_UNINITIALIZED: 7185 return; 7186 default: 7187 break; 7188 } 7189 break; 7190 } 7191 7192 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7193 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7194 } 7195 #endif 7196