1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1989, 1993 5 * The Regents of the University of California. All rights reserved. 6 * (c) UNIX System Laboratories, Inc. 7 * All or some portions of this file are derived from material licensed 8 * to the University of California by American Telephone and Telegraph 9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 10 * the permission of UNIX System Laboratories, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 37 */ 38 39 /* 40 * External virtual filesystem routines 41 */ 42 43 #include <sys/cdefs.h> 44 #include "opt_ddb.h" 45 #include "opt_watchdog.h" 46 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/asan.h> 50 #include <sys/bio.h> 51 #include <sys/buf.h> 52 #include <sys/capsicum.h> 53 #include <sys/condvar.h> 54 #include <sys/conf.h> 55 #include <sys/counter.h> 56 #include <sys/dirent.h> 57 #include <sys/event.h> 58 #include <sys/eventhandler.h> 59 #include <sys/extattr.h> 60 #include <sys/file.h> 61 #include <sys/fcntl.h> 62 #include <sys/jail.h> 63 #include <sys/kdb.h> 64 #include <sys/kernel.h> 65 #include <sys/kthread.h> 66 #include <sys/ktr.h> 67 #include <sys/limits.h> 68 #include <sys/lockf.h> 69 #include <sys/malloc.h> 70 #include <sys/mount.h> 71 #include <sys/namei.h> 72 #include <sys/pctrie.h> 73 #include <sys/priv.h> 74 #include <sys/reboot.h> 75 #include <sys/refcount.h> 76 #include <sys/rwlock.h> 77 #include <sys/sched.h> 78 #include <sys/sleepqueue.h> 79 #include <sys/smr.h> 80 #include <sys/smp.h> 81 #include <sys/stat.h> 82 #include <sys/sysctl.h> 83 #include <sys/syslog.h> 84 #include <sys/vmmeter.h> 85 #include <sys/vnode.h> 86 #include <sys/watchdog.h> 87 88 #include <machine/stdarg.h> 89 90 #include <security/mac/mac_framework.h> 91 92 #include <vm/vm.h> 93 #include <vm/vm_object.h> 94 #include <vm/vm_extern.h> 95 #include <vm/pmap.h> 96 #include <vm/vm_map.h> 97 #include <vm/vm_page.h> 98 #include <vm/vm_kern.h> 99 #include <vm/uma.h> 100 101 #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS)) 102 #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS 103 #endif 104 105 #ifdef DDB 106 #include <ddb/ddb.h> 107 #endif 108 109 static void delmntque(struct vnode *vp); 110 static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 111 int slpflag, int slptimeo); 112 static void syncer_shutdown(void *arg, int howto); 113 static int vtryrecycle(struct vnode *vp); 114 static void v_init_counters(struct vnode *); 115 static void vn_seqc_init(struct vnode *); 116 static void vn_seqc_write_end_free(struct vnode *vp); 117 static void vgonel(struct vnode *); 118 static bool vhold_recycle_free(struct vnode *); 119 static void vdropl_recycle(struct vnode *vp); 120 static void vdrop_recycle(struct vnode *vp); 121 static void vfs_knllock(void *arg); 122 static void vfs_knlunlock(void *arg); 123 static void vfs_knl_assert_lock(void *arg, int what); 124 static void destroy_vpollinfo(struct vpollinfo *vi); 125 static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 126 daddr_t startlbn, daddr_t endlbn); 127 static void vnlru_recalc(void); 128 129 /* 130 * Number of vnodes in existence. Increased whenever getnewvnode() 131 * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. 132 */ 133 static u_long __exclusive_cache_line numvnodes; 134 135 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 136 "Number of vnodes in existence"); 137 138 static counter_u64_t vnodes_created; 139 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, 140 "Number of vnodes created by getnewvnode"); 141 142 /* 143 * Conversion tables for conversion from vnode types to inode formats 144 * and back. 145 */ 146 __enum_uint8(vtype) iftovt_tab[16] = { 147 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 148 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 149 }; 150 int vttoif_tab[10] = { 151 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 152 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 153 }; 154 155 /* 156 * List of allocates vnodes in the system. 157 */ 158 static TAILQ_HEAD(freelst, vnode) vnode_list; 159 static struct vnode *vnode_list_free_marker; 160 static struct vnode *vnode_list_reclaim_marker; 161 162 /* 163 * "Free" vnode target. Free vnodes are rarely completely free, but are 164 * just ones that are cheap to recycle. Usually they are for files which 165 * have been stat'd but not read; these usually have inode and namecache 166 * data attached to them. This target is the preferred minimum size of a 167 * sub-cache consisting mostly of such files. The system balances the size 168 * of this sub-cache with its complement to try to prevent either from 169 * thrashing while the other is relatively inactive. The targets express 170 * a preference for the best balance. 171 * 172 * "Above" this target there are 2 further targets (watermarks) related 173 * to recyling of free vnodes. In the best-operating case, the cache is 174 * exactly full, the free list has size between vlowat and vhiwat above the 175 * free target, and recycling from it and normal use maintains this state. 176 * Sometimes the free list is below vlowat or even empty, but this state 177 * is even better for immediate use provided the cache is not full. 178 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free 179 * ones) to reach one of these states. The watermarks are currently hard- 180 * coded as 4% and 9% of the available space higher. These and the default 181 * of 25% for wantfreevnodes are too large if the memory size is large. 182 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim 183 * whenever vnlru_proc() becomes active. 184 */ 185 static long wantfreevnodes; 186 static long __exclusive_cache_line freevnodes; 187 SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, 188 &freevnodes, 0, "Number of \"free\" vnodes"); 189 static long freevnodes_old; 190 191 static counter_u64_t recycles_count; 192 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 193 "Number of vnodes recycled to meet vnode cache targets"); 194 195 static counter_u64_t recycles_free_count; 196 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, 197 "Number of free vnodes recycled to meet vnode cache targets"); 198 199 static u_long deferred_inact; 200 SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, 201 &deferred_inact, 0, "Number of times inactive processing was deferred"); 202 203 /* To keep more than one thread at a time from running vfs_getnewfsid */ 204 static struct mtx mntid_mtx; 205 206 /* 207 * Lock for any access to the following: 208 * vnode_list 209 * numvnodes 210 * freevnodes 211 */ 212 static struct mtx __exclusive_cache_line vnode_list_mtx; 213 214 /* Publicly exported FS */ 215 struct nfs_public nfs_pub; 216 217 static uma_zone_t buf_trie_zone; 218 static smr_t buf_trie_smr; 219 220 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 221 static uma_zone_t vnode_zone; 222 MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); 223 224 __read_frequently smr_t vfs_smr; 225 226 /* 227 * The workitem queue. 228 * 229 * It is useful to delay writes of file data and filesystem metadata 230 * for tens of seconds so that quickly created and deleted files need 231 * not waste disk bandwidth being created and removed. To realize this, 232 * we append vnodes to a "workitem" queue. When running with a soft 233 * updates implementation, most pending metadata dependencies should 234 * not wait for more than a few seconds. Thus, mounted on block devices 235 * are delayed only about a half the time that file data is delayed. 236 * Similarly, directory updates are more critical, so are only delayed 237 * about a third the time that file data is delayed. Thus, there are 238 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 239 * one each second (driven off the filesystem syncer process). The 240 * syncer_delayno variable indicates the next queue that is to be processed. 241 * Items that need to be processed soon are placed in this queue: 242 * 243 * syncer_workitem_pending[syncer_delayno] 244 * 245 * A delay of fifteen seconds is done by placing the request fifteen 246 * entries later in the queue: 247 * 248 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 249 * 250 */ 251 static int syncer_delayno; 252 static long syncer_mask; 253 LIST_HEAD(synclist, bufobj); 254 static struct synclist *syncer_workitem_pending; 255 /* 256 * The sync_mtx protects: 257 * bo->bo_synclist 258 * sync_vnode_count 259 * syncer_delayno 260 * syncer_state 261 * syncer_workitem_pending 262 * syncer_worklist_len 263 * rushjob 264 */ 265 static struct mtx sync_mtx; 266 static struct cv sync_wakeup; 267 268 #define SYNCER_MAXDELAY 32 269 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 270 static int syncdelay = 30; /* max time to delay syncing data */ 271 static int filedelay = 30; /* time to delay syncing files */ 272 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 273 "Time to delay syncing files (in seconds)"); 274 static int dirdelay = 29; /* time to delay syncing directories */ 275 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 276 "Time to delay syncing directories (in seconds)"); 277 static int metadelay = 28; /* time to delay syncing metadata */ 278 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 279 "Time to delay syncing metadata (in seconds)"); 280 static int rushjob; /* number of slots to run ASAP */ 281 static int stat_rush_requests; /* number of times I/O speeded up */ 282 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 283 "Number of times I/O speeded up (rush requests)"); 284 285 #define VDBATCH_SIZE 8 286 struct vdbatch { 287 u_int index; 288 struct mtx lock; 289 struct vnode *tab[VDBATCH_SIZE]; 290 }; 291 DPCPU_DEFINE_STATIC(struct vdbatch, vd); 292 293 static void vdbatch_dequeue(struct vnode *vp); 294 295 /* 296 * When shutting down the syncer, run it at four times normal speed. 297 */ 298 #define SYNCER_SHUTDOWN_SPEEDUP 4 299 static int sync_vnode_count; 300 static int syncer_worklist_len; 301 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 302 syncer_state; 303 304 /* Target for maximum number of vnodes. */ 305 u_long desiredvnodes; 306 static u_long gapvnodes; /* gap between wanted and desired */ 307 static u_long vhiwat; /* enough extras after expansion */ 308 static u_long vlowat; /* minimal extras before expansion */ 309 static u_long vstir; /* nonzero to stir non-free vnodes */ 310 static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ 311 312 static u_long vnlru_read_freevnodes(void); 313 314 /* 315 * Note that no attempt is made to sanitize these parameters. 316 */ 317 static int 318 sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) 319 { 320 u_long val; 321 int error; 322 323 val = desiredvnodes; 324 error = sysctl_handle_long(oidp, &val, 0, req); 325 if (error != 0 || req->newptr == NULL) 326 return (error); 327 328 if (val == desiredvnodes) 329 return (0); 330 mtx_lock(&vnode_list_mtx); 331 desiredvnodes = val; 332 wantfreevnodes = desiredvnodes / 4; 333 vnlru_recalc(); 334 mtx_unlock(&vnode_list_mtx); 335 /* 336 * XXX There is no protection against multiple threads changing 337 * desiredvnodes at the same time. Locking above only helps vnlru and 338 * getnewvnode. 339 */ 340 vfs_hash_changesize(desiredvnodes); 341 cache_changesize(desiredvnodes); 342 return (0); 343 } 344 345 SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, 346 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, 347 "LU", "Target for maximum number of vnodes"); 348 349 static int 350 sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) 351 { 352 u_long val; 353 int error; 354 355 val = wantfreevnodes; 356 error = sysctl_handle_long(oidp, &val, 0, req); 357 if (error != 0 || req->newptr == NULL) 358 return (error); 359 360 if (val == wantfreevnodes) 361 return (0); 362 mtx_lock(&vnode_list_mtx); 363 wantfreevnodes = val; 364 vnlru_recalc(); 365 mtx_unlock(&vnode_list_mtx); 366 return (0); 367 } 368 369 SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, 370 CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, 371 "LU", "Target for minimum number of \"free\" vnodes"); 372 373 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 374 &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); 375 static int vnlru_nowhere; 376 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW | CTLFLAG_STATS, 377 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 378 379 static int 380 sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) 381 { 382 struct vnode *vp; 383 struct nameidata nd; 384 char *buf; 385 unsigned long ndflags; 386 int error; 387 388 if (req->newptr == NULL) 389 return (EINVAL); 390 if (req->newlen >= PATH_MAX) 391 return (E2BIG); 392 393 buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); 394 error = SYSCTL_IN(req, buf, req->newlen); 395 if (error != 0) 396 goto out; 397 398 buf[req->newlen] = '\0'; 399 400 ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; 401 NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); 402 if ((error = namei(&nd)) != 0) 403 goto out; 404 vp = nd.ni_vp; 405 406 if (VN_IS_DOOMED(vp)) { 407 /* 408 * This vnode is being recycled. Return != 0 to let the caller 409 * know that the sysctl had no effect. Return EAGAIN because a 410 * subsequent call will likely succeed (since namei will create 411 * a new vnode if necessary) 412 */ 413 error = EAGAIN; 414 goto putvnode; 415 } 416 417 counter_u64_add(recycles_count, 1); 418 vgone(vp); 419 putvnode: 420 vput(vp); 421 NDFREE_PNBUF(&nd); 422 out: 423 free(buf, M_TEMP); 424 return (error); 425 } 426 427 static int 428 sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) 429 { 430 struct thread *td = curthread; 431 struct vnode *vp; 432 struct file *fp; 433 int error; 434 int fd; 435 436 if (req->newptr == NULL) 437 return (EBADF); 438 439 error = sysctl_handle_int(oidp, &fd, 0, req); 440 if (error != 0) 441 return (error); 442 error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); 443 if (error != 0) 444 return (error); 445 vp = fp->f_vnode; 446 447 error = vn_lock(vp, LK_EXCLUSIVE); 448 if (error != 0) 449 goto drop; 450 451 counter_u64_add(recycles_count, 1); 452 vgone(vp); 453 VOP_UNLOCK(vp); 454 drop: 455 fdrop(fp, td); 456 return (error); 457 } 458 459 SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, 460 CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 461 sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); 462 SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, 463 CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, 464 sysctl_ftry_reclaim_vnode, "I", 465 "Try to reclaim a vnode by its file descriptor"); 466 467 /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 468 #define vnsz2log 8 469 #ifndef DEBUG_LOCKS 470 _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && 471 sizeof(struct vnode) < 1UL << (vnsz2log + 1), 472 "vnsz2log needs to be updated"); 473 #endif 474 475 /* 476 * Support for the bufobj clean & dirty pctrie. 477 */ 478 static void * 479 buf_trie_alloc(struct pctrie *ptree) 480 { 481 return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); 482 } 483 484 static void 485 buf_trie_free(struct pctrie *ptree, void *node) 486 { 487 uma_zfree_smr(buf_trie_zone, node); 488 } 489 PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, 490 buf_trie_smr); 491 492 /* 493 * Initialize the vnode management data structures. 494 * 495 * Reevaluate the following cap on the number of vnodes after the physical 496 * memory size exceeds 512GB. In the limit, as the physical memory size 497 * grows, the ratio of the memory size in KB to vnodes approaches 64:1. 498 */ 499 #ifndef MAXVNODES_MAX 500 #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ 501 #endif 502 503 static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 504 505 static struct vnode * 506 vn_alloc_marker(struct mount *mp) 507 { 508 struct vnode *vp; 509 510 vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 511 vp->v_type = VMARKER; 512 vp->v_mount = mp; 513 514 return (vp); 515 } 516 517 static void 518 vn_free_marker(struct vnode *vp) 519 { 520 521 MPASS(vp->v_type == VMARKER); 522 free(vp, M_VNODE_MARKER); 523 } 524 525 #ifdef KASAN 526 static int 527 vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) 528 { 529 kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); 530 return (0); 531 } 532 533 static void 534 vnode_dtor(void *mem, int size, void *arg __unused) 535 { 536 size_t end1, end2, off1, off2; 537 538 _Static_assert(offsetof(struct vnode, v_vnodelist) < 539 offsetof(struct vnode, v_dbatchcpu), 540 "KASAN marks require updating"); 541 542 off1 = offsetof(struct vnode, v_vnodelist); 543 off2 = offsetof(struct vnode, v_dbatchcpu); 544 end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); 545 end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); 546 547 /* 548 * Access to the v_vnodelist and v_dbatchcpu fields are permitted even 549 * after the vnode has been freed. Try to get some KASAN coverage by 550 * marking everything except those two fields as invalid. Because 551 * KASAN's tracking is not byte-granular, any preceding fields sharing 552 * the same 8-byte aligned word must also be marked valid. 553 */ 554 555 /* Handle the area from the start until v_vnodelist... */ 556 off1 = rounddown2(off1, KASAN_SHADOW_SCALE); 557 kasan_mark(mem, off1, off1, KASAN_UMA_FREED); 558 559 /* ... then the area between v_vnodelist and v_dbatchcpu ... */ 560 off1 = roundup2(end1, KASAN_SHADOW_SCALE); 561 off2 = rounddown2(off2, KASAN_SHADOW_SCALE); 562 if (off2 > off1) 563 kasan_mark((void *)((char *)mem + off1), off2 - off1, 564 off2 - off1, KASAN_UMA_FREED); 565 566 /* ... and finally the area from v_dbatchcpu to the end. */ 567 off2 = roundup2(end2, KASAN_SHADOW_SCALE); 568 kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, 569 KASAN_UMA_FREED); 570 } 571 #endif /* KASAN */ 572 573 /* 574 * Initialize a vnode as it first enters the zone. 575 */ 576 static int 577 vnode_init(void *mem, int size, int flags) 578 { 579 struct vnode *vp; 580 581 vp = mem; 582 bzero(vp, size); 583 /* 584 * Setup locks. 585 */ 586 vp->v_vnlock = &vp->v_lock; 587 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 588 /* 589 * By default, don't allow shared locks unless filesystems opt-in. 590 */ 591 lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, 592 LK_NOSHARE | LK_IS_VNODE); 593 /* 594 * Initialize bufobj. 595 */ 596 bufobj_init(&vp->v_bufobj, vp); 597 /* 598 * Initialize namecache. 599 */ 600 cache_vnode_init(vp); 601 /* 602 * Initialize rangelocks. 603 */ 604 rangelock_init(&vp->v_rl); 605 606 vp->v_dbatchcpu = NOCPU; 607 608 vp->v_state = VSTATE_DEAD; 609 610 /* 611 * Check vhold_recycle_free for an explanation. 612 */ 613 vp->v_holdcnt = VHOLD_NO_SMR; 614 vp->v_type = VNON; 615 mtx_lock(&vnode_list_mtx); 616 TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); 617 mtx_unlock(&vnode_list_mtx); 618 return (0); 619 } 620 621 /* 622 * Free a vnode when it is cleared from the zone. 623 */ 624 static void 625 vnode_fini(void *mem, int size) 626 { 627 struct vnode *vp; 628 struct bufobj *bo; 629 630 vp = mem; 631 vdbatch_dequeue(vp); 632 mtx_lock(&vnode_list_mtx); 633 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 634 mtx_unlock(&vnode_list_mtx); 635 rangelock_destroy(&vp->v_rl); 636 lockdestroy(vp->v_vnlock); 637 mtx_destroy(&vp->v_interlock); 638 bo = &vp->v_bufobj; 639 rw_destroy(BO_LOCKPTR(bo)); 640 641 kasan_mark(mem, size, size, 0); 642 } 643 644 /* 645 * Provide the size of NFS nclnode and NFS fh for calculation of the 646 * vnode memory consumption. The size is specified directly to 647 * eliminate dependency on NFS-private header. 648 * 649 * Other filesystems may use bigger or smaller (like UFS and ZFS) 650 * private inode data, but the NFS-based estimation is ample enough. 651 * Still, we care about differences in the size between 64- and 32-bit 652 * platforms. 653 * 654 * Namecache structure size is heuristically 655 * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. 656 */ 657 #ifdef _LP64 658 #define NFS_NCLNODE_SZ (528 + 64) 659 #define NC_SZ 148 660 #else 661 #define NFS_NCLNODE_SZ (360 + 32) 662 #define NC_SZ 92 663 #endif 664 665 static void 666 vntblinit(void *dummy __unused) 667 { 668 struct vdbatch *vd; 669 uma_ctor ctor; 670 uma_dtor dtor; 671 int cpu, physvnodes, virtvnodes; 672 673 /* 674 * Desiredvnodes is a function of the physical memory size and the 675 * kernel's heap size. Generally speaking, it scales with the 676 * physical memory size. The ratio of desiredvnodes to the physical 677 * memory size is 1:16 until desiredvnodes exceeds 98,304. 678 * Thereafter, the 679 * marginal ratio of desiredvnodes to the physical memory size is 680 * 1:64. However, desiredvnodes is limited by the kernel's heap 681 * size. The memory required by desiredvnodes vnodes and vm objects 682 * must not exceed 1/10th of the kernel's heap size. 683 */ 684 physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 685 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; 686 virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + 687 sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); 688 desiredvnodes = min(physvnodes, virtvnodes); 689 if (desiredvnodes > MAXVNODES_MAX) { 690 if (bootverbose) 691 printf("Reducing kern.maxvnodes %lu -> %lu\n", 692 desiredvnodes, MAXVNODES_MAX); 693 desiredvnodes = MAXVNODES_MAX; 694 } 695 wantfreevnodes = desiredvnodes / 4; 696 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 697 TAILQ_INIT(&vnode_list); 698 mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); 699 /* 700 * The lock is taken to appease WITNESS. 701 */ 702 mtx_lock(&vnode_list_mtx); 703 vnlru_recalc(); 704 mtx_unlock(&vnode_list_mtx); 705 vnode_list_free_marker = vn_alloc_marker(NULL); 706 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); 707 vnode_list_reclaim_marker = vn_alloc_marker(NULL); 708 TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); 709 710 #ifdef KASAN 711 ctor = vnode_ctor; 712 dtor = vnode_dtor; 713 #else 714 ctor = NULL; 715 dtor = NULL; 716 #endif 717 vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, 718 vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); 719 uma_zone_set_smr(vnode_zone, vfs_smr); 720 721 /* 722 * Preallocate enough nodes to support one-per buf so that 723 * we can not fail an insert. reassignbuf() callers can not 724 * tolerate the insertion failure. 725 */ 726 buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), 727 NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 728 UMA_ZONE_NOFREE | UMA_ZONE_SMR); 729 buf_trie_smr = uma_zone_get_smr(buf_trie_zone); 730 uma_prealloc(buf_trie_zone, nbuf); 731 732 vnodes_created = counter_u64_alloc(M_WAITOK); 733 recycles_count = counter_u64_alloc(M_WAITOK); 734 recycles_free_count = counter_u64_alloc(M_WAITOK); 735 736 /* 737 * Initialize the filesystem syncer. 738 */ 739 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 740 &syncer_mask); 741 syncer_maxdelay = syncer_mask + 1; 742 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 743 cv_init(&sync_wakeup, "syncer"); 744 745 CPU_FOREACH(cpu) { 746 vd = DPCPU_ID_PTR((cpu), vd); 747 bzero(vd, sizeof(*vd)); 748 mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); 749 } 750 } 751 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 752 753 /* 754 * Mark a mount point as busy. Used to synchronize access and to delay 755 * unmounting. Eventually, mountlist_mtx is not released on failure. 756 * 757 * vfs_busy() is a custom lock, it can block the caller. 758 * vfs_busy() only sleeps if the unmount is active on the mount point. 759 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 760 * vnode belonging to mp. 761 * 762 * Lookup uses vfs_busy() to traverse mount points. 763 * root fs var fs 764 * / vnode lock A / vnode lock (/var) D 765 * /var vnode lock B /log vnode lock(/var/log) E 766 * vfs_busy lock C vfs_busy lock F 767 * 768 * Within each file system, the lock order is C->A->B and F->D->E. 769 * 770 * When traversing across mounts, the system follows that lock order: 771 * 772 * C->A->B 773 * | 774 * +->F->D->E 775 * 776 * The lookup() process for namei("/var") illustrates the process: 777 * 1. VOP_LOOKUP() obtains B while A is held 778 * 2. vfs_busy() obtains a shared lock on F while A and B are held 779 * 3. vput() releases lock on B 780 * 4. vput() releases lock on A 781 * 5. VFS_ROOT() obtains lock on D while shared lock on F is held 782 * 6. vfs_unbusy() releases shared lock on F 783 * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 784 * Attempt to lock A (instead of vp_crossmp) while D is held would 785 * violate the global order, causing deadlocks. 786 * 787 * dounmount() locks B while F is drained. Note that for stacked 788 * filesystems, D and B in the example above may be the same lock, 789 * which introdues potential lock order reversal deadlock between 790 * dounmount() and step 5 above. These filesystems may avoid the LOR 791 * by setting VV_CROSSLOCK on the covered vnode so that lock B will 792 * remain held until after step 5. 793 */ 794 int 795 vfs_busy(struct mount *mp, int flags) 796 { 797 struct mount_pcpu *mpcpu; 798 799 MPASS((flags & ~MBF_MASK) == 0); 800 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 801 802 if (vfs_op_thread_enter(mp, mpcpu)) { 803 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 804 MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); 805 MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); 806 vfs_mp_count_add_pcpu(mpcpu, ref, 1); 807 vfs_mp_count_add_pcpu(mpcpu, lockref, 1); 808 vfs_op_thread_exit(mp, mpcpu); 809 if (flags & MBF_MNTLSTLOCK) 810 mtx_unlock(&mountlist_mtx); 811 return (0); 812 } 813 814 MNT_ILOCK(mp); 815 vfs_assert_mount_counters(mp); 816 MNT_REF(mp); 817 /* 818 * If mount point is currently being unmounted, sleep until the 819 * mount point fate is decided. If thread doing the unmounting fails, 820 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 821 * that this mount point has survived the unmount attempt and vfs_busy 822 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 823 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 824 * about to be really destroyed. vfs_busy needs to release its 825 * reference on the mount point in this case and return with ENOENT, 826 * telling the caller the mount it tried to busy is no longer valid. 827 */ 828 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 829 KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), 830 ("%s: non-empty upper mount list with pending unmount", 831 __func__)); 832 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 833 MNT_REL(mp); 834 MNT_IUNLOCK(mp); 835 CTR1(KTR_VFS, "%s: failed busying before sleeping", 836 __func__); 837 return (ENOENT); 838 } 839 if (flags & MBF_MNTLSTLOCK) 840 mtx_unlock(&mountlist_mtx); 841 mp->mnt_kern_flag |= MNTK_MWAIT; 842 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 843 if (flags & MBF_MNTLSTLOCK) 844 mtx_lock(&mountlist_mtx); 845 MNT_ILOCK(mp); 846 } 847 if (flags & MBF_MNTLSTLOCK) 848 mtx_unlock(&mountlist_mtx); 849 mp->mnt_lockref++; 850 MNT_IUNLOCK(mp); 851 return (0); 852 } 853 854 /* 855 * Free a busy filesystem. 856 */ 857 void 858 vfs_unbusy(struct mount *mp) 859 { 860 struct mount_pcpu *mpcpu; 861 int c; 862 863 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 864 865 if (vfs_op_thread_enter(mp, mpcpu)) { 866 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 867 vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); 868 vfs_mp_count_sub_pcpu(mpcpu, ref, 1); 869 vfs_op_thread_exit(mp, mpcpu); 870 return; 871 } 872 873 MNT_ILOCK(mp); 874 vfs_assert_mount_counters(mp); 875 MNT_REL(mp); 876 c = --mp->mnt_lockref; 877 if (mp->mnt_vfs_ops == 0) { 878 MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); 879 MNT_IUNLOCK(mp); 880 return; 881 } 882 if (c < 0) 883 vfs_dump_mount_counters(mp); 884 if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 885 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 886 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 887 mp->mnt_kern_flag &= ~MNTK_DRAINING; 888 wakeup(&mp->mnt_lockref); 889 } 890 MNT_IUNLOCK(mp); 891 } 892 893 /* 894 * Lookup a mount point by filesystem identifier. 895 */ 896 struct mount * 897 vfs_getvfs(fsid_t *fsid) 898 { 899 struct mount *mp; 900 901 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 902 mtx_lock(&mountlist_mtx); 903 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 904 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 905 vfs_ref(mp); 906 mtx_unlock(&mountlist_mtx); 907 return (mp); 908 } 909 } 910 mtx_unlock(&mountlist_mtx); 911 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 912 return ((struct mount *) 0); 913 } 914 915 /* 916 * Lookup a mount point by filesystem identifier, busying it before 917 * returning. 918 * 919 * To avoid congestion on mountlist_mtx, implement simple direct-mapped 920 * cache for popular filesystem identifiers. The cache is lockess, using 921 * the fact that struct mount's are never freed. In worst case we may 922 * get pointer to unmounted or even different filesystem, so we have to 923 * check what we got, and go slow way if so. 924 */ 925 struct mount * 926 vfs_busyfs(fsid_t *fsid) 927 { 928 #define FSID_CACHE_SIZE 256 929 typedef struct mount * volatile vmp_t; 930 static vmp_t cache[FSID_CACHE_SIZE]; 931 struct mount *mp; 932 int error; 933 uint32_t hash; 934 935 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 936 hash = fsid->val[0] ^ fsid->val[1]; 937 hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); 938 mp = cache[hash]; 939 if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) 940 goto slow; 941 if (vfs_busy(mp, 0) != 0) { 942 cache[hash] = NULL; 943 goto slow; 944 } 945 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) 946 return (mp); 947 else 948 vfs_unbusy(mp); 949 950 slow: 951 mtx_lock(&mountlist_mtx); 952 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 953 if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { 954 error = vfs_busy(mp, MBF_MNTLSTLOCK); 955 if (error) { 956 cache[hash] = NULL; 957 mtx_unlock(&mountlist_mtx); 958 return (NULL); 959 } 960 cache[hash] = mp; 961 return (mp); 962 } 963 } 964 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 965 mtx_unlock(&mountlist_mtx); 966 return ((struct mount *) 0); 967 } 968 969 /* 970 * Check if a user can access privileged mount options. 971 */ 972 int 973 vfs_suser(struct mount *mp, struct thread *td) 974 { 975 int error; 976 977 if (jailed(td->td_ucred)) { 978 /* 979 * If the jail of the calling thread lacks permission for 980 * this type of file system, deny immediately. 981 */ 982 if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) 983 return (EPERM); 984 985 /* 986 * If the file system was mounted outside the jail of the 987 * calling thread, deny immediately. 988 */ 989 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 990 return (EPERM); 991 } 992 993 /* 994 * If file system supports delegated administration, we don't check 995 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 996 * by the file system itself. 997 * If this is not the user that did original mount, we check for 998 * the PRIV_VFS_MOUNT_OWNER privilege. 999 */ 1000 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 1001 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 1002 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 1003 return (error); 1004 } 1005 return (0); 1006 } 1007 1008 /* 1009 * Get a new unique fsid. Try to make its val[0] unique, since this value 1010 * will be used to create fake device numbers for stat(). Also try (but 1011 * not so hard) make its val[0] unique mod 2^16, since some emulators only 1012 * support 16-bit device numbers. We end up with unique val[0]'s for the 1013 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 1014 * 1015 * Keep in mind that several mounts may be running in parallel. Starting 1016 * the search one past where the previous search terminated is both a 1017 * micro-optimization and a defense against returning the same fsid to 1018 * different mounts. 1019 */ 1020 void 1021 vfs_getnewfsid(struct mount *mp) 1022 { 1023 static uint16_t mntid_base; 1024 struct mount *nmp; 1025 fsid_t tfsid; 1026 int mtype; 1027 1028 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 1029 mtx_lock(&mntid_mtx); 1030 mtype = mp->mnt_vfc->vfc_typenum; 1031 tfsid.val[1] = mtype; 1032 mtype = (mtype & 0xFF) << 24; 1033 for (;;) { 1034 tfsid.val[0] = makedev(255, 1035 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 1036 mntid_base++; 1037 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 1038 break; 1039 vfs_rel(nmp); 1040 } 1041 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 1042 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 1043 mtx_unlock(&mntid_mtx); 1044 } 1045 1046 /* 1047 * Knob to control the precision of file timestamps: 1048 * 1049 * 0 = seconds only; nanoseconds zeroed. 1050 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1051 * 2 = seconds and nanoseconds, truncated to microseconds. 1052 * >=3 = seconds and nanoseconds, maximum precision. 1053 */ 1054 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1055 1056 static int timestamp_precision = TSP_USEC; 1057 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 1058 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 1059 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " 1060 "3+: sec + ns (max. precision))"); 1061 1062 /* 1063 * Get a current timestamp. 1064 */ 1065 void 1066 vfs_timestamp(struct timespec *tsp) 1067 { 1068 struct timeval tv; 1069 1070 switch (timestamp_precision) { 1071 case TSP_SEC: 1072 tsp->tv_sec = time_second; 1073 tsp->tv_nsec = 0; 1074 break; 1075 case TSP_HZ: 1076 getnanotime(tsp); 1077 break; 1078 case TSP_USEC: 1079 microtime(&tv); 1080 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1081 break; 1082 case TSP_NSEC: 1083 default: 1084 nanotime(tsp); 1085 break; 1086 } 1087 } 1088 1089 /* 1090 * Set vnode attributes to VNOVAL 1091 */ 1092 void 1093 vattr_null(struct vattr *vap) 1094 { 1095 1096 vap->va_type = VNON; 1097 vap->va_size = VNOVAL; 1098 vap->va_bytes = VNOVAL; 1099 vap->va_mode = VNOVAL; 1100 vap->va_nlink = VNOVAL; 1101 vap->va_uid = VNOVAL; 1102 vap->va_gid = VNOVAL; 1103 vap->va_fsid = VNOVAL; 1104 vap->va_fileid = VNOVAL; 1105 vap->va_blocksize = VNOVAL; 1106 vap->va_rdev = VNOVAL; 1107 vap->va_atime.tv_sec = VNOVAL; 1108 vap->va_atime.tv_nsec = VNOVAL; 1109 vap->va_mtime.tv_sec = VNOVAL; 1110 vap->va_mtime.tv_nsec = VNOVAL; 1111 vap->va_ctime.tv_sec = VNOVAL; 1112 vap->va_ctime.tv_nsec = VNOVAL; 1113 vap->va_birthtime.tv_sec = VNOVAL; 1114 vap->va_birthtime.tv_nsec = VNOVAL; 1115 vap->va_flags = VNOVAL; 1116 vap->va_gen = VNOVAL; 1117 vap->va_vaflags = 0; 1118 } 1119 1120 /* 1121 * Try to reduce the total number of vnodes. 1122 * 1123 * This routine (and its user) are buggy in at least the following ways: 1124 * - all parameters were picked years ago when RAM sizes were significantly 1125 * smaller 1126 * - it can pick vnodes based on pages used by the vm object, but filesystems 1127 * like ZFS don't use it making the pick broken 1128 * - since ZFS has its own aging policy it gets partially combated by this one 1129 * - a dedicated method should be provided for filesystems to let them decide 1130 * whether the vnode should be recycled 1131 * 1132 * This routine is called when we have too many vnodes. It attempts 1133 * to free <count> vnodes and will potentially free vnodes that still 1134 * have VM backing store (VM backing store is typically the cause 1135 * of a vnode blowout so we want to do this). Therefore, this operation 1136 * is not considered cheap. 1137 * 1138 * A number of conditions may prevent a vnode from being reclaimed. 1139 * the buffer cache may have references on the vnode, a directory 1140 * vnode may still have references due to the namei cache representing 1141 * underlying files, or the vnode may be in active use. It is not 1142 * desirable to reuse such vnodes. These conditions may cause the 1143 * number of vnodes to reach some minimum value regardless of what 1144 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 1145 * 1146 * @param reclaim_nc_src Only reclaim directories with outgoing namecache 1147 * entries if this argument is strue 1148 * @param trigger Only reclaim vnodes with fewer than this many resident 1149 * pages. 1150 * @param target How many vnodes to reclaim. 1151 * @return The number of vnodes that were reclaimed. 1152 */ 1153 static int 1154 vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) 1155 { 1156 struct vnode *vp, *mvp; 1157 struct mount *mp; 1158 struct vm_object *object; 1159 u_long done; 1160 bool retried; 1161 1162 mtx_assert(&vnode_list_mtx, MA_OWNED); 1163 1164 retried = false; 1165 done = 0; 1166 1167 mvp = vnode_list_reclaim_marker; 1168 restart: 1169 vp = mvp; 1170 while (done < target) { 1171 vp = TAILQ_NEXT(vp, v_vnodelist); 1172 if (__predict_false(vp == NULL)) 1173 break; 1174 1175 if (__predict_false(vp->v_type == VMARKER)) 1176 continue; 1177 1178 /* 1179 * If it's been deconstructed already, it's still 1180 * referenced, or it exceeds the trigger, skip it. 1181 * Also skip free vnodes. We are trying to make space 1182 * to expand the free list, not reduce it. 1183 */ 1184 if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || 1185 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) 1186 goto next_iter; 1187 1188 if (vp->v_type == VBAD || vp->v_type == VNON) 1189 goto next_iter; 1190 1191 object = atomic_load_ptr(&vp->v_object); 1192 if (object == NULL || object->resident_page_count > trigger) { 1193 goto next_iter; 1194 } 1195 1196 /* 1197 * Handle races against vnode allocation. Filesystems lock the 1198 * vnode some time after it gets returned from getnewvnode, 1199 * despite type and hold count being manipulated earlier. 1200 * Resorting to checking v_mount restores guarantees present 1201 * before the global list was reworked to contain all vnodes. 1202 */ 1203 if (!VI_TRYLOCK(vp)) 1204 goto next_iter; 1205 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1206 VI_UNLOCK(vp); 1207 goto next_iter; 1208 } 1209 if (vp->v_mount == NULL) { 1210 VI_UNLOCK(vp); 1211 goto next_iter; 1212 } 1213 vholdl(vp); 1214 VI_UNLOCK(vp); 1215 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1216 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1217 mtx_unlock(&vnode_list_mtx); 1218 1219 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1220 vdrop_recycle(vp); 1221 goto next_iter_unlocked; 1222 } 1223 if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { 1224 vdrop_recycle(vp); 1225 vn_finished_write(mp); 1226 goto next_iter_unlocked; 1227 } 1228 1229 VI_LOCK(vp); 1230 if (vp->v_usecount > 0 || 1231 (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || 1232 (vp->v_object != NULL && vp->v_object->handle == vp && 1233 vp->v_object->resident_page_count > trigger)) { 1234 VOP_UNLOCK(vp); 1235 vdropl_recycle(vp); 1236 vn_finished_write(mp); 1237 goto next_iter_unlocked; 1238 } 1239 counter_u64_add(recycles_count, 1); 1240 vgonel(vp); 1241 VOP_UNLOCK(vp); 1242 vdropl_recycle(vp); 1243 vn_finished_write(mp); 1244 done++; 1245 next_iter_unlocked: 1246 maybe_yield(); 1247 mtx_lock(&vnode_list_mtx); 1248 goto restart; 1249 next_iter: 1250 MPASS(vp->v_type != VMARKER); 1251 if (!should_yield()) 1252 continue; 1253 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1254 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1255 mtx_unlock(&vnode_list_mtx); 1256 kern_yield(PRI_USER); 1257 mtx_lock(&vnode_list_mtx); 1258 goto restart; 1259 } 1260 if (done == 0 && !retried) { 1261 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1262 TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); 1263 retried = true; 1264 goto restart; 1265 } 1266 return (done); 1267 } 1268 1269 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ 1270 SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 1271 0, 1272 "limit on vnode free requests per call to the vnlru_free routine"); 1273 1274 /* 1275 * Attempt to reduce the free list by the requested amount. 1276 */ 1277 static int 1278 vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp) 1279 { 1280 struct vnode *vp; 1281 struct mount *mp; 1282 int ocount; 1283 1284 mtx_assert(&vnode_list_mtx, MA_OWNED); 1285 if (count > max_vnlru_free) 1286 count = max_vnlru_free; 1287 ocount = count; 1288 vp = mvp; 1289 for (;;) { 1290 if (count == 0) { 1291 break; 1292 } 1293 vp = TAILQ_NEXT(vp, v_vnodelist); 1294 if (__predict_false(vp == NULL)) { 1295 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1296 TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); 1297 break; 1298 } 1299 if (__predict_false(vp->v_type == VMARKER)) 1300 continue; 1301 if (vp->v_holdcnt > 0) 1302 continue; 1303 /* 1304 * Don't recycle if our vnode is from different type 1305 * of mount point. Note that mp is type-safe, the 1306 * check does not reach unmapped address even if 1307 * vnode is reclaimed. 1308 */ 1309 if (mnt_op != NULL && (mp = vp->v_mount) != NULL && 1310 mp->mnt_op != mnt_op) { 1311 continue; 1312 } 1313 if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { 1314 continue; 1315 } 1316 if (!vhold_recycle_free(vp)) 1317 continue; 1318 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1319 TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); 1320 mtx_unlock(&vnode_list_mtx); 1321 /* 1322 * FIXME: ignores the return value, meaning it may be nothing 1323 * got recycled but it claims otherwise to the caller. 1324 * 1325 * Originally the value started being ignored in 2005 with 1326 * 114a1006a8204aa156e1f9ad6476cdff89cada7f . 1327 * 1328 * Respecting the value can run into significant stalls if most 1329 * vnodes belong to one file system and it has writes 1330 * suspended. In presence of many threads and millions of 1331 * vnodes they keep contending on the vnode_list_mtx lock only 1332 * to find vnodes they can't recycle. 1333 * 1334 * The solution would be to pre-check if the vnode is likely to 1335 * be recycle-able, but it needs to happen with the 1336 * vnode_list_mtx lock held. This runs into a problem where 1337 * VOP_GETWRITEMOUNT (currently needed to find out about if 1338 * writes are frozen) can take locks which LOR against it. 1339 * 1340 * Check nullfs for one example (null_getwritemount). 1341 */ 1342 vtryrecycle(vp); 1343 count--; 1344 mtx_lock(&vnode_list_mtx); 1345 vp = mvp; 1346 } 1347 return (ocount - count); 1348 } 1349 1350 static int 1351 vnlru_free_locked(int count) 1352 { 1353 1354 mtx_assert(&vnode_list_mtx, MA_OWNED); 1355 return (vnlru_free_impl(count, NULL, vnode_list_free_marker)); 1356 } 1357 1358 void 1359 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) 1360 { 1361 1362 MPASS(mnt_op != NULL); 1363 MPASS(mvp != NULL); 1364 VNPASS(mvp->v_type == VMARKER, mvp); 1365 mtx_lock(&vnode_list_mtx); 1366 vnlru_free_impl(count, mnt_op, mvp); 1367 mtx_unlock(&vnode_list_mtx); 1368 } 1369 1370 struct vnode * 1371 vnlru_alloc_marker(void) 1372 { 1373 struct vnode *mvp; 1374 1375 mvp = vn_alloc_marker(NULL); 1376 mtx_lock(&vnode_list_mtx); 1377 TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); 1378 mtx_unlock(&vnode_list_mtx); 1379 return (mvp); 1380 } 1381 1382 void 1383 vnlru_free_marker(struct vnode *mvp) 1384 { 1385 mtx_lock(&vnode_list_mtx); 1386 TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); 1387 mtx_unlock(&vnode_list_mtx); 1388 vn_free_marker(mvp); 1389 } 1390 1391 static void 1392 vnlru_recalc(void) 1393 { 1394 1395 mtx_assert(&vnode_list_mtx, MA_OWNED); 1396 gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); 1397 vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ 1398 vlowat = vhiwat / 2; 1399 } 1400 1401 /* 1402 * Attempt to recycle vnodes in a context that is always safe to block. 1403 * Calling vlrurecycle() from the bowels of filesystem code has some 1404 * interesting deadlock problems. 1405 */ 1406 static struct proc *vnlruproc; 1407 static int vnlruproc_sig; 1408 1409 /* 1410 * The main freevnodes counter is only updated when threads requeue their vnode 1411 * batches. CPUs are conditionally walked to compute a more accurate total. 1412 * 1413 * Limit how much of a slop are we willing to tolerate. Note: the actual value 1414 * at any given moment can still exceed slop, but it should not be by significant 1415 * margin in practice. 1416 */ 1417 #define VNLRU_FREEVNODES_SLOP 126 1418 1419 static void __noinline 1420 vfs_freevnodes_rollup(int8_t *lfreevnodes) 1421 { 1422 1423 atomic_add_long(&freevnodes, *lfreevnodes); 1424 *lfreevnodes = 0; 1425 critical_exit(); 1426 } 1427 1428 static __inline void 1429 vfs_freevnodes_inc(void) 1430 { 1431 int8_t *lfreevnodes; 1432 1433 critical_enter(); 1434 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1435 (*lfreevnodes)++; 1436 if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP)) 1437 vfs_freevnodes_rollup(lfreevnodes); 1438 else 1439 critical_exit(); 1440 } 1441 1442 static __inline void 1443 vfs_freevnodes_dec(void) 1444 { 1445 int8_t *lfreevnodes; 1446 1447 critical_enter(); 1448 lfreevnodes = PCPU_PTR(vfs_freevnodes); 1449 (*lfreevnodes)--; 1450 if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP)) 1451 vfs_freevnodes_rollup(lfreevnodes); 1452 else 1453 critical_exit(); 1454 } 1455 1456 static u_long 1457 vnlru_read_freevnodes(void) 1458 { 1459 long slop, rfreevnodes; 1460 int cpu; 1461 1462 rfreevnodes = atomic_load_long(&freevnodes); 1463 1464 if (rfreevnodes > freevnodes_old) 1465 slop = rfreevnodes - freevnodes_old; 1466 else 1467 slop = freevnodes_old - rfreevnodes; 1468 if (slop < VNLRU_FREEVNODES_SLOP) 1469 return (rfreevnodes >= 0 ? rfreevnodes : 0); 1470 freevnodes_old = rfreevnodes; 1471 CPU_FOREACH(cpu) { 1472 freevnodes_old += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes; 1473 } 1474 return (freevnodes_old >= 0 ? freevnodes_old : 0); 1475 } 1476 1477 static bool 1478 vnlru_under(u_long rnumvnodes, u_long limit) 1479 { 1480 u_long rfreevnodes, space; 1481 1482 if (__predict_false(rnumvnodes > desiredvnodes)) 1483 return (true); 1484 1485 space = desiredvnodes - rnumvnodes; 1486 if (space < limit) { 1487 rfreevnodes = vnlru_read_freevnodes(); 1488 if (rfreevnodes > wantfreevnodes) 1489 space += rfreevnodes - wantfreevnodes; 1490 } 1491 return (space < limit); 1492 } 1493 1494 static bool 1495 vnlru_under_unlocked(u_long rnumvnodes, u_long limit) 1496 { 1497 long rfreevnodes, space; 1498 1499 if (__predict_false(rnumvnodes > desiredvnodes)) 1500 return (true); 1501 1502 space = desiredvnodes - rnumvnodes; 1503 if (space < limit) { 1504 rfreevnodes = atomic_load_long(&freevnodes); 1505 if (rfreevnodes > wantfreevnodes) 1506 space += rfreevnodes - wantfreevnodes; 1507 } 1508 return (space < limit); 1509 } 1510 1511 static void 1512 vnlru_kick(void) 1513 { 1514 1515 mtx_assert(&vnode_list_mtx, MA_OWNED); 1516 if (vnlruproc_sig == 0) { 1517 vnlruproc_sig = 1; 1518 wakeup(vnlruproc); 1519 } 1520 } 1521 1522 static void 1523 vnlru_proc(void) 1524 { 1525 u_long rnumvnodes, rfreevnodes, target; 1526 unsigned long onumvnodes; 1527 int done, force, trigger, usevnodes; 1528 bool reclaim_nc_src, want_reread; 1529 1530 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, 1531 SHUTDOWN_PRI_FIRST); 1532 1533 force = 0; 1534 want_reread = false; 1535 for (;;) { 1536 kproc_suspend_check(vnlruproc); 1537 mtx_lock(&vnode_list_mtx); 1538 rnumvnodes = atomic_load_long(&numvnodes); 1539 1540 if (want_reread) { 1541 force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; 1542 want_reread = false; 1543 } 1544 1545 /* 1546 * If numvnodes is too large (due to desiredvnodes being 1547 * adjusted using its sysctl, or emergency growth), first 1548 * try to reduce it by discarding from the free list. 1549 */ 1550 if (rnumvnodes > desiredvnodes) { 1551 vnlru_free_locked(rnumvnodes - desiredvnodes); 1552 rnumvnodes = atomic_load_long(&numvnodes); 1553 } 1554 /* 1555 * Sleep if the vnode cache is in a good state. This is 1556 * when it is not over-full and has space for about a 4% 1557 * or 9% expansion (by growing its size or inexcessively 1558 * reducing its free list). Otherwise, try to reclaim 1559 * space for a 10% expansion. 1560 */ 1561 if (vstir && force == 0) { 1562 force = 1; 1563 vstir = 0; 1564 } 1565 if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { 1566 vnlruproc_sig = 0; 1567 wakeup(&vnlruproc_sig); 1568 msleep(vnlruproc, &vnode_list_mtx, 1569 PVFS|PDROP, "vlruwt", hz); 1570 continue; 1571 } 1572 rfreevnodes = vnlru_read_freevnodes(); 1573 1574 onumvnodes = rnumvnodes; 1575 /* 1576 * Calculate parameters for recycling. These are the same 1577 * throughout the loop to give some semblance of fairness. 1578 * The trigger point is to avoid recycling vnodes with lots 1579 * of resident pages. We aren't trying to free memory; we 1580 * are trying to recycle or at least free vnodes. 1581 */ 1582 if (rnumvnodes <= desiredvnodes) 1583 usevnodes = rnumvnodes - rfreevnodes; 1584 else 1585 usevnodes = rnumvnodes; 1586 if (usevnodes <= 0) 1587 usevnodes = 1; 1588 /* 1589 * The trigger value is chosen to give a conservatively 1590 * large value to ensure that it alone doesn't prevent 1591 * making progress. The value can easily be so large that 1592 * it is effectively infinite in some congested and 1593 * misconfigured cases, and this is necessary. Normally 1594 * it is about 8 to 100 (pages), which is quite large. 1595 */ 1596 trigger = vm_cnt.v_page_count * 2 / usevnodes; 1597 if (force < 2) 1598 trigger = vsmalltrigger; 1599 reclaim_nc_src = force >= 3; 1600 target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); 1601 target = target / 10 + 1; 1602 done = vlrureclaim(reclaim_nc_src, trigger, target); 1603 mtx_unlock(&vnode_list_mtx); 1604 if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) 1605 uma_reclaim(UMA_RECLAIM_DRAIN); 1606 if (done == 0) { 1607 if (force == 0 || force == 1) { 1608 force = 2; 1609 continue; 1610 } 1611 if (force == 2) { 1612 force = 3; 1613 continue; 1614 } 1615 want_reread = true; 1616 force = 0; 1617 vnlru_nowhere++; 1618 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 1619 } else { 1620 want_reread = true; 1621 kern_yield(PRI_USER); 1622 } 1623 } 1624 } 1625 1626 static struct kproc_desc vnlru_kp = { 1627 "vnlru", 1628 vnlru_proc, 1629 &vnlruproc 1630 }; 1631 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 1632 &vnlru_kp); 1633 1634 /* 1635 * Routines having to do with the management of the vnode table. 1636 */ 1637 1638 /* 1639 * Try to recycle a freed vnode. We abort if anyone picks up a reference 1640 * before we actually vgone(). This function must be called with the vnode 1641 * held to prevent the vnode from being returned to the free list midway 1642 * through vgone(). 1643 */ 1644 static int 1645 vtryrecycle(struct vnode *vp) 1646 { 1647 struct mount *vnmp; 1648 1649 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 1650 VNPASS(vp->v_holdcnt > 0, vp); 1651 /* 1652 * This vnode may found and locked via some other list, if so we 1653 * can't recycle it yet. 1654 */ 1655 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1656 CTR2(KTR_VFS, 1657 "%s: impossible to recycle, vp %p lock is already held", 1658 __func__, vp); 1659 vdrop_recycle(vp); 1660 return (EWOULDBLOCK); 1661 } 1662 /* 1663 * Don't recycle if its filesystem is being suspended. 1664 */ 1665 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 1666 VOP_UNLOCK(vp); 1667 CTR2(KTR_VFS, 1668 "%s: impossible to recycle, cannot start the write for %p", 1669 __func__, vp); 1670 vdrop_recycle(vp); 1671 return (EBUSY); 1672 } 1673 /* 1674 * If we got this far, we need to acquire the interlock and see if 1675 * anyone picked up this vnode from another list. If not, we will 1676 * mark it with DOOMED via vgonel() so that anyone who does find it 1677 * will skip over it. 1678 */ 1679 VI_LOCK(vp); 1680 if (vp->v_usecount) { 1681 VOP_UNLOCK(vp); 1682 vdropl_recycle(vp); 1683 vn_finished_write(vnmp); 1684 CTR2(KTR_VFS, 1685 "%s: impossible to recycle, %p is already referenced", 1686 __func__, vp); 1687 return (EBUSY); 1688 } 1689 if (!VN_IS_DOOMED(vp)) { 1690 counter_u64_add(recycles_free_count, 1); 1691 vgonel(vp); 1692 } 1693 VOP_UNLOCK(vp); 1694 vdropl_recycle(vp); 1695 vn_finished_write(vnmp); 1696 return (0); 1697 } 1698 1699 /* 1700 * Allocate a new vnode. 1701 * 1702 * The operation never returns an error. Returning an error was disabled 1703 * in r145385 (dated 2005) with the following comment: 1704 * 1705 * XXX Not all VFS_VGET/ffs_vget callers check returns. 1706 * 1707 * Given the age of this commit (almost 15 years at the time of writing this 1708 * comment) restoring the ability to fail requires a significant audit of 1709 * all codepaths. 1710 * 1711 * The routine can try to free a vnode or stall for up to 1 second waiting for 1712 * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. 1713 */ 1714 static u_long vn_alloc_cyclecount; 1715 static u_long vn_alloc_sleeps; 1716 1717 SYSCTL_ULONG(_vfs, OID_AUTO, vnode_alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0, 1718 "Number of times vnode allocation blocked waiting on vnlru"); 1719 1720 static struct vnode * __noinline 1721 vn_alloc_hard(struct mount *mp) 1722 { 1723 u_long rnumvnodes, rfreevnodes; 1724 1725 mtx_lock(&vnode_list_mtx); 1726 rnumvnodes = atomic_load_long(&numvnodes); 1727 if (rnumvnodes + 1 < desiredvnodes) { 1728 vn_alloc_cyclecount = 0; 1729 goto alloc; 1730 } 1731 rfreevnodes = vnlru_read_freevnodes(); 1732 if (vn_alloc_cyclecount++ >= rfreevnodes) { 1733 vn_alloc_cyclecount = 0; 1734 vstir = 1; 1735 } 1736 /* 1737 * Grow the vnode cache if it will not be above its target max 1738 * after growing. Otherwise, if the free list is nonempty, try 1739 * to reclaim 1 item from it before growing the cache (possibly 1740 * above its target max if the reclamation failed or is delayed). 1741 * Otherwise, wait for some space. In all cases, schedule 1742 * vnlru_proc() if we are getting short of space. The watermarks 1743 * should be chosen so that we never wait or even reclaim from 1744 * the free list to below its target minimum. 1745 */ 1746 if (vnlru_free_locked(1) > 0) 1747 goto alloc; 1748 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { 1749 /* 1750 * Wait for space for a new vnode. 1751 */ 1752 vnlru_kick(); 1753 vn_alloc_sleeps++; 1754 msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); 1755 if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && 1756 vnlru_read_freevnodes() > 1) 1757 vnlru_free_locked(1); 1758 } 1759 alloc: 1760 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1761 if (vnlru_under(rnumvnodes, vlowat)) 1762 vnlru_kick(); 1763 mtx_unlock(&vnode_list_mtx); 1764 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1765 } 1766 1767 static struct vnode * 1768 vn_alloc(struct mount *mp) 1769 { 1770 u_long rnumvnodes; 1771 1772 if (__predict_false(vn_alloc_cyclecount != 0)) 1773 return (vn_alloc_hard(mp)); 1774 rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; 1775 if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { 1776 atomic_subtract_long(&numvnodes, 1); 1777 return (vn_alloc_hard(mp)); 1778 } 1779 1780 return (uma_zalloc_smr(vnode_zone, M_WAITOK)); 1781 } 1782 1783 static void 1784 vn_free(struct vnode *vp) 1785 { 1786 1787 atomic_subtract_long(&numvnodes, 1); 1788 uma_zfree_smr(vnode_zone, vp); 1789 } 1790 1791 /* 1792 * Return the next vnode from the free list. 1793 */ 1794 int 1795 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1796 struct vnode **vpp) 1797 { 1798 struct vnode *vp; 1799 struct thread *td; 1800 struct lock_object *lo; 1801 1802 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1803 1804 KASSERT(vops->registered, 1805 ("%s: not registered vector op %p\n", __func__, vops)); 1806 cache_validate_vop_vector(mp, vops); 1807 1808 td = curthread; 1809 if (td->td_vp_reserved != NULL) { 1810 vp = td->td_vp_reserved; 1811 td->td_vp_reserved = NULL; 1812 } else { 1813 vp = vn_alloc(mp); 1814 } 1815 counter_u64_add(vnodes_created, 1); 1816 1817 vn_set_state(vp, VSTATE_UNINITIALIZED); 1818 1819 /* 1820 * Locks are given the generic name "vnode" when created. 1821 * Follow the historic practice of using the filesystem 1822 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. 1823 * 1824 * Locks live in a witness group keyed on their name. Thus, 1825 * when a lock is renamed, it must also move from the witness 1826 * group of its old name to the witness group of its new name. 1827 * 1828 * The change only needs to be made when the vnode moves 1829 * from one filesystem type to another. We ensure that each 1830 * filesystem use a single static name pointer for its tag so 1831 * that we can compare pointers rather than doing a strcmp(). 1832 */ 1833 lo = &vp->v_vnlock->lock_object; 1834 #ifdef WITNESS 1835 if (lo->lo_name != tag) { 1836 #endif 1837 lo->lo_name = tag; 1838 #ifdef WITNESS 1839 WITNESS_DESTROY(lo); 1840 WITNESS_INIT(lo, tag); 1841 } 1842 #endif 1843 /* 1844 * By default, don't allow shared locks unless filesystems opt-in. 1845 */ 1846 vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; 1847 /* 1848 * Finalize various vnode identity bits. 1849 */ 1850 KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); 1851 KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); 1852 KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); 1853 vp->v_type = VNON; 1854 vp->v_op = vops; 1855 vp->v_irflag = 0; 1856 v_init_counters(vp); 1857 vn_seqc_init(vp); 1858 vp->v_bufobj.bo_ops = &buf_ops_bio; 1859 #ifdef DIAGNOSTIC 1860 if (mp == NULL && vops != &dead_vnodeops) 1861 printf("NULL mp in getnewvnode(9), tag %s\n", tag); 1862 #endif 1863 #ifdef MAC 1864 mac_vnode_init(vp); 1865 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1866 mac_vnode_associate_singlelabel(mp, vp); 1867 #endif 1868 if (mp != NULL) { 1869 vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; 1870 } 1871 1872 /* 1873 * For the filesystems which do not use vfs_hash_insert(), 1874 * still initialize v_hash to have vfs_hash_index() useful. 1875 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1876 * its own hashing. 1877 */ 1878 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1879 1880 *vpp = vp; 1881 return (0); 1882 } 1883 1884 void 1885 getnewvnode_reserve(void) 1886 { 1887 struct thread *td; 1888 1889 td = curthread; 1890 MPASS(td->td_vp_reserved == NULL); 1891 td->td_vp_reserved = vn_alloc(NULL); 1892 } 1893 1894 void 1895 getnewvnode_drop_reserve(void) 1896 { 1897 struct thread *td; 1898 1899 td = curthread; 1900 if (td->td_vp_reserved != NULL) { 1901 vn_free(td->td_vp_reserved); 1902 td->td_vp_reserved = NULL; 1903 } 1904 } 1905 1906 static void __noinline 1907 freevnode(struct vnode *vp) 1908 { 1909 struct bufobj *bo; 1910 1911 /* 1912 * The vnode has been marked for destruction, so free it. 1913 * 1914 * The vnode will be returned to the zone where it will 1915 * normally remain until it is needed for another vnode. We 1916 * need to cleanup (or verify that the cleanup has already 1917 * been done) any residual data left from its current use 1918 * so as not to contaminate the freshly allocated vnode. 1919 */ 1920 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 1921 /* 1922 * Paired with vgone. 1923 */ 1924 vn_seqc_write_end_free(vp); 1925 1926 bo = &vp->v_bufobj; 1927 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 1928 VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); 1929 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 1930 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 1931 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 1932 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 1933 VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, 1934 ("clean blk trie not empty")); 1935 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 1936 VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, 1937 ("dirty blk trie not empty")); 1938 VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, 1939 ("Dangling rangelock waiters")); 1940 VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, 1941 ("Leaked inactivation")); 1942 VI_UNLOCK(vp); 1943 cache_assert_no_entries(vp); 1944 1945 #ifdef MAC 1946 mac_vnode_destroy(vp); 1947 #endif 1948 if (vp->v_pollinfo != NULL) { 1949 /* 1950 * Use LK_NOWAIT to shut up witness about the lock. We may get 1951 * here while having another vnode locked when trying to 1952 * satisfy a lookup and needing to recycle. 1953 */ 1954 VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); 1955 destroy_vpollinfo(vp->v_pollinfo); 1956 VOP_UNLOCK(vp); 1957 vp->v_pollinfo = NULL; 1958 } 1959 vp->v_mountedhere = NULL; 1960 vp->v_unpcb = NULL; 1961 vp->v_rdev = NULL; 1962 vp->v_fifoinfo = NULL; 1963 vp->v_iflag = 0; 1964 vp->v_vflag = 0; 1965 bo->bo_flag = 0; 1966 vn_free(vp); 1967 } 1968 1969 /* 1970 * Delete from old mount point vnode list, if on one. 1971 */ 1972 static void 1973 delmntque(struct vnode *vp) 1974 { 1975 struct mount *mp; 1976 1977 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 1978 1979 mp = vp->v_mount; 1980 MNT_ILOCK(mp); 1981 VI_LOCK(vp); 1982 vp->v_mount = NULL; 1983 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1984 ("bad mount point vnode list size")); 1985 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1986 mp->mnt_nvnodelistsize--; 1987 MNT_REL(mp); 1988 MNT_IUNLOCK(mp); 1989 /* 1990 * The caller expects the interlock to be still held. 1991 */ 1992 ASSERT_VI_LOCKED(vp, __func__); 1993 } 1994 1995 static int 1996 insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) 1997 { 1998 1999 KASSERT(vp->v_mount == NULL, 2000 ("insmntque: vnode already on per mount vnode list")); 2001 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 2002 if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { 2003 ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); 2004 } else { 2005 KASSERT(!dtr, 2006 ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", 2007 __func__)); 2008 } 2009 2010 /* 2011 * We acquire the vnode interlock early to ensure that the 2012 * vnode cannot be recycled by another process releasing a 2013 * holdcnt on it before we get it on both the vnode list 2014 * and the active vnode list. The mount mutex protects only 2015 * manipulation of the vnode list and the vnode freelist 2016 * mutex protects only manipulation of the active vnode list. 2017 * Hence the need to hold the vnode interlock throughout. 2018 */ 2019 MNT_ILOCK(mp); 2020 VI_LOCK(vp); 2021 if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && 2022 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 2023 mp->mnt_nvnodelistsize == 0)) && 2024 (vp->v_vflag & VV_FORCEINSMQ) == 0) { 2025 VI_UNLOCK(vp); 2026 MNT_IUNLOCK(mp); 2027 if (dtr) { 2028 vp->v_data = NULL; 2029 vp->v_op = &dead_vnodeops; 2030 vgone(vp); 2031 vput(vp); 2032 } 2033 return (EBUSY); 2034 } 2035 vp->v_mount = mp; 2036 MNT_REF(mp); 2037 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 2038 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 2039 ("neg mount point vnode list size")); 2040 mp->mnt_nvnodelistsize++; 2041 VI_UNLOCK(vp); 2042 MNT_IUNLOCK(mp); 2043 return (0); 2044 } 2045 2046 /* 2047 * Insert into list of vnodes for the new mount point, if available. 2048 * insmntque() reclaims the vnode on insertion failure, insmntque1() 2049 * leaves handling of the vnode to the caller. 2050 */ 2051 int 2052 insmntque(struct vnode *vp, struct mount *mp) 2053 { 2054 return (insmntque1_int(vp, mp, true)); 2055 } 2056 2057 int 2058 insmntque1(struct vnode *vp, struct mount *mp) 2059 { 2060 return (insmntque1_int(vp, mp, false)); 2061 } 2062 2063 /* 2064 * Flush out and invalidate all buffers associated with a bufobj 2065 * Called with the underlying object locked. 2066 */ 2067 int 2068 bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 2069 { 2070 int error; 2071 2072 BO_LOCK(bo); 2073 if (flags & V_SAVE) { 2074 error = bufobj_wwait(bo, slpflag, slptimeo); 2075 if (error) { 2076 BO_UNLOCK(bo); 2077 return (error); 2078 } 2079 if (bo->bo_dirty.bv_cnt > 0) { 2080 BO_UNLOCK(bo); 2081 do { 2082 error = BO_SYNC(bo, MNT_WAIT); 2083 } while (error == ERELOOKUP); 2084 if (error != 0) 2085 return (error); 2086 BO_LOCK(bo); 2087 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { 2088 BO_UNLOCK(bo); 2089 return (EBUSY); 2090 } 2091 } 2092 } 2093 /* 2094 * If you alter this loop please notice that interlock is dropped and 2095 * reacquired in flushbuflist. Special care is needed to ensure that 2096 * no race conditions occur from this. 2097 */ 2098 do { 2099 error = flushbuflist(&bo->bo_clean, 2100 flags, bo, slpflag, slptimeo); 2101 if (error == 0 && !(flags & V_CLEANONLY)) 2102 error = flushbuflist(&bo->bo_dirty, 2103 flags, bo, slpflag, slptimeo); 2104 if (error != 0 && error != EAGAIN) { 2105 BO_UNLOCK(bo); 2106 return (error); 2107 } 2108 } while (error != 0); 2109 2110 /* 2111 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 2112 * have write I/O in-progress but if there is a VM object then the 2113 * VM object can also have read-I/O in-progress. 2114 */ 2115 do { 2116 bufobj_wwait(bo, 0, 0); 2117 if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { 2118 BO_UNLOCK(bo); 2119 vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); 2120 BO_LOCK(bo); 2121 } 2122 } while (bo->bo_numoutput > 0); 2123 BO_UNLOCK(bo); 2124 2125 /* 2126 * Destroy the copy in the VM cache, too. 2127 */ 2128 if (bo->bo_object != NULL && 2129 (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { 2130 VM_OBJECT_WLOCK(bo->bo_object); 2131 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 2132 OBJPR_CLEANONLY : 0); 2133 VM_OBJECT_WUNLOCK(bo->bo_object); 2134 } 2135 2136 #ifdef INVARIANTS 2137 BO_LOCK(bo); 2138 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | 2139 V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || 2140 bo->bo_clean.bv_cnt > 0)) 2141 panic("vinvalbuf: flush failed"); 2142 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && 2143 bo->bo_dirty.bv_cnt > 0) 2144 panic("vinvalbuf: flush dirty failed"); 2145 BO_UNLOCK(bo); 2146 #endif 2147 return (0); 2148 } 2149 2150 /* 2151 * Flush out and invalidate all buffers associated with a vnode. 2152 * Called with the underlying object locked. 2153 */ 2154 int 2155 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 2156 { 2157 2158 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2159 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 2160 if (vp->v_object != NULL && vp->v_object->handle != vp) 2161 return (0); 2162 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 2163 } 2164 2165 /* 2166 * Flush out buffers on the specified list. 2167 * 2168 */ 2169 static int 2170 flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 2171 int slptimeo) 2172 { 2173 struct buf *bp, *nbp; 2174 int retval, error; 2175 daddr_t lblkno; 2176 b_xflags_t xflags; 2177 2178 ASSERT_BO_WLOCKED(bo); 2179 2180 retval = 0; 2181 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 2182 /* 2183 * If we are flushing both V_NORMAL and V_ALT buffers then 2184 * do not skip any buffers. If we are flushing only V_NORMAL 2185 * buffers then skip buffers marked as BX_ALTDATA. If we are 2186 * flushing only V_ALT buffers then skip buffers not marked 2187 * as BX_ALTDATA. 2188 */ 2189 if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && 2190 (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || 2191 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { 2192 continue; 2193 } 2194 if (nbp != NULL) { 2195 lblkno = nbp->b_lblkno; 2196 xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); 2197 } 2198 retval = EAGAIN; 2199 error = BUF_TIMELOCK(bp, 2200 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), 2201 "flushbuf", slpflag, slptimeo); 2202 if (error) { 2203 BO_LOCK(bo); 2204 return (error != ENOLCK ? error : EAGAIN); 2205 } 2206 KASSERT(bp->b_bufobj == bo, 2207 ("bp %p wrong b_bufobj %p should be %p", 2208 bp, bp->b_bufobj, bo)); 2209 /* 2210 * XXX Since there are no node locks for NFS, I 2211 * believe there is a slight chance that a delayed 2212 * write will occur while sleeping just above, so 2213 * check for it. 2214 */ 2215 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 2216 (flags & V_SAVE)) { 2217 bremfree(bp); 2218 bp->b_flags |= B_ASYNC; 2219 bwrite(bp); 2220 BO_LOCK(bo); 2221 return (EAGAIN); /* XXX: why not loop ? */ 2222 } 2223 bremfree(bp); 2224 bp->b_flags |= (B_INVAL | B_RELBUF); 2225 bp->b_flags &= ~B_ASYNC; 2226 brelse(bp); 2227 BO_LOCK(bo); 2228 if (nbp == NULL) 2229 break; 2230 nbp = gbincore(bo, lblkno); 2231 if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2232 != xflags) 2233 break; /* nbp invalid */ 2234 } 2235 return (retval); 2236 } 2237 2238 int 2239 bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) 2240 { 2241 struct buf *bp; 2242 int error; 2243 daddr_t lblkno; 2244 2245 ASSERT_BO_LOCKED(bo); 2246 2247 for (lblkno = startn;;) { 2248 again: 2249 bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); 2250 if (bp == NULL || bp->b_lblkno >= endn || 2251 bp->b_lblkno < startn) 2252 break; 2253 error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | 2254 LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); 2255 if (error != 0) { 2256 BO_RLOCK(bo); 2257 if (error == ENOLCK) 2258 goto again; 2259 return (error); 2260 } 2261 KASSERT(bp->b_bufobj == bo, 2262 ("bp %p wrong b_bufobj %p should be %p", 2263 bp, bp->b_bufobj, bo)); 2264 lblkno = bp->b_lblkno + 1; 2265 if ((bp->b_flags & B_MANAGED) == 0) 2266 bremfree(bp); 2267 bp->b_flags |= B_RELBUF; 2268 /* 2269 * In the VMIO case, use the B_NOREUSE flag to hint that the 2270 * pages backing each buffer in the range are unlikely to be 2271 * reused. Dirty buffers will have the hint applied once 2272 * they've been written. 2273 */ 2274 if ((bp->b_flags & B_VMIO) != 0) 2275 bp->b_flags |= B_NOREUSE; 2276 brelse(bp); 2277 BO_RLOCK(bo); 2278 } 2279 return (0); 2280 } 2281 2282 /* 2283 * Truncate a file's buffer and pages to a specified length. This 2284 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 2285 * sync activity. 2286 */ 2287 int 2288 vtruncbuf(struct vnode *vp, off_t length, int blksize) 2289 { 2290 struct buf *bp, *nbp; 2291 struct bufobj *bo; 2292 daddr_t startlbn; 2293 2294 CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, 2295 vp, blksize, (uintmax_t)length); 2296 2297 /* 2298 * Round up to the *next* lbn. 2299 */ 2300 startlbn = howmany(length, blksize); 2301 2302 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 2303 2304 bo = &vp->v_bufobj; 2305 restart_unlocked: 2306 BO_LOCK(bo); 2307 2308 while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) 2309 ; 2310 2311 if (length > 0) { 2312 restartsync: 2313 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2314 if (bp->b_lblkno > 0) 2315 continue; 2316 /* 2317 * Since we hold the vnode lock this should only 2318 * fail if we're racing with the buf daemon. 2319 */ 2320 if (BUF_LOCK(bp, 2321 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2322 BO_LOCKPTR(bo)) == ENOLCK) 2323 goto restart_unlocked; 2324 2325 VNASSERT((bp->b_flags & B_DELWRI), vp, 2326 ("buf(%p) on dirty queue without DELWRI", bp)); 2327 2328 bremfree(bp); 2329 bawrite(bp); 2330 BO_LOCK(bo); 2331 goto restartsync; 2332 } 2333 } 2334 2335 bufobj_wwait(bo, 0, 0); 2336 BO_UNLOCK(bo); 2337 vnode_pager_setsize(vp, length); 2338 2339 return (0); 2340 } 2341 2342 /* 2343 * Invalidate the cached pages of a file's buffer within the range of block 2344 * numbers [startlbn, endlbn). 2345 */ 2346 void 2347 v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, 2348 int blksize) 2349 { 2350 struct bufobj *bo; 2351 off_t start, end; 2352 2353 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); 2354 2355 start = blksize * startlbn; 2356 end = blksize * endlbn; 2357 2358 bo = &vp->v_bufobj; 2359 BO_LOCK(bo); 2360 MPASS(blksize == bo->bo_bsize); 2361 2362 while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) 2363 ; 2364 2365 BO_UNLOCK(bo); 2366 vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); 2367 } 2368 2369 static int 2370 v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, 2371 daddr_t startlbn, daddr_t endlbn) 2372 { 2373 struct buf *bp, *nbp; 2374 bool anyfreed; 2375 2376 ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); 2377 ASSERT_BO_LOCKED(bo); 2378 2379 do { 2380 anyfreed = false; 2381 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 2382 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2383 continue; 2384 if (BUF_LOCK(bp, 2385 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2386 BO_LOCKPTR(bo)) == ENOLCK) { 2387 BO_LOCK(bo); 2388 return (EAGAIN); 2389 } 2390 2391 bremfree(bp); 2392 bp->b_flags |= B_INVAL | B_RELBUF; 2393 bp->b_flags &= ~B_ASYNC; 2394 brelse(bp); 2395 anyfreed = true; 2396 2397 BO_LOCK(bo); 2398 if (nbp != NULL && 2399 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 2400 nbp->b_vp != vp || 2401 (nbp->b_flags & B_DELWRI) != 0)) 2402 return (EAGAIN); 2403 } 2404 2405 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 2406 if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) 2407 continue; 2408 if (BUF_LOCK(bp, 2409 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 2410 BO_LOCKPTR(bo)) == ENOLCK) { 2411 BO_LOCK(bo); 2412 return (EAGAIN); 2413 } 2414 bremfree(bp); 2415 bp->b_flags |= B_INVAL | B_RELBUF; 2416 bp->b_flags &= ~B_ASYNC; 2417 brelse(bp); 2418 anyfreed = true; 2419 2420 BO_LOCK(bo); 2421 if (nbp != NULL && 2422 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 2423 (nbp->b_vp != vp) || 2424 (nbp->b_flags & B_DELWRI) == 0)) 2425 return (EAGAIN); 2426 } 2427 } while (anyfreed); 2428 return (0); 2429 } 2430 2431 static void 2432 buf_vlist_remove(struct buf *bp) 2433 { 2434 struct bufv *bv; 2435 b_xflags_t flags; 2436 2437 flags = bp->b_xflags; 2438 2439 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 2440 ASSERT_BO_WLOCKED(bp->b_bufobj); 2441 KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && 2442 (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), 2443 ("%s: buffer %p has invalid queue state", __func__, bp)); 2444 2445 if ((flags & BX_VNDIRTY) != 0) 2446 bv = &bp->b_bufobj->bo_dirty; 2447 else 2448 bv = &bp->b_bufobj->bo_clean; 2449 BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); 2450 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 2451 bv->bv_cnt--; 2452 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 2453 } 2454 2455 /* 2456 * Add the buffer to the sorted clean or dirty block list. 2457 * 2458 * NOTE: xflags is passed as a constant, optimizing this inline function! 2459 */ 2460 static void 2461 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 2462 { 2463 struct bufv *bv; 2464 struct buf *n; 2465 int error; 2466 2467 ASSERT_BO_WLOCKED(bo); 2468 KASSERT((bo->bo_flag & BO_NOBUFS) == 0, 2469 ("buf_vlist_add: bo %p does not allow bufs", bo)); 2470 KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, 2471 ("dead bo %p", bo)); 2472 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 2473 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 2474 bp->b_xflags |= xflags; 2475 if (xflags & BX_VNDIRTY) 2476 bv = &bo->bo_dirty; 2477 else 2478 bv = &bo->bo_clean; 2479 2480 /* 2481 * Keep the list ordered. Optimize empty list insertion. Assume 2482 * we tend to grow at the tail so lookup_le should usually be cheaper 2483 * than _ge. 2484 */ 2485 if (bv->bv_cnt == 0 || 2486 bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) 2487 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 2488 else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) 2489 TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); 2490 else 2491 TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); 2492 error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); 2493 if (error) 2494 panic("buf_vlist_add: Preallocated nodes insufficient."); 2495 bv->bv_cnt++; 2496 } 2497 2498 /* 2499 * Look up a buffer using the buffer tries. 2500 */ 2501 struct buf * 2502 gbincore(struct bufobj *bo, daddr_t lblkno) 2503 { 2504 struct buf *bp; 2505 2506 ASSERT_BO_LOCKED(bo); 2507 bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); 2508 if (bp != NULL) 2509 return (bp); 2510 return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); 2511 } 2512 2513 /* 2514 * Look up a buf using the buffer tries, without the bufobj lock. This relies 2515 * on SMR for safe lookup, and bufs being in a no-free zone to provide type 2516 * stability of the result. Like other lockless lookups, the found buf may 2517 * already be invalid by the time this function returns. 2518 */ 2519 struct buf * 2520 gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) 2521 { 2522 struct buf *bp; 2523 2524 ASSERT_BO_UNLOCKED(bo); 2525 bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); 2526 if (bp != NULL) 2527 return (bp); 2528 return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); 2529 } 2530 2531 /* 2532 * Associate a buffer with a vnode. 2533 */ 2534 void 2535 bgetvp(struct vnode *vp, struct buf *bp) 2536 { 2537 struct bufobj *bo; 2538 2539 bo = &vp->v_bufobj; 2540 ASSERT_BO_WLOCKED(bo); 2541 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 2542 2543 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 2544 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 2545 ("bgetvp: bp already attached! %p", bp)); 2546 2547 vhold(vp); 2548 bp->b_vp = vp; 2549 bp->b_bufobj = bo; 2550 /* 2551 * Insert onto list for new vnode. 2552 */ 2553 buf_vlist_add(bp, bo, BX_VNCLEAN); 2554 } 2555 2556 /* 2557 * Disassociate a buffer from a vnode. 2558 */ 2559 void 2560 brelvp(struct buf *bp) 2561 { 2562 struct bufobj *bo; 2563 struct vnode *vp; 2564 2565 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 2566 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 2567 2568 /* 2569 * Delete from old vnode list, if on one. 2570 */ 2571 vp = bp->b_vp; /* XXX */ 2572 bo = bp->b_bufobj; 2573 BO_LOCK(bo); 2574 buf_vlist_remove(bp); 2575 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2576 bo->bo_flag &= ~BO_ONWORKLST; 2577 mtx_lock(&sync_mtx); 2578 LIST_REMOVE(bo, bo_synclist); 2579 syncer_worklist_len--; 2580 mtx_unlock(&sync_mtx); 2581 } 2582 bp->b_vp = NULL; 2583 bp->b_bufobj = NULL; 2584 BO_UNLOCK(bo); 2585 vdrop(vp); 2586 } 2587 2588 /* 2589 * Add an item to the syncer work queue. 2590 */ 2591 static void 2592 vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 2593 { 2594 int slot; 2595 2596 ASSERT_BO_WLOCKED(bo); 2597 2598 mtx_lock(&sync_mtx); 2599 if (bo->bo_flag & BO_ONWORKLST) 2600 LIST_REMOVE(bo, bo_synclist); 2601 else { 2602 bo->bo_flag |= BO_ONWORKLST; 2603 syncer_worklist_len++; 2604 } 2605 2606 if (delay > syncer_maxdelay - 2) 2607 delay = syncer_maxdelay - 2; 2608 slot = (syncer_delayno + delay) & syncer_mask; 2609 2610 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 2611 mtx_unlock(&sync_mtx); 2612 } 2613 2614 static int 2615 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 2616 { 2617 int error, len; 2618 2619 mtx_lock(&sync_mtx); 2620 len = syncer_worklist_len - sync_vnode_count; 2621 mtx_unlock(&sync_mtx); 2622 error = SYSCTL_OUT(req, &len, sizeof(len)); 2623 return (error); 2624 } 2625 2626 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, 2627 CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, 2628 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 2629 2630 static struct proc *updateproc; 2631 static void sched_sync(void); 2632 static struct kproc_desc up_kp = { 2633 "syncer", 2634 sched_sync, 2635 &updateproc 2636 }; 2637 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 2638 2639 static int 2640 sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 2641 { 2642 struct vnode *vp; 2643 struct mount *mp; 2644 2645 *bo = LIST_FIRST(slp); 2646 if (*bo == NULL) 2647 return (0); 2648 vp = bo2vnode(*bo); 2649 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 2650 return (1); 2651 /* 2652 * We use vhold in case the vnode does not 2653 * successfully sync. vhold prevents the vnode from 2654 * going away when we unlock the sync_mtx so that 2655 * we can acquire the vnode interlock. 2656 */ 2657 vholdl(vp); 2658 mtx_unlock(&sync_mtx); 2659 VI_UNLOCK(vp); 2660 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2661 vdrop(vp); 2662 mtx_lock(&sync_mtx); 2663 return (*bo == LIST_FIRST(slp)); 2664 } 2665 MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 || 2666 (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp, 2667 ("suspended mp syncing vp %p", vp)); 2668 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2669 (void) VOP_FSYNC(vp, MNT_LAZY, td); 2670 VOP_UNLOCK(vp); 2671 vn_finished_write(mp); 2672 BO_LOCK(*bo); 2673 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 2674 /* 2675 * Put us back on the worklist. The worklist 2676 * routine will remove us from our current 2677 * position and then add us back in at a later 2678 * position. 2679 */ 2680 vn_syncer_add_to_worklist(*bo, syncdelay); 2681 } 2682 BO_UNLOCK(*bo); 2683 vdrop(vp); 2684 mtx_lock(&sync_mtx); 2685 return (0); 2686 } 2687 2688 static int first_printf = 1; 2689 2690 /* 2691 * System filesystem synchronizer daemon. 2692 */ 2693 static void 2694 sched_sync(void) 2695 { 2696 struct synclist *next, *slp; 2697 struct bufobj *bo; 2698 long starttime; 2699 struct thread *td = curthread; 2700 int last_work_seen; 2701 int net_worklist_len; 2702 int syncer_final_iter; 2703 int error; 2704 2705 last_work_seen = 0; 2706 syncer_final_iter = 0; 2707 syncer_state = SYNCER_RUNNING; 2708 starttime = time_uptime; 2709 td->td_pflags |= TDP_NORUNNINGBUF; 2710 2711 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 2712 SHUTDOWN_PRI_LAST); 2713 2714 mtx_lock(&sync_mtx); 2715 for (;;) { 2716 if (syncer_state == SYNCER_FINAL_DELAY && 2717 syncer_final_iter == 0) { 2718 mtx_unlock(&sync_mtx); 2719 kproc_suspend_check(td->td_proc); 2720 mtx_lock(&sync_mtx); 2721 } 2722 net_worklist_len = syncer_worklist_len - sync_vnode_count; 2723 if (syncer_state != SYNCER_RUNNING && 2724 starttime != time_uptime) { 2725 if (first_printf) { 2726 printf("\nSyncing disks, vnodes remaining... "); 2727 first_printf = 0; 2728 } 2729 printf("%d ", net_worklist_len); 2730 } 2731 starttime = time_uptime; 2732 2733 /* 2734 * Push files whose dirty time has expired. Be careful 2735 * of interrupt race on slp queue. 2736 * 2737 * Skip over empty worklist slots when shutting down. 2738 */ 2739 do { 2740 slp = &syncer_workitem_pending[syncer_delayno]; 2741 syncer_delayno += 1; 2742 if (syncer_delayno == syncer_maxdelay) 2743 syncer_delayno = 0; 2744 next = &syncer_workitem_pending[syncer_delayno]; 2745 /* 2746 * If the worklist has wrapped since the 2747 * it was emptied of all but syncer vnodes, 2748 * switch to the FINAL_DELAY state and run 2749 * for one more second. 2750 */ 2751 if (syncer_state == SYNCER_SHUTTING_DOWN && 2752 net_worklist_len == 0 && 2753 last_work_seen == syncer_delayno) { 2754 syncer_state = SYNCER_FINAL_DELAY; 2755 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 2756 } 2757 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 2758 syncer_worklist_len > 0); 2759 2760 /* 2761 * Keep track of the last time there was anything 2762 * on the worklist other than syncer vnodes. 2763 * Return to the SHUTTING_DOWN state if any 2764 * new work appears. 2765 */ 2766 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 2767 last_work_seen = syncer_delayno; 2768 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 2769 syncer_state = SYNCER_SHUTTING_DOWN; 2770 while (!LIST_EMPTY(slp)) { 2771 error = sync_vnode(slp, &bo, td); 2772 if (error == 1) { 2773 LIST_REMOVE(bo, bo_synclist); 2774 LIST_INSERT_HEAD(next, bo, bo_synclist); 2775 continue; 2776 } 2777 2778 if (first_printf == 0) { 2779 /* 2780 * Drop the sync mutex, because some watchdog 2781 * drivers need to sleep while patting 2782 */ 2783 mtx_unlock(&sync_mtx); 2784 wdog_kern_pat(WD_LASTVAL); 2785 mtx_lock(&sync_mtx); 2786 } 2787 } 2788 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 2789 syncer_final_iter--; 2790 /* 2791 * The variable rushjob allows the kernel to speed up the 2792 * processing of the filesystem syncer process. A rushjob 2793 * value of N tells the filesystem syncer to process the next 2794 * N seconds worth of work on its queue ASAP. Currently rushjob 2795 * is used by the soft update code to speed up the filesystem 2796 * syncer process when the incore state is getting so far 2797 * ahead of the disk that the kernel memory pool is being 2798 * threatened with exhaustion. 2799 */ 2800 if (rushjob > 0) { 2801 rushjob -= 1; 2802 continue; 2803 } 2804 /* 2805 * Just sleep for a short period of time between 2806 * iterations when shutting down to allow some I/O 2807 * to happen. 2808 * 2809 * If it has taken us less than a second to process the 2810 * current work, then wait. Otherwise start right over 2811 * again. We can still lose time if any single round 2812 * takes more than two seconds, but it does not really 2813 * matter as we are just trying to generally pace the 2814 * filesystem activity. 2815 */ 2816 if (syncer_state != SYNCER_RUNNING || 2817 time_uptime == starttime) { 2818 thread_lock(td); 2819 sched_prio(td, PPAUSE); 2820 thread_unlock(td); 2821 } 2822 if (syncer_state != SYNCER_RUNNING) 2823 cv_timedwait(&sync_wakeup, &sync_mtx, 2824 hz / SYNCER_SHUTDOWN_SPEEDUP); 2825 else if (time_uptime == starttime) 2826 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 2827 } 2828 } 2829 2830 /* 2831 * Request the syncer daemon to speed up its work. 2832 * We never push it to speed up more than half of its 2833 * normal turn time, otherwise it could take over the cpu. 2834 */ 2835 int 2836 speedup_syncer(void) 2837 { 2838 int ret = 0; 2839 2840 mtx_lock(&sync_mtx); 2841 if (rushjob < syncdelay / 2) { 2842 rushjob += 1; 2843 stat_rush_requests += 1; 2844 ret = 1; 2845 } 2846 mtx_unlock(&sync_mtx); 2847 cv_broadcast(&sync_wakeup); 2848 return (ret); 2849 } 2850 2851 /* 2852 * Tell the syncer to speed up its work and run though its work 2853 * list several times, then tell it to shut down. 2854 */ 2855 static void 2856 syncer_shutdown(void *arg, int howto) 2857 { 2858 2859 if (howto & RB_NOSYNC) 2860 return; 2861 mtx_lock(&sync_mtx); 2862 syncer_state = SYNCER_SHUTTING_DOWN; 2863 rushjob = 0; 2864 mtx_unlock(&sync_mtx); 2865 cv_broadcast(&sync_wakeup); 2866 kproc_shutdown(arg, howto); 2867 } 2868 2869 void 2870 syncer_suspend(void) 2871 { 2872 2873 syncer_shutdown(updateproc, 0); 2874 } 2875 2876 void 2877 syncer_resume(void) 2878 { 2879 2880 mtx_lock(&sync_mtx); 2881 first_printf = 1; 2882 syncer_state = SYNCER_RUNNING; 2883 mtx_unlock(&sync_mtx); 2884 cv_broadcast(&sync_wakeup); 2885 kproc_resume(updateproc); 2886 } 2887 2888 /* 2889 * Move the buffer between the clean and dirty lists of its vnode. 2890 */ 2891 void 2892 reassignbuf(struct buf *bp) 2893 { 2894 struct vnode *vp; 2895 struct bufobj *bo; 2896 int delay; 2897 #ifdef INVARIANTS 2898 struct bufv *bv; 2899 #endif 2900 2901 vp = bp->b_vp; 2902 bo = bp->b_bufobj; 2903 2904 KASSERT((bp->b_flags & B_PAGING) == 0, 2905 ("%s: cannot reassign paging buffer %p", __func__, bp)); 2906 2907 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2908 bp, bp->b_vp, bp->b_flags); 2909 2910 BO_LOCK(bo); 2911 buf_vlist_remove(bp); 2912 2913 /* 2914 * If dirty, put on list of dirty buffers; otherwise insert onto list 2915 * of clean buffers. 2916 */ 2917 if (bp->b_flags & B_DELWRI) { 2918 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2919 switch (vp->v_type) { 2920 case VDIR: 2921 delay = dirdelay; 2922 break; 2923 case VCHR: 2924 delay = metadelay; 2925 break; 2926 default: 2927 delay = filedelay; 2928 } 2929 vn_syncer_add_to_worklist(bo, delay); 2930 } 2931 buf_vlist_add(bp, bo, BX_VNDIRTY); 2932 } else { 2933 buf_vlist_add(bp, bo, BX_VNCLEAN); 2934 2935 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2936 mtx_lock(&sync_mtx); 2937 LIST_REMOVE(bo, bo_synclist); 2938 syncer_worklist_len--; 2939 mtx_unlock(&sync_mtx); 2940 bo->bo_flag &= ~BO_ONWORKLST; 2941 } 2942 } 2943 #ifdef INVARIANTS 2944 bv = &bo->bo_clean; 2945 bp = TAILQ_FIRST(&bv->bv_hd); 2946 KASSERT(bp == NULL || bp->b_bufobj == bo, 2947 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2948 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2949 KASSERT(bp == NULL || bp->b_bufobj == bo, 2950 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2951 bv = &bo->bo_dirty; 2952 bp = TAILQ_FIRST(&bv->bv_hd); 2953 KASSERT(bp == NULL || bp->b_bufobj == bo, 2954 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2955 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2956 KASSERT(bp == NULL || bp->b_bufobj == bo, 2957 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2958 #endif 2959 BO_UNLOCK(bo); 2960 } 2961 2962 static void 2963 v_init_counters(struct vnode *vp) 2964 { 2965 2966 VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, 2967 vp, ("%s called for an initialized vnode", __FUNCTION__)); 2968 ASSERT_VI_UNLOCKED(vp, __FUNCTION__); 2969 2970 refcount_init(&vp->v_holdcnt, 1); 2971 refcount_init(&vp->v_usecount, 1); 2972 } 2973 2974 /* 2975 * Grab a particular vnode from the free list, increment its 2976 * reference count and lock it. VIRF_DOOMED is set if the vnode 2977 * is being destroyed. Only callers who specify LK_RETRY will 2978 * see doomed vnodes. If inactive processing was delayed in 2979 * vput try to do it here. 2980 * 2981 * usecount is manipulated using atomics without holding any locks. 2982 * 2983 * holdcnt can be manipulated using atomics without holding any locks, 2984 * except when transitioning 1<->0, in which case the interlock is held. 2985 * 2986 * Consumers which don't guarantee liveness of the vnode can use SMR to 2987 * try to get a reference. Note this operation can fail since the vnode 2988 * may be awaiting getting freed by the time they get to it. 2989 */ 2990 enum vgetstate 2991 vget_prep_smr(struct vnode *vp) 2992 { 2993 enum vgetstate vs; 2994 2995 VFS_SMR_ASSERT_ENTERED(); 2996 2997 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 2998 vs = VGET_USECOUNT; 2999 } else { 3000 if (vhold_smr(vp)) 3001 vs = VGET_HOLDCNT; 3002 else 3003 vs = VGET_NONE; 3004 } 3005 return (vs); 3006 } 3007 3008 enum vgetstate 3009 vget_prep(struct vnode *vp) 3010 { 3011 enum vgetstate vs; 3012 3013 if (refcount_acquire_if_not_zero(&vp->v_usecount)) { 3014 vs = VGET_USECOUNT; 3015 } else { 3016 vhold(vp); 3017 vs = VGET_HOLDCNT; 3018 } 3019 return (vs); 3020 } 3021 3022 void 3023 vget_abort(struct vnode *vp, enum vgetstate vs) 3024 { 3025 3026 switch (vs) { 3027 case VGET_USECOUNT: 3028 vrele(vp); 3029 break; 3030 case VGET_HOLDCNT: 3031 vdrop(vp); 3032 break; 3033 default: 3034 __assert_unreachable(); 3035 } 3036 } 3037 3038 int 3039 vget(struct vnode *vp, int flags) 3040 { 3041 enum vgetstate vs; 3042 3043 vs = vget_prep(vp); 3044 return (vget_finish(vp, flags, vs)); 3045 } 3046 3047 int 3048 vget_finish(struct vnode *vp, int flags, enum vgetstate vs) 3049 { 3050 int error; 3051 3052 if ((flags & LK_INTERLOCK) != 0) 3053 ASSERT_VI_LOCKED(vp, __func__); 3054 else 3055 ASSERT_VI_UNLOCKED(vp, __func__); 3056 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3057 VNPASS(vp->v_holdcnt > 0, vp); 3058 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3059 3060 error = vn_lock(vp, flags); 3061 if (__predict_false(error != 0)) { 3062 vget_abort(vp, vs); 3063 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 3064 vp); 3065 return (error); 3066 } 3067 3068 vget_finish_ref(vp, vs); 3069 return (0); 3070 } 3071 3072 void 3073 vget_finish_ref(struct vnode *vp, enum vgetstate vs) 3074 { 3075 int old; 3076 3077 VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); 3078 VNPASS(vp->v_holdcnt > 0, vp); 3079 VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); 3080 3081 if (vs == VGET_USECOUNT) 3082 return; 3083 3084 /* 3085 * We hold the vnode. If the usecount is 0 it will be utilized to keep 3086 * the vnode around. Otherwise someone else lended their hold count and 3087 * we have to drop ours. 3088 */ 3089 old = atomic_fetchadd_int(&vp->v_usecount, 1); 3090 VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); 3091 if (old != 0) { 3092 #ifdef INVARIANTS 3093 old = atomic_fetchadd_int(&vp->v_holdcnt, -1); 3094 VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); 3095 #else 3096 refcount_release(&vp->v_holdcnt); 3097 #endif 3098 } 3099 } 3100 3101 void 3102 vref(struct vnode *vp) 3103 { 3104 enum vgetstate vs; 3105 3106 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3107 vs = vget_prep(vp); 3108 vget_finish_ref(vp, vs); 3109 } 3110 3111 void 3112 vrefact(struct vnode *vp) 3113 { 3114 3115 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3116 #ifdef INVARIANTS 3117 int old = atomic_fetchadd_int(&vp->v_usecount, 1); 3118 VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); 3119 #else 3120 refcount_acquire(&vp->v_usecount); 3121 #endif 3122 } 3123 3124 void 3125 vlazy(struct vnode *vp) 3126 { 3127 struct mount *mp; 3128 3129 VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); 3130 3131 if ((vp->v_mflag & VMP_LAZYLIST) != 0) 3132 return; 3133 /* 3134 * We may get here for inactive routines after the vnode got doomed. 3135 */ 3136 if (VN_IS_DOOMED(vp)) 3137 return; 3138 mp = vp->v_mount; 3139 mtx_lock(&mp->mnt_listmtx); 3140 if ((vp->v_mflag & VMP_LAZYLIST) == 0) { 3141 vp->v_mflag |= VMP_LAZYLIST; 3142 TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3143 mp->mnt_lazyvnodelistsize++; 3144 } 3145 mtx_unlock(&mp->mnt_listmtx); 3146 } 3147 3148 static void 3149 vunlazy(struct vnode *vp) 3150 { 3151 struct mount *mp; 3152 3153 ASSERT_VI_LOCKED(vp, __func__); 3154 VNPASS(!VN_IS_DOOMED(vp), vp); 3155 3156 mp = vp->v_mount; 3157 mtx_lock(&mp->mnt_listmtx); 3158 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3159 /* 3160 * Don't remove the vnode from the lazy list if another thread 3161 * has increased the hold count. It may have re-enqueued the 3162 * vnode to the lazy list and is now responsible for its 3163 * removal. 3164 */ 3165 if (vp->v_holdcnt == 0) { 3166 vp->v_mflag &= ~VMP_LAZYLIST; 3167 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3168 mp->mnt_lazyvnodelistsize--; 3169 } 3170 mtx_unlock(&mp->mnt_listmtx); 3171 } 3172 3173 /* 3174 * This routine is only meant to be called from vgonel prior to dooming 3175 * the vnode. 3176 */ 3177 static void 3178 vunlazy_gone(struct vnode *vp) 3179 { 3180 struct mount *mp; 3181 3182 ASSERT_VOP_ELOCKED(vp, __func__); 3183 ASSERT_VI_LOCKED(vp, __func__); 3184 VNPASS(!VN_IS_DOOMED(vp), vp); 3185 3186 if (vp->v_mflag & VMP_LAZYLIST) { 3187 mp = vp->v_mount; 3188 mtx_lock(&mp->mnt_listmtx); 3189 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 3190 vp->v_mflag &= ~VMP_LAZYLIST; 3191 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); 3192 mp->mnt_lazyvnodelistsize--; 3193 mtx_unlock(&mp->mnt_listmtx); 3194 } 3195 } 3196 3197 static void 3198 vdefer_inactive(struct vnode *vp) 3199 { 3200 3201 ASSERT_VI_LOCKED(vp, __func__); 3202 VNPASS(vp->v_holdcnt > 0, vp); 3203 if (VN_IS_DOOMED(vp)) { 3204 vdropl(vp); 3205 return; 3206 } 3207 if (vp->v_iflag & VI_DEFINACT) { 3208 VNPASS(vp->v_holdcnt > 1, vp); 3209 vdropl(vp); 3210 return; 3211 } 3212 if (vp->v_usecount > 0) { 3213 vp->v_iflag &= ~VI_OWEINACT; 3214 vdropl(vp); 3215 return; 3216 } 3217 vlazy(vp); 3218 vp->v_iflag |= VI_DEFINACT; 3219 VI_UNLOCK(vp); 3220 atomic_add_long(&deferred_inact, 1); 3221 } 3222 3223 static void 3224 vdefer_inactive_unlocked(struct vnode *vp) 3225 { 3226 3227 VI_LOCK(vp); 3228 if ((vp->v_iflag & VI_OWEINACT) == 0) { 3229 vdropl(vp); 3230 return; 3231 } 3232 vdefer_inactive(vp); 3233 } 3234 3235 enum vput_op { VRELE, VPUT, VUNREF }; 3236 3237 /* 3238 * Handle ->v_usecount transitioning to 0. 3239 * 3240 * By releasing the last usecount we take ownership of the hold count which 3241 * provides liveness of the vnode, meaning we have to vdrop. 3242 * 3243 * For all vnodes we may need to perform inactive processing. It requires an 3244 * exclusive lock on the vnode, while it is legal to call here with only a 3245 * shared lock (or no locks). If locking the vnode in an expected manner fails, 3246 * inactive processing gets deferred to the syncer. 3247 * 3248 * XXX Some filesystems pass in an exclusively locked vnode and strongly depend 3249 * on the lock being held all the way until VOP_INACTIVE. This in particular 3250 * happens with UFS which adds half-constructed vnodes to the hash, where they 3251 * can be found by other code. 3252 */ 3253 static void 3254 vput_final(struct vnode *vp, enum vput_op func) 3255 { 3256 int error; 3257 bool want_unlock; 3258 3259 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3260 VNPASS(vp->v_holdcnt > 0, vp); 3261 3262 VI_LOCK(vp); 3263 3264 /* 3265 * By the time we got here someone else might have transitioned 3266 * the count back to > 0. 3267 */ 3268 if (vp->v_usecount > 0) 3269 goto out; 3270 3271 /* 3272 * If the vnode is doomed vgone already performed inactive processing 3273 * (if needed). 3274 */ 3275 if (VN_IS_DOOMED(vp)) 3276 goto out; 3277 3278 if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) 3279 goto out; 3280 3281 if (vp->v_iflag & VI_DOINGINACT) 3282 goto out; 3283 3284 /* 3285 * Locking operations here will drop the interlock and possibly the 3286 * vnode lock, opening a window where the vnode can get doomed all the 3287 * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to 3288 * perform inactive. 3289 */ 3290 vp->v_iflag |= VI_OWEINACT; 3291 want_unlock = false; 3292 error = 0; 3293 switch (func) { 3294 case VRELE: 3295 switch (VOP_ISLOCKED(vp)) { 3296 case LK_EXCLUSIVE: 3297 break; 3298 case LK_EXCLOTHER: 3299 case 0: 3300 want_unlock = true; 3301 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 3302 VI_LOCK(vp); 3303 break; 3304 default: 3305 /* 3306 * The lock has at least one sharer, but we have no way 3307 * to conclude whether this is us. Play it safe and 3308 * defer processing. 3309 */ 3310 error = EAGAIN; 3311 break; 3312 } 3313 break; 3314 case VPUT: 3315 want_unlock = true; 3316 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3317 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 3318 LK_NOWAIT); 3319 VI_LOCK(vp); 3320 } 3321 break; 3322 case VUNREF: 3323 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 3324 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 3325 VI_LOCK(vp); 3326 } 3327 break; 3328 } 3329 if (error == 0) { 3330 if (func == VUNREF) { 3331 VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, 3332 ("recursive vunref")); 3333 vp->v_vflag |= VV_UNREF; 3334 } 3335 for (;;) { 3336 error = vinactive(vp); 3337 if (want_unlock) 3338 VOP_UNLOCK(vp); 3339 if (error != ERELOOKUP || !want_unlock) 3340 break; 3341 VOP_LOCK(vp, LK_EXCLUSIVE); 3342 } 3343 if (func == VUNREF) 3344 vp->v_vflag &= ~VV_UNREF; 3345 vdropl(vp); 3346 } else { 3347 vdefer_inactive(vp); 3348 } 3349 return; 3350 out: 3351 if (func == VPUT) 3352 VOP_UNLOCK(vp); 3353 vdropl(vp); 3354 } 3355 3356 /* 3357 * Decrement ->v_usecount for a vnode. 3358 * 3359 * Releasing the last use count requires additional processing, see vput_final 3360 * above for details. 3361 * 3362 * Comment above each variant denotes lock state on entry and exit. 3363 */ 3364 3365 /* 3366 * in: any 3367 * out: same as passed in 3368 */ 3369 void 3370 vrele(struct vnode *vp) 3371 { 3372 3373 ASSERT_VI_UNLOCKED(vp, __func__); 3374 if (!refcount_release(&vp->v_usecount)) 3375 return; 3376 vput_final(vp, VRELE); 3377 } 3378 3379 /* 3380 * in: locked 3381 * out: unlocked 3382 */ 3383 void 3384 vput(struct vnode *vp) 3385 { 3386 3387 ASSERT_VOP_LOCKED(vp, __func__); 3388 ASSERT_VI_UNLOCKED(vp, __func__); 3389 if (!refcount_release(&vp->v_usecount)) { 3390 VOP_UNLOCK(vp); 3391 return; 3392 } 3393 vput_final(vp, VPUT); 3394 } 3395 3396 /* 3397 * in: locked 3398 * out: locked 3399 */ 3400 void 3401 vunref(struct vnode *vp) 3402 { 3403 3404 ASSERT_VOP_LOCKED(vp, __func__); 3405 ASSERT_VI_UNLOCKED(vp, __func__); 3406 if (!refcount_release(&vp->v_usecount)) 3407 return; 3408 vput_final(vp, VUNREF); 3409 } 3410 3411 void 3412 vhold(struct vnode *vp) 3413 { 3414 int old; 3415 3416 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3417 old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3418 VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3419 ("%s: wrong hold count %d", __func__, old)); 3420 if (old == 0) 3421 vfs_freevnodes_dec(); 3422 } 3423 3424 void 3425 vholdnz(struct vnode *vp) 3426 { 3427 3428 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3429 #ifdef INVARIANTS 3430 int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3431 VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, 3432 ("%s: wrong hold count %d", __func__, old)); 3433 #else 3434 atomic_add_int(&vp->v_holdcnt, 1); 3435 #endif 3436 } 3437 3438 /* 3439 * Grab a hold count unless the vnode is freed. 3440 * 3441 * Only use this routine if vfs smr is the only protection you have against 3442 * freeing the vnode. 3443 * 3444 * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag 3445 * is not set. After the flag is set the vnode becomes immutable to anyone but 3446 * the thread which managed to set the flag. 3447 * 3448 * It may be tempting to replace the loop with: 3449 * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); 3450 * if (count & VHOLD_NO_SMR) { 3451 * backpedal and error out; 3452 * } 3453 * 3454 * However, while this is more performant, it hinders debugging by eliminating 3455 * the previously mentioned invariant. 3456 */ 3457 bool 3458 vhold_smr(struct vnode *vp) 3459 { 3460 int count; 3461 3462 VFS_SMR_ASSERT_ENTERED(); 3463 3464 count = atomic_load_int(&vp->v_holdcnt); 3465 for (;;) { 3466 if (count & VHOLD_NO_SMR) { 3467 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3468 ("non-zero hold count with flags %d\n", count)); 3469 return (false); 3470 } 3471 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3472 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3473 if (count == 0) 3474 vfs_freevnodes_dec(); 3475 return (true); 3476 } 3477 } 3478 } 3479 3480 /* 3481 * Hold a free vnode for recycling. 3482 * 3483 * Note: vnode_init references this comment. 3484 * 3485 * Attempts to recycle only need the global vnode list lock and have no use for 3486 * SMR. 3487 * 3488 * However, vnodes get inserted into the global list before they get fully 3489 * initialized and stay there until UMA decides to free the memory. This in 3490 * particular means the target can be found before it becomes usable and after 3491 * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to 3492 * VHOLD_NO_SMR. 3493 * 3494 * Note: the vnode may gain more references after we transition the count 0->1. 3495 */ 3496 static bool 3497 vhold_recycle_free(struct vnode *vp) 3498 { 3499 int count; 3500 3501 mtx_assert(&vnode_list_mtx, MA_OWNED); 3502 3503 count = atomic_load_int(&vp->v_holdcnt); 3504 for (;;) { 3505 if (count & VHOLD_NO_SMR) { 3506 VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, 3507 ("non-zero hold count with flags %d\n", count)); 3508 return (false); 3509 } 3510 VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); 3511 if (count > 0) { 3512 return (false); 3513 } 3514 if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { 3515 vfs_freevnodes_dec(); 3516 return (true); 3517 } 3518 } 3519 } 3520 3521 static void __noinline 3522 vdbatch_process(struct vdbatch *vd) 3523 { 3524 struct vnode *vp; 3525 int i; 3526 3527 mtx_assert(&vd->lock, MA_OWNED); 3528 MPASS(curthread->td_pinned > 0); 3529 MPASS(vd->index == VDBATCH_SIZE); 3530 3531 critical_enter(); 3532 if (mtx_trylock(&vnode_list_mtx)) { 3533 for (i = 0; i < VDBATCH_SIZE; i++) { 3534 vp = vd->tab[i]; 3535 vd->tab[i] = NULL; 3536 TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); 3537 TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); 3538 MPASS(vp->v_dbatchcpu != NOCPU); 3539 vp->v_dbatchcpu = NOCPU; 3540 } 3541 mtx_unlock(&vnode_list_mtx); 3542 } else { 3543 for (i = 0; i < VDBATCH_SIZE; i++) { 3544 vp = vd->tab[i]; 3545 vd->tab[i] = NULL; 3546 MPASS(vp->v_dbatchcpu != NOCPU); 3547 vp->v_dbatchcpu = NOCPU; 3548 } 3549 } 3550 vd->index = 0; 3551 critical_exit(); 3552 } 3553 3554 static void 3555 vdbatch_enqueue(struct vnode *vp) 3556 { 3557 struct vdbatch *vd; 3558 3559 ASSERT_VI_LOCKED(vp, __func__); 3560 VNPASS(!VN_IS_DOOMED(vp), vp); 3561 3562 if (vp->v_dbatchcpu != NOCPU) { 3563 VI_UNLOCK(vp); 3564 return; 3565 } 3566 3567 sched_pin(); 3568 vd = DPCPU_PTR(vd); 3569 mtx_lock(&vd->lock); 3570 MPASS(vd->index < VDBATCH_SIZE); 3571 MPASS(vd->tab[vd->index] == NULL); 3572 /* 3573 * A hack: we depend on being pinned so that we know what to put in 3574 * ->v_dbatchcpu. 3575 */ 3576 vp->v_dbatchcpu = curcpu; 3577 vd->tab[vd->index] = vp; 3578 vd->index++; 3579 VI_UNLOCK(vp); 3580 if (vd->index == VDBATCH_SIZE) 3581 vdbatch_process(vd); 3582 mtx_unlock(&vd->lock); 3583 sched_unpin(); 3584 } 3585 3586 /* 3587 * This routine must only be called for vnodes which are about to be 3588 * deallocated. Supporting dequeue for arbitrary vndoes would require 3589 * validating that the locked batch matches. 3590 */ 3591 static void 3592 vdbatch_dequeue(struct vnode *vp) 3593 { 3594 struct vdbatch *vd; 3595 int i; 3596 short cpu; 3597 3598 VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp); 3599 3600 cpu = vp->v_dbatchcpu; 3601 if (cpu == NOCPU) 3602 return; 3603 3604 vd = DPCPU_ID_PTR(cpu, vd); 3605 mtx_lock(&vd->lock); 3606 for (i = 0; i < vd->index; i++) { 3607 if (vd->tab[i] != vp) 3608 continue; 3609 vp->v_dbatchcpu = NOCPU; 3610 vd->index--; 3611 vd->tab[i] = vd->tab[vd->index]; 3612 vd->tab[vd->index] = NULL; 3613 break; 3614 } 3615 mtx_unlock(&vd->lock); 3616 /* 3617 * Either we dequeued the vnode above or the target CPU beat us to it. 3618 */ 3619 MPASS(vp->v_dbatchcpu == NOCPU); 3620 } 3621 3622 /* 3623 * Drop the hold count of the vnode. If this is the last reference to 3624 * the vnode we place it on the free list unless it has been vgone'd 3625 * (marked VIRF_DOOMED) in which case we will free it. 3626 * 3627 * Because the vnode vm object keeps a hold reference on the vnode if 3628 * there is at least one resident non-cached page, the vnode cannot 3629 * leave the active list without the page cleanup done. 3630 */ 3631 static void __noinline 3632 vdropl_final(struct vnode *vp) 3633 { 3634 3635 ASSERT_VI_LOCKED(vp, __func__); 3636 VNPASS(VN_IS_DOOMED(vp), vp); 3637 /* 3638 * Set the VHOLD_NO_SMR flag. 3639 * 3640 * We may be racing against vhold_smr. If they win we can just pretend 3641 * we never got this far, they will vdrop later. 3642 */ 3643 if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { 3644 vfs_freevnodes_inc(); 3645 VI_UNLOCK(vp); 3646 /* 3647 * We lost the aforementioned race. Any subsequent access is 3648 * invalid as they might have managed to vdropl on their own. 3649 */ 3650 return; 3651 } 3652 /* 3653 * Don't bump freevnodes as this one is going away. 3654 */ 3655 freevnode(vp); 3656 } 3657 3658 void 3659 vdrop(struct vnode *vp) 3660 { 3661 3662 ASSERT_VI_UNLOCKED(vp, __func__); 3663 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3664 if (refcount_release_if_not_last(&vp->v_holdcnt)) 3665 return; 3666 VI_LOCK(vp); 3667 vdropl(vp); 3668 } 3669 3670 static void __always_inline 3671 vdropl_impl(struct vnode *vp, bool enqueue) 3672 { 3673 3674 ASSERT_VI_LOCKED(vp, __func__); 3675 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3676 if (!refcount_release(&vp->v_holdcnt)) { 3677 VI_UNLOCK(vp); 3678 return; 3679 } 3680 VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); 3681 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 3682 if (VN_IS_DOOMED(vp)) { 3683 vdropl_final(vp); 3684 return; 3685 } 3686 3687 vfs_freevnodes_inc(); 3688 if (vp->v_mflag & VMP_LAZYLIST) { 3689 vunlazy(vp); 3690 } 3691 3692 if (!enqueue) { 3693 VI_UNLOCK(vp); 3694 return; 3695 } 3696 3697 /* 3698 * Also unlocks the interlock. We can't assert on it as we 3699 * released our hold and by now the vnode might have been 3700 * freed. 3701 */ 3702 vdbatch_enqueue(vp); 3703 } 3704 3705 void 3706 vdropl(struct vnode *vp) 3707 { 3708 3709 vdropl_impl(vp, true); 3710 } 3711 3712 /* 3713 * vdrop a vnode when recycling 3714 * 3715 * This is a special case routine only to be used when recycling, differs from 3716 * regular vdrop by not requeieing the vnode on LRU. 3717 * 3718 * Consider a case where vtryrecycle continuously fails with all vnodes (due to 3719 * e.g., frozen writes on the filesystem), filling the batch and causing it to 3720 * be requeued. Then vnlru will end up revisiting the same vnodes. This is a 3721 * loop which can last for as long as writes are frozen. 3722 */ 3723 static void 3724 vdropl_recycle(struct vnode *vp) 3725 { 3726 3727 vdropl_impl(vp, false); 3728 } 3729 3730 static void 3731 vdrop_recycle(struct vnode *vp) 3732 { 3733 3734 VI_LOCK(vp); 3735 vdropl_recycle(vp); 3736 } 3737 3738 /* 3739 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 3740 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 3741 */ 3742 static int 3743 vinactivef(struct vnode *vp) 3744 { 3745 struct vm_object *obj; 3746 int error; 3747 3748 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3749 ASSERT_VI_LOCKED(vp, "vinactive"); 3750 VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp); 3751 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3752 vp->v_iflag |= VI_DOINGINACT; 3753 vp->v_iflag &= ~VI_OWEINACT; 3754 VI_UNLOCK(vp); 3755 /* 3756 * Before moving off the active list, we must be sure that any 3757 * modified pages are converted into the vnode's dirty 3758 * buffers, since these will no longer be checked once the 3759 * vnode is on the inactive list. 3760 * 3761 * The write-out of the dirty pages is asynchronous. At the 3762 * point that VOP_INACTIVE() is called, there could still be 3763 * pending I/O and dirty pages in the object. 3764 */ 3765 if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 3766 vm_object_mightbedirty(obj)) { 3767 VM_OBJECT_WLOCK(obj); 3768 vm_object_page_clean(obj, 0, 0, 0); 3769 VM_OBJECT_WUNLOCK(obj); 3770 } 3771 error = VOP_INACTIVE(vp); 3772 VI_LOCK(vp); 3773 VNPASS(vp->v_iflag & VI_DOINGINACT, vp); 3774 vp->v_iflag &= ~VI_DOINGINACT; 3775 return (error); 3776 } 3777 3778 int 3779 vinactive(struct vnode *vp) 3780 { 3781 3782 ASSERT_VOP_ELOCKED(vp, "vinactive"); 3783 ASSERT_VI_LOCKED(vp, "vinactive"); 3784 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3785 3786 if ((vp->v_iflag & VI_OWEINACT) == 0) 3787 return (0); 3788 if (vp->v_iflag & VI_DOINGINACT) 3789 return (0); 3790 if (vp->v_usecount > 0) { 3791 vp->v_iflag &= ~VI_OWEINACT; 3792 return (0); 3793 } 3794 return (vinactivef(vp)); 3795 } 3796 3797 /* 3798 * Remove any vnodes in the vnode table belonging to mount point mp. 3799 * 3800 * If FORCECLOSE is not specified, there should not be any active ones, 3801 * return error if any are found (nb: this is a user error, not a 3802 * system error). If FORCECLOSE is specified, detach any active vnodes 3803 * that are found. 3804 * 3805 * If WRITECLOSE is set, only flush out regular file vnodes open for 3806 * writing. 3807 * 3808 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 3809 * 3810 * `rootrefs' specifies the base reference count for the root vnode 3811 * of this filesystem. The root vnode is considered busy if its 3812 * v_usecount exceeds this value. On a successful return, vflush(, td) 3813 * will call vrele() on the root vnode exactly rootrefs times. 3814 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 3815 * be zero. 3816 */ 3817 #ifdef DIAGNOSTIC 3818 static int busyprt = 0; /* print out busy vnodes */ 3819 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 3820 #endif 3821 3822 int 3823 vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 3824 { 3825 struct vnode *vp, *mvp, *rootvp = NULL; 3826 struct vattr vattr; 3827 int busy = 0, error; 3828 3829 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 3830 rootrefs, flags); 3831 if (rootrefs > 0) { 3832 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 3833 ("vflush: bad args")); 3834 /* 3835 * Get the filesystem root vnode. We can vput() it 3836 * immediately, since with rootrefs > 0, it won't go away. 3837 */ 3838 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 3839 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 3840 __func__, error); 3841 return (error); 3842 } 3843 vput(rootvp); 3844 } 3845 loop: 3846 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 3847 vholdl(vp); 3848 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 3849 if (error) { 3850 vdrop(vp); 3851 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3852 goto loop; 3853 } 3854 /* 3855 * Skip over a vnodes marked VV_SYSTEM. 3856 */ 3857 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 3858 VOP_UNLOCK(vp); 3859 vdrop(vp); 3860 continue; 3861 } 3862 /* 3863 * If WRITECLOSE is set, flush out unlinked but still open 3864 * files (even if open only for reading) and regular file 3865 * vnodes open for writing. 3866 */ 3867 if (flags & WRITECLOSE) { 3868 if (vp->v_object != NULL) { 3869 VM_OBJECT_WLOCK(vp->v_object); 3870 vm_object_page_clean(vp->v_object, 0, 0, 0); 3871 VM_OBJECT_WUNLOCK(vp->v_object); 3872 } 3873 do { 3874 error = VOP_FSYNC(vp, MNT_WAIT, td); 3875 } while (error == ERELOOKUP); 3876 if (error != 0) { 3877 VOP_UNLOCK(vp); 3878 vdrop(vp); 3879 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 3880 return (error); 3881 } 3882 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 3883 VI_LOCK(vp); 3884 3885 if ((vp->v_type == VNON || 3886 (error == 0 && vattr.va_nlink > 0)) && 3887 (vp->v_writecount <= 0 || vp->v_type != VREG)) { 3888 VOP_UNLOCK(vp); 3889 vdropl(vp); 3890 continue; 3891 } 3892 } else 3893 VI_LOCK(vp); 3894 /* 3895 * With v_usecount == 0, all we need to do is clear out the 3896 * vnode data structures and we are done. 3897 * 3898 * If FORCECLOSE is set, forcibly close the vnode. 3899 */ 3900 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 3901 vgonel(vp); 3902 } else { 3903 busy++; 3904 #ifdef DIAGNOSTIC 3905 if (busyprt) 3906 vn_printf(vp, "vflush: busy vnode "); 3907 #endif 3908 } 3909 VOP_UNLOCK(vp); 3910 vdropl(vp); 3911 } 3912 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 3913 /* 3914 * If just the root vnode is busy, and if its refcount 3915 * is equal to `rootrefs', then go ahead and kill it. 3916 */ 3917 VI_LOCK(rootvp); 3918 KASSERT(busy > 0, ("vflush: not busy")); 3919 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 3920 ("vflush: usecount %d < rootrefs %d", 3921 rootvp->v_usecount, rootrefs)); 3922 if (busy == 1 && rootvp->v_usecount == rootrefs) { 3923 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 3924 vgone(rootvp); 3925 VOP_UNLOCK(rootvp); 3926 busy = 0; 3927 } else 3928 VI_UNLOCK(rootvp); 3929 } 3930 if (busy) { 3931 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 3932 busy); 3933 return (EBUSY); 3934 } 3935 for (; rootrefs > 0; rootrefs--) 3936 vrele(rootvp); 3937 return (0); 3938 } 3939 3940 /* 3941 * Recycle an unused vnode to the front of the free list. 3942 */ 3943 int 3944 vrecycle(struct vnode *vp) 3945 { 3946 int recycled; 3947 3948 VI_LOCK(vp); 3949 recycled = vrecyclel(vp); 3950 VI_UNLOCK(vp); 3951 return (recycled); 3952 } 3953 3954 /* 3955 * vrecycle, with the vp interlock held. 3956 */ 3957 int 3958 vrecyclel(struct vnode *vp) 3959 { 3960 int recycled; 3961 3962 ASSERT_VOP_ELOCKED(vp, __func__); 3963 ASSERT_VI_LOCKED(vp, __func__); 3964 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 3965 recycled = 0; 3966 if (vp->v_usecount == 0) { 3967 recycled = 1; 3968 vgonel(vp); 3969 } 3970 return (recycled); 3971 } 3972 3973 /* 3974 * Eliminate all activity associated with a vnode 3975 * in preparation for reuse. 3976 */ 3977 void 3978 vgone(struct vnode *vp) 3979 { 3980 VI_LOCK(vp); 3981 vgonel(vp); 3982 VI_UNLOCK(vp); 3983 } 3984 3985 /* 3986 * Notify upper mounts about reclaimed or unlinked vnode. 3987 */ 3988 void 3989 vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) 3990 { 3991 struct mount *mp; 3992 struct mount_upper_node *ump; 3993 3994 mp = atomic_load_ptr(&vp->v_mount); 3995 if (mp == NULL) 3996 return; 3997 if (TAILQ_EMPTY(&mp->mnt_notify)) 3998 return; 3999 4000 MNT_ILOCK(mp); 4001 mp->mnt_upper_pending++; 4002 KASSERT(mp->mnt_upper_pending > 0, 4003 ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); 4004 TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { 4005 MNT_IUNLOCK(mp); 4006 switch (event) { 4007 case VFS_NOTIFY_UPPER_RECLAIM: 4008 VFS_RECLAIM_LOWERVP(ump->mp, vp); 4009 break; 4010 case VFS_NOTIFY_UPPER_UNLINK: 4011 VFS_UNLINK_LOWERVP(ump->mp, vp); 4012 break; 4013 } 4014 MNT_ILOCK(mp); 4015 } 4016 mp->mnt_upper_pending--; 4017 if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && 4018 mp->mnt_upper_pending == 0) { 4019 mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; 4020 wakeup(&mp->mnt_uppers); 4021 } 4022 MNT_IUNLOCK(mp); 4023 } 4024 4025 /* 4026 * vgone, with the vp interlock held. 4027 */ 4028 static void 4029 vgonel(struct vnode *vp) 4030 { 4031 struct thread *td; 4032 struct mount *mp; 4033 vm_object_t object; 4034 bool active, doinginact, oweinact; 4035 4036 ASSERT_VOP_ELOCKED(vp, "vgonel"); 4037 ASSERT_VI_LOCKED(vp, "vgonel"); 4038 VNASSERT(vp->v_holdcnt, vp, 4039 ("vgonel: vp %p has no reference.", vp)); 4040 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 4041 td = curthread; 4042 4043 /* 4044 * Don't vgonel if we're already doomed. 4045 */ 4046 if (VN_IS_DOOMED(vp)) { 4047 VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ 4048 vn_get_state(vp) == VSTATE_DEAD, vp); 4049 return; 4050 } 4051 /* 4052 * Paired with freevnode. 4053 */ 4054 vn_seqc_write_begin_locked(vp); 4055 vunlazy_gone(vp); 4056 vn_irflag_set_locked(vp, VIRF_DOOMED); 4057 vn_set_state(vp, VSTATE_DESTROYING); 4058 4059 /* 4060 * Check to see if the vnode is in use. If so, we have to 4061 * call VOP_CLOSE() and VOP_INACTIVE(). 4062 * 4063 * It could be that VOP_INACTIVE() requested reclamation, in 4064 * which case we should avoid recursion, so check 4065 * VI_DOINGINACT. This is not precise but good enough. 4066 */ 4067 active = vp->v_usecount > 0; 4068 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4069 doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; 4070 4071 /* 4072 * If we need to do inactive VI_OWEINACT will be set. 4073 */ 4074 if (vp->v_iflag & VI_DEFINACT) { 4075 VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); 4076 vp->v_iflag &= ~VI_DEFINACT; 4077 vdropl(vp); 4078 } else { 4079 VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); 4080 VI_UNLOCK(vp); 4081 } 4082 cache_purge_vgone(vp); 4083 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 4084 4085 /* 4086 * If purging an active vnode, it must be closed and 4087 * deactivated before being reclaimed. 4088 */ 4089 if (active) 4090 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 4091 if (!doinginact) { 4092 do { 4093 if (oweinact || active) { 4094 VI_LOCK(vp); 4095 vinactivef(vp); 4096 oweinact = (vp->v_iflag & VI_OWEINACT) != 0; 4097 VI_UNLOCK(vp); 4098 } 4099 } while (oweinact); 4100 } 4101 if (vp->v_type == VSOCK) 4102 vfs_unp_reclaim(vp); 4103 4104 /* 4105 * Clean out any buffers associated with the vnode. 4106 * If the flush fails, just toss the buffers. 4107 */ 4108 mp = NULL; 4109 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 4110 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 4111 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { 4112 while (vinvalbuf(vp, 0, 0, 0) != 0) 4113 ; 4114 } 4115 4116 BO_LOCK(&vp->v_bufobj); 4117 KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && 4118 vp->v_bufobj.bo_dirty.bv_cnt == 0 && 4119 TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && 4120 vp->v_bufobj.bo_clean.bv_cnt == 0, 4121 ("vp %p bufobj not invalidated", vp)); 4122 4123 /* 4124 * For VMIO bufobj, BO_DEAD is set later, or in 4125 * vm_object_terminate() after the object's page queue is 4126 * flushed. 4127 */ 4128 object = vp->v_bufobj.bo_object; 4129 if (object == NULL) 4130 vp->v_bufobj.bo_flag |= BO_DEAD; 4131 BO_UNLOCK(&vp->v_bufobj); 4132 4133 /* 4134 * Handle the VM part. Tmpfs handles v_object on its own (the 4135 * OBJT_VNODE check). Nullfs or other bypassing filesystems 4136 * should not touch the object borrowed from the lower vnode 4137 * (the handle check). 4138 */ 4139 if (object != NULL && object->type == OBJT_VNODE && 4140 object->handle == vp) 4141 vnode_destroy_vobject(vp); 4142 4143 /* 4144 * Reclaim the vnode. 4145 */ 4146 if (VOP_RECLAIM(vp)) 4147 panic("vgone: cannot reclaim"); 4148 if (mp != NULL) 4149 vn_finished_secondary_write(mp); 4150 VNASSERT(vp->v_object == NULL, vp, 4151 ("vop_reclaim left v_object vp=%p", vp)); 4152 /* 4153 * Clear the advisory locks and wake up waiting threads. 4154 */ 4155 if (vp->v_lockf != NULL) { 4156 (void)VOP_ADVLOCKPURGE(vp); 4157 vp->v_lockf = NULL; 4158 } 4159 /* 4160 * Delete from old mount point vnode list. 4161 */ 4162 if (vp->v_mount == NULL) { 4163 VI_LOCK(vp); 4164 } else { 4165 delmntque(vp); 4166 ASSERT_VI_LOCKED(vp, "vgonel 2"); 4167 } 4168 /* 4169 * Done with purge, reset to the standard lock and invalidate 4170 * the vnode. 4171 */ 4172 vp->v_vnlock = &vp->v_lock; 4173 vp->v_op = &dead_vnodeops; 4174 vp->v_type = VBAD; 4175 vn_set_state(vp, VSTATE_DEAD); 4176 } 4177 4178 /* 4179 * Print out a description of a vnode. 4180 */ 4181 static const char *const vtypename[] = { 4182 [VNON] = "VNON", 4183 [VREG] = "VREG", 4184 [VDIR] = "VDIR", 4185 [VBLK] = "VBLK", 4186 [VCHR] = "VCHR", 4187 [VLNK] = "VLNK", 4188 [VSOCK] = "VSOCK", 4189 [VFIFO] = "VFIFO", 4190 [VBAD] = "VBAD", 4191 [VMARKER] = "VMARKER", 4192 }; 4193 _Static_assert(nitems(vtypename) == VLASTTYPE + 1, 4194 "vnode type name not added to vtypename"); 4195 4196 static const char *const vstatename[] = { 4197 [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", 4198 [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", 4199 [VSTATE_DESTROYING] = "VSTATE_DESTROYING", 4200 [VSTATE_DEAD] = "VSTATE_DEAD", 4201 }; 4202 _Static_assert(nitems(vstatename) == VLASTSTATE + 1, 4203 "vnode state name not added to vstatename"); 4204 4205 _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, 4206 "new hold count flag not added to vn_printf"); 4207 4208 void 4209 vn_printf(struct vnode *vp, const char *fmt, ...) 4210 { 4211 va_list ap; 4212 char buf[256], buf2[16]; 4213 u_long flags; 4214 u_int holdcnt; 4215 short irflag; 4216 4217 va_start(ap, fmt); 4218 vprintf(fmt, ap); 4219 va_end(ap); 4220 printf("%p: ", (void *)vp); 4221 printf("type %s state %s op %p\n", vtypename[vp->v_type], 4222 vstatename[vp->v_state], vp->v_op); 4223 holdcnt = atomic_load_int(&vp->v_holdcnt); 4224 printf(" usecount %d, writecount %d, refcount %d seqc users %d", 4225 vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, 4226 vp->v_seqc_users); 4227 switch (vp->v_type) { 4228 case VDIR: 4229 printf(" mountedhere %p\n", vp->v_mountedhere); 4230 break; 4231 case VCHR: 4232 printf(" rdev %p\n", vp->v_rdev); 4233 break; 4234 case VSOCK: 4235 printf(" socket %p\n", vp->v_unpcb); 4236 break; 4237 case VFIFO: 4238 printf(" fifoinfo %p\n", vp->v_fifoinfo); 4239 break; 4240 default: 4241 printf("\n"); 4242 break; 4243 } 4244 buf[0] = '\0'; 4245 buf[1] = '\0'; 4246 if (holdcnt & VHOLD_NO_SMR) 4247 strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); 4248 printf(" hold count flags (%s)\n", buf + 1); 4249 4250 buf[0] = '\0'; 4251 buf[1] = '\0'; 4252 irflag = vn_irflag_read(vp); 4253 if (irflag & VIRF_DOOMED) 4254 strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); 4255 if (irflag & VIRF_PGREAD) 4256 strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); 4257 if (irflag & VIRF_MOUNTPOINT) 4258 strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); 4259 if (irflag & VIRF_TEXT_REF) 4260 strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); 4261 flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); 4262 if (flags != 0) { 4263 snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); 4264 strlcat(buf, buf2, sizeof(buf)); 4265 } 4266 if (vp->v_vflag & VV_ROOT) 4267 strlcat(buf, "|VV_ROOT", sizeof(buf)); 4268 if (vp->v_vflag & VV_ISTTY) 4269 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 4270 if (vp->v_vflag & VV_NOSYNC) 4271 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 4272 if (vp->v_vflag & VV_ETERNALDEV) 4273 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 4274 if (vp->v_vflag & VV_CACHEDLABEL) 4275 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 4276 if (vp->v_vflag & VV_VMSIZEVNLOCK) 4277 strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); 4278 if (vp->v_vflag & VV_COPYONWRITE) 4279 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 4280 if (vp->v_vflag & VV_SYSTEM) 4281 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 4282 if (vp->v_vflag & VV_PROCDEP) 4283 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 4284 if (vp->v_vflag & VV_DELETED) 4285 strlcat(buf, "|VV_DELETED", sizeof(buf)); 4286 if (vp->v_vflag & VV_MD) 4287 strlcat(buf, "|VV_MD", sizeof(buf)); 4288 if (vp->v_vflag & VV_FORCEINSMQ) 4289 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 4290 if (vp->v_vflag & VV_READLINK) 4291 strlcat(buf, "|VV_READLINK", sizeof(buf)); 4292 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 4293 VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | 4294 VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); 4295 if (flags != 0) { 4296 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 4297 strlcat(buf, buf2, sizeof(buf)); 4298 } 4299 if (vp->v_iflag & VI_MOUNT) 4300 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 4301 if (vp->v_iflag & VI_DOINGINACT) 4302 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 4303 if (vp->v_iflag & VI_OWEINACT) 4304 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 4305 if (vp->v_iflag & VI_DEFINACT) 4306 strlcat(buf, "|VI_DEFINACT", sizeof(buf)); 4307 if (vp->v_iflag & VI_FOPENING) 4308 strlcat(buf, "|VI_FOPENING", sizeof(buf)); 4309 flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | 4310 VI_OWEINACT | VI_DEFINACT | VI_FOPENING); 4311 if (flags != 0) { 4312 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 4313 strlcat(buf, buf2, sizeof(buf)); 4314 } 4315 if (vp->v_mflag & VMP_LAZYLIST) 4316 strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); 4317 flags = vp->v_mflag & ~(VMP_LAZYLIST); 4318 if (flags != 0) { 4319 snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); 4320 strlcat(buf, buf2, sizeof(buf)); 4321 } 4322 printf(" flags (%s)", buf + 1); 4323 if (mtx_owned(VI_MTX(vp))) 4324 printf(" VI_LOCKed"); 4325 printf("\n"); 4326 if (vp->v_object != NULL) 4327 printf(" v_object %p ref %d pages %d " 4328 "cleanbuf %d dirtybuf %d\n", 4329 vp->v_object, vp->v_object->ref_count, 4330 vp->v_object->resident_page_count, 4331 vp->v_bufobj.bo_clean.bv_cnt, 4332 vp->v_bufobj.bo_dirty.bv_cnt); 4333 printf(" "); 4334 lockmgr_printinfo(vp->v_vnlock); 4335 if (vp->v_data != NULL) 4336 VOP_PRINT(vp); 4337 } 4338 4339 #ifdef DDB 4340 /* 4341 * List all of the locked vnodes in the system. 4342 * Called when debugging the kernel. 4343 */ 4344 DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) 4345 { 4346 struct mount *mp; 4347 struct vnode *vp; 4348 4349 /* 4350 * Note: because this is DDB, we can't obey the locking semantics 4351 * for these structures, which means we could catch an inconsistent 4352 * state and dereference a nasty pointer. Not much to be done 4353 * about that. 4354 */ 4355 db_printf("Locked vnodes\n"); 4356 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4357 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4358 if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) 4359 vn_printf(vp, "vnode "); 4360 } 4361 } 4362 } 4363 4364 /* 4365 * Show details about the given vnode. 4366 */ 4367 DB_SHOW_COMMAND(vnode, db_show_vnode) 4368 { 4369 struct vnode *vp; 4370 4371 if (!have_addr) 4372 return; 4373 vp = (struct vnode *)addr; 4374 vn_printf(vp, "vnode "); 4375 } 4376 4377 /* 4378 * Show details about the given mount point. 4379 */ 4380 DB_SHOW_COMMAND(mount, db_show_mount) 4381 { 4382 struct mount *mp; 4383 struct vfsopt *opt; 4384 struct statfs *sp; 4385 struct vnode *vp; 4386 char buf[512]; 4387 uint64_t mflags; 4388 u_int flags; 4389 4390 if (!have_addr) { 4391 /* No address given, print short info about all mount points. */ 4392 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 4393 db_printf("%p %s on %s (%s)\n", mp, 4394 mp->mnt_stat.f_mntfromname, 4395 mp->mnt_stat.f_mntonname, 4396 mp->mnt_stat.f_fstypename); 4397 if (db_pager_quit) 4398 break; 4399 } 4400 db_printf("\nMore info: show mount <addr>\n"); 4401 return; 4402 } 4403 4404 mp = (struct mount *)addr; 4405 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 4406 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 4407 4408 buf[0] = '\0'; 4409 mflags = mp->mnt_flag; 4410 #define MNT_FLAG(flag) do { \ 4411 if (mflags & (flag)) { \ 4412 if (buf[0] != '\0') \ 4413 strlcat(buf, ", ", sizeof(buf)); \ 4414 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 4415 mflags &= ~(flag); \ 4416 } \ 4417 } while (0) 4418 MNT_FLAG(MNT_RDONLY); 4419 MNT_FLAG(MNT_SYNCHRONOUS); 4420 MNT_FLAG(MNT_NOEXEC); 4421 MNT_FLAG(MNT_NOSUID); 4422 MNT_FLAG(MNT_NFS4ACLS); 4423 MNT_FLAG(MNT_UNION); 4424 MNT_FLAG(MNT_ASYNC); 4425 MNT_FLAG(MNT_SUIDDIR); 4426 MNT_FLAG(MNT_SOFTDEP); 4427 MNT_FLAG(MNT_NOSYMFOLLOW); 4428 MNT_FLAG(MNT_GJOURNAL); 4429 MNT_FLAG(MNT_MULTILABEL); 4430 MNT_FLAG(MNT_ACLS); 4431 MNT_FLAG(MNT_NOATIME); 4432 MNT_FLAG(MNT_NOCLUSTERR); 4433 MNT_FLAG(MNT_NOCLUSTERW); 4434 MNT_FLAG(MNT_SUJ); 4435 MNT_FLAG(MNT_EXRDONLY); 4436 MNT_FLAG(MNT_EXPORTED); 4437 MNT_FLAG(MNT_DEFEXPORTED); 4438 MNT_FLAG(MNT_EXPORTANON); 4439 MNT_FLAG(MNT_EXKERB); 4440 MNT_FLAG(MNT_EXPUBLIC); 4441 MNT_FLAG(MNT_LOCAL); 4442 MNT_FLAG(MNT_QUOTA); 4443 MNT_FLAG(MNT_ROOTFS); 4444 MNT_FLAG(MNT_USER); 4445 MNT_FLAG(MNT_IGNORE); 4446 MNT_FLAG(MNT_UPDATE); 4447 MNT_FLAG(MNT_DELEXPORT); 4448 MNT_FLAG(MNT_RELOAD); 4449 MNT_FLAG(MNT_FORCE); 4450 MNT_FLAG(MNT_SNAPSHOT); 4451 MNT_FLAG(MNT_BYFSID); 4452 #undef MNT_FLAG 4453 if (mflags != 0) { 4454 if (buf[0] != '\0') 4455 strlcat(buf, ", ", sizeof(buf)); 4456 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4457 "0x%016jx", mflags); 4458 } 4459 db_printf(" mnt_flag = %s\n", buf); 4460 4461 buf[0] = '\0'; 4462 flags = mp->mnt_kern_flag; 4463 #define MNT_KERN_FLAG(flag) do { \ 4464 if (flags & (flag)) { \ 4465 if (buf[0] != '\0') \ 4466 strlcat(buf, ", ", sizeof(buf)); \ 4467 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 4468 flags &= ~(flag); \ 4469 } \ 4470 } while (0) 4471 MNT_KERN_FLAG(MNTK_UNMOUNTF); 4472 MNT_KERN_FLAG(MNTK_ASYNC); 4473 MNT_KERN_FLAG(MNTK_SOFTDEP); 4474 MNT_KERN_FLAG(MNTK_NOMSYNC); 4475 MNT_KERN_FLAG(MNTK_DRAINING); 4476 MNT_KERN_FLAG(MNTK_REFEXPIRE); 4477 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 4478 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 4479 MNT_KERN_FLAG(MNTK_NO_IOPF); 4480 MNT_KERN_FLAG(MNTK_RECURSE); 4481 MNT_KERN_FLAG(MNTK_UPPER_WAITER); 4482 MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); 4483 MNT_KERN_FLAG(MNTK_USES_BCACHE); 4484 MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); 4485 MNT_KERN_FLAG(MNTK_FPLOOKUP); 4486 MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); 4487 MNT_KERN_FLAG(MNTK_NOASYNC); 4488 MNT_KERN_FLAG(MNTK_UNMOUNT); 4489 MNT_KERN_FLAG(MNTK_MWAIT); 4490 MNT_KERN_FLAG(MNTK_SUSPEND); 4491 MNT_KERN_FLAG(MNTK_SUSPEND2); 4492 MNT_KERN_FLAG(MNTK_SUSPENDED); 4493 MNT_KERN_FLAG(MNTK_NULL_NOCACHE); 4494 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 4495 #undef MNT_KERN_FLAG 4496 if (flags != 0) { 4497 if (buf[0] != '\0') 4498 strlcat(buf, ", ", sizeof(buf)); 4499 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 4500 "0x%08x", flags); 4501 } 4502 db_printf(" mnt_kern_flag = %s\n", buf); 4503 4504 db_printf(" mnt_opt = "); 4505 opt = TAILQ_FIRST(mp->mnt_opt); 4506 if (opt != NULL) { 4507 db_printf("%s", opt->name); 4508 opt = TAILQ_NEXT(opt, link); 4509 while (opt != NULL) { 4510 db_printf(", %s", opt->name); 4511 opt = TAILQ_NEXT(opt, link); 4512 } 4513 } 4514 db_printf("\n"); 4515 4516 sp = &mp->mnt_stat; 4517 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 4518 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 4519 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 4520 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 4521 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 4522 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 4523 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 4524 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 4525 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 4526 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 4527 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 4528 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 4529 4530 db_printf(" mnt_cred = { uid=%u ruid=%u", 4531 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 4532 if (jailed(mp->mnt_cred)) 4533 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 4534 db_printf(" }\n"); 4535 db_printf(" mnt_ref = %d (with %d in the struct)\n", 4536 vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); 4537 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 4538 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 4539 db_printf(" mnt_lazyvnodelistsize = %d\n", 4540 mp->mnt_lazyvnodelistsize); 4541 db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", 4542 vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); 4543 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 4544 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 4545 db_printf(" mnt_lockref = %d (with %d in the struct)\n", 4546 vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); 4547 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 4548 db_printf(" mnt_secondary_accwrites = %d\n", 4549 mp->mnt_secondary_accwrites); 4550 db_printf(" mnt_gjprovider = %s\n", 4551 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 4552 db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); 4553 4554 db_printf("\n\nList of active vnodes\n"); 4555 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4556 if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { 4557 vn_printf(vp, "vnode "); 4558 if (db_pager_quit) 4559 break; 4560 } 4561 } 4562 db_printf("\n\nList of inactive vnodes\n"); 4563 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 4564 if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { 4565 vn_printf(vp, "vnode "); 4566 if (db_pager_quit) 4567 break; 4568 } 4569 } 4570 } 4571 #endif /* DDB */ 4572 4573 /* 4574 * Fill in a struct xvfsconf based on a struct vfsconf. 4575 */ 4576 static int 4577 vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 4578 { 4579 struct xvfsconf xvfsp; 4580 4581 bzero(&xvfsp, sizeof(xvfsp)); 4582 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4583 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4584 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4585 xvfsp.vfc_flags = vfsp->vfc_flags; 4586 /* 4587 * These are unused in userland, we keep them 4588 * to not break binary compatibility. 4589 */ 4590 xvfsp.vfc_vfsops = NULL; 4591 xvfsp.vfc_next = NULL; 4592 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4593 } 4594 4595 #ifdef COMPAT_FREEBSD32 4596 struct xvfsconf32 { 4597 uint32_t vfc_vfsops; 4598 char vfc_name[MFSNAMELEN]; 4599 int32_t vfc_typenum; 4600 int32_t vfc_refcount; 4601 int32_t vfc_flags; 4602 uint32_t vfc_next; 4603 }; 4604 4605 static int 4606 vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 4607 { 4608 struct xvfsconf32 xvfsp; 4609 4610 bzero(&xvfsp, sizeof(xvfsp)); 4611 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 4612 xvfsp.vfc_typenum = vfsp->vfc_typenum; 4613 xvfsp.vfc_refcount = vfsp->vfc_refcount; 4614 xvfsp.vfc_flags = vfsp->vfc_flags; 4615 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 4616 } 4617 #endif 4618 4619 /* 4620 * Top level filesystem related information gathering. 4621 */ 4622 static int 4623 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 4624 { 4625 struct vfsconf *vfsp; 4626 int error; 4627 4628 error = 0; 4629 vfsconf_slock(); 4630 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4631 #ifdef COMPAT_FREEBSD32 4632 if (req->flags & SCTL_MASK32) 4633 error = vfsconf2x32(req, vfsp); 4634 else 4635 #endif 4636 error = vfsconf2x(req, vfsp); 4637 if (error) 4638 break; 4639 } 4640 vfsconf_sunlock(); 4641 return (error); 4642 } 4643 4644 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | 4645 CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, 4646 "S,xvfsconf", "List of all configured filesystems"); 4647 4648 #ifndef BURN_BRIDGES 4649 static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 4650 4651 static int 4652 vfs_sysctl(SYSCTL_HANDLER_ARGS) 4653 { 4654 int *name = (int *)arg1 - 1; /* XXX */ 4655 u_int namelen = arg2 + 1; /* XXX */ 4656 struct vfsconf *vfsp; 4657 4658 log(LOG_WARNING, "userland calling deprecated sysctl, " 4659 "please rebuild world\n"); 4660 4661 #if 1 || defined(COMPAT_PRELITE2) 4662 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 4663 if (namelen == 1) 4664 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 4665 #endif 4666 4667 switch (name[1]) { 4668 case VFS_MAXTYPENUM: 4669 if (namelen != 2) 4670 return (ENOTDIR); 4671 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 4672 case VFS_CONF: 4673 if (namelen != 3) 4674 return (ENOTDIR); /* overloaded */ 4675 vfsconf_slock(); 4676 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4677 if (vfsp->vfc_typenum == name[2]) 4678 break; 4679 } 4680 vfsconf_sunlock(); 4681 if (vfsp == NULL) 4682 return (EOPNOTSUPP); 4683 #ifdef COMPAT_FREEBSD32 4684 if (req->flags & SCTL_MASK32) 4685 return (vfsconf2x32(req, vfsp)); 4686 else 4687 #endif 4688 return (vfsconf2x(req, vfsp)); 4689 } 4690 return (EOPNOTSUPP); 4691 } 4692 4693 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | 4694 CTLFLAG_MPSAFE, vfs_sysctl, 4695 "Generic filesystem"); 4696 4697 #if 1 || defined(COMPAT_PRELITE2) 4698 4699 static int 4700 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 4701 { 4702 int error; 4703 struct vfsconf *vfsp; 4704 struct ovfsconf ovfs; 4705 4706 vfsconf_slock(); 4707 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 4708 bzero(&ovfs, sizeof(ovfs)); 4709 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 4710 strcpy(ovfs.vfc_name, vfsp->vfc_name); 4711 ovfs.vfc_index = vfsp->vfc_typenum; 4712 ovfs.vfc_refcount = vfsp->vfc_refcount; 4713 ovfs.vfc_flags = vfsp->vfc_flags; 4714 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 4715 if (error != 0) { 4716 vfsconf_sunlock(); 4717 return (error); 4718 } 4719 } 4720 vfsconf_sunlock(); 4721 return (0); 4722 } 4723 4724 #endif /* 1 || COMPAT_PRELITE2 */ 4725 #endif /* !BURN_BRIDGES */ 4726 4727 static void 4728 unmount_or_warn(struct mount *mp) 4729 { 4730 int error; 4731 4732 error = dounmount(mp, MNT_FORCE, curthread); 4733 if (error != 0) { 4734 printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); 4735 if (error == EBUSY) 4736 printf("BUSY)\n"); 4737 else 4738 printf("%d)\n", error); 4739 } 4740 } 4741 4742 /* 4743 * Unmount all filesystems. The list is traversed in reverse order 4744 * of mounting to avoid dependencies. 4745 */ 4746 void 4747 vfs_unmountall(void) 4748 { 4749 struct mount *mp, *tmp; 4750 4751 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 4752 4753 /* 4754 * Since this only runs when rebooting, it is not interlocked. 4755 */ 4756 TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { 4757 vfs_ref(mp); 4758 4759 /* 4760 * Forcibly unmounting "/dev" before "/" would prevent clean 4761 * unmount of the latter. 4762 */ 4763 if (mp == rootdevmp) 4764 continue; 4765 4766 unmount_or_warn(mp); 4767 } 4768 4769 if (rootdevmp != NULL) 4770 unmount_or_warn(rootdevmp); 4771 } 4772 4773 static void 4774 vfs_deferred_inactive(struct vnode *vp, int lkflags) 4775 { 4776 4777 ASSERT_VI_LOCKED(vp, __func__); 4778 VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); 4779 if ((vp->v_iflag & VI_OWEINACT) == 0) { 4780 vdropl(vp); 4781 return; 4782 } 4783 if (vn_lock(vp, lkflags) == 0) { 4784 VI_LOCK(vp); 4785 vinactive(vp); 4786 VOP_UNLOCK(vp); 4787 vdropl(vp); 4788 return; 4789 } 4790 vdefer_inactive_unlocked(vp); 4791 } 4792 4793 static int 4794 vfs_periodic_inactive_filter(struct vnode *vp, void *arg) 4795 { 4796 4797 return (vp->v_iflag & VI_DEFINACT); 4798 } 4799 4800 static void __noinline 4801 vfs_periodic_inactive(struct mount *mp, int flags) 4802 { 4803 struct vnode *vp, *mvp; 4804 int lkflags; 4805 4806 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4807 if (flags != MNT_WAIT) 4808 lkflags |= LK_NOWAIT; 4809 4810 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { 4811 if ((vp->v_iflag & VI_DEFINACT) == 0) { 4812 VI_UNLOCK(vp); 4813 continue; 4814 } 4815 vp->v_iflag &= ~VI_DEFINACT; 4816 vfs_deferred_inactive(vp, lkflags); 4817 } 4818 } 4819 4820 static inline bool 4821 vfs_want_msync(struct vnode *vp) 4822 { 4823 struct vm_object *obj; 4824 4825 /* 4826 * This test may be performed without any locks held. 4827 * We rely on vm_object's type stability. 4828 */ 4829 if (vp->v_vflag & VV_NOSYNC) 4830 return (false); 4831 obj = vp->v_object; 4832 return (obj != NULL && vm_object_mightbedirty(obj)); 4833 } 4834 4835 static int 4836 vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) 4837 { 4838 4839 if (vp->v_vflag & VV_NOSYNC) 4840 return (false); 4841 if (vp->v_iflag & VI_DEFINACT) 4842 return (true); 4843 return (vfs_want_msync(vp)); 4844 } 4845 4846 static void __noinline 4847 vfs_periodic_msync_inactive(struct mount *mp, int flags) 4848 { 4849 struct vnode *vp, *mvp; 4850 struct vm_object *obj; 4851 int lkflags, objflags; 4852 bool seen_defer; 4853 4854 lkflags = LK_EXCLUSIVE | LK_INTERLOCK; 4855 if (flags != MNT_WAIT) { 4856 lkflags |= LK_NOWAIT; 4857 objflags = OBJPC_NOSYNC; 4858 } else { 4859 objflags = OBJPC_SYNC; 4860 } 4861 4862 MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { 4863 seen_defer = false; 4864 if (vp->v_iflag & VI_DEFINACT) { 4865 vp->v_iflag &= ~VI_DEFINACT; 4866 seen_defer = true; 4867 } 4868 if (!vfs_want_msync(vp)) { 4869 if (seen_defer) 4870 vfs_deferred_inactive(vp, lkflags); 4871 else 4872 VI_UNLOCK(vp); 4873 continue; 4874 } 4875 if (vget(vp, lkflags) == 0) { 4876 obj = vp->v_object; 4877 if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { 4878 VM_OBJECT_WLOCK(obj); 4879 vm_object_page_clean(obj, 0, 0, objflags); 4880 VM_OBJECT_WUNLOCK(obj); 4881 } 4882 vput(vp); 4883 if (seen_defer) 4884 vdrop(vp); 4885 } else { 4886 if (seen_defer) 4887 vdefer_inactive_unlocked(vp); 4888 } 4889 } 4890 } 4891 4892 void 4893 vfs_periodic(struct mount *mp, int flags) 4894 { 4895 4896 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 4897 4898 if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) 4899 vfs_periodic_inactive(mp, flags); 4900 else 4901 vfs_periodic_msync_inactive(mp, flags); 4902 } 4903 4904 static void 4905 destroy_vpollinfo_free(struct vpollinfo *vi) 4906 { 4907 4908 knlist_destroy(&vi->vpi_selinfo.si_note); 4909 mtx_destroy(&vi->vpi_lock); 4910 free(vi, M_VNODEPOLL); 4911 } 4912 4913 static void 4914 destroy_vpollinfo(struct vpollinfo *vi) 4915 { 4916 4917 knlist_clear(&vi->vpi_selinfo.si_note, 1); 4918 seldrain(&vi->vpi_selinfo); 4919 destroy_vpollinfo_free(vi); 4920 } 4921 4922 /* 4923 * Initialize per-vnode helper structure to hold poll-related state. 4924 */ 4925 void 4926 v_addpollinfo(struct vnode *vp) 4927 { 4928 struct vpollinfo *vi; 4929 4930 if (vp->v_pollinfo != NULL) 4931 return; 4932 vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); 4933 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 4934 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 4935 vfs_knlunlock, vfs_knl_assert_lock); 4936 VI_LOCK(vp); 4937 if (vp->v_pollinfo != NULL) { 4938 VI_UNLOCK(vp); 4939 destroy_vpollinfo_free(vi); 4940 return; 4941 } 4942 vp->v_pollinfo = vi; 4943 VI_UNLOCK(vp); 4944 } 4945 4946 /* 4947 * Record a process's interest in events which might happen to 4948 * a vnode. Because poll uses the historic select-style interface 4949 * internally, this routine serves as both the ``check for any 4950 * pending events'' and the ``record my interest in future events'' 4951 * functions. (These are done together, while the lock is held, 4952 * to avoid race conditions.) 4953 */ 4954 int 4955 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 4956 { 4957 4958 v_addpollinfo(vp); 4959 mtx_lock(&vp->v_pollinfo->vpi_lock); 4960 if (vp->v_pollinfo->vpi_revents & events) { 4961 /* 4962 * This leaves events we are not interested 4963 * in available for the other process which 4964 * which presumably had requested them 4965 * (otherwise they would never have been 4966 * recorded). 4967 */ 4968 events &= vp->v_pollinfo->vpi_revents; 4969 vp->v_pollinfo->vpi_revents &= ~events; 4970 4971 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4972 return (events); 4973 } 4974 vp->v_pollinfo->vpi_events |= events; 4975 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 4976 mtx_unlock(&vp->v_pollinfo->vpi_lock); 4977 return (0); 4978 } 4979 4980 /* 4981 * Routine to create and manage a filesystem syncer vnode. 4982 */ 4983 #define sync_close ((int (*)(struct vop_close_args *))nullop) 4984 static int sync_fsync(struct vop_fsync_args *); 4985 static int sync_inactive(struct vop_inactive_args *); 4986 static int sync_reclaim(struct vop_reclaim_args *); 4987 4988 static struct vop_vector sync_vnodeops = { 4989 .vop_bypass = VOP_EOPNOTSUPP, 4990 .vop_close = sync_close, 4991 .vop_fsync = sync_fsync, 4992 .vop_getwritemount = vop_stdgetwritemount, 4993 .vop_inactive = sync_inactive, 4994 .vop_need_inactive = vop_stdneed_inactive, 4995 .vop_reclaim = sync_reclaim, 4996 .vop_lock1 = vop_stdlock, 4997 .vop_unlock = vop_stdunlock, 4998 .vop_islocked = vop_stdislocked, 4999 .vop_fplookup_vexec = VOP_EAGAIN, 5000 .vop_fplookup_symlink = VOP_EAGAIN, 5001 }; 5002 VFS_VOP_VECTOR_REGISTER(sync_vnodeops); 5003 5004 /* 5005 * Create a new filesystem syncer vnode for the specified mount point. 5006 */ 5007 void 5008 vfs_allocate_syncvnode(struct mount *mp) 5009 { 5010 struct vnode *vp; 5011 struct bufobj *bo; 5012 static long start, incr, next; 5013 int error; 5014 5015 /* Allocate a new vnode */ 5016 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 5017 if (error != 0) 5018 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 5019 vp->v_type = VNON; 5020 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5021 vp->v_vflag |= VV_FORCEINSMQ; 5022 error = insmntque1(vp, mp); 5023 if (error != 0) 5024 panic("vfs_allocate_syncvnode: insmntque() failed"); 5025 vp->v_vflag &= ~VV_FORCEINSMQ; 5026 vn_set_state(vp, VSTATE_CONSTRUCTED); 5027 VOP_UNLOCK(vp); 5028 /* 5029 * Place the vnode onto the syncer worklist. We attempt to 5030 * scatter them about on the list so that they will go off 5031 * at evenly distributed times even if all the filesystems 5032 * are mounted at once. 5033 */ 5034 next += incr; 5035 if (next == 0 || next > syncer_maxdelay) { 5036 start /= 2; 5037 incr /= 2; 5038 if (start == 0) { 5039 start = syncer_maxdelay / 2; 5040 incr = syncer_maxdelay; 5041 } 5042 next = start; 5043 } 5044 bo = &vp->v_bufobj; 5045 BO_LOCK(bo); 5046 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 5047 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 5048 mtx_lock(&sync_mtx); 5049 sync_vnode_count++; 5050 if (mp->mnt_syncer == NULL) { 5051 mp->mnt_syncer = vp; 5052 vp = NULL; 5053 } 5054 mtx_unlock(&sync_mtx); 5055 BO_UNLOCK(bo); 5056 if (vp != NULL) { 5057 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 5058 vgone(vp); 5059 vput(vp); 5060 } 5061 } 5062 5063 void 5064 vfs_deallocate_syncvnode(struct mount *mp) 5065 { 5066 struct vnode *vp; 5067 5068 mtx_lock(&sync_mtx); 5069 vp = mp->mnt_syncer; 5070 if (vp != NULL) 5071 mp->mnt_syncer = NULL; 5072 mtx_unlock(&sync_mtx); 5073 if (vp != NULL) 5074 vrele(vp); 5075 } 5076 5077 /* 5078 * Do a lazy sync of the filesystem. 5079 */ 5080 static int 5081 sync_fsync(struct vop_fsync_args *ap) 5082 { 5083 struct vnode *syncvp = ap->a_vp; 5084 struct mount *mp = syncvp->v_mount; 5085 int error, save; 5086 struct bufobj *bo; 5087 5088 /* 5089 * We only need to do something if this is a lazy evaluation. 5090 */ 5091 if (ap->a_waitfor != MNT_LAZY) 5092 return (0); 5093 5094 /* 5095 * Move ourselves to the back of the sync list. 5096 */ 5097 bo = &syncvp->v_bufobj; 5098 BO_LOCK(bo); 5099 vn_syncer_add_to_worklist(bo, syncdelay); 5100 BO_UNLOCK(bo); 5101 5102 /* 5103 * Walk the list of vnodes pushing all that are dirty and 5104 * not already on the sync list. 5105 */ 5106 if (vfs_busy(mp, MBF_NOWAIT) != 0) 5107 return (0); 5108 VOP_UNLOCK(syncvp); 5109 save = curthread_pflags_set(TDP_SYNCIO); 5110 /* 5111 * The filesystem at hand may be idle with free vnodes stored in the 5112 * batch. Return them instead of letting them stay there indefinitely. 5113 */ 5114 vfs_periodic(mp, MNT_NOWAIT); 5115 error = VFS_SYNC(mp, MNT_LAZY); 5116 curthread_pflags_restore(save); 5117 vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); 5118 vfs_unbusy(mp); 5119 return (error); 5120 } 5121 5122 /* 5123 * The syncer vnode is no referenced. 5124 */ 5125 static int 5126 sync_inactive(struct vop_inactive_args *ap) 5127 { 5128 5129 vgone(ap->a_vp); 5130 return (0); 5131 } 5132 5133 /* 5134 * The syncer vnode is no longer needed and is being decommissioned. 5135 * 5136 * Modifications to the worklist must be protected by sync_mtx. 5137 */ 5138 static int 5139 sync_reclaim(struct vop_reclaim_args *ap) 5140 { 5141 struct vnode *vp = ap->a_vp; 5142 struct bufobj *bo; 5143 5144 bo = &vp->v_bufobj; 5145 BO_LOCK(bo); 5146 mtx_lock(&sync_mtx); 5147 if (vp->v_mount->mnt_syncer == vp) 5148 vp->v_mount->mnt_syncer = NULL; 5149 if (bo->bo_flag & BO_ONWORKLST) { 5150 LIST_REMOVE(bo, bo_synclist); 5151 syncer_worklist_len--; 5152 sync_vnode_count--; 5153 bo->bo_flag &= ~BO_ONWORKLST; 5154 } 5155 mtx_unlock(&sync_mtx); 5156 BO_UNLOCK(bo); 5157 5158 return (0); 5159 } 5160 5161 int 5162 vn_need_pageq_flush(struct vnode *vp) 5163 { 5164 struct vm_object *obj; 5165 5166 obj = vp->v_object; 5167 return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && 5168 vm_object_mightbedirty(obj)); 5169 } 5170 5171 /* 5172 * Check if vnode represents a disk device 5173 */ 5174 bool 5175 vn_isdisk_error(struct vnode *vp, int *errp) 5176 { 5177 int error; 5178 5179 if (vp->v_type != VCHR) { 5180 error = ENOTBLK; 5181 goto out; 5182 } 5183 error = 0; 5184 dev_lock(); 5185 if (vp->v_rdev == NULL) 5186 error = ENXIO; 5187 else if (vp->v_rdev->si_devsw == NULL) 5188 error = ENXIO; 5189 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 5190 error = ENOTBLK; 5191 dev_unlock(); 5192 out: 5193 *errp = error; 5194 return (error == 0); 5195 } 5196 5197 bool 5198 vn_isdisk(struct vnode *vp) 5199 { 5200 int error; 5201 5202 return (vn_isdisk_error(vp, &error)); 5203 } 5204 5205 /* 5206 * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see 5207 * the comment above cache_fplookup for details. 5208 */ 5209 int 5210 vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) 5211 { 5212 int error; 5213 5214 VFS_SMR_ASSERT_ENTERED(); 5215 5216 /* Check the owner. */ 5217 if (cred->cr_uid == file_uid) { 5218 if (file_mode & S_IXUSR) 5219 return (0); 5220 goto out_error; 5221 } 5222 5223 /* Otherwise, check the groups (first match) */ 5224 if (groupmember(file_gid, cred)) { 5225 if (file_mode & S_IXGRP) 5226 return (0); 5227 goto out_error; 5228 } 5229 5230 /* Otherwise, check everyone else. */ 5231 if (file_mode & S_IXOTH) 5232 return (0); 5233 out_error: 5234 /* 5235 * Permission check failed, but it is possible denial will get overwritten 5236 * (e.g., when root is traversing through a 700 directory owned by someone 5237 * else). 5238 * 5239 * vaccess() calls priv_check_cred which in turn can descent into MAC 5240 * modules overriding this result. It's quite unclear what semantics 5241 * are allowed for them to operate, thus for safety we don't call them 5242 * from within the SMR section. This also means if any such modules 5243 * are present, we have to let the regular lookup decide. 5244 */ 5245 error = priv_check_cred_vfs_lookup_nomac(cred); 5246 switch (error) { 5247 case 0: 5248 return (0); 5249 case EAGAIN: 5250 /* 5251 * MAC modules present. 5252 */ 5253 return (EAGAIN); 5254 case EPERM: 5255 return (EACCES); 5256 default: 5257 return (error); 5258 } 5259 } 5260 5261 /* 5262 * Common filesystem object access control check routine. Accepts a 5263 * vnode's type, "mode", uid and gid, requested access mode, and credentials. 5264 * Returns 0 on success, or an errno on failure. 5265 */ 5266 int 5267 vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 5268 accmode_t accmode, struct ucred *cred) 5269 { 5270 accmode_t dac_granted; 5271 accmode_t priv_granted; 5272 5273 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 5274 ("invalid bit in accmode")); 5275 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 5276 ("VAPPEND without VWRITE")); 5277 5278 /* 5279 * Look for a normal, non-privileged way to access the file/directory 5280 * as requested. If it exists, go with that. 5281 */ 5282 5283 dac_granted = 0; 5284 5285 /* Check the owner. */ 5286 if (cred->cr_uid == file_uid) { 5287 dac_granted |= VADMIN; 5288 if (file_mode & S_IXUSR) 5289 dac_granted |= VEXEC; 5290 if (file_mode & S_IRUSR) 5291 dac_granted |= VREAD; 5292 if (file_mode & S_IWUSR) 5293 dac_granted |= (VWRITE | VAPPEND); 5294 5295 if ((accmode & dac_granted) == accmode) 5296 return (0); 5297 5298 goto privcheck; 5299 } 5300 5301 /* Otherwise, check the groups (first match) */ 5302 if (groupmember(file_gid, cred)) { 5303 if (file_mode & S_IXGRP) 5304 dac_granted |= VEXEC; 5305 if (file_mode & S_IRGRP) 5306 dac_granted |= VREAD; 5307 if (file_mode & S_IWGRP) 5308 dac_granted |= (VWRITE | VAPPEND); 5309 5310 if ((accmode & dac_granted) == accmode) 5311 return (0); 5312 5313 goto privcheck; 5314 } 5315 5316 /* Otherwise, check everyone else. */ 5317 if (file_mode & S_IXOTH) 5318 dac_granted |= VEXEC; 5319 if (file_mode & S_IROTH) 5320 dac_granted |= VREAD; 5321 if (file_mode & S_IWOTH) 5322 dac_granted |= (VWRITE | VAPPEND); 5323 if ((accmode & dac_granted) == accmode) 5324 return (0); 5325 5326 privcheck: 5327 /* 5328 * Build a privilege mask to determine if the set of privileges 5329 * satisfies the requirements when combined with the granted mask 5330 * from above. For each privilege, if the privilege is required, 5331 * bitwise or the request type onto the priv_granted mask. 5332 */ 5333 priv_granted = 0; 5334 5335 if (type == VDIR) { 5336 /* 5337 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 5338 * requests, instead of PRIV_VFS_EXEC. 5339 */ 5340 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5341 !priv_check_cred(cred, PRIV_VFS_LOOKUP)) 5342 priv_granted |= VEXEC; 5343 } else { 5344 /* 5345 * Ensure that at least one execute bit is on. Otherwise, 5346 * a privileged user will always succeed, and we don't want 5347 * this to happen unless the file really is executable. 5348 */ 5349 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 5350 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 5351 !priv_check_cred(cred, PRIV_VFS_EXEC)) 5352 priv_granted |= VEXEC; 5353 } 5354 5355 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 5356 !priv_check_cred(cred, PRIV_VFS_READ)) 5357 priv_granted |= VREAD; 5358 5359 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 5360 !priv_check_cred(cred, PRIV_VFS_WRITE)) 5361 priv_granted |= (VWRITE | VAPPEND); 5362 5363 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 5364 !priv_check_cred(cred, PRIV_VFS_ADMIN)) 5365 priv_granted |= VADMIN; 5366 5367 if ((accmode & (priv_granted | dac_granted)) == accmode) { 5368 return (0); 5369 } 5370 5371 return ((accmode & VADMIN) ? EPERM : EACCES); 5372 } 5373 5374 /* 5375 * Credential check based on process requesting service, and per-attribute 5376 * permissions. 5377 */ 5378 int 5379 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 5380 struct thread *td, accmode_t accmode) 5381 { 5382 5383 /* 5384 * Kernel-invoked always succeeds. 5385 */ 5386 if (cred == NOCRED) 5387 return (0); 5388 5389 /* 5390 * Do not allow privileged processes in jail to directly manipulate 5391 * system attributes. 5392 */ 5393 switch (attrnamespace) { 5394 case EXTATTR_NAMESPACE_SYSTEM: 5395 /* Potentially should be: return (EPERM); */ 5396 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); 5397 case EXTATTR_NAMESPACE_USER: 5398 return (VOP_ACCESS(vp, accmode, cred, td)); 5399 default: 5400 return (EPERM); 5401 } 5402 } 5403 5404 #ifdef DEBUG_VFS_LOCKS 5405 int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 5406 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 5407 "Drop into debugger on lock violation"); 5408 5409 int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 5410 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 5411 0, "Check for interlock across VOPs"); 5412 5413 int vfs_badlock_print = 1; /* Print lock violations. */ 5414 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 5415 0, "Print lock violations"); 5416 5417 int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ 5418 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 5419 0, "Print vnode details on lock violations"); 5420 5421 #ifdef KDB 5422 int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 5423 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 5424 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 5425 #endif 5426 5427 static void 5428 vfs_badlock(const char *msg, const char *str, struct vnode *vp) 5429 { 5430 5431 #ifdef KDB 5432 if (vfs_badlock_backtrace) 5433 kdb_backtrace(); 5434 #endif 5435 if (vfs_badlock_vnode) 5436 vn_printf(vp, "vnode "); 5437 if (vfs_badlock_print) 5438 printf("%s: %p %s\n", str, (void *)vp, msg); 5439 if (vfs_badlock_ddb) 5440 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5441 } 5442 5443 void 5444 assert_vi_locked(struct vnode *vp, const char *str) 5445 { 5446 5447 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 5448 vfs_badlock("interlock is not locked but should be", str, vp); 5449 } 5450 5451 void 5452 assert_vi_unlocked(struct vnode *vp, const char *str) 5453 { 5454 5455 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 5456 vfs_badlock("interlock is locked but should not be", str, vp); 5457 } 5458 5459 void 5460 assert_vop_locked(struct vnode *vp, const char *str) 5461 { 5462 if (KERNEL_PANICKED() || vp == NULL) 5463 return; 5464 5465 #ifdef WITNESS 5466 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5467 witness_is_owned(&vp->v_vnlock->lock_object) == -1) 5468 #else 5469 int locked = VOP_ISLOCKED(vp); 5470 if (locked == 0 || locked == LK_EXCLOTHER) 5471 #endif 5472 vfs_badlock("is not locked but should be", str, vp); 5473 } 5474 5475 void 5476 assert_vop_unlocked(struct vnode *vp, const char *str) 5477 { 5478 if (KERNEL_PANICKED() || vp == NULL) 5479 return; 5480 5481 #ifdef WITNESS 5482 if ((vp->v_irflag & VIRF_CROSSMP) == 0 && 5483 witness_is_owned(&vp->v_vnlock->lock_object) == 1) 5484 #else 5485 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 5486 #endif 5487 vfs_badlock("is locked but should not be", str, vp); 5488 } 5489 5490 void 5491 assert_vop_elocked(struct vnode *vp, const char *str) 5492 { 5493 if (KERNEL_PANICKED() || vp == NULL) 5494 return; 5495 5496 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 5497 vfs_badlock("is not exclusive locked but should be", str, vp); 5498 } 5499 #endif /* DEBUG_VFS_LOCKS */ 5500 5501 void 5502 vop_rename_fail(struct vop_rename_args *ap) 5503 { 5504 5505 if (ap->a_tvp != NULL) 5506 vput(ap->a_tvp); 5507 if (ap->a_tdvp == ap->a_tvp) 5508 vrele(ap->a_tdvp); 5509 else 5510 vput(ap->a_tdvp); 5511 vrele(ap->a_fdvp); 5512 vrele(ap->a_fvp); 5513 } 5514 5515 void 5516 vop_rename_pre(void *ap) 5517 { 5518 struct vop_rename_args *a = ap; 5519 5520 #ifdef DEBUG_VFS_LOCKS 5521 if (a->a_tvp) 5522 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 5523 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 5524 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 5525 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 5526 5527 /* Check the source (from). */ 5528 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 5529 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 5530 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 5531 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 5532 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 5533 5534 /* Check the target. */ 5535 if (a->a_tvp) 5536 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 5537 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 5538 #endif 5539 /* 5540 * It may be tempting to add vn_seqc_write_begin/end calls here and 5541 * in vop_rename_post but that's not going to work out since some 5542 * filesystems relookup vnodes mid-rename. This is probably a bug. 5543 * 5544 * For now filesystems are expected to do the relevant calls after they 5545 * decide what vnodes to operate on. 5546 */ 5547 if (a->a_tdvp != a->a_fdvp) 5548 vhold(a->a_fdvp); 5549 if (a->a_tvp != a->a_fvp) 5550 vhold(a->a_fvp); 5551 vhold(a->a_tdvp); 5552 if (a->a_tvp) 5553 vhold(a->a_tvp); 5554 } 5555 5556 #ifdef DEBUG_VFS_LOCKS 5557 void 5558 vop_fplookup_vexec_debugpre(void *ap __unused) 5559 { 5560 5561 VFS_SMR_ASSERT_ENTERED(); 5562 } 5563 5564 void 5565 vop_fplookup_vexec_debugpost(void *ap, int rc) 5566 { 5567 struct vop_fplookup_vexec_args *a; 5568 struct vnode *vp; 5569 5570 a = ap; 5571 vp = a->a_vp; 5572 5573 VFS_SMR_ASSERT_ENTERED(); 5574 if (rc == EOPNOTSUPP) 5575 VNPASS(VN_IS_DOOMED(vp), vp); 5576 } 5577 5578 void 5579 vop_fplookup_symlink_debugpre(void *ap __unused) 5580 { 5581 5582 VFS_SMR_ASSERT_ENTERED(); 5583 } 5584 5585 void 5586 vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) 5587 { 5588 5589 VFS_SMR_ASSERT_ENTERED(); 5590 } 5591 5592 static void 5593 vop_fsync_debugprepost(struct vnode *vp, const char *name) 5594 { 5595 if (vp->v_type == VCHR) 5596 ; 5597 else if (MNT_EXTENDED_SHARED(vp->v_mount)) 5598 ASSERT_VOP_LOCKED(vp, name); 5599 else 5600 ASSERT_VOP_ELOCKED(vp, name); 5601 } 5602 5603 void 5604 vop_fsync_debugpre(void *a) 5605 { 5606 struct vop_fsync_args *ap; 5607 5608 ap = a; 5609 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5610 } 5611 5612 void 5613 vop_fsync_debugpost(void *a, int rc __unused) 5614 { 5615 struct vop_fsync_args *ap; 5616 5617 ap = a; 5618 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5619 } 5620 5621 void 5622 vop_fdatasync_debugpre(void *a) 5623 { 5624 struct vop_fdatasync_args *ap; 5625 5626 ap = a; 5627 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5628 } 5629 5630 void 5631 vop_fdatasync_debugpost(void *a, int rc __unused) 5632 { 5633 struct vop_fdatasync_args *ap; 5634 5635 ap = a; 5636 vop_fsync_debugprepost(ap->a_vp, "fsync"); 5637 } 5638 5639 void 5640 vop_strategy_debugpre(void *ap) 5641 { 5642 struct vop_strategy_args *a; 5643 struct buf *bp; 5644 5645 a = ap; 5646 bp = a->a_bp; 5647 5648 /* 5649 * Cluster ops lock their component buffers but not the IO container. 5650 */ 5651 if ((bp->b_flags & B_CLUSTER) != 0) 5652 return; 5653 5654 if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { 5655 if (vfs_badlock_print) 5656 printf( 5657 "VOP_STRATEGY: bp is not locked but should be\n"); 5658 if (vfs_badlock_ddb) 5659 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 5660 } 5661 } 5662 5663 void 5664 vop_lock_debugpre(void *ap) 5665 { 5666 struct vop_lock1_args *a = ap; 5667 5668 if ((a->a_flags & LK_INTERLOCK) == 0) 5669 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5670 else 5671 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 5672 } 5673 5674 void 5675 vop_lock_debugpost(void *ap, int rc) 5676 { 5677 struct vop_lock1_args *a = ap; 5678 5679 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 5680 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 5681 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 5682 } 5683 5684 void 5685 vop_unlock_debugpre(void *ap) 5686 { 5687 struct vop_unlock_args *a = ap; 5688 struct vnode *vp = a->a_vp; 5689 5690 VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); 5691 ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); 5692 } 5693 5694 void 5695 vop_need_inactive_debugpre(void *ap) 5696 { 5697 struct vop_need_inactive_args *a = ap; 5698 5699 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5700 } 5701 5702 void 5703 vop_need_inactive_debugpost(void *ap, int rc) 5704 { 5705 struct vop_need_inactive_args *a = ap; 5706 5707 ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); 5708 } 5709 #endif 5710 5711 void 5712 vop_create_pre(void *ap) 5713 { 5714 struct vop_create_args *a; 5715 struct vnode *dvp; 5716 5717 a = ap; 5718 dvp = a->a_dvp; 5719 vn_seqc_write_begin(dvp); 5720 } 5721 5722 void 5723 vop_create_post(void *ap, int rc) 5724 { 5725 struct vop_create_args *a; 5726 struct vnode *dvp; 5727 5728 a = ap; 5729 dvp = a->a_dvp; 5730 vn_seqc_write_end(dvp); 5731 if (!rc) 5732 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5733 } 5734 5735 void 5736 vop_whiteout_pre(void *ap) 5737 { 5738 struct vop_whiteout_args *a; 5739 struct vnode *dvp; 5740 5741 a = ap; 5742 dvp = a->a_dvp; 5743 vn_seqc_write_begin(dvp); 5744 } 5745 5746 void 5747 vop_whiteout_post(void *ap, int rc) 5748 { 5749 struct vop_whiteout_args *a; 5750 struct vnode *dvp; 5751 5752 a = ap; 5753 dvp = a->a_dvp; 5754 vn_seqc_write_end(dvp); 5755 } 5756 5757 void 5758 vop_deleteextattr_pre(void *ap) 5759 { 5760 struct vop_deleteextattr_args *a; 5761 struct vnode *vp; 5762 5763 a = ap; 5764 vp = a->a_vp; 5765 vn_seqc_write_begin(vp); 5766 } 5767 5768 void 5769 vop_deleteextattr_post(void *ap, int rc) 5770 { 5771 struct vop_deleteextattr_args *a; 5772 struct vnode *vp; 5773 5774 a = ap; 5775 vp = a->a_vp; 5776 vn_seqc_write_end(vp); 5777 if (!rc) 5778 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 5779 } 5780 5781 void 5782 vop_link_pre(void *ap) 5783 { 5784 struct vop_link_args *a; 5785 struct vnode *vp, *tdvp; 5786 5787 a = ap; 5788 vp = a->a_vp; 5789 tdvp = a->a_tdvp; 5790 vn_seqc_write_begin(vp); 5791 vn_seqc_write_begin(tdvp); 5792 } 5793 5794 void 5795 vop_link_post(void *ap, int rc) 5796 { 5797 struct vop_link_args *a; 5798 struct vnode *vp, *tdvp; 5799 5800 a = ap; 5801 vp = a->a_vp; 5802 tdvp = a->a_tdvp; 5803 vn_seqc_write_end(vp); 5804 vn_seqc_write_end(tdvp); 5805 if (!rc) { 5806 VFS_KNOTE_LOCKED(vp, NOTE_LINK); 5807 VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); 5808 } 5809 } 5810 5811 void 5812 vop_mkdir_pre(void *ap) 5813 { 5814 struct vop_mkdir_args *a; 5815 struct vnode *dvp; 5816 5817 a = ap; 5818 dvp = a->a_dvp; 5819 vn_seqc_write_begin(dvp); 5820 } 5821 5822 void 5823 vop_mkdir_post(void *ap, int rc) 5824 { 5825 struct vop_mkdir_args *a; 5826 struct vnode *dvp; 5827 5828 a = ap; 5829 dvp = a->a_dvp; 5830 vn_seqc_write_end(dvp); 5831 if (!rc) 5832 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5833 } 5834 5835 #ifdef DEBUG_VFS_LOCKS 5836 void 5837 vop_mkdir_debugpost(void *ap, int rc) 5838 { 5839 struct vop_mkdir_args *a; 5840 5841 a = ap; 5842 if (!rc) 5843 cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); 5844 } 5845 #endif 5846 5847 void 5848 vop_mknod_pre(void *ap) 5849 { 5850 struct vop_mknod_args *a; 5851 struct vnode *dvp; 5852 5853 a = ap; 5854 dvp = a->a_dvp; 5855 vn_seqc_write_begin(dvp); 5856 } 5857 5858 void 5859 vop_mknod_post(void *ap, int rc) 5860 { 5861 struct vop_mknod_args *a; 5862 struct vnode *dvp; 5863 5864 a = ap; 5865 dvp = a->a_dvp; 5866 vn_seqc_write_end(dvp); 5867 if (!rc) 5868 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5869 } 5870 5871 void 5872 vop_reclaim_post(void *ap, int rc) 5873 { 5874 struct vop_reclaim_args *a; 5875 struct vnode *vp; 5876 5877 a = ap; 5878 vp = a->a_vp; 5879 ASSERT_VOP_IN_SEQC(vp); 5880 if (!rc) 5881 VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); 5882 } 5883 5884 void 5885 vop_remove_pre(void *ap) 5886 { 5887 struct vop_remove_args *a; 5888 struct vnode *dvp, *vp; 5889 5890 a = ap; 5891 dvp = a->a_dvp; 5892 vp = a->a_vp; 5893 vn_seqc_write_begin(dvp); 5894 vn_seqc_write_begin(vp); 5895 } 5896 5897 void 5898 vop_remove_post(void *ap, int rc) 5899 { 5900 struct vop_remove_args *a; 5901 struct vnode *dvp, *vp; 5902 5903 a = ap; 5904 dvp = a->a_dvp; 5905 vp = a->a_vp; 5906 vn_seqc_write_end(dvp); 5907 vn_seqc_write_end(vp); 5908 if (!rc) { 5909 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 5910 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5911 } 5912 } 5913 5914 void 5915 vop_rename_post(void *ap, int rc) 5916 { 5917 struct vop_rename_args *a = ap; 5918 long hint; 5919 5920 if (!rc) { 5921 hint = NOTE_WRITE; 5922 if (a->a_fdvp == a->a_tdvp) { 5923 if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) 5924 hint |= NOTE_LINK; 5925 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5926 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5927 } else { 5928 hint |= NOTE_EXTEND; 5929 if (a->a_fvp->v_type == VDIR) 5930 hint |= NOTE_LINK; 5931 VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); 5932 5933 if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && 5934 a->a_tvp->v_type == VDIR) 5935 hint &= ~NOTE_LINK; 5936 VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); 5937 } 5938 5939 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 5940 if (a->a_tvp) 5941 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 5942 } 5943 if (a->a_tdvp != a->a_fdvp) 5944 vdrop(a->a_fdvp); 5945 if (a->a_tvp != a->a_fvp) 5946 vdrop(a->a_fvp); 5947 vdrop(a->a_tdvp); 5948 if (a->a_tvp) 5949 vdrop(a->a_tvp); 5950 } 5951 5952 void 5953 vop_rmdir_pre(void *ap) 5954 { 5955 struct vop_rmdir_args *a; 5956 struct vnode *dvp, *vp; 5957 5958 a = ap; 5959 dvp = a->a_dvp; 5960 vp = a->a_vp; 5961 vn_seqc_write_begin(dvp); 5962 vn_seqc_write_begin(vp); 5963 } 5964 5965 void 5966 vop_rmdir_post(void *ap, int rc) 5967 { 5968 struct vop_rmdir_args *a; 5969 struct vnode *dvp, *vp; 5970 5971 a = ap; 5972 dvp = a->a_dvp; 5973 vp = a->a_vp; 5974 vn_seqc_write_end(dvp); 5975 vn_seqc_write_end(vp); 5976 if (!rc) { 5977 vp->v_vflag |= VV_UNLINKED; 5978 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); 5979 VFS_KNOTE_LOCKED(vp, NOTE_DELETE); 5980 } 5981 } 5982 5983 void 5984 vop_setattr_pre(void *ap) 5985 { 5986 struct vop_setattr_args *a; 5987 struct vnode *vp; 5988 5989 a = ap; 5990 vp = a->a_vp; 5991 vn_seqc_write_begin(vp); 5992 } 5993 5994 void 5995 vop_setattr_post(void *ap, int rc) 5996 { 5997 struct vop_setattr_args *a; 5998 struct vnode *vp; 5999 6000 a = ap; 6001 vp = a->a_vp; 6002 vn_seqc_write_end(vp); 6003 if (!rc) 6004 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6005 } 6006 6007 void 6008 vop_setacl_pre(void *ap) 6009 { 6010 struct vop_setacl_args *a; 6011 struct vnode *vp; 6012 6013 a = ap; 6014 vp = a->a_vp; 6015 vn_seqc_write_begin(vp); 6016 } 6017 6018 void 6019 vop_setacl_post(void *ap, int rc __unused) 6020 { 6021 struct vop_setacl_args *a; 6022 struct vnode *vp; 6023 6024 a = ap; 6025 vp = a->a_vp; 6026 vn_seqc_write_end(vp); 6027 } 6028 6029 void 6030 vop_setextattr_pre(void *ap) 6031 { 6032 struct vop_setextattr_args *a; 6033 struct vnode *vp; 6034 6035 a = ap; 6036 vp = a->a_vp; 6037 vn_seqc_write_begin(vp); 6038 } 6039 6040 void 6041 vop_setextattr_post(void *ap, int rc) 6042 { 6043 struct vop_setextattr_args *a; 6044 struct vnode *vp; 6045 6046 a = ap; 6047 vp = a->a_vp; 6048 vn_seqc_write_end(vp); 6049 if (!rc) 6050 VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); 6051 } 6052 6053 void 6054 vop_symlink_pre(void *ap) 6055 { 6056 struct vop_symlink_args *a; 6057 struct vnode *dvp; 6058 6059 a = ap; 6060 dvp = a->a_dvp; 6061 vn_seqc_write_begin(dvp); 6062 } 6063 6064 void 6065 vop_symlink_post(void *ap, int rc) 6066 { 6067 struct vop_symlink_args *a; 6068 struct vnode *dvp; 6069 6070 a = ap; 6071 dvp = a->a_dvp; 6072 vn_seqc_write_end(dvp); 6073 if (!rc) 6074 VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); 6075 } 6076 6077 void 6078 vop_open_post(void *ap, int rc) 6079 { 6080 struct vop_open_args *a = ap; 6081 6082 if (!rc) 6083 VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); 6084 } 6085 6086 void 6087 vop_close_post(void *ap, int rc) 6088 { 6089 struct vop_close_args *a = ap; 6090 6091 if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ 6092 !VN_IS_DOOMED(a->a_vp))) { 6093 VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? 6094 NOTE_CLOSE_WRITE : NOTE_CLOSE); 6095 } 6096 } 6097 6098 void 6099 vop_read_post(void *ap, int rc) 6100 { 6101 struct vop_read_args *a = ap; 6102 6103 if (!rc) 6104 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6105 } 6106 6107 void 6108 vop_read_pgcache_post(void *ap, int rc) 6109 { 6110 struct vop_read_pgcache_args *a = ap; 6111 6112 if (!rc) 6113 VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); 6114 } 6115 6116 void 6117 vop_readdir_post(void *ap, int rc) 6118 { 6119 struct vop_readdir_args *a = ap; 6120 6121 if (!rc) 6122 VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); 6123 } 6124 6125 static struct knlist fs_knlist; 6126 6127 static void 6128 vfs_event_init(void *arg) 6129 { 6130 knlist_init_mtx(&fs_knlist, NULL); 6131 } 6132 /* XXX - correct order? */ 6133 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 6134 6135 void 6136 vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 6137 { 6138 6139 KNOTE_UNLOCKED(&fs_knlist, event); 6140 } 6141 6142 static int filt_fsattach(struct knote *kn); 6143 static void filt_fsdetach(struct knote *kn); 6144 static int filt_fsevent(struct knote *kn, long hint); 6145 6146 struct filterops fs_filtops = { 6147 .f_isfd = 0, 6148 .f_attach = filt_fsattach, 6149 .f_detach = filt_fsdetach, 6150 .f_event = filt_fsevent 6151 }; 6152 6153 static int 6154 filt_fsattach(struct knote *kn) 6155 { 6156 6157 kn->kn_flags |= EV_CLEAR; 6158 knlist_add(&fs_knlist, kn, 0); 6159 return (0); 6160 } 6161 6162 static void 6163 filt_fsdetach(struct knote *kn) 6164 { 6165 6166 knlist_remove(&fs_knlist, kn, 0); 6167 } 6168 6169 static int 6170 filt_fsevent(struct knote *kn, long hint) 6171 { 6172 6173 kn->kn_fflags |= kn->kn_sfflags & hint; 6174 6175 return (kn->kn_fflags != 0); 6176 } 6177 6178 static int 6179 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 6180 { 6181 struct vfsidctl vc; 6182 int error; 6183 struct mount *mp; 6184 6185 error = SYSCTL_IN(req, &vc, sizeof(vc)); 6186 if (error) 6187 return (error); 6188 if (vc.vc_vers != VFS_CTL_VERS1) 6189 return (EINVAL); 6190 mp = vfs_getvfs(&vc.vc_fsid); 6191 if (mp == NULL) 6192 return (ENOENT); 6193 /* ensure that a specific sysctl goes to the right filesystem. */ 6194 if (strcmp(vc.vc_fstypename, "*") != 0 && 6195 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 6196 vfs_rel(mp); 6197 return (EINVAL); 6198 } 6199 VCTLTOREQ(&vc, req); 6200 error = VFS_SYSCTL(mp, vc.vc_op, req); 6201 vfs_rel(mp); 6202 return (error); 6203 } 6204 6205 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, 6206 NULL, 0, sysctl_vfs_ctl, "", 6207 "Sysctl by fsid"); 6208 6209 /* 6210 * Function to initialize a va_filerev field sensibly. 6211 * XXX: Wouldn't a random number make a lot more sense ?? 6212 */ 6213 u_quad_t 6214 init_va_filerev(void) 6215 { 6216 struct bintime bt; 6217 6218 getbinuptime(&bt); 6219 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 6220 } 6221 6222 static int filt_vfsread(struct knote *kn, long hint); 6223 static int filt_vfswrite(struct knote *kn, long hint); 6224 static int filt_vfsvnode(struct knote *kn, long hint); 6225 static void filt_vfsdetach(struct knote *kn); 6226 static struct filterops vfsread_filtops = { 6227 .f_isfd = 1, 6228 .f_detach = filt_vfsdetach, 6229 .f_event = filt_vfsread 6230 }; 6231 static struct filterops vfswrite_filtops = { 6232 .f_isfd = 1, 6233 .f_detach = filt_vfsdetach, 6234 .f_event = filt_vfswrite 6235 }; 6236 static struct filterops vfsvnode_filtops = { 6237 .f_isfd = 1, 6238 .f_detach = filt_vfsdetach, 6239 .f_event = filt_vfsvnode 6240 }; 6241 6242 static void 6243 vfs_knllock(void *arg) 6244 { 6245 struct vnode *vp = arg; 6246 6247 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 6248 } 6249 6250 static void 6251 vfs_knlunlock(void *arg) 6252 { 6253 struct vnode *vp = arg; 6254 6255 VOP_UNLOCK(vp); 6256 } 6257 6258 static void 6259 vfs_knl_assert_lock(void *arg, int what) 6260 { 6261 #ifdef DEBUG_VFS_LOCKS 6262 struct vnode *vp = arg; 6263 6264 if (what == LA_LOCKED) 6265 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 6266 else 6267 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 6268 #endif 6269 } 6270 6271 int 6272 vfs_kqfilter(struct vop_kqfilter_args *ap) 6273 { 6274 struct vnode *vp = ap->a_vp; 6275 struct knote *kn = ap->a_kn; 6276 struct knlist *knl; 6277 6278 KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && 6279 kn->kn_filter != EVFILT_WRITE), 6280 ("READ/WRITE filter on a FIFO leaked through")); 6281 switch (kn->kn_filter) { 6282 case EVFILT_READ: 6283 kn->kn_fop = &vfsread_filtops; 6284 break; 6285 case EVFILT_WRITE: 6286 kn->kn_fop = &vfswrite_filtops; 6287 break; 6288 case EVFILT_VNODE: 6289 kn->kn_fop = &vfsvnode_filtops; 6290 break; 6291 default: 6292 return (EINVAL); 6293 } 6294 6295 kn->kn_hook = (caddr_t)vp; 6296 6297 v_addpollinfo(vp); 6298 if (vp->v_pollinfo == NULL) 6299 return (ENOMEM); 6300 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 6301 vhold(vp); 6302 knlist_add(knl, kn, 0); 6303 6304 return (0); 6305 } 6306 6307 /* 6308 * Detach knote from vnode 6309 */ 6310 static void 6311 filt_vfsdetach(struct knote *kn) 6312 { 6313 struct vnode *vp = (struct vnode *)kn->kn_hook; 6314 6315 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 6316 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 6317 vdrop(vp); 6318 } 6319 6320 /*ARGSUSED*/ 6321 static int 6322 filt_vfsread(struct knote *kn, long hint) 6323 { 6324 struct vnode *vp = (struct vnode *)kn->kn_hook; 6325 off_t size; 6326 int res; 6327 6328 /* 6329 * filesystem is gone, so set the EOF flag and schedule 6330 * the knote for deletion. 6331 */ 6332 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6333 VI_LOCK(vp); 6334 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6335 VI_UNLOCK(vp); 6336 return (1); 6337 } 6338 6339 if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) 6340 return (0); 6341 6342 VI_LOCK(vp); 6343 kn->kn_data = size - kn->kn_fp->f_offset; 6344 res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; 6345 VI_UNLOCK(vp); 6346 return (res); 6347 } 6348 6349 /*ARGSUSED*/ 6350 static int 6351 filt_vfswrite(struct knote *kn, long hint) 6352 { 6353 struct vnode *vp = (struct vnode *)kn->kn_hook; 6354 6355 VI_LOCK(vp); 6356 6357 /* 6358 * filesystem is gone, so set the EOF flag and schedule 6359 * the knote for deletion. 6360 */ 6361 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) 6362 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 6363 6364 kn->kn_data = 0; 6365 VI_UNLOCK(vp); 6366 return (1); 6367 } 6368 6369 static int 6370 filt_vfsvnode(struct knote *kn, long hint) 6371 { 6372 struct vnode *vp = (struct vnode *)kn->kn_hook; 6373 int res; 6374 6375 VI_LOCK(vp); 6376 if (kn->kn_sfflags & hint) 6377 kn->kn_fflags |= hint; 6378 if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { 6379 kn->kn_flags |= EV_EOF; 6380 VI_UNLOCK(vp); 6381 return (1); 6382 } 6383 res = (kn->kn_fflags != 0); 6384 VI_UNLOCK(vp); 6385 return (res); 6386 } 6387 6388 int 6389 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 6390 { 6391 int error; 6392 6393 if (dp->d_reclen > ap->a_uio->uio_resid) 6394 return (ENAMETOOLONG); 6395 error = uiomove(dp, dp->d_reclen, ap->a_uio); 6396 if (error) { 6397 if (ap->a_ncookies != NULL) { 6398 if (ap->a_cookies != NULL) 6399 free(ap->a_cookies, M_TEMP); 6400 ap->a_cookies = NULL; 6401 *ap->a_ncookies = 0; 6402 } 6403 return (error); 6404 } 6405 if (ap->a_ncookies == NULL) 6406 return (0); 6407 6408 KASSERT(ap->a_cookies, 6409 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 6410 6411 *ap->a_cookies = realloc(*ap->a_cookies, 6412 (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); 6413 (*ap->a_cookies)[*ap->a_ncookies] = off; 6414 *ap->a_ncookies += 1; 6415 return (0); 6416 } 6417 6418 /* 6419 * The purpose of this routine is to remove granularity from accmode_t, 6420 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 6421 * VADMIN and VAPPEND. 6422 * 6423 * If it returns 0, the caller is supposed to continue with the usual 6424 * access checks using 'accmode' as modified by this routine. If it 6425 * returns nonzero value, the caller is supposed to return that value 6426 * as errno. 6427 * 6428 * Note that after this routine runs, accmode may be zero. 6429 */ 6430 int 6431 vfs_unixify_accmode(accmode_t *accmode) 6432 { 6433 /* 6434 * There is no way to specify explicit "deny" rule using 6435 * file mode or POSIX.1e ACLs. 6436 */ 6437 if (*accmode & VEXPLICIT_DENY) { 6438 *accmode = 0; 6439 return (0); 6440 } 6441 6442 /* 6443 * None of these can be translated into usual access bits. 6444 * Also, the common case for NFSv4 ACLs is to not contain 6445 * either of these bits. Caller should check for VWRITE 6446 * on the containing directory instead. 6447 */ 6448 if (*accmode & (VDELETE_CHILD | VDELETE)) 6449 return (EPERM); 6450 6451 if (*accmode & VADMIN_PERMS) { 6452 *accmode &= ~VADMIN_PERMS; 6453 *accmode |= VADMIN; 6454 } 6455 6456 /* 6457 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 6458 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 6459 */ 6460 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 6461 6462 return (0); 6463 } 6464 6465 /* 6466 * Clear out a doomed vnode (if any) and replace it with a new one as long 6467 * as the fs is not being unmounted. Return the root vnode to the caller. 6468 */ 6469 static int __noinline 6470 vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) 6471 { 6472 struct vnode *vp; 6473 int error; 6474 6475 restart: 6476 if (mp->mnt_rootvnode != NULL) { 6477 MNT_ILOCK(mp); 6478 vp = mp->mnt_rootvnode; 6479 if (vp != NULL) { 6480 if (!VN_IS_DOOMED(vp)) { 6481 vrefact(vp); 6482 MNT_IUNLOCK(mp); 6483 error = vn_lock(vp, flags); 6484 if (error == 0) { 6485 *vpp = vp; 6486 return (0); 6487 } 6488 vrele(vp); 6489 goto restart; 6490 } 6491 /* 6492 * Clear the old one. 6493 */ 6494 mp->mnt_rootvnode = NULL; 6495 } 6496 MNT_IUNLOCK(mp); 6497 if (vp != NULL) { 6498 vfs_op_barrier_wait(mp); 6499 vrele(vp); 6500 } 6501 } 6502 error = VFS_CACHEDROOT(mp, flags, vpp); 6503 if (error != 0) 6504 return (error); 6505 if (mp->mnt_vfs_ops == 0) { 6506 MNT_ILOCK(mp); 6507 if (mp->mnt_vfs_ops != 0) { 6508 MNT_IUNLOCK(mp); 6509 return (0); 6510 } 6511 if (mp->mnt_rootvnode == NULL) { 6512 vrefact(*vpp); 6513 mp->mnt_rootvnode = *vpp; 6514 } else { 6515 if (mp->mnt_rootvnode != *vpp) { 6516 if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { 6517 panic("%s: mismatch between vnode returned " 6518 " by VFS_CACHEDROOT and the one cached " 6519 " (%p != %p)", 6520 __func__, *vpp, mp->mnt_rootvnode); 6521 } 6522 } 6523 } 6524 MNT_IUNLOCK(mp); 6525 } 6526 return (0); 6527 } 6528 6529 int 6530 vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) 6531 { 6532 struct mount_pcpu *mpcpu; 6533 struct vnode *vp; 6534 int error; 6535 6536 if (!vfs_op_thread_enter(mp, mpcpu)) 6537 return (vfs_cache_root_fallback(mp, flags, vpp)); 6538 vp = atomic_load_ptr(&mp->mnt_rootvnode); 6539 if (vp == NULL || VN_IS_DOOMED(vp)) { 6540 vfs_op_thread_exit(mp, mpcpu); 6541 return (vfs_cache_root_fallback(mp, flags, vpp)); 6542 } 6543 vrefact(vp); 6544 vfs_op_thread_exit(mp, mpcpu); 6545 error = vn_lock(vp, flags); 6546 if (error != 0) { 6547 vrele(vp); 6548 return (vfs_cache_root_fallback(mp, flags, vpp)); 6549 } 6550 *vpp = vp; 6551 return (0); 6552 } 6553 6554 struct vnode * 6555 vfs_cache_root_clear(struct mount *mp) 6556 { 6557 struct vnode *vp; 6558 6559 /* 6560 * ops > 0 guarantees there is nobody who can see this vnode 6561 */ 6562 MPASS(mp->mnt_vfs_ops > 0); 6563 vp = mp->mnt_rootvnode; 6564 if (vp != NULL) 6565 vn_seqc_write_begin(vp); 6566 mp->mnt_rootvnode = NULL; 6567 return (vp); 6568 } 6569 6570 void 6571 vfs_cache_root_set(struct mount *mp, struct vnode *vp) 6572 { 6573 6574 MPASS(mp->mnt_vfs_ops > 0); 6575 vrefact(vp); 6576 mp->mnt_rootvnode = vp; 6577 } 6578 6579 /* 6580 * These are helper functions for filesystems to traverse all 6581 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 6582 * 6583 * This interface replaces MNT_VNODE_FOREACH. 6584 */ 6585 6586 struct vnode * 6587 __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 6588 { 6589 struct vnode *vp; 6590 6591 maybe_yield(); 6592 MNT_ILOCK(mp); 6593 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6594 for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; 6595 vp = TAILQ_NEXT(vp, v_nmntvnodes)) { 6596 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6597 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6598 continue; 6599 VI_LOCK(vp); 6600 if (VN_IS_DOOMED(vp)) { 6601 VI_UNLOCK(vp); 6602 continue; 6603 } 6604 break; 6605 } 6606 if (vp == NULL) { 6607 __mnt_vnode_markerfree_all(mvp, mp); 6608 /* MNT_IUNLOCK(mp); -- done in above function */ 6609 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 6610 return (NULL); 6611 } 6612 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6613 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6614 MNT_IUNLOCK(mp); 6615 return (vp); 6616 } 6617 6618 struct vnode * 6619 __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 6620 { 6621 struct vnode *vp; 6622 6623 *mvp = vn_alloc_marker(mp); 6624 MNT_ILOCK(mp); 6625 MNT_REF(mp); 6626 6627 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 6628 /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ 6629 if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) 6630 continue; 6631 VI_LOCK(vp); 6632 if (VN_IS_DOOMED(vp)) { 6633 VI_UNLOCK(vp); 6634 continue; 6635 } 6636 break; 6637 } 6638 if (vp == NULL) { 6639 MNT_REL(mp); 6640 MNT_IUNLOCK(mp); 6641 vn_free_marker(*mvp); 6642 *mvp = NULL; 6643 return (NULL); 6644 } 6645 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 6646 MNT_IUNLOCK(mp); 6647 return (vp); 6648 } 6649 6650 void 6651 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 6652 { 6653 6654 if (*mvp == NULL) { 6655 MNT_IUNLOCK(mp); 6656 return; 6657 } 6658 6659 mtx_assert(MNT_MTX(mp), MA_OWNED); 6660 6661 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6662 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 6663 MNT_REL(mp); 6664 MNT_IUNLOCK(mp); 6665 vn_free_marker(*mvp); 6666 *mvp = NULL; 6667 } 6668 6669 /* 6670 * These are helper functions for filesystems to traverse their 6671 * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h 6672 */ 6673 static void 6674 mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6675 { 6676 6677 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6678 6679 MNT_ILOCK(mp); 6680 MNT_REL(mp); 6681 MNT_IUNLOCK(mp); 6682 vn_free_marker(*mvp); 6683 *mvp = NULL; 6684 } 6685 6686 /* 6687 * Relock the mp mount vnode list lock with the vp vnode interlock in the 6688 * conventional lock order during mnt_vnode_next_lazy iteration. 6689 * 6690 * On entry, the mount vnode list lock is held and the vnode interlock is not. 6691 * The list lock is dropped and reacquired. On success, both locks are held. 6692 * On failure, the mount vnode list lock is held but the vnode interlock is 6693 * not, and the procedure may have yielded. 6694 */ 6695 static bool 6696 mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, 6697 struct vnode *vp) 6698 { 6699 6700 VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && 6701 TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, 6702 ("%s: bad marker", __func__)); 6703 VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, 6704 ("%s: inappropriate vnode", __func__)); 6705 ASSERT_VI_UNLOCKED(vp, __func__); 6706 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6707 6708 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); 6709 TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); 6710 6711 /* 6712 * Note we may be racing against vdrop which transitioned the hold 6713 * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, 6714 * if we are the only user after we get the interlock we will just 6715 * vdrop. 6716 */ 6717 vhold(vp); 6718 mtx_unlock(&mp->mnt_listmtx); 6719 VI_LOCK(vp); 6720 if (VN_IS_DOOMED(vp)) { 6721 VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); 6722 goto out_lost; 6723 } 6724 VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); 6725 /* 6726 * There is nothing to do if we are the last user. 6727 */ 6728 if (!refcount_release_if_not_last(&vp->v_holdcnt)) 6729 goto out_lost; 6730 mtx_lock(&mp->mnt_listmtx); 6731 return (true); 6732 out_lost: 6733 vdropl(vp); 6734 maybe_yield(); 6735 mtx_lock(&mp->mnt_listmtx); 6736 return (false); 6737 } 6738 6739 static struct vnode * 6740 mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6741 void *cbarg) 6742 { 6743 struct vnode *vp; 6744 6745 mtx_assert(&mp->mnt_listmtx, MA_OWNED); 6746 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 6747 restart: 6748 vp = TAILQ_NEXT(*mvp, v_lazylist); 6749 while (vp != NULL) { 6750 if (vp->v_type == VMARKER) { 6751 vp = TAILQ_NEXT(vp, v_lazylist); 6752 continue; 6753 } 6754 /* 6755 * See if we want to process the vnode. Note we may encounter a 6756 * long string of vnodes we don't care about and hog the list 6757 * as a result. Check for it and requeue the marker. 6758 */ 6759 VNPASS(!VN_IS_DOOMED(vp), vp); 6760 if (!cb(vp, cbarg)) { 6761 if (!should_yield()) { 6762 vp = TAILQ_NEXT(vp, v_lazylist); 6763 continue; 6764 } 6765 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, 6766 v_lazylist); 6767 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, 6768 v_lazylist); 6769 mtx_unlock(&mp->mnt_listmtx); 6770 kern_yield(PRI_USER); 6771 mtx_lock(&mp->mnt_listmtx); 6772 goto restart; 6773 } 6774 /* 6775 * Try-lock because this is the wrong lock order. 6776 */ 6777 if (!VI_TRYLOCK(vp) && 6778 !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) 6779 goto restart; 6780 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 6781 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 6782 ("alien vnode on the lazy list %p %p", vp, mp)); 6783 VNPASS(vp->v_mount == mp, vp); 6784 VNPASS(!VN_IS_DOOMED(vp), vp); 6785 break; 6786 } 6787 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6788 6789 /* Check if we are done */ 6790 if (vp == NULL) { 6791 mtx_unlock(&mp->mnt_listmtx); 6792 mnt_vnode_markerfree_lazy(mvp, mp); 6793 return (NULL); 6794 } 6795 TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); 6796 mtx_unlock(&mp->mnt_listmtx); 6797 ASSERT_VI_LOCKED(vp, "lazy iter"); 6798 return (vp); 6799 } 6800 6801 struct vnode * 6802 __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6803 void *cbarg) 6804 { 6805 6806 maybe_yield(); 6807 mtx_lock(&mp->mnt_listmtx); 6808 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6809 } 6810 6811 struct vnode * 6812 __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, 6813 void *cbarg) 6814 { 6815 struct vnode *vp; 6816 6817 if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) 6818 return (NULL); 6819 6820 *mvp = vn_alloc_marker(mp); 6821 MNT_ILOCK(mp); 6822 MNT_REF(mp); 6823 MNT_IUNLOCK(mp); 6824 6825 mtx_lock(&mp->mnt_listmtx); 6826 vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); 6827 if (vp == NULL) { 6828 mtx_unlock(&mp->mnt_listmtx); 6829 mnt_vnode_markerfree_lazy(mvp, mp); 6830 return (NULL); 6831 } 6832 TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); 6833 return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); 6834 } 6835 6836 void 6837 __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) 6838 { 6839 6840 if (*mvp == NULL) 6841 return; 6842 6843 mtx_lock(&mp->mnt_listmtx); 6844 TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); 6845 mtx_unlock(&mp->mnt_listmtx); 6846 mnt_vnode_markerfree_lazy(mvp, mp); 6847 } 6848 6849 int 6850 vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) 6851 { 6852 6853 if ((cnp->cn_flags & NOEXECCHECK) != 0) { 6854 cnp->cn_flags &= ~NOEXECCHECK; 6855 return (0); 6856 } 6857 6858 return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); 6859 } 6860 6861 /* 6862 * Do not use this variant unless you have means other than the hold count 6863 * to prevent the vnode from getting freed. 6864 */ 6865 void 6866 vn_seqc_write_begin_locked(struct vnode *vp) 6867 { 6868 6869 ASSERT_VI_LOCKED(vp, __func__); 6870 VNPASS(vp->v_holdcnt > 0, vp); 6871 VNPASS(vp->v_seqc_users >= 0, vp); 6872 vp->v_seqc_users++; 6873 if (vp->v_seqc_users == 1) 6874 seqc_sleepable_write_begin(&vp->v_seqc); 6875 } 6876 6877 void 6878 vn_seqc_write_begin(struct vnode *vp) 6879 { 6880 6881 VI_LOCK(vp); 6882 vn_seqc_write_begin_locked(vp); 6883 VI_UNLOCK(vp); 6884 } 6885 6886 void 6887 vn_seqc_write_end_locked(struct vnode *vp) 6888 { 6889 6890 ASSERT_VI_LOCKED(vp, __func__); 6891 VNPASS(vp->v_seqc_users > 0, vp); 6892 vp->v_seqc_users--; 6893 if (vp->v_seqc_users == 0) 6894 seqc_sleepable_write_end(&vp->v_seqc); 6895 } 6896 6897 void 6898 vn_seqc_write_end(struct vnode *vp) 6899 { 6900 6901 VI_LOCK(vp); 6902 vn_seqc_write_end_locked(vp); 6903 VI_UNLOCK(vp); 6904 } 6905 6906 /* 6907 * Special case handling for allocating and freeing vnodes. 6908 * 6909 * The counter remains unchanged on free so that a doomed vnode will 6910 * keep testing as in modify as long as it is accessible with SMR. 6911 */ 6912 static void 6913 vn_seqc_init(struct vnode *vp) 6914 { 6915 6916 vp->v_seqc = 0; 6917 vp->v_seqc_users = 0; 6918 } 6919 6920 static void 6921 vn_seqc_write_end_free(struct vnode *vp) 6922 { 6923 6924 VNPASS(seqc_in_modify(vp->v_seqc), vp); 6925 VNPASS(vp->v_seqc_users == 1, vp); 6926 } 6927 6928 void 6929 vn_irflag_set_locked(struct vnode *vp, short toset) 6930 { 6931 short flags; 6932 6933 ASSERT_VI_LOCKED(vp, __func__); 6934 flags = vn_irflag_read(vp); 6935 VNASSERT((flags & toset) == 0, vp, 6936 ("%s: some of the passed flags already set (have %d, passed %d)\n", 6937 __func__, flags, toset)); 6938 atomic_store_short(&vp->v_irflag, flags | toset); 6939 } 6940 6941 void 6942 vn_irflag_set(struct vnode *vp, short toset) 6943 { 6944 6945 VI_LOCK(vp); 6946 vn_irflag_set_locked(vp, toset); 6947 VI_UNLOCK(vp); 6948 } 6949 6950 void 6951 vn_irflag_set_cond_locked(struct vnode *vp, short toset) 6952 { 6953 short flags; 6954 6955 ASSERT_VI_LOCKED(vp, __func__); 6956 flags = vn_irflag_read(vp); 6957 atomic_store_short(&vp->v_irflag, flags | toset); 6958 } 6959 6960 void 6961 vn_irflag_set_cond(struct vnode *vp, short toset) 6962 { 6963 6964 VI_LOCK(vp); 6965 vn_irflag_set_cond_locked(vp, toset); 6966 VI_UNLOCK(vp); 6967 } 6968 6969 void 6970 vn_irflag_unset_locked(struct vnode *vp, short tounset) 6971 { 6972 short flags; 6973 6974 ASSERT_VI_LOCKED(vp, __func__); 6975 flags = vn_irflag_read(vp); 6976 VNASSERT((flags & tounset) == tounset, vp, 6977 ("%s: some of the passed flags not set (have %d, passed %d)\n", 6978 __func__, flags, tounset)); 6979 atomic_store_short(&vp->v_irflag, flags & ~tounset); 6980 } 6981 6982 void 6983 vn_irflag_unset(struct vnode *vp, short tounset) 6984 { 6985 6986 VI_LOCK(vp); 6987 vn_irflag_unset_locked(vp, tounset); 6988 VI_UNLOCK(vp); 6989 } 6990 6991 int 6992 vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) 6993 { 6994 struct vattr vattr; 6995 int error; 6996 6997 ASSERT_VOP_LOCKED(vp, __func__); 6998 error = VOP_GETATTR(vp, &vattr, cred); 6999 if (__predict_true(error == 0)) { 7000 if (vattr.va_size <= OFF_MAX) 7001 *size = vattr.va_size; 7002 else 7003 error = EFBIG; 7004 } 7005 return (error); 7006 } 7007 7008 int 7009 vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) 7010 { 7011 int error; 7012 7013 VOP_LOCK(vp, LK_SHARED); 7014 error = vn_getsize_locked(vp, size, cred); 7015 VOP_UNLOCK(vp); 7016 return (error); 7017 } 7018 7019 #ifdef INVARIANTS 7020 void 7021 vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state) 7022 { 7023 7024 switch (vp->v_state) { 7025 case VSTATE_UNINITIALIZED: 7026 switch (state) { 7027 case VSTATE_CONSTRUCTED: 7028 case VSTATE_DESTROYING: 7029 return; 7030 default: 7031 break; 7032 } 7033 break; 7034 case VSTATE_CONSTRUCTED: 7035 ASSERT_VOP_ELOCKED(vp, __func__); 7036 switch (state) { 7037 case VSTATE_DESTROYING: 7038 return; 7039 default: 7040 break; 7041 } 7042 break; 7043 case VSTATE_DESTROYING: 7044 ASSERT_VOP_ELOCKED(vp, __func__); 7045 switch (state) { 7046 case VSTATE_DEAD: 7047 return; 7048 default: 7049 break; 7050 } 7051 break; 7052 case VSTATE_DEAD: 7053 switch (state) { 7054 case VSTATE_UNINITIALIZED: 7055 return; 7056 default: 7057 break; 7058 } 7059 break; 7060 } 7061 7062 vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); 7063 panic("invalid state transition %d -> %d\n", vp->v_state, state); 7064 } 7065 #endif 7066